Skip to content

read_in_data

Read in data from a file and return a dataframe with the following columns: gene_id, {binding/expression}_effect, {binding/expression}_pvalue, source

Parameters:

Name Type Description Default
data_path str

path to the data file

required
identifier_col str

name of the feature identifier column in the data

required
effect_col str

name of the effect column in the data

required
pval_col str

name of the pvalue column in the data

required
source str

source of the data

required
data_type str

type of data, either ‘binding’ or ‘expression’

required

Returns:

Type Description
DataFrame

pd.DataFrame: dataframe with the following columns: feature, {binding/expression}_effect, {binding/expression}_pvalue, source

Raises:

Type Description
FileExistsError

if data_path does not exist

KeyError

if identifier_col, effect_col, or pval_col is not in the data, or if the identifier_col is something other than feature and the column feature also exists in the data

AttributeError

if there are NA values in the effect or pvalue columns

Source code in callingcardstools/Analysis/yeast/read_in_data.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def read_in_data(
    data_path: str,
    identifier_col: str,
    effect_col: str,
    pval_col: str,
    source: str,
    data_type: Literal["binding", "expression"],
) -> pd.DataFrame:
    """
    Read in data from a file and return a dataframe with
    the following columns: gene_id, {binding/expression}_effect,
    {binding/expression}_pvalue, source

    Args:
        data_path (str): path to the data file
        identifier_col (str): name of the feature identifier column in the data
        effect_col (str): name of the effect column in the data
        pval_col (str): name of the pvalue column in the data
        source (str): source of the data
        data_type (str): type of data, either 'binding' or 'expression'

    Returns:
        pd.DataFrame: dataframe with the following columns:
            feature, {binding/expression}_effect, {binding/expression}_pvalue,
            source

    Raises:
        FileExistsError: if data_path does not exist
        KeyError: if identifier_col, effect_col, or pval_col is not in the
            data, or if the `identifier_col` is something other than `feature`
            and the column `feature` also exists in the data
        AttributeError: if there are NA values in the effect or pvalue columns
    """
    if not os.path.exists(data_path):
        raise FileExistsError(f"{data_path} does not exist")

    compressed = data_path.endswith(".gz")
    logger.debug("data compressed: %s", compressed)

    sep = (
        "\t"
        if any(data_path.endswith(ext) for ext in [".tsv", ".txt", ".tsv.gz", "txt.gz"])
        else ","
    )
    logger.debug("data separator: %s", sep)

    df = pd.read_csv(data_path, sep=sep, compression="gzip" if compressed else None)

    if identifier_col not in df.columns:
        raise KeyError(f"Column `{identifier_col}` is not in {data_path}")
    if "feature" in df.columns and identifier_col != "feature":
        raise KeyError(
            f"Column `feature` exists in the data, but is not the "
            f"`identifier_col` {identifier_col}. Please rename the "
            f"current `feature` column to avoid confusion."
        )

    try:
        effect_colname = data_type + "_effect"
        # Assuming df is your DataFrame and effect_col is a variable
        # indicating column name
        df[effect_colname] = df[effect_col] if effect_col else float("inf")

        # Check for NA values in the effect_colname
        if pd.isna(df[effect_colname]).any():
            raise AttributeError(
                f"NA values found in column {effect_colname}." " This must not be."
            )
    except KeyError as exc:
        raise KeyError(
            f"Column {effect_col} is not `none` and " "does not exist in {data_path}"
        ) from exc

    try:
        pval_colname = data_type + "_pvalue"
        df[pval_colname] = df[pval_col] if pval_col else 0.0

        # Check for NA values in the pval_colname
        if pd.isna(df[pval_colname]).any():
            raise AttributeError(
                f"NA values found in column {pval_colname}. This must not be."
            )

        # Check for 'inf' values in the pval column
        if np.isinf(df[pval_colname]).any():
            # Remove inf values and check the largest remaining value
            non_inf_pvals = df[pval_colname].replace([np.inf, -np.inf], np.nan).dropna()

            # if the largest remaining p-value is 0, and there exists at least 1
            # negative value, then the pvalue column is logged.
            # convert positive infinity to negative infinity (this is for the raw
            # chipexo data)
            if (
                not non_inf_pvals.empty
                and non_inf_pvals.max() <= 0
                and non_inf_pvals.min() < 0
            ):
                # Convert positive infinity (resulting from log(0)) to negative infinity
                logger.warning(
                    "The pvalue column is logged, but there exist "
                    "positive infinity values. This is assumed to be an error. "
                    "Converting positive infinity to negative infinity."
                )
                df[pval_colname].replace(np.inf, -np.inf, inplace=True)

    except KeyError as exc:
        raise KeyError(
            f"Column {pval_col} is not `none` and does not exist in {data_path}"
        ) from exc

    source_colname = data_type + "_source"
    df[source_colname] = source

    df.rename(columns={identifier_col: "feature"}, inplace=True)

    return df[["feature", effect_colname, pval_colname, source_colname]]