Skip to content

read_in_data

Read in data from a file and return a dataframe with the following columns: gene_id, {binding/expression}_effect, {binding/expression}_pvalue, source

Parameters:

Name Type Description Default
data_path str

path to the data file

required
identifier_col str

name of the feature identifier column in the data

required
effect_col str

name of the effect column in the data

required
pval_col str

name of the pvalue column in the data

required
source str

source of the data

required
data_type str

type of data, either ‘binding’ or ‘expression’

required

Returns:

Type Description
DataFrame

pd.DataFrame: dataframe with the following columns: feature, {binding/expression}_effect, {binding/expression}_pvalue, source

Raises:

Type Description
FileExistsError

if data_path does not exist

KeyError

if identifier_col, effect_col, or pval_col is not in the data, or if the identifier_col is something other than feature and the column feature also exists in the data

AttributeError

if there are NA values in the effect or pvalue columns

Source code in callingcardstools/Analysis/yeast/rank_response/read_in_data.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def read_in_data(
        data_path: str,
        identifier_col: str,
        effect_col: str,
        pval_col: str,
        source: str,
        data_type: Literal['binding', 'expression']) -> pd.DataFrame:
    """
    Read in data from a file and return a dataframe with
    the following columns: gene_id, {binding/expression}_effect,
    {binding/expression}_pvalue, source

    Args:
        data_path (str): path to the data file
        identifier_col (str): name of the feature identifier column in the data
        effect_col (str): name of the effect column in the data
        pval_col (str): name of the pvalue column in the data
        source (str): source of the data
        data_type (str): type of data, either 'binding' or 'expression'

    Returns:
        pd.DataFrame: dataframe with the following columns:
            feature, {binding/expression}_effect, {binding/expression}_pvalue,
            source

    Raises:
        FileExistsError: if data_path does not exist
        KeyError: if identifier_col, effect_col, or pval_col is not in the
            data, or if the `identifier_col` is something other than `feature`
            and the column `feature` also exists in the data
        AttributeError: if there are NA values in the effect or pvalue columns
    """
    if not os.path.exists(data_path):
        raise FileExistsError(f"{data_path} does not exist")

    compressed = data_path.endswith('.gz')
    logger.debug("data compressed: %s", compressed)

    sep = '\t' if any(data_path.endswith(ext)
                      for ext in ['.tsv', '.txt', '.tsv.gz', 'txt.gz']) \
        else ','
    logger.debug("data separator: %s", sep)

    df = pd.read_csv(
        data_path,
        sep=sep,
        compression='gzip' if compressed else None)

    if identifier_col not in df.columns:
        raise KeyError(f"Column `{identifier_col}` is not in {data_path}")
    if 'feature' in df.columns and identifier_col != 'feature':
        raise KeyError(f"Column `feature` exists in the data, but is not the "
                       f"`identifier_col` {identifier_col}. Please rename the "
                       f"current `feature` column to avoid confusion.")

    try:
        effect_colname = data_type + '_effect'
        # Assuming df is your DataFrame and effect_col is a variable
        # indicating column name
        df[effect_colname] = df[effect_col] if effect_col else float('inf')

        # Check for NA values in the effect_colname
        if pd.isna(df[effect_colname]).any():
            raise AttributeError(f"NA values found in column {effect_colname}."
                                 " This must not be.")
    except KeyError as exc:
        raise KeyError(f"Column {effect_col} is not `none` and "
                       "does not exist in {data_path}") from exc

    try:
        pval_colname = data_type + '_pvalue'
        df[pval_colname] = df[pval_col] if pval_col else 0.0

        # Check for NA values in the pval_colname
        if pd.isna(df[pval_colname]).any():
            raise AttributeError(f"NA values found in column {pval_colname}. "
                                 "This must not be.")
    except KeyError as exc:
        raise KeyError(f"Column {pval_col} is not `none` and "
                       "does not exist in {data_path}") from exc

    source_colname = data_type + '_source'
    df[source_colname] = source

    df.rename(columns={identifier_col: 'feature'}, inplace=True)

    return df[['feature', effect_colname, pval_colname, source_colname]]