Read in the chrmap data from the chrmap_data_path
and check that the
required_col_set
is a subset of the columns in the chrmap dataframe.
:param chrmap_data_path: Path to the chrmap file.
:type chrmap_data_path: str
:param required_col_set: The required columns in the chrmap dataframe.
:type required_col_set: set
:return: The chrmap dataframe.
:rtype: pd.DataFrame
:raises ValueError: If the chrmap_data_path
does not exist or
is not a file; if the required_col_set
is not a subset of the
columns in the chrmap dataframe.
:Example:
import pandas as pd
import os
import tempfile
tmp_chrmap = tempfile.NamedTemporaryFile(suffix=’.csv’).name
with open(tmp_chrmap, ‘w’) as f:
… _ = f.write(‘experiment_orig_chr_convention,’
… ‘promoter_orig_chr_convention,’
… ‘background_orig_chr_convention,’
… ‘unified_chr_convention\n’)
… _ = f.write(‘chr1,chr1,chr1,chrI\n’)
chrmap_df = read_in_chrmap(tmp_chrmap,
… {‘experiment_orig_chr_convention’,
… ‘promoter_orig_chr_convention’,
… ‘background_orig_chr_convention’,
… ‘unified_chr_convention’})
list(chrmap_df.columns) == [‘experiment_orig_chr_convention’,
… ‘promoter_orig_chr_convention’,
… ‘background_orig_chr_convention’,
… ‘unified_chr_convention’]
True
Source code in callingcardstools/PeakCalling/yeast/read_in_data.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 | def read_in_chrmap(chrmap_data_path: str, required_col_set: set) -> pd.DataFrame:
"""
Read in the chrmap data from the `chrmap_data_path` and check that the
`required_col_set` is a subset of the columns in the chrmap dataframe.
:param chrmap_data_path: Path to the chrmap file.
:type chrmap_data_path: str
:param required_col_set: The required columns in the chrmap dataframe.
:type required_col_set: set
:return: The chrmap dataframe.
:rtype: pd.DataFrame
:raises ValueError: If the `chrmap_data_path` does not exist or
is not a file; if the `required_col_set` is not a subset of the
columns in the chrmap dataframe.
:Example:
>>> import pandas as pd
>>> import os
>>> import tempfile
>>> tmp_chrmap = tempfile.NamedTemporaryFile(suffix='.csv').name
>>> with open(tmp_chrmap, 'w') as f:
... _ = f.write('experiment_orig_chr_convention,'
... 'promoter_orig_chr_convention,'
... 'background_orig_chr_convention,'
... 'unified_chr_convention\\n')
... _ = f.write('chr1,chr1,chr1,chrI\\n')
>>> chrmap_df = read_in_chrmap(tmp_chrmap,
... {'experiment_orig_chr_convention',
... 'promoter_orig_chr_convention',
... 'background_orig_chr_convention',
... 'unified_chr_convention'})
>>> list(chrmap_df.columns) == ['experiment_orig_chr_convention',
... 'promoter_orig_chr_convention',
... 'background_orig_chr_convention',
... 'unified_chr_convention']
True
"""
if not os.path.exists(chrmap_data_path):
raise ValueError(f"chrmap_data_path does " f"not exist: {chrmap_data_path}")
required_col_set.add("type")
# read in the chr map
chrmap_df = pd.read_csv(chrmap_data_path)
# raise an error if the nrow of the chrmap is 0
if chrmap_df.shape[0] == 0:
raise ValueError("The chrmap file is empty")
# raise an error if the ncol of the chrmap is less than 2
if chrmap_df.shape[1] < 1:
raise ValueError(
"The chrmap file must have at least two column, ",
"one for the original chromosome name and one for ",
"the new chromosome name",
)
# raise an error if the experiment_orig_chr_convention,
# promoter_orig_chr_convention,
# background_orig_chr_convention and unified_chr_convention are not in
# the chrmap_df columns
missing_chr_cols = required_col_set.difference(chrmap_df.columns)
if len(missing_chr_cols) > 0:
raise ValueError(
f"The following chromosome columns are missing "
f"from the chrmap file: {missing_chr_cols}"
)
# cast all columns to str
chrmap_df = chrmap_df.astype("str")
return chrmap_df
|