Given a data_df
with column chr
, a curr_chr_name_convention
and
a new_chr_name_convention
, that are both columns of chrmap_df
, join
the chrmap
to the data_df
based on the curr_chr_name_convention
and
swap the values in the chr
column to the new_chr_name_convention
.
relabel the new_chr_name_convention
to chr
and return the dataframe
with columns in the same order as the input dataframe. If geonmic_only
is
set to True
, only those chromosomes with type == ‘genomic’ are returned.
:param df: The dataframe to relabel.
:type df: pd.DataFrame
:param curr_chr_name_convention: The current chromosome name convention.
:type curr_chr_name_convention: str
:param new_chr_name_convention: The new chromosome name convention.
:type new_chr_name_convention: str
:param genomic_only: Whether to return only records with type == “genomic”.
:type genomic_only: bool
:return: The relabeled dataframe.
:rtype: pd.DataFrame
:raises ValueError: If the curr_chr_name_convention
or
new_chr_name_convention
are not columns in chrmap_df
.
:Example:
import pandas as pd
data_df = pd.DataFrame({‘chr’: [‘chr1’, ‘chr2’, ‘chr3’],
… ‘start’: [1, 2, 3],})
chrmap_df = pd.DataFrame({‘curr_chr_name_convention’:
… [‘chr1’, ‘chr2’, ‘chr3’],
… ‘new_chr_name_convention’:
… [‘chrI’, ‘chrII’, ‘chrIII’]})
relabeled_df = relabel_chr_column(data_df, chrmap_df,
… ‘curr_chr_name_convention’,
… ‘new_chr_name_convention’)
list(relabeled_df.columns) == [‘chr’, ‘start’]
True
list(relabeled_df[‘chr’]) == [‘chrI’, ‘chrII’, ‘chrIII’]
True
Source code in callingcardstools/PeakCalling/yeast/read_in_data.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202 | def relabel_chr_column(
data_df: pd.DataFrame,
chrmap_df: pd.DataFrame,
curr_chr_name_convention: str,
new_chr_name_convention: str,
genomic_only: bool = True,
) -> pd.DataFrame:
"""
Given a `data_df` with column `chr`, a `curr_chr_name_convention` and
a `new_chr_name_convention`, that are both columns of `chrmap_df`, join
the `chrmap` to the `data_df` based on the `curr_chr_name_convention` and
swap the values in the `chr` column to the `new_chr_name_convention`.
relabel the `new_chr_name_convention` to `chr` and return the dataframe
with columns in the same order as the input dataframe. If `geonmic_only` is
set to `True`, only those chromosomes with type == 'genomic' are returned.
:param df: The dataframe to relabel.
:type df: pd.DataFrame
:param curr_chr_name_convention: The current chromosome name convention.
:type curr_chr_name_convention: str
:param new_chr_name_convention: The new chromosome name convention.
:type new_chr_name_convention: str
:param genomic_only: Whether to return only records with type == "genomic".
:type genomic_only: bool
:return: The relabeled dataframe.
:rtype: pd.DataFrame
:raises ValueError: If the `curr_chr_name_convention` or
`new_chr_name_convention` are not columns in `chrmap_df`.
:Example:
>>> import pandas as pd
>>> data_df = pd.DataFrame({'chr': ['chr1', 'chr2', 'chr3'],
... 'start': [1, 2, 3],})
>>> chrmap_df = pd.DataFrame({'curr_chr_name_convention':
... ['chr1', 'chr2', 'chr3'],
... 'new_chr_name_convention':
... ['chrI', 'chrII', 'chrIII']})
>>> relabeled_df = relabel_chr_column(data_df, chrmap_df,
... 'curr_chr_name_convention',
... 'new_chr_name_convention')
>>> list(relabeled_df.columns) == ['chr', 'start']
True
>>> list(relabeled_df['chr']) == ['chrI', 'chrII', 'chrIII']
True
"""
# check input
if "chr_curr" in chrmap_df.columns:
raise ValueError(
"chr_curr cannot be a column in chrmap_df for the "
"purposes of relabelling. rename that column in "
"chrmap_df and resubmit"
)
if curr_chr_name_convention not in chrmap_df.columns:
raise ValueError("curr_chr_name_convention " "must be a column in chrmap_df")
if new_chr_name_convention not in chrmap_df.columns:
raise ValueError("new_chr_name_convention " "must be a column in chrmap_df")
# rename the current chr column to chr_curr to avoid any errors in
# joining, if the old/new format is called 'chr'
data_df = data_df.rename(columns={"chr": "chr_curr"})
if curr_chr_name_convention != new_chr_name_convention:
# join a subset of chrmap_df -- only the columns we need -- to data_df
data_df = data_df.merge(
chrmap_df[[curr_chr_name_convention, new_chr_name_convention, "type"]],
left_on="chr_curr",
right_on=curr_chr_name_convention,
)
# swap values in chr column
data_df["chr_curr"] = data_df[new_chr_name_convention]
else:
data_df = data_df.merge(
chrmap_df[[curr_chr_name_convention, "type"]],
left_on="chr_curr",
right_on=curr_chr_name_convention,
)
if genomic_only:
data_df = data_df.query("type=='genomic'")
return data_df.drop(
columns=[new_chr_name_convention, curr_chr_name_convention, "type"]
).rename(columns={"chr_curr": "chr"})
|