read_in_chrmap

Read in the chrmap data from the chrmap_data_path and check that the required_col_set is a subset of the columns in the chrmap dataframe.

:param chrmap_data_path: Path to the chrmap file. :type chrmap_data_path: str :param required_col_set: The required columns in the chrmap dataframe. :type required_col_set: set :return: The chrmap dataframe. :rtype: pd.DataFrame

:raises ValueError: If the chrmap_data_path does not exist or is not a file; if the required_col_set is not a subset of the columns in the chrmap dataframe.

:Example:

import pandas as pd import os import tempfile tmp_chrmap = tempfile.NamedTemporaryFile(suffix=’.csv’).name with open(tmp_chrmap, ‘w’) as f: … _ = f.write(‘experiment_orig_chr_convention,’ … ‘promoter_orig_chr_convention,’ … ‘background_orig_chr_convention,’ … ‘unified_chr_convention\n’) … _ = f.write(‘chr1,chr1,chr1,chrI\n’) chrmap_df = read_in_chrmap(tmp_chrmap, … {‘experiment_orig_chr_convention’, … ‘promoter_orig_chr_convention’, … ‘background_orig_chr_convention’, … ‘unified_chr_convention’}) list(chrmap_df.columns) == [‘experiment_orig_chr_convention’, … ‘promoter_orig_chr_convention’, … ‘background_orig_chr_convention’, … ‘unified_chr_convention’] True

Source code in callingcardstools/PeakCalling/yeast/read_in_data.py

def read_in_chrmap(chrmap_data_path: str, required_col_set: set) -> pd.DataFrame:
    """
    Read in the chrmap data from the `chrmap_data_path` and check that the
    `required_col_set` is a subset of the columns in the chrmap dataframe.

    :param chrmap_data_path: Path to the chrmap file.
    :type chrmap_data_path: str
    :param required_col_set: The required columns in the chrmap dataframe.
    :type required_col_set: set
    :return: The chrmap dataframe.
    :rtype: pd.DataFrame

    :raises ValueError: If the `chrmap_data_path` does not exist or
        is not a file; if the `required_col_set` is not a subset of the
        columns in the chrmap dataframe.

    :Example:

    >>> import pandas as pd
    >>> import os
    >>> import tempfile
    >>> tmp_chrmap = tempfile.NamedTemporaryFile(suffix='.csv').name
    >>> with open(tmp_chrmap, 'w') as f:
    ...    _ = f.write('experiment_orig_chr_convention,'
    ...                'promoter_orig_chr_convention,'
    ...                'background_orig_chr_convention,'
    ...                'unified_chr_convention\\n')
    ...    _ = f.write('chr1,chr1,chr1,chrI\\n')
    >>> chrmap_df = read_in_chrmap(tmp_chrmap,
    ...                            {'experiment_orig_chr_convention',
    ...                             'promoter_orig_chr_convention',
    ...                             'background_orig_chr_convention',
    ...                             'unified_chr_convention'})
    >>> list(chrmap_df.columns) == ['experiment_orig_chr_convention',
    ...                             'promoter_orig_chr_convention',
    ...                             'background_orig_chr_convention',
    ...                             'unified_chr_convention']
    True
    """
    if not os.path.exists(chrmap_data_path):
        raise ValueError(f"chrmap_data_path does " f"not exist: {chrmap_data_path}")
    required_col_set.add("type")
    # read in the chr map
    chrmap_df = pd.read_csv(chrmap_data_path)

    # raise an error if the nrow of the chrmap is 0
    if chrmap_df.shape[0] == 0:
        raise ValueError("The chrmap file is empty")
    # raise an error if the ncol of the chrmap is less than 2
    if chrmap_df.shape[1] < 1:
        raise ValueError(
            "The chrmap file must have at least two column, ",
            "one for the original chromosome name and one for ",
            "the new chromosome name",
        )

    # raise an error if the experiment_orig_chr_convention,
    # promoter_orig_chr_convention,
    # background_orig_chr_convention and unified_chr_convention are not in
    # the chrmap_df columns
    missing_chr_cols = required_col_set.difference(chrmap_df.columns)
    if len(missing_chr_cols) > 0:
        raise ValueError(
            f"The following chromosome columns are missing "
            f"from the chrmap file: {missing_chr_cols}"
        )

    # cast all columns to str
    chrmap_df = chrmap_df.astype("str")

    return chrmap_df

handler: python