read_in_promoter_data

Read in the promoter data. The promoter data should be a tsv with extension .bed or .bed.gz and should have the following columns: chr start end name score strand. The chr column is refactored from the curr_chr_name_convention to the new_chr_name_convention using the chrmap_df.

:param promoter_data_path: Path to the promoter bed file (plus colnames) :type promoter_data_path: str :param curr_chr_name_convention: The current chromosome name convention. :type curr_chr_name_convention: str :param new_chr_name_convention: The new chromosome name convention. :type new_chr_name_convention: str :param chrmap_df: The chrmap dataframe. :type chrmap_df: pd.DataFrame :return: The promoter data as a dataframe with the chr column refactored to the new_chr_name_convention :rtype: pd.DataFrame

:raises ValueError: If the promoter_data_path does not exist or is not a file; if the column headers exist but do not match expectation or if the datatypes do not match expectation.

:Example:

import pandas as pd import os import tempfile tmp_bed = tempfile.NamedTemporaryFile(suffix=’.bed’).name with open(tmp_bed, ‘w’) as f: … _ = f.write(‘chr\tstart\tend\tname\tscore\tstrand\n’) … _ = f.write(‘chr1\t1\t2\ttest\t1\t+\n’) chrmap_df = pd.DataFrame({‘curr_chr_name_convention’: … [‘chr1’, ‘chr2’, ‘chr3’], … ‘new_chr_name_convention’: … [‘chrI’, ‘chrII’, ‘chrIII’]}) promoter_df = read_in_promoter_data( … tmp_bed, … ‘curr_chr_name_convention’, … ‘new_chr_name_convention’, … chrmap_df) list(promoter_df.columns) == [‘chr’, ‘start’, ‘end’, ‘name’, … ‘score’, ‘strand’] True len(promoter_df) == 1 True

Source code in callingcardstools/PeakCalling/yeast/read_in_data.py

def read_in_promoter_data(
    promoter_data_path: str,
    curr_chr_name_convention: pd.DataFrame,
    new_chr_name_convention: pd.DataFrame,
    chrmap_df: str,
) -> pd.DataFrame:
    """
    Read in the promoter data. The promoter data should be a tsv with extension
    `.bed` or `.bed.gz` and should have the following columns:
    `chr` `start`   `end`  `name` `score` `strand`. The `chr` column is
    refactored from the `curr_chr_name_convention` to the
    `new_chr_name_convention` using the `chrmap_df`.

    :param promoter_data_path: Path to the promoter bed file (plus colnames)
    :type promoter_data_path: str
    :param curr_chr_name_convention: The current chromosome name convention.
    :type curr_chr_name_convention: str
    :param new_chr_name_convention: The new chromosome name convention.
    :type new_chr_name_convention: str
    :param chrmap_df: The chrmap dataframe.
    :type chrmap_df: pd.DataFrame
    :return: The promoter data as a dataframe with the `chr` column
        refactored to the `new_chr_name_convention`
    :rtype: pd.DataFrame

    :raises ValueError: If the `promoter_data_path` does not exist or
        is not a file; if the column headers exist but do not match expectation
        or if the datatypes do not match expectation.

    :Example:

    >>> import pandas as pd
    >>> import os
    >>> import tempfile
    >>> tmp_bed = tempfile.NamedTemporaryFile(suffix='.bed').name
    >>> with open(tmp_bed, 'w') as f:
    ...    _ = f.write('chr\\tstart\\tend\\tname\\tscore\\tstrand\\n')
    ...    _ = f.write('chr1\\t1\\t2\\ttest\\t1\\t+\\n')
    >>> chrmap_df = pd.DataFrame({'curr_chr_name_convention':
    ...                            ['chr1', 'chr2', 'chr3'],
    ...                           'new_chr_name_convention':
    ...                            ['chrI', 'chrII', 'chrIII']})
    >>> promoter_df = read_in_promoter_data(
    ... tmp_bed,
    ... 'curr_chr_name_convention',
    ... 'new_chr_name_convention',
    ... chrmap_df)
    >>> list(promoter_df.columns) == ['chr', 'start', 'end', 'name',
    ...                               'score', 'strand']
    True
    >>> len(promoter_df) == 1
    True
    """
    # check input
    if not os.path.exists(promoter_data_path):
        raise ValueError("promoter_data_path must exist")
    if not os.path.isfile(promoter_data_path):
        raise ValueError("promoter_data_path must be a file")

    # check if data is gzipped
    gzipped = str(promoter_data_path).endswith(".gz")
    # check if data has column headers
    header = pd.read_csv(
        promoter_data_path, sep="\t", compression="gzip" if gzipped else None, nrows=0
    )
    if header.columns.tolist() != ["chr", "start", "end", "name", "score", "strand"]:
        header = None
    else:
        header = 0
    # read in data
    try:
        promoter_df = pd.read_csv(
            promoter_data_path,
            sep="\t",
            header=header,
            names=["chr", "start", "end", "name", "score", "strand"],
            dtype={
                "chr": str,
                "start": int,
                "end": int,
                "name": str,
                "score": float,
                "strand": str,
            },
            compression="gzip" if gzipped else None,
        )
    except ValueError as e:
        raise ValueError(
            "promoter_data_path must be a bed file "
            "with columns `chr`, `start`, `end`, `name`, "
            "`score`, and `strand`"
        ) from e

    # if the file is empty, raise an error.
    if promoter_df.shape[0] == 0:
        raise ValueError("The promoter file is empty -- no data to process")

    # relabel chr column
    chr_relabeled_promoter_df = relabel_chr_column(
        promoter_df, chrmap_df, curr_chr_name_convention, new_chr_name_convention
    )

    return chr_relabeled_promoter_df