create_rank_response_table

Create a rank repsonse table from a dictionary which contains the configuration parameters. See docs at https://cmatkhan.github.io/callingCardsTools/file_format_specs/yeast_rank_response/ # noqa for details

Parameters:

Name	Type	Description	Default
`config_dict`	`dict`	A dictionary containing the configuration parameters	required

Returns:

Name	Type	Description
`tuple`	`tuple[DataFrame, DataFrame, DataFrame]`	A tuple containing three DataFrames (see rank_response_summarize): 1. A dataframe summarized where hte responsive_ratio is summarized by rank_bin 2. The merged, labelled and sorted dataframe with both binding and expression data 3. The random expectation dataframe

Raises:

Type	Description
`KeyError`	if the configuration dictionary is missing any of the required keys
`FileExistsError`	if the data files do not exist
`AttributeError`	if there are NA values in the effect or pvalue columns
`ValueError`	if there are incomplete cases in the data

Source code in callingcardstools/Analysis/yeast/rank_response.py

def create_rank_response_table(
    config_dict: dict,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Create a rank repsonse table from a dictionary which contains the
    configuration parameters. See docs at
    https://cmatkhan.github.io/callingCardsTools/file_format_specs/yeast_rank_response/ # noqa
    for details

    Args:
        config_dict (dict): A dictionary containing the configuration
            parameters

    Returns:
        tuple: A tuple containing three DataFrames
            (see rank_response_summarize):
               1. A dataframe summarized where hte responsive_ratio is summarized by
                    rank_bin
               2. The merged, labelled and sorted dataframe with both binding and
                    expression data
               3. The random expectation dataframe

    Raises:
        KeyError: if the configuration dictionary is missing any of the
            required keys
        FileExistsError: if the data files do not exist
        AttributeError: if there are NA values in the effect or pvalue columns
        ValueError: if there are incomplete cases in the data
    """
    # validate the configuration key/value pairs
    args = validate_config(config_dict)

    try:
        if len(args["binding_data_path"]) > 1:
            binding_data = combine_data(
                data_paths=args["binding_data_path"],
                identifier_col=args["binding_identifier_col"],
                effect_col=args["binding_effect_col"],
                pval_col=args["binding_pvalue_col"],
                source=args["binding_source"],
                data_type="binding",
            )
        else:
            binding_data = read_in_data(
                data_path=args["binding_data_path"][0],
                identifier_col=args["binding_identifier_col"],
                effect_col=args["binding_effect_col"],
                pval_col=args["binding_pvalue_col"],
                source=args["binding_source"],
                data_type="binding",
            )
    except (KeyError, FileExistsError, AttributeError) as exc:
        logger.error("Error reading in binding data: %s", exc)
        raise

    try:
        if len(args["expression_data_path"]) > 1:
            expression_data = combine_data(
                data_paths=args["expression_data_path"],
                identifier_col=args["expression_identifier_col"],
                effect_col=args["expression_effect_col"],
                pval_col=args["expression_pvalue_col"],
                source=args["expression_source"],
                data_type="expression",
            )
        else:
            expression_data = read_in_data(
                data_path=args["expression_data_path"][0],
                identifier_col=args["expression_identifier_col"],
                effect_col=args["expression_effect_col"],
                pval_col=args["expression_pvalue_col"],
                source=args["expression_source"],
                data_type="expression",
            )
    except (KeyError, FileExistsError, AttributeError) as exc:
        logger.error("Error reading in expression data: %s", exc)
        raise

    labeled_expression_data = label_responsive_genes(
        expression_data,
        args["expression_effect_thres"],
        args["expression_pvalue_thres"],
        args["normalization_cutoff"],
    )

    # Calculate counts for responsive and unresponsive
    responsive_unresponsive_counts = labeled_expression_data[
        "responsive"
    ].value_counts()

    # Create the DataFrame
    random_expectation_df = pd.DataFrame(
        {
            "unresponsive": [responsive_unresponsive_counts.get(False, 0)],
            "responsive": [responsive_unresponsive_counts.get(True, 0)],
        }
    )

    # Calculate the 'random' column
    total_expression_genes = random_expectation_df.sum(axis=1)
    random_expectation_df["random"] = (
        random_expectation_df["responsive"] / total_expression_genes
    )

    df = labeled_expression_data.merge(
        binding_data[["binding_effect", "binding_pvalue", "binding_source", "feature"]],
        how="inner",
        on="feature",
    )
    # test that there no incomplete cases. raise an error if there are
    if df.isnull().values.any():
        raise ValueError("There are incomplete cases in the data")

    logger.debug(
        "There are %s genes in the data after merging "
        "the %s binding data and "
        " %s expression data",
        str(df.shape[0]),
        args["binding_source"],
        args["expression_source"],
    )

    df_expression_labeled_binding_ranked = bin_by_binding_rank(
        df, args["rank_bin_size"], args["rank_by_binding_effect"]
    )

    df_expression_labeled_binding_ranked["random"] = random_expectation_df[
        "random"
    ].iloc[0]

    rank_response_df = compute_rank_response(df_expression_labeled_binding_ranked)

    return rank_response_df, df_expression_labeled_binding_ranked, random_expectation_df