Skip to content

create_rank_response_table

Create a rank repsonse table from a dictionary which contains the configuration parameters. See docs at https://cmatkhan.github.io/callingCardsTools/file_format_specs/yeast_rank_response/ # noqa for details

Parameters:

Name Type Description Default
config_dict dict

A dictionary containing the configuration parameters

required

Returns:

Name Type Description
tuple tuple[DataFrame, DataFrame, DataFrame]

A tuple containing three DataFrames (see rank_response_summarize): 1. A dataframe summarized where hte responsive_ratio is summarized by rank_bin 2. The merged, labelled and sorted dataframe with both binding and expression data 3. The random expectation dataframe

Raises:

Type Description
KeyError

if the configuration dictionary is missing any of the required keys

FileExistsError

if the data files do not exist

AttributeError

if there are NA values in the effect or pvalue columns

ValueError

if there are incomplete cases in the data

Source code in callingcardstools/Analysis/yeast/rank_response.py
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
def create_rank_response_table(
    config_dict: dict,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Create a rank repsonse table from a dictionary which contains the
    configuration parameters. See docs at
    https://cmatkhan.github.io/callingCardsTools/file_format_specs/yeast_rank_response/ # noqa
    for details

    Args:
        config_dict (dict): A dictionary containing the configuration
            parameters

    Returns:
        tuple: A tuple containing three DataFrames
            (see rank_response_summarize):
               1. A dataframe summarized where hte responsive_ratio is summarized by
                    rank_bin
               2. The merged, labelled and sorted dataframe with both binding and
                    expression data
               3. The random expectation dataframe

    Raises:
        KeyError: if the configuration dictionary is missing any of the
            required keys
        FileExistsError: if the data files do not exist
        AttributeError: if there are NA values in the effect or pvalue columns
        ValueError: if there are incomplete cases in the data
    """
    # validate the configuration key/value pairs
    args = validate_config(config_dict)

    try:
        if len(args["binding_data_path"]) > 1:
            binding_data = combine_data(
                data_paths=args["binding_data_path"],
                identifier_col=args["binding_identifier_col"],
                effect_col=args["binding_effect_col"],
                pval_col=args["binding_pvalue_col"],
                source=args["binding_source"],
                data_type="binding",
            )
        else:
            binding_data = read_in_data(
                data_path=args["binding_data_path"][0],
                identifier_col=args["binding_identifier_col"],
                effect_col=args["binding_effect_col"],
                pval_col=args["binding_pvalue_col"],
                source=args["binding_source"],
                data_type="binding",
            )
    except (KeyError, FileExistsError, AttributeError) as exc:
        logger.error("Error reading in binding data: %s", exc)
        raise

    try:
        if len(args["expression_data_path"]) > 1:
            expression_data = combine_data(
                data_paths=args["expression_data_path"],
                identifier_col=args["expression_identifier_col"],
                effect_col=args["expression_effect_col"],
                pval_col=args["expression_pvalue_col"],
                source=args["expression_source"],
                data_type="expression",
            )
        else:
            expression_data = read_in_data(
                data_path=args["expression_data_path"][0],
                identifier_col=args["expression_identifier_col"],
                effect_col=args["expression_effect_col"],
                pval_col=args["expression_pvalue_col"],
                source=args["expression_source"],
                data_type="expression",
            )
    except (KeyError, FileExistsError, AttributeError) as exc:
        logger.error("Error reading in expression data: %s", exc)
        raise

    labeled_expression_data = label_responsive_genes(
        expression_data,
        args["expression_effect_thres"],
        args["expression_pvalue_thres"],
        args["normalization_cutoff"],
    )

    # Calculate counts for responsive and unresponsive
    responsive_unresponsive_counts = labeled_expression_data[
        "responsive"
    ].value_counts()

    # Create the DataFrame
    random_expectation_df = pd.DataFrame(
        {
            "unresponsive": [responsive_unresponsive_counts.get(False, 0)],
            "responsive": [responsive_unresponsive_counts.get(True, 0)],
        }
    )

    # Calculate the 'random' column
    total_expression_genes = random_expectation_df.sum(axis=1)
    random_expectation_df["random"] = (
        random_expectation_df["responsive"] / total_expression_genes
    )

    df = labeled_expression_data.merge(
        binding_data[["binding_effect", "binding_pvalue", "binding_source", "feature"]],
        how="inner",
        on="feature",
    )
    # test that there no incomplete cases. raise an error if there are
    if df.isnull().values.any():
        raise ValueError("There are incomplete cases in the data")

    logger.debug(
        "There are %s genes in the data after merging "
        "the %s binding data and "
        " %s expression data",
        str(df.shape[0]),
        args["binding_source"],
        args["expression_source"],
    )

    df_expression_labeled_binding_ranked = bin_by_binding_rank(
        df, args["rank_bin_size"], args["rank_by_binding_effect"]
    )

    df_expression_labeled_binding_ranked["random"] = random_expectation_df[
        "random"
    ].iloc[0]

    rank_response_df = compute_rank_response(df_expression_labeled_binding_ranked)

    return rank_response_df, df_expression_labeled_binding_ranked, random_expectation_df