This should come after the previous QC steps (batch effects, sex and
expression). The metadata in the dds
objects should be
updated now with the current state of the database, meaning any
additional suspicious sex samples or WGS identified mislabels, for
instance, have been flagged in the database itself, and that updated
information added back to the dds
object.
library(tidyverse)
library(DESeq2)
library(edgeR)
library(here)
# set environmental variables
## set to false to use gene level quantification, true to use
## tx level quantification
TX_QUANTS = TRUE
## determines whether data is written to the project data directory
WRITE = TRUE
# read in data ----------------------------------------------------------------
dds_list = list(
gene = here("llfs_rnaseq_data/dds_gene_20231023.rds"),
tx = here("llfs_rnaseq_data/dds_tx_20231023.rds")
)
# note, clear out any design -- it isn't necessary for the manipulations below.
# if you want to set the design, then change the formula below. Make sure to
# cast any character fields to factors.
if(TX_QUANTS){
dds_full = readRDS(dds_list$tx)
design(dds_full) = ~1
dds_full$count_headers = paste0('library_',dds_full$library_id)
colnames(dds_full) = dds_full$count_headers
} else {
dds_full = readRDS(dds_list$gene)
design(dds_full) = ~1
dds_full$count_headers = paste0('library_',dds_full$library_id)
colnames(dds_full) = dds_full$count_headers
}
control
and
experiment
sets
# ensure that all samples are ones for which there is legal permission
stopifnot(unique(dds_full$legal) == 1)
# split into control and experimental sets ------------------------------------
dds_control = dds_full[,str_detect(dds_full$purpose,'control')]
dds_experiment = dds_full[,dds_full$purpose == 'experiment']
First, we remove samples with more than 0.08 intergenic reads. We also remove samples which are confidently mislabeled, but do not have an alternate match.
Next, we design the following criteria to choose a single sample,
where there are replicates. ‘Meaningful reads’ is here defined as the
protein_coding_total
minus the
rn7sl_total
.
Our preference is for samples with the least percent intergenic reads. If the library with the least intergenic reads is less than 1e6 smaller than the library with the most meaningful reads, OR the library with the least intergenic reads is at least in the 20th percentile of the overall empirical cumulative distribution of meaningful reads, then choose the library with the least intergenic reads. Else, we choose the library with the most meaningful reads.
We perform two checks to ensure that this filtering is correct. First, we confirm that the number of unique ‘subject/visit’s is the same in the original set and the subsequently filtered set. Second, we confirm that there is no more than 1 sample in each ’subject/visit’ group.
Below, we provide a minimal example
library(tidyverse)
# Create an example dataframe
df <- data.frame(
library_id = seq(1:10),
subject = c('A', 'A', 'A',
'A',
'B', 'B',
'B',
'C', 'C',
'D'),
visit = c(1, 1, 1,
2,
1, 1,
2,
1, 1,
1),
percent_intergenic = c(0.1, 0.15, 0.2,
0.2,
0.3, 0.1,
0.5,
0.4, 0.2,
0.15),
protein_coding_total = c(1e6, 9e6, 7e6,
7e6,
8e6, 8e6,
9e6,
6e6, 6e6,
7e6),
rn7sl_total = c(1e3, 9e4, 7e4,
7e2,
8e1, 8e2,
9e1,
6e2, 6e3,
7e4))
passing_ids = df %>%
# add a column which stores the protein_coding_total quantile of a given
# library, calculated against the empirical cumulative distribution of
# protein_coding_total in the set
mutate(protein_coding_tmp = protein_coding_total-rn7sl_total) %>%
mutate(protein_coding_tmp_quantile =
ecdf(.$protein_coding_tmp)(protein_coding_tmp)) %>%
group_by(subject, visit) %>%
# Sort, within group, by percent_intergenic
arrange(percent_intergenic, .by_group = TRUE) %>%
mutate(protein_coding_tmp_max_diff =
protein_coding_tmp[percent_intergenic==dplyr::first(percent_intergenic)][1] -
protein_coding_tmp[protein_coding_tmp==max(protein_coding_tmp)][1]) %>%
mutate(
select_flag = if_else(
protein_coding_tmp_max_diff > -1e6 |
dplyr::first(protein_coding_tmp_quantile) >= 0.2,
percent_intergenic == min(percent_intergenic),
protein_coding_tmp == max(protein_coding_tmp)
)) %>%
filter(select_flag) %>%
pull(library_id)
# two checks:
# 1. that the number of unique samples is the same before and after
# 2. that no one group of subject/visit has size > 1
stopifnot(length(passing_ids) ==
nrow(distinct(df, subject, visit)))
stopifnot(df %>%
filter(library_id %in% passing_ids) %>%
group_by(subject,visit) %>%
filter(n()>1) %>%
nrow() == 0)
And this is how we filter the experiment data
analysis_library_ids = colData(dds_experiment) %>%
as_tibble() %>%
# in the experiment set, a sample should either be mislabeled and relabeled
# or not mislabeled and not relabeled
filter(!suspicious_sex,relabel==mislabel, percent_intergenic < 0.08) %>%
mutate(protein_coding_minus_rn7sl = protein_coding_total - rn7sl_total) %>%
mutate(protein_coding_minus_rn7sl_quantile =
ecdf(.$protein_coding_minus_rn7sl)(protein_coding_minus_rn7sl)) %>%
group_by(subject, visit) %>%
arrange(percent_intergenic, .by_group = TRUE) %>%
mutate(protein_coding_minus_rn7sl_max_diff =
protein_coding_minus_rn7sl[percent_intergenic==dplyr::first(percent_intergenic)][1] -
protein_coding_minus_rn7sl[protein_coding_minus_rn7sl==max(protein_coding_minus_rn7sl)][1]) %>%
mutate(
select_flag = if_else(
protein_coding_minus_rn7sl_max_diff > -1e6 |
dplyr::first(protein_coding_minus_rn7sl_quantile) >= 0.2,
percent_intergenic == min(percent_intergenic),
protein_coding_minus_rn7sl == max(protein_coding_minus_rn7sl)
)) %>%
filter(select_flag) %>%
pull(library_id)
stopifnot(length(analysis_library_ids) ==
nrow(distinct(as_tibble(colData(dds_experiment)) %>%
filter(!suspicious_sex,
relabel==mislabel,
percent_intergenic < 0.08),
subject, visit)))
stopifnot(colData(dds_experiment) %>%
as_tibble() %>%
filter(library_id %in% analysis_library_ids) %>%
group_by(subject,visit) %>%
filter(n()>1) %>%
nrow() == 0)
## Expression Filter
# at the time of writing, num_samples comes out to 57
num_samples = floor(ncol(dds_experiment[,dds_experiment$library_id %in%
analysis_library_ids])*.015)
# filter out low expression genes
mid_expression_filter <-
rowSums(cpm(counts(
dds_experiment[,dds_experiment$library_id %in%
analysis_library_ids])) > 3) >= num_samples
## Apply Filters ----
# filter the deseq data object on the rows (genes) and samples (column)
dds_experiment_passing =
dds_experiment[mid_expression_filter,
dds_experiment$library_id %in% analysis_library_ids]
# add size factors to the dds object
dds_experiment_passing = estimateSizeFactors(dds_experiment_passing)
# extract data from dds object ------------------------------------------------
output_list = list(
raw_unfltr_counts = as_tibble(counts(dds_full)),
unfltr_feature_id = tibble(gene_id = rownames(dds_full)),
raw_fltr_counts = as_tibble(counts(dds_experiment_passing)),
norm_fltr_counts = as_tibble(counts(dds_experiment_passing, normalized=TRUE)),
fltr_feature_id = tibble(gene_id = rownames(dds_experiment_passing))
)
# write out -------------------------------------------------------------------
if(WRITE){
today = format(Sys.time(), "%Y%m%d")
if(TX_QUANTS){
add_sep = "_tx_"
} else{
add_sep = "_gene_"
}
sample_metadata_path = here("llfs_rnaseq_data/plain_text",
paste0("sample_meta_",
today, ".csv"))
# Do not overwrite -- if this is already been run, ie if you run the gene
# data and then the tx data, there is no reason to re-write the sample
# metadata. Pay attention to this if you do want to re-write on the same
# day
if(!file.exists(sample_metadata_path)){
write_csv(as_tibble(colData(dds_full)), sample_metadata_path)
}
map(names(output_list),
~write_csv(output_list[[.]],
file.path(here("llfs_rnaseq_data/plain_text"),
paste0(., add_sep, today, ".csv"))))
}