Read in data from VCF and parse into dataframe
# the following file is from daniels original pipeline, which used a
# hacked together set of aligners ngm + yaha
# system.file("BSA6.IGV.vcf.gz", package="BSA")
# The example uses the bwamem2 output from the current pipeline, and is
# recommended
input_paths = list(
vcf = "/mnt/scratch/bsa/run_21_bsa_results/variants/filtered/1_freebayes_filtered.vcf.gz",
meta = "~/Downloads/Doering lab BSA - 20240216.csv"
)
meta_df = read_csv(input_paths$meta) %>%
dplyr::rename(sample=tube_number)
tmp_dir = tempdir()
# note that after some investigation, I found that there is no freebayes labelled
# ALTERNATE with ref_percentage greater than .60. Hence, I lowered the ref_freq_thres
# to allow more data to be included in the analysis
raw_samples_df = vcf_to_qtlseqr_table(input_paths$vcf, tmp_dir,
parent_ref_sample = 'KN99a',
parent_alt_sample = 'C8',
ref_freq_thres = 0.7,
parent_filter = TRUE)
samples_df = raw_samples_df %>%
filter(!sample %in% c("KN99a", "C8")) %>%
arrange(sample, CHR, POS) %>%
left_join(meta_df) %>%
# note this!
mutate(bulk = ifelse(condition == "inoculum", "low", "high"))
summary(
filter(samples_df,
Filt_Genotype == "Alternative",
genotype != 1) %>%
select(sample, Depth, RealDepth, genotype)
)
filter(samples_df, Filt_Genotype == "Alternative", genotype != 1) %>%
ggplot(aes(sample, RealDepth)) +
geom_boxplot() +
ggtitle("Filt Genotype, VCF genotype mismatch. RealDepth Distributions")
samples_df %>%
filter(!is.na(group)) %>%
ggplot(aes( sample, log2(Depth))) +
geom_violin(aes(fill=factor(condition))) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
text = element_text(size = 20)) +
scale_y_continuous(limits = c(8,18), breaks = seq(8, 18, 1)) +
facet_grid(group~sac_day,scales = "free_x")
Collapse pools
# Sum to 'conditions' by collapsing the pools
summed_conditions = collapse_bsa_data(samples_df, c('CHR','POS','REF_Allele','ALT1',
'bulk','condition','sac_day', 'culture_time'),
c('condition', 'sac_day', 'culture_time'))
summed_conditions_meta = summed_conditions %>%
select(sample, condition, sac_day, culture_time, bulk) %>%
distinct(sample, .keep_all = TRUE)
picker2
is one of Daniel’s functions
It basically reformats the data frame and adds some additional metrics which are expected by the qtlSeqR fucntions
# this function will run the `picker2` function on the oneMouse and sepMouse
# groups
run_picker2 = function(metadata_df,
sample_df,
grouping_var,
condition_var,
base_cond_in_each_group){
experiment_crosses =
sample_comparison_frame(metadata_df,
grouping_var = grouping_var,
condition_var = condition_var,
base_cond_in_each_group = base_cond_in_each_group)
out = foreach(
row = iterators::iter(experiment_crosses, by='row'),
.inorder = TRUE
) %do% {
input_df = sample_df %>%
filter(sample %in% c(row[['lowBulk']], row[['highBulk']]))
if (length(unique(input_df$bulk)) < 2){
futile.logger::flog.info(paste0('Both samples ',
row[['lowBulk']],
' and ',
row[['highBulk']],
' are `low` bulk. ',
'Setting the second sample ',
'to `high` for `picker2`'))
input_df = input_df %>%
mutate(bulk = ifelse(sample == row[['highBulk']], 'high', bulk))
}
tryCatch(
picker2(input_df),
error = function(e){
futile.logger::flog.error(e)
})
}
names(out) = experiment_crosses$highBulk
out
}
bsa_data_sets = list(
no_collapse = run_picker2(meta_df,
samples_df,
grouping_var = 'pool',
condition_var = 'condition',
base_cond_in_each_group = TRUE),
condition_collapse = run_picker2(
summed_conditions_meta,
summed_conditions,
grouping_var = 'pool',
condition_var = 'condition',
base_cond_in_each_group = FALSE
)
)
Another daniel function, analyzer
analyzer
is a wrapper of a number of slightly modified
qtlSeqR
functions. This is what adds the BSA metrics.
run_analyzer_on_bsa_set = function(bsa_set){
foreach(
# each df is for a given sample
set_name = names(bsa_set),
.inorder = TRUE
) %do% {
df = bsa_set[[set_name]]
tryCatch(
analyzer(df[!is.nan(df$deltaSNP),],
minDepthPercentile = 0.1,
maxDepthPercentile = 0.995,
outlierFilt = "Hampel",
filter_chr_list = c("CP022335.1", "G418", "NAT")),
error = function(e){
futile.logger::flog.error(paste0(
'Error in: ',
set_name, '; ',
e
))
},
finally = {})
}
}
filtered_bsa_data_sets = map(bsa_data_sets, run_analyzer_on_bsa_set)
names(filtered_bsa_data_sets$no_collapse) =
names(bsa_data_sets$no_collapse)
names(filtered_bsa_data_sets$condition_collapse) =
names(bsa_data_sets$condition_collapse)
Visualize
tiled_genome_df =
tileGenome(seqlengths(kn99_genome),
tilewidth = 1000,
cut.last.tile.in.chrom = TRUE) %>%
as_tibble() %>%
dplyr::rename(CHROM=seqnames)
# a helper function whose purpose is to execute the `bin_variants` function
# on each sample in either oneMouse or pooled_mouse sets
bin_variants_by_group = function(bsa_set){
out = foreach(
sample_name = names(bsa_set),
.inorder = TRUE
) %do% {
df = bsa_set[[sample_name]]
x = bin_variants(df,
tiled_genome_df,
seqlengths(kn99_genome),
"CHROM") %>%
mutate(sample = sample_name)
}
do.call('rbind', out)
}
bin_list = list(
no_collapse = bin_variants_by_group(filtered_bsa_data_sets$no_collapse),
condition = bin_variants_by_group(filtered_bsa_data_sets$condition_collapse)
)
graphing_df = bind_rows(bin_list,.id='collapse_level')
pool_min_max_df = graphing_df %>%
ungroup() %>%
filter(collapse_level == "no_collapse") %>%
left_join(meta_df %>% select(sample, condition, sac_day, culture_time, pool)) %>%
mutate(condition = paste(condition, sac_day, culture_time, sep = "_")) %>%
group_by(CHROM, tile, condition) %>%
summarize(pool_max_smoothed_allele_freq = max(mean_smoothed_delta_snp, na.rm = TRUE),
pool_min_smoothed_allele_freq = min(mean_smoothed_delta_snp, na.rm = TRUE),
pool_max_unsmoothed_allele_freq = max(mean_deltaSNP, na.rm = TRUE),
pool_min_unsmoothed_allele_freq = min(mean_deltaSNP, na.rm = TRUE),
.groups = 'keep') %>%
separate(tile, c("binFloor","binCeiling"), sep = ",") %>%
ungroup() %>%
pivot_longer(starts_with("pool")) %>%
mutate(min_max = paste0("pool_allele_freq_", str_extract(name, "max|min")),
smoothed = ifelse(str_extract(name, "smoothed|unsmoothed") == 'smoothed', TRUE, FALSE)) %>%
select(-name) %>%
pivot_wider(names_from = min_max, values_from = value) %>%
arrange(condition, CHROM, binFloor) %>%
arrange(smoothed) %>%
select(condition, CHROM,binFloor,binCeiling,smoothed,
pool_allele_freq_min, pool_allele_freq_max) %>%
mutate(binFloor = as.numeric(binFloor), binCeiling = as.numeric(binCeiling))
condition_collapsed_df = graphing_df %>%
ungroup() %>%
filter(collapse_level == "condition") %>%
mutate(condition = sample) %>%
select(condition,significance, CHROM, binFloor, binCeiling,
binMiddle, mean_deltaSNP, mean_smoothed_delta_snp) %>%
pivot_longer(c(mean_deltaSNP, mean_smoothed_delta_snp)) %>%
mutate(smoothed = ifelse(str_detect(name, "smoothed"), TRUE, FALSE)) %>%
dplyr::rename(condition_allele_freq=value) %>%
select(-name) %>%
arrange(condition, CHROM, binFloor) %>%
arrange(smoothed) %>%
select(condition,significance, CHROM,binFloor,
binCeiling,smoothed, condition_allele_freq)
no_collapse_df = graphing_df %>%
ungroup() %>%
filter(collapse_level == "no_collapse") %>%
left_join(meta_df %>% select(sample, condition, sac_day, culture_time, pool)) %>%
mutate(condition = paste(condition, sac_day, culture_time, sep = "_")) %>%
select(-c(sac_day,culture_time))%>%
select(condition,pool,significance, CHROM, binFloor, binCeiling,
binMiddle, mean_deltaSNP, mean_smoothed_delta_snp) %>%
pivot_longer(c(mean_deltaSNP, mean_smoothed_delta_snp)) %>%
mutate(smoothed = ifelse(str_detect(name, "smoothed"), TRUE, FALSE)) %>%
dplyr::rename(condition_allele_freq=value) %>%
select(-name) %>%
arrange(condition, CHROM, binFloor) %>%
arrange(smoothed) %>%
select(condition,pool,significance, CHROM,binFloor,
binCeiling,smoothed, condition_allele_freq)
x = pool_min_max_df %>%
mutate(binFloor = as.numeric(binFloor), binCeiling = as.numeric(binCeiling)) %>%
left_join(condition_collapsed_df) %>%
filter(complete.cases(.)) %>%
mutate(binMiddle = (binFloor+binCeiling )/ 2) %>%
select(condition,significance,CHROM,binFloor,
binCeiling,binMiddle,smoothed,pool_allele_freq_min,
pool_allele_freq_max, condition_allele_freq)
separate_pool_plotting_df = pool_min_max_df %>%
mutate(binFloor = as.numeric(binFloor),
binCeiling = as.numeric(binCeiling)) %>%
left_join(no_collapse_df) %>%
filter(complete.cases(.)) %>%
mutate(binMiddle = (binFloor+binCeiling )/ 2) %>%
select(condition,pool,significance,CHROM,binFloor,
binCeiling,binMiddle,smoothed,pool_allele_freq_min,
pool_allele_freq_max, condition_allele_freq) %>%
unite(condition_tmp, c(condition,pool), sep = "_") %>%
dplyr::rename(condition = condition_tmp)
# TODO this needs to be checked
#The SNPindex.LOW should be the same for YPD and Lungs....
MaxAndMins = filtered_bsa_data_sets$condition_collapse$YPD %>%
select(c(CHROM,POS,REF,ALT, SNPindex.LOW)) %>%
mutate(min_SNPchange = -1*SNPindex.LOW, max_SNPchange = 1-SNPindex.LOW) %>%
select(-SNPindex.LOW)
#This mutation was erased in the addition of the drug marker in the C8 parent (It was in the arm).
MaxAndMins=MaxAndMins %>%
filter(!(CHROM=="chr2" &
(POS==283162 | POS==283652|
(POS>466000 & POS<467000))))
plot_by_chrom = function(plotting_frame, chr_name, ypd_condition, exp_condition1, exp_condition2){
color_palette = c('grey', 'red', 'blue')
names(color_palette) = c(ypd_condition, exp_condition1, exp_condition2)
plotting_frame %>%
mutate(condition = factor(condition)) %>%
filter(CHROM==chr_name, condition %in% c(exp_condition1, exp_condition2, "YPD_0_24"),
smoothed == TRUE) %>%
ggplot(aes(binMiddle/1000, condition_allele_freq)) +
geom_line(data = filter(MaxAndMins, CHROM == chr_name),
aes(POS/1000, min_SNPchange)) +
geom_line(data = filter(MaxAndMins, CHROM == chr_name),
aes(POS/1000, max_SNPchange)) +
geom_pointrange(aes(ymin=pool_allele_freq_min,
ymax=pool_allele_freq_max,
x=binMiddle/1000,
y=condition_allele_freq,
colour=condition,
alpha=significance,
order="combine"),
na.rm = TRUE) +
scale_color_manual(values = color_palette) +
labs(title=paste0("Chromosome ", chr_name),
x=paste0("Position on chromosome ",
chr_name," (kb)"),
y="Change in allele frequency (BSA)",
color="Pool")+
geom_hline(yintercept = c(0), color="black", size=0.6)+
scale_x_continuous(breaks = seq(from=0,
to=seqlengths(kn99_genome)[[chr_name]]/1000,
by=100),
limits=c(0, seqlengths(kn99_genome)[[chr_name]]/1000),
expand = c(0, 0))+
scale_y_continuous(breaks = c(-1,-0.75,-0.5, -0.25, 0,0.25,0.5,0.75,1),
limits = c(-1.05,1.05),
expand = c(0,0)) +
theme(#legend.title = element_text(size=18),
legend.position = "right",
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
plot.title = element_blank(),
legend.text = element_text(size=16),
axis.title=element_text(size=20),
axis.title.y =element_text(margin = margin(r=25, t=0, l=5, b=0)),
axis.title.x =element_text(margin = margin(r=0, t=25, l=0, b=5)),
axis.text=element_text(size = 20),
axis.ticks.length = unit(0.3,"cm"),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.background = element_rect(fill = "white",colour = "black", size=1),
text=element_text(family="sans"))
}
#plt_list = map(seqlevels(kn99_genome), plot_by_chrom)
plot_by_chrom(x, seqnames(kn99_genome)[[3]], 'YPD_0_24', 'Lung_9_0', 'Lung_9_24')
# map(names(plt_list), ~ggsave(
# file.path(here('data/BSA3_plots'),paste0(.,'.png')),
# plt_list[[.]],
# width=15,
# height=7))
#
# ggsave('data/BSA3_plts/chr2.png', plt_list$chr2, width=20, height=9)