BSA_run_21

Setup

You are responsible for installing these packages prior to running this notebook

library(BSA)
library(tidyverse)
library(foreach)
library(GenomicRanges)
library(BSgenome.CneoformansVarGrubiiKN99.NCBI.ASM221672v1)

kn99_genome = BSgenome.CneoformansVarGrubiiKN99.NCBI.ASM221672v1

Read in data from VCF and parse into dataframe

# the following file is from daniels original pipeline, which used a 
# hacked together set of aligners ngm + yaha
# system.file("BSA6.IGV.vcf.gz", package="BSA")
# The example uses the bwamem2 output from the current pipeline, and is  
# recommended
input_paths = list(
  vcf = "/mnt/scratch/bsa/run_21_bsa_results/variants/filtered/1_freebayes_filtered.vcf.gz",
  meta = "~/Downloads/Doering lab BSA - 20240216.csv"
)

meta_df = read_csv(input_paths$meta) %>%
  dplyr::rename(sample=tube_number)

tmp_dir = tempdir()
# note that after some investigation, I found that there is no freebayes labelled
# ALTERNATE with ref_percentage greater than .60. Hence, I lowered the ref_freq_thres
# to allow more data to be included in the analysis
raw_samples_df = vcf_to_qtlseqr_table(input_paths$vcf, tmp_dir, 
                                      parent_ref_sample = 'KN99a', 
                                      parent_alt_sample = 'C8',
                                      ref_freq_thres = 0.7,
                                      parent_filter = TRUE) 

samples_df = raw_samples_df %>% 
  filter(!sample %in% c("KN99a", "C8")) %>%
  arrange(sample, CHR, POS) %>% 
  left_join(meta_df) %>%
  # note this!
  mutate(bulk = ifelse(condition == "inoculum", "low", "high"))

summary(
  filter(samples_df, 
         Filt_Genotype == "Alternative", 
         genotype != 1) %>% 
          select(sample, Depth, RealDepth, genotype)
  )

filter(samples_df, Filt_Genotype == "Alternative", genotype != 1) %>% 
  ggplot(aes(sample, RealDepth)) + 
  geom_boxplot() +
  ggtitle("Filt Genotype, VCF genotype mismatch. RealDepth Distributions")

samples_df %>% 
  filter(sample == "SLB0025") %>% 
  pull(RealDepth) %>%
  summary()

samples_df %>%
  filter(!is.na(group)) %>%
ggplot(aes( sample, log2(Depth))) + 
    geom_violin(aes(fill=factor(condition))) + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
          text = element_text(size = 20)) +
  scale_y_continuous(limits = c(8,18), breaks = seq(8, 18, 1)) +
    facet_grid(group~sac_day,scales = "free_x")

Collapse pools

# Sum to 'conditions' by collapsing the pools
summed_conditions = collapse_bsa_data(samples_df, c('CHR','POS','REF_Allele','ALT1',
                                'bulk','condition','sac_day', 'culture_time'),
                              c('condition', 'sac_day', 'culture_time'))

summed_conditions_meta = summed_conditions %>%
  select(sample, condition, sac_day, culture_time, bulk) %>% 
  distinct(sample, .keep_all = TRUE)

`picker2` is one of Daniel’s functions

It basically reformats the data frame and adds some additional metrics which are expected by the qtlSeqR fucntions

# this function will run the `picker2` function on the oneMouse and sepMouse 
# groups
run_picker2 = function(metadata_df,
                       sample_df, 
                       grouping_var,
                       condition_var,
                       base_cond_in_each_group){

  experiment_crosses =
    sample_comparison_frame(metadata_df,
                            grouping_var = grouping_var,
                            condition_var = condition_var,
                            base_cond_in_each_group = base_cond_in_each_group)

  out = foreach(
    row = iterators::iter(experiment_crosses, by='row'),
    .inorder = TRUE
  ) %do% {
    input_df = sample_df %>% 
      filter(sample %in% c(row[['lowBulk']], row[['highBulk']]))
    if (length(unique(input_df$bulk)) < 2){
      futile.logger::flog.info(paste0('Both samples ',
                                      row[['lowBulk']],
                                      ' and ',
                                      row[['highBulk']], 
                                      ' are `low` bulk. ',
                                      'Setting the second sample ',
                                      'to `high` for `picker2`'))
      input_df = input_df %>%
        mutate(bulk = ifelse(sample == row[['highBulk']], 'high', bulk))
    }
    tryCatch(
          picker2(input_df), 
             error = function(e){
               futile.logger::flog.error(e)
             })
  }
  names(out) = experiment_crosses$highBulk
  
  out
}

bsa_data_sets = list(
  no_collapse = run_picker2(meta_df,
                samples_df,
                grouping_var = 'pool',
                condition_var = 'condition',
                base_cond_in_each_group = TRUE),
  
  condition_collapse = run_picker2(
    summed_conditions_meta,
    summed_conditions,
    grouping_var = 'pool',
    condition_var = 'condition',
    base_cond_in_each_group = FALSE
  )
)

Another daniel function, `analyzer`

analyzer is a wrapper of a number of slightly modified qtlSeqR functions. This is what adds the BSA metrics.

run_analyzer_on_bsa_set = function(bsa_set){
  foreach(
        # each df is for a given sample
        set_name = names(bsa_set),
        .inorder = TRUE
      ) %do% { 
        df = bsa_set[[set_name]]
        tryCatch(
          analyzer(df[!is.nan(df$deltaSNP),], 
               minDepthPercentile = 0.1, 
               maxDepthPercentile = 0.995, 
               outlierFilt = "Hampel",
               filter_chr_list = c("CP022335.1", "G418", "NAT")),
                 error = function(e){
                   futile.logger::flog.error(paste0(
                     'Error in: ',
                     set_name, '; ',
                     e
                   ))
                 }, 
                 finally = {})

    }
}

filtered_bsa_data_sets = map(bsa_data_sets, run_analyzer_on_bsa_set)

names(filtered_bsa_data_sets$no_collapse) = 
  names(bsa_data_sets$no_collapse)

names(filtered_bsa_data_sets$condition_collapse) =
  names(bsa_data_sets$condition_collapse)

Visualize

tiled_genome_df = 
  tileGenome(seqlengths(kn99_genome), 
             tilewidth = 1000, 
             cut.last.tile.in.chrom = TRUE) %>% 
  as_tibble() %>% 
  dplyr::rename(CHROM=seqnames)

# a helper function whose purpose is to execute the `bin_variants` function
# on each sample in either oneMouse or pooled_mouse sets
bin_variants_by_group = function(bsa_set){
  out = foreach(
    sample_name = names(bsa_set),
    .inorder = TRUE
  ) %do% {
    df = bsa_set[[sample_name]]
    x = bin_variants(df, 
                 tiled_genome_df,
                 seqlengths(kn99_genome),
                 "CHROM") %>%
      mutate(sample = sample_name)
  }
  do.call('rbind', out)
}

bin_list = list(
  no_collapse = bin_variants_by_group(filtered_bsa_data_sets$no_collapse),
  condition = bin_variants_by_group(filtered_bsa_data_sets$condition_collapse)
) 

graphing_df = bind_rows(bin_list,.id='collapse_level')

pool_min_max_df = graphing_df %>% 
  ungroup() %>%
  filter(collapse_level == "no_collapse") %>%
  left_join(meta_df %>% select(sample, condition, sac_day, culture_time, pool)) %>%
  mutate(condition = paste(condition, sac_day, culture_time, sep = "_")) %>%
  group_by(CHROM, tile, condition) %>%
  summarize(pool_max_smoothed_allele_freq   = max(mean_smoothed_delta_snp, na.rm = TRUE),
            pool_min_smoothed_allele_freq   = min(mean_smoothed_delta_snp, na.rm = TRUE),
            pool_max_unsmoothed_allele_freq = max(mean_deltaSNP, na.rm = TRUE),
            pool_min_unsmoothed_allele_freq = min(mean_deltaSNP, na.rm = TRUE),
            .groups = 'keep') %>%
  separate(tile, c("binFloor","binCeiling"), sep = ",") %>%
  ungroup() %>%
  pivot_longer(starts_with("pool")) %>%
  mutate(min_max = paste0("pool_allele_freq_", str_extract(name, "max|min")),
         smoothed = ifelse(str_extract(name, "smoothed|unsmoothed") == 'smoothed', TRUE, FALSE)) %>%
  select(-name) %>%
  pivot_wider(names_from = min_max, values_from = value) %>%
  arrange(condition, CHROM, binFloor) %>%
  arrange(smoothed) %>%
  select(condition, CHROM,binFloor,binCeiling,smoothed, 
         pool_allele_freq_min, pool_allele_freq_max) %>%
  mutate(binFloor = as.numeric(binFloor), binCeiling = as.numeric(binCeiling))

condition_collapsed_df = graphing_df %>%
  ungroup() %>%
  filter(collapse_level == "condition") %>% 
  mutate(condition = sample) %>%
  select(condition,significance, CHROM, binFloor, binCeiling, 
         binMiddle, mean_deltaSNP, mean_smoothed_delta_snp) %>%
  pivot_longer(c(mean_deltaSNP, mean_smoothed_delta_snp)) %>%
  mutate(smoothed = ifelse(str_detect(name, "smoothed"), TRUE, FALSE)) %>%
  dplyr::rename(condition_allele_freq=value) %>%
  select(-name) %>%
  arrange(condition, CHROM, binFloor) %>%
  arrange(smoothed) %>%
  select(condition,significance, CHROM,binFloor, 
         binCeiling,smoothed, condition_allele_freq)

no_collapse_df = graphing_df %>%
  ungroup() %>%
  filter(collapse_level == "no_collapse") %>% 
  left_join(meta_df %>% select(sample, condition, sac_day, culture_time, pool)) %>%
  mutate(condition = paste(condition, sac_day, culture_time, sep = "_")) %>%
  select(-c(sac_day,culture_time))%>%
  select(condition,pool,significance, CHROM, binFloor, binCeiling, 
         binMiddle, mean_deltaSNP, mean_smoothed_delta_snp) %>%
  pivot_longer(c(mean_deltaSNP, mean_smoothed_delta_snp)) %>%
  mutate(smoothed = ifelse(str_detect(name, "smoothed"), TRUE, FALSE)) %>%
  dplyr::rename(condition_allele_freq=value) %>%
  select(-name) %>%
  arrange(condition, CHROM, binFloor) %>%
  arrange(smoothed) %>%
  select(condition,pool,significance, CHROM,binFloor, 
         binCeiling,smoothed, condition_allele_freq)

x = pool_min_max_df %>%  
  mutate(binFloor = as.numeric(binFloor), binCeiling = as.numeric(binCeiling)) %>%
  left_join(condition_collapsed_df) %>%
  filter(complete.cases(.)) %>%
  mutate(binMiddle = (binFloor+binCeiling )/ 2) %>%
    select(condition,significance,CHROM,binFloor, 
         binCeiling,binMiddle,smoothed,pool_allele_freq_min, 
         pool_allele_freq_max, condition_allele_freq)

separate_pool_plotting_df = pool_min_max_df %>%  
  mutate(binFloor = as.numeric(binFloor),
         binCeiling = as.numeric(binCeiling)) %>%
  left_join(no_collapse_df) %>%
  filter(complete.cases(.)) %>%
  mutate(binMiddle = (binFloor+binCeiling )/ 2) %>%
    select(condition,pool,significance,CHROM,binFloor, 
         binCeiling,binMiddle,smoothed,pool_allele_freq_min, 
         pool_allele_freq_max, condition_allele_freq) %>%
  unite(condition_tmp, c(condition,pool), sep = "_") %>%
  dplyr::rename(condition = condition_tmp)


# TODO this needs to be checked
#The SNPindex.LOW should be the same for YPD and Lungs....
MaxAndMins = filtered_bsa_data_sets$condition_collapse$YPD %>% 
  select(c(CHROM,POS,REF,ALT, SNPindex.LOW)) %>%
  mutate(min_SNPchange = -1*SNPindex.LOW, max_SNPchange = 1-SNPindex.LOW) %>% 
  select(-SNPindex.LOW)

#This mutation was erased in the addition of the drug marker in the C8 parent (It was in the arm).
MaxAndMins=MaxAndMins %>% 
  filter(!(CHROM=="chr2" & 
             (POS==283162 | POS==283652| 
                (POS>466000 & POS<467000))))

plot_by_chrom = function(plotting_frame, chr_name, ypd_condition, exp_condition1, exp_condition2){
  
  color_palette = c('grey', 'red', 'blue')
  names(color_palette) = c(ypd_condition, exp_condition1, exp_condition2)
  
  plotting_frame %>%
    mutate(condition = factor(condition)) %>%
  filter(CHROM==chr_name, condition %in% c(exp_condition1, exp_condition2, "YPD_0_24"), 
         smoothed == TRUE) %>%
  ggplot(aes(binMiddle/1000, condition_allele_freq)) +
    geom_line(data = filter(MaxAndMins, CHROM == chr_name), 
              aes(POS/1000, min_SNPchange)) +
    geom_line(data = filter(MaxAndMins, CHROM == chr_name), 
              aes(POS/1000, max_SNPchange)) +
    geom_pointrange(aes(ymin=pool_allele_freq_min,
                        ymax=pool_allele_freq_max,
                        x=binMiddle/1000,
                        y=condition_allele_freq,
                        colour=condition,
                        alpha=significance,
                        order="combine"), 
                    na.rm = TRUE) +
  scale_color_manual(values = color_palette) +
    labs(title=paste0("Chromosome ", chr_name), 
         x=paste0("Position on chromosome ",
                  chr_name," (kb)"),
         y="Change in allele frequency (BSA)", 
         color="Pool")+
    geom_hline(yintercept = c(0), color="black", size=0.6)+
    scale_x_continuous(breaks = seq(from=0,
                                    to=seqlengths(kn99_genome)[[chr_name]]/1000, 
                                    by=100), 
                       limits=c(0, seqlengths(kn99_genome)[[chr_name]]/1000), 
                       expand = c(0, 0))+
    scale_y_continuous(breaks = c(-1,-0.75,-0.5, -0.25, 0,0.25,0.5,0.75,1), 
                       limits = c(-1.05,1.05), 
                       expand = c(0,0)) +
    theme(#legend.title = element_text(size=18),
      legend.position = "right",
      axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
      plot.title = element_blank(),
      legend.text = element_text(size=16),
      axis.title=element_text(size=20),
      axis.title.y =element_text(margin = margin(r=25, t=0, l=5, b=0)),
      axis.title.x =element_text(margin = margin(r=0, t=25, l=0, b=5)),
      axis.text=element_text(size = 20),
      axis.ticks.length = unit(0.3,"cm"),
      panel.grid.major=element_blank(),
      panel.grid.minor=element_blank(),
      panel.background = element_rect(fill = "white",colour = "black", size=1),
      text=element_text(family="sans"))
}

#plt_list = map(seqlevels(kn99_genome), plot_by_chrom)

plot_by_chrom(x, seqnames(kn99_genome)[[3]], 'YPD_0_24', 'Lung_9_0', 'Lung_9_24')

# map(names(plt_list), ~ggsave(
#   file.path(here('data/BSA3_plots'),paste0(.,'.png')),
#   plt_list[[.]],
#   width=15,
#   height=7))
# 
# ggsave('data/BSA3_plts/chr2.png', plt_list$chr2, width=20, height=9)