BSA3

library(BSA)
library(tidyverse)
library(foreach)
library(GenomicRanges)
library(here)
library(BSgenome.CneoformansVarGrubiiKN99.NCBI.ASM221672v1)

kn99_genome = BSgenome.CneoformansVarGrubiiKN99.NCBI.ASM221672v1

Introduction

This vignette will not run from your computer if you simply copy/paste the code – it is intended to show how to modify the commands and code to run a BSA experiment of slightly different structure from BSA6.

However, the BSA6 vignette (also provided – see articles) will run for you if you simply copy/paste it from the screen to your computer after installing the BSA package. All the data to run that is provided in the package when you install it.

Read in data from VCF and parse into dataframe

Update the paths appropriately for your system and data.

chr_map = tibble(seqnames = seqnames(kn99_genome), 
                 CHR=c(seq(1:14), "M"))

input_paths = list(
  bsa3_vcf = "/mnt/scratch/bsa/bsa3_results/variant_annote/bwamem2/snpeff/group_1_merged_dusted.recode_sorted_sorted_snpEff.ann.vcf",
  bsa3_meta = "/mnt/scratch/bsa/bsa3_samplesheet_single.csv"
)

meta_df = read_csv(input_paths$bsa3_meta)

tmp_dir = tempdir()
raw_samples_df = vcf_to_qtlseqr_table(input_paths$bsa3_vcf, here("data"), 
                                      parent_ref_sample = 'KN99a', 
                                      parent_alt_sample = 'TDY1993',
                                      parent_filter = TRUE) 

samples_df = raw_samples_df %>%
  filter(!sample %in% c("KN99a", "TDY1993")) %>%
  arrange(sample, CHR, POS) %>% 
  left_join(meta_df) %>%
  # note this!
  mutate(bulk = ifelse(cond == "inoculum", "low", "high"))

meta_df = meta_df %>%
  filter(!sample %in% c("KN99a", "TDY1993"))

Take a looksee

samples_df %>%
  filter(!is.na(group)) %>%
ggplot(aes( sample, log2(Depth))) + 
    geom_violin(aes(fill=factor(cond))) + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
          text = element_text(size = 20)) +
  scale_y_continuous(limits = c(8,18), breaks = seq(8, 18, 1)) +
    facet_grid(group~day,scales = "free_x")

Collapse over replicates and conditions

# Sum over replicates
# For each given condition, the result is that there will be only one pool. So 
# if we had 2 replicates per pool, there will now be only one pool of YPD at 15 
# days. If there are 5 pools, each done in a different mouse, then there will be 
# 5 independent records (pools) of that YPD at 15 days
summed_replicates = collapse_bsa_data(samples_df, c('CHR','POS','REF_Allele','ALT1',
                                'bulk','cond','pool','day'),
                              c('pool', 'cond'))

summed_replicates_meta = summed_replicates %>%
  select(sample, pool, cond, bulk) %>% 
  distinct(sample, .keep_all = TRUE)

# Sum over both replicates AND pools
# This creates a single record for a given condition, eg YPD at 15 days
summed_conditions = collapse_bsa_data(samples_df,
                                  c('CHR','POS','REF_Allele','ALT1',
                                    'bulk', 'cond','day'),
                                  'cond')

summed_conditions_meta = summed_conditions %>%
  select(sample, cond, bulk) %>% 
  distinct(sample, .keep_all = TRUE)

`picker2` is one of Daniel’s functions

It basically reformats the data frame and adds some additional metrics which are expected by the qtlSeqR fucntions

# this function will run the `picker2` function on the oneMouse and sepMouse 
# groups
run_picker2 = function(metadata_df,
                       sample_df, 
                       grouping_var,
                       base_cond_in_each_group){

  experiment_crosses =
    sample_comparison_frame(metadata_df,
                            grouping_var = grouping_var,
                            base_cond_in_each_group = base_cond_in_each_group)

  out = foreach(
    row = iterators::iter(experiment_crosses, by='row'),
    .inorder = TRUE
  ) %do% {
    input_df = sample_df %>% 
      filter(sample %in% c(row[['lowBulk']], row[['highBulk']]))
    if (length(unique(input_df$bulk)) < 2){
      futile.logger::flog.info(paste0('Both samples ',
                                      row[['lowBulk']],
                                      ' and ',
                                      row[['highBulk']], 
                                      ' are `low` bulk. ',
                                      'Setting the second sample ',
                                      'to `high` for `picker2`'))
      input_df = input_df %>%
        mutate(bulk = ifelse(sample == row[['highBulk']], 'high', bulk))
    }
    tryCatch(
          picker2(input_df), 
             error = function(e){
               futile.logger::flog.error(e)
             })
  }
  names(out) = experiment_crosses$highBulk
  
  out
}

bsa_data_sets = list(
  no_collapse = run_picker2(meta_df,
                samples_df,
                grouping_var = 'pool',
                base_cond_in_each_group = TRUE),
  
  replicate_collapse = run_picker2(
    summed_replicates_meta,
    summed_replicates,
    grouping_var = 'pool',
    base_cond_in_each_group = TRUE
  ),
  condition_collapse = run_picker2(
    summed_conditions_meta,
    summed_conditions,
    'cond',
    base_cond_in_each_group = FALSE
  )
)

Another daniel function, `analyzer`

analyzer is a wrapper of a number of slightly modified qtlSeqR functions. This is what adds the BSA metrics.

run_analyzer_on_bsa_set = function(bsa_set){
  foreach(
        # each df is for a given sample
        set_name = names(bsa_set),
        .inorder = TRUE
      ) %do% { 
        df = bsa_set[[set_name]]
        tryCatch(
          analyzer(df[!is.nan(df$deltaSNP),], 
               minDepthPercentile = 0.1, 
               maxDepthPercentile = 0.995, 
               outlierFilt = "Hampel",
               filter_chr_list = c("CP022335.1", "G418", "NAT")),
                 error = function(e){
                   futile.logger::flog.error(paste0(
                     'Error in: ',
                     set_name, '; ',
                     e
                   ))
                 }, 
                 finally = {})

    }
}

filtered_bsa_data_sets = map(bsa_data_sets, run_analyzer_on_bsa_set)

names(filtered_bsa_data_sets$no_collapse) = 
  names(bsa_data_sets$no_collapse)

names(filtered_bsa_data_sets$replicate_collapse) = 
  names(bsa_data_sets$replicate_collapse)

names(filtered_bsa_data_sets$condition_collapse) =
  names(bsa_data_sets$condition_collapse)

Visualize

tiled_genome_df = 
  tileGenome(seqlengths(kn99_genome), 
             tilewidth = 1000, 
             cut.last.tile.in.chrom = TRUE) %>% 
  as_tibble() %>% 
  left_join(chr_map) %>%
  dplyr::rename(CHROM=seqnames)

bin_variants_by_group = function(bsa_set, collapse_level){
  foreach(
    sample_name = names(bsa_set),
    .inorder = TRUE
  ) %do% {
    df = bsa_set[[sample_name]]
    futile.logger::flog.debug(sample_name)
    x = bin_variants(df, 
                 tiled_genome_df,
                 seqlengths(kn99_genome),
                 "CHROM") %>%
      mutate(sample = sample_name,
             collapse_level = collapse_level)
  }
}

bin_list = list(
  
  binned_filtered_summedReplicates = 
    bin_variants_by_group(filtered_bsa_data_sets$replicate_collapse,
                          'replicate') %>%
  do.call('rbind',.),
  
  binned_filtered_summedConditions = 
    bin_variants_by_group(filtered_bsa_data_sets$condition_collapse,
                          'condition') %>%
    do.call('rbind', .)
) 

graphing_df = bin_list %>%
  do.call('rbind', .)

pool_min_max_df = graphing_df %>% 
  ungroup() %>%
  filter(collapse_level == "replicate") %>%
  separate(sample, c('pool', 'cond'), sep = "_", remove = FALSE) %>%
  select(-pool) %>%
  group_by(CHROM, tile, cond) %>% 
  summarize(pool_max_smoothed_allele_freq   = max(mean_smoothed_delta_snp, na.rm = TRUE),
            pool_min_smoothed_allele_freq   = min(mean_smoothed_delta_snp, na.rm = TRUE),
            pool_max_unsmoothed_allele_freq = max(mean_deltaSNP, na.rm = TRUE),
            pool_min_unsmoothed_allele_freq = min(mean_deltaSNP, na.rm = TRUE),
            .groups = 'keep') %>%
  separate(tile, c("binFloor","binCeiling"), sep = ",") %>%
  ungroup() %>%
  pivot_longer(starts_with("pool")) %>%
  mutate(min_max = paste0("pool_allele_freq_", str_extract(name, "max|min")),
         smoothed = ifelse(str_extract(name, "smoothed|unsmoothed") == 'smoothed', TRUE, FALSE)) %>%
  select(-name) %>%
  pivot_wider(names_from = min_max, values_from = value) %>%
  arrange(cond, CHROM, binFloor) %>%
  arrange(smoothed) %>%
  select(cond, CHROM,binFloor,binCeiling,smoothed, 
         pool_allele_freq_min, pool_allele_freq_max) %>%
  mutate(binFloor = as.numeric(binFloor), binCeiling = as.numeric(binCeiling))

condition_collapsed_df = graphing_df %>%
  ungroup() %>%
  filter(collapse_level == "condition") %>% 
  mutate(cond = sample) %>%
  select(cond,significance, CHROM, binFloor, binCeiling, 
         binMiddle, mean_deltaSNP, mean_smoothed_delta_snp) %>%
  pivot_longer(c(mean_deltaSNP, mean_smoothed_delta_snp)) %>%
  mutate(smoothed = ifelse(str_detect(name, "smoothed"), TRUE, FALSE)) %>%
  dplyr::rename(condition_allele_freq=value) %>%
  select(-name) %>%
  arrange(cond, CHROM, binFloor) %>%
  arrange(smoothed) %>%
  select(cond,significance, CHROM,binFloor, 
         binCeiling,smoothed, condition_allele_freq)

x = pool_min_max_df %>%  
  mutate(binFloor = as.numeric(binFloor), binCeiling = as.numeric(binCeiling)) %>%
  left_join(condition_collapsed_df) %>%
  filter(complete.cases(.)) %>%
  mutate(binMiddle = (binFloor+binCeiling )/ 2) %>%
    select(cond,significance,CHROM,binFloor, 
         binCeiling,binMiddle,smoothed,pool_allele_freq_min, 
         pool_allele_freq_max, condition_allele_freq)


# TODO this needs to be checked
#The SNPindex.LOW should be the same for YPD and Lungs....
MaxAndMins = filtered_bsa_data_sets$condition_collapse$YPD %>% 
  select(c(CHROM,POS,REF,ALT, SNPindex.LOW)) %>%
  mutate(min_SNPchange = -1*SNPindex.LOW, max_SNPchange = 1-SNPindex.LOW) %>% 
  select(-SNPindex.LOW)

#This mutation was erased in the addition of the drug marker in the C8 parent (It was in the arm).
MaxAndMins=MaxAndMins %>% 
  filter(!(CHROM=="chr2" & 
             (POS==283162 | POS==283652| 
                (POS>466000 & POS<467000))))

plot_by_chrom = function(chr_name){
  
  color_palette = c('YPD' = 'grey', 'Lung' = 'red')
  
  x %>%
    mutate(cond = factor(cond)) %>%
  filter(CHROM==chr_name, cond %in% c("Lung", "YPD"), 
         smoothed == TRUE) %>%
  ggplot(aes(binMiddle/1000, condition_allele_freq)) +
    geom_line(data = filter(MaxAndMins, CHROM == chr_name), 
              aes(POS/1000, min_SNPchange)) +
    geom_line(data = filter(MaxAndMins, CHROM == chr_name), 
              aes(POS/1000, max_SNPchange)) +
    geom_pointrange(aes(ymin=pool_allele_freq_min,
                        ymax=pool_allele_freq_max,
                        x=binMiddle/1000,
                        y=condition_allele_freq,
                        colour=cond,
                        alpha=significance,
                        order="combine"), 
                    na.rm = TRUE) +
  scale_color_manual(values = color_palette) +
    labs(title=paste0("Chromosome ", chr_name), 
         x=paste0("Position on chromosome ",
                  chr_name," (kb)"),
         y="Change in allele frequency (BSA)", 
         color="Pool")+
    geom_hline(yintercept = c(0), color="black", size=0.6)+
    scale_x_continuous(breaks = seq(from=0,
                                    to=seqlengths(kn99_genome)[[chr_name]]/1000, 
                                    by=100), 
                       limits=c(0, seqlengths(kn99_genome)[[chr_name]]/1000), 
                       expand = c(0, 0))+
    scale_y_continuous(breaks = c(-1,-0.75,-0.5, -0.25, 0,0.25,0.5,0.75,1), 
                       limits = c(-1.05,1.05), 
                       expand = c(0,0)) +
    theme(#legend.title = element_text(size=18),
      legend.position = "right",
      axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
      plot.title = element_blank(),
      legend.text = element_text(size=16),
      axis.title=element_text(size=20),
      axis.title.y =element_text(margin = margin(r=25, t=0, l=5, b=0)),
      axis.title.x =element_text(margin = margin(r=0, t=25, l=0, b=5)),
      axis.text=element_text(size = 20),
      axis.ticks.length = unit(0.3,"cm"),
      panel.grid.major=element_blank(),
      panel.grid.minor=element_blank(),
      panel.background = element_rect(fill = "white",colour = "black", size=1),
      text=element_text(family="sans"))
}

plt_list = map(seqlevels(kn99_genome), plot_by_chrom)

names(plt_list) = paste0("chr",chr_map$CHR)

map(names(plt_list), ~ggsave(
  file.path(here('data/BSA3_plots'),paste0(.,'.png')),
  plt_list[[.]],
  width=15,
  height=7))

ggsave('data/BSA3_plts/chr2.png', plt_list$chr2, width=20, height=9)