Skip to contents
# Give your self a nice description of what your script does -- this will be
# helpful to you-in-the-future
#
# This script serves as an example of how to extract data from a VCF file into
# a more usable format. In this case, it turns a VCF file into a GDS file,
# and then uses SeqArray and SeqVarTools to extract the variant info (chr, pos,
# depth, annotations, ...) and parse that data into a dataframe. The final
# product of this script is a dataframe that can be used to filter variants
# in the DK samples based on how their genotypes compare to the YSP2-8 genotype
# at the same location.

library(tidyverse)
library(SeqArray)
library(SeqVarTools)
library(here)

#' @title Tidy data from gds file
#' @description Tidy data to long format with column sample, variant, and the
#'   metric name. internal use only
#'
#' @param data_mat a data matrix in sample x variant format
#' @param values_cname name of the metric -- eg RealDepth
#' @param rnames vector to name the columns, eg sample.ids
#' @param cnames vector to name the rows, eg variant_ids
#'
#' @return a tidy dataframe in long format with columns sample, variant, metric
#'
#' @importFrom dplyr as_tibble
#' @importFrom tidyr pivot_longer
tidy_metrics = function(data_mat, values_cname, rnames, cnames){

  errors = list(
    dimension_error = paste0("vcf_to_qtlseqr_table error: rownames do not ",
                             "match data matrix. see tidy_metrics() function")
  )

  if(length(rnames) != dim(data_mat)[1] | length(cnames) != dim(data_mat)[2]){
    stop(errors$dimension_error)
  }

  data_mat %>%
    `rownames<-`(rnames) %>%
    `colnames<-`(cnames) %>%
    dplyr::as_tibble(rownames = "sample") %>%
    tidyr::pivot_longer(-sample, names_to = "variant", values_to = values_cname)
}

# this is already done -- just load the allClinical.ann.gds
# tryCatch(
#   SeqArray::seqVCF2GDS(
#     here("allClinical.ann.vcf"),
#     here("inst/allClinical.ann.gds")),
#   error = function(e){
#     message(e)
#   })


# After creating the GDS file, we now 'open' it. Note that this does not
# open the entire file, but rather creates a file handle. The difference is
# that we are not reading the data into RAM. This means that we can handle
# enormous VCF files, which is a common use case with this type of data
all_strains_gds = SeqArray::seqOpen(here("inst/allClinical.ann.gds"))

# set a filter such that we only look at loci with at most 1 alternative allele
# In the all_strains data, there is only a single position with more than 1 alt
message("Setting single allele loci filter...")
single_allele_fltr =
  SeqArray::seqGetData(all_strains_gds,
                       'variant.id')[SeqVarTools::nAlleles(all_strains_gds) == 2]
message(sprintf("...total variants: %s",
                length(SeqVarTools::nAlleles(all_strains_gds))))
message("...FALSE refers to the number of multi allelic loci")
print(table(SeqVarTools::nAlleles(all_strains_gds) == 2))
SeqArray::seqSetFilter(all_strains_gds,
                       variant.id=single_allele_fltr)

# select only KN99a and TDY1993 (C8)
# SeqArray::seqSetFilter(
#   all_strains_gds, 
#   sample.sel = SeqArray::seqGetData(
#     all_strains_gds, 'sample.id') %in% c('KN99a', 'TDY1993'))

# This is the set of 'fields' we're going to extract from the GDS file. Note
# that the return type of each of these 'fields' will not be the same -- eg,
# genotype returns a 3 dimensional matrix, chromosome returns a vector, ...
data_to_extract = c("chromosome", "position", "$ref", "$alt",
                    "annotation/qual", "genotype", "annotation/info/ANN",
                    "annotation/info/DP", "annotation/format/DP",
                    "annotation/format/RO", "annotation/format/AO",
                    "sample.id")

# `seqGetData` is one of the methods we can use to extract data from the
# GDS file. Using `seqGetData` extracts the data from the file and returns it
# as a list in memory. In the example below, we now have the annotations data
# in memory and can start to parse it
all_strains_annotations = SeqArray::seqGetData(
  all_strains_gds, data_to_extract)

# create unique IDs for the variants
variant_ids = paste0("var_",
                     seq(1,dim(all_strains_annotations$`annotation/format/DP`)[2]))


# The annotations are returned in a list of two items, `length` and `data`.
# The `length` list looks something like 3, 1, 6, 2, ... where the number
# represents the number of times the variant at that index is annotated to a
# different spnEFF annotation. These annotations are stored in the list `data`.
# So, variant 1 has 3 different annotations in `data`, variant 2 has 1
# annotation, and so on.
#
# The goal of the code below is to extract a "key" column from the
# annotations to the variants. If we consider the variant index to be the key,
# what we want to know is that the first 3 entries of the `data` map to the
# variant at index 1
variant_index = map2(
  seq(length(all_strains_annotations$`annotation/info/ANN`$length)),
  all_strains_annotations$`annotation/info/ANN`$length,
  ~rep(.x,.y)) %>%
  unlist()

# create a dataframe where the first column is the variant name, the second
# column is the full annotation string, and the `alt`, `effect`, `impact`,
# and `gene` columns are created by parsing out the relevant information from
# the annotation string

variant_annotation_df = tibble(
  variant = paste('var', variant_index, sep="_"),
  annotation = all_strains_annotations$`annotation/info/ANN`$data) %>%
  separate(annotation, into = c('alt', 'effect', 'impact', 'col4', 'gene', 
                                'col6', 'col7', 'protein_coding_effect', 
                                'col9', 'col10', 'aa', 'col12', 'col13', 
                                'col14', 'col15'), 
             sep = "\\|", fill = "right",remove = FALSE) %>%
  select(variant, annotation, alt, effect, impact, gene, 
         protein_coding_effect, aa) %>%
  mutate(protein_coding_effect = ifelse(
    protein_coding_effect == 'protein_coding', TRUE, FALSE))

# this is part of a function from the BSA package which was set up as a
# convenience to parse some of this data. It is reused here -- see
# R/tidy_metrics.R
data_to_transform = list(
  genotype     = all_strains_annotations$genotype[1,,],
  RealDepth    = all_strains_annotations$`annotation/format/DP`,
  Reference    = all_strains_annotations$`annotation/format/RO`,
  Alternative1 = all_strains_annotations$`annotation/format/AO`$data)
# result is a long data frame with columns sample, variant, RealDepth,
# Reference (which is the reference allele depth) and Alternative1, which is
# the Alternative depth. Note that if single_allele_loci_only is not set,
# then this is not actually Alternative1, but just Alternative
depth_metrics_df = suppressMessages(
  purrr::map2(data_to_transform,
              names(data_to_transform),
              tidy_metrics,
              all_strains_annotations$sample.id,
              variant_ids) %>%
    purrr::reduce(dplyr::left_join))

# Now we're going to create our analysis data -- in this case, we're going to
# join the depth df to a temporary dataframe with the chromosome, position, etc
# information. Then we join the annotations dataframe and filter for positions
# which have RealDepth (actual reads aligning over a given position,
# not counting reads which aligned with a gap over that variant position) of
# at least 10
all_strains_annotated_df = depth_metrics_df %>%
  left_join(
  tibble(variant = variant_ids,
         chr = all_strains_annotations$chromosome,
         pos = all_strains_annotations$position,
         ref = all_strains_annotations$`$ref`,
         alt = all_strains_annotations$`$alt`)) %>%
  left_join(variant_annotation_df %>%
              select(-annotation, -alt),
            by = c('variant'),
            relationship = 'many-to-many') %>%
  filter(!is.na(RealDepth) & RealDepth >= 10) %>%
  # exclude any marker sequences
  filter(!chr %in% c("NAT", "G418")) %>%
  # factor the `impact` column -- note that I ran everything up to the
  # `filter` step above, and then looked at the unique values in `impact`
  # to figure out the levels. The order of the labels determines how
  # this column will sort, with "HIGH" being highest impact and "MODIFIER"
  # the lowest
  mutate(impact = factor(impact,levels = c('HIGH', 'MODERATE',
                                           'LOW', 'MODIFIER'))) %>%
  # `HIGH` on top
  arrange(impact)

# Now we're going to extract just the tdy1993 data and reform it into a
# dataframe with columns `variant` and `tdy1993_genotype`
tdy1993_genotype_df = all_strains_annotated_df %>%
  filter(sample == 'TDY1993') %>%
  select(variant, genotype, RealDepth, Reference, Alternative1) %>%
  dplyr::rename(tdy1993_genotype = genotype,
                tdy1993_RealDepth = RealDepth,
                tdy1993_Reference = Reference,
                tdy1993_Alternative1 = Alternative1)

# read in gene info (parsed from the gtf from fungidb)
kn99_genes = read_csv(here("inst/kn99_genes.csv")) %>%
  select(-c(chr, source)) %>%
  dplyr::rename(gene_start = start,
                gene_end = end,
                gene_strand = strand,
                gene_description = description)

# And finally, we remove the tdy1993 data from the all_strains data, and then join
# the ysp2_genotype_df so that we now have an additional column called
# `ysp_2_8_genotype`
#
# We filter for "complete cases" (or, only keep rows with no NAs in any column)
# so that we're only looking at variants for which we have reliable data over
# both a given sample, and tdy1993
analysis_df = all_strains_annotated_df %>%
  filter(sample == 'KN99a') %>%
  left_join(tdy1993_genotype_df,
            by = 'variant',
            relationship = 'many-to-many') %>%
  filter(complete.cases(.)) %>%
  filter(genotype != 0 | tdy1993_genotype != 0) %>%
  dplyr::rename(kn99a_genotype = genotype,
                kn99a_RealDepth = RealDepth,
                kn99a_Reference = Reference,
                kn99a_Alternative1 = Alternative1) %>%
  left_join(kn99_genes, by = c('gene' = 'ID')) %>%
  dplyr::select(variant, chr, pos, kn99a_genotype, 
                tdy1993_genotype,ref,alt,impact,effect,aa,gene, 
                protein_coding_effect, gene_start, gene_end, gene_strand, 
                gene_description, kn99a_RealDepth,kn99a_Reference, 
                kn99a_Alternative1, tdy1993_RealDepth, tdy1993_Reference, 
                tdy1993_Alternative1) %>%
  distinct()

nstrains_per_variant = all_strains_annotated_df %>%
  filter(!sample %in% c('KN99a', 'TDY1993')) %>%
  group_by(variant) %>%
  tally()
# write results
analysis_df %>% 
  group_by(gene, impact) %>% 
  summarize(tally=n(), 
            effect_all=paste(unique(effect),collapse=","), 
            aa_all=paste(unique(aa),collapse = ','),
            gene_description = unique(gene_description),
            gene_start = unique(gene_start),
            gene_end = unique(gene_end),
            gene_strand = unique(gene_strand)) %>%
  select(gene,gene_description,gene_start,gene_end,gene_strand,
         impact,tally,effect_all,aa_all) %>%
  write_csv(here('results/gene_overview_kn99a_vs_c8_20230726.csv'))

write_csv(analysis_df, here('results/kn99a_vs_c8_20230725.csv'))