# Give your self a nice description of what your script does -- this will be
# helpful to you-in-the-future
#
# This script serves as an example of how to extract data from a VCF file into
# a more usable format. In this case, it turns a VCF file into a GDS file,
# and then uses SeqArray and SeqVarTools to extract the variant info (chr, pos,
# depth, annotations, ...) and parse that data into a dataframe. The final
# product of this script is a dataframe that can be used to filter variants
# in the DK samples based on how their genotypes compare to the YSP2-8 genotype
# at the same location.
library(tidyverse)
library(SeqArray)
library(SeqVarTools)
library(here)
#' @title Tidy data from gds file
#' @description Tidy data to long format with column sample, variant, and the
#' metric name. internal use only
#'
#' @param data_mat a data matrix in sample x variant format
#' @param values_cname name of the metric -- eg RealDepth
#' @param rnames vector to name the columns, eg sample.ids
#' @param cnames vector to name the rows, eg variant_ids
#'
#' @return a tidy dataframe in long format with columns sample, variant, metric
#'
#' @importFrom dplyr as_tibble
#' @importFrom tidyr pivot_longer
tidy_metrics = function(data_mat, values_cname, rnames, cnames){
errors = list(
dimension_error = paste0("vcf_to_qtlseqr_table error: rownames do not ",
"match data matrix. see tidy_metrics() function")
)
if(length(rnames) != dim(data_mat)[1] | length(cnames) != dim(data_mat)[2]){
stop(errors$dimension_error)
}
data_mat %>%
`rownames<-`(rnames) %>%
`colnames<-`(cnames) %>%
dplyr::as_tibble(rownames = "sample") %>%
tidyr::pivot_longer(-sample, names_to = "variant", values_to = values_cname)
}
# this is already done -- just load the allClinical.ann.gds
# tryCatch(
# SeqArray::seqVCF2GDS(
# here("allClinical.ann.vcf"),
# here("inst/allClinical.ann.gds")),
# error = function(e){
# message(e)
# })
# After creating the GDS file, we now 'open' it. Note that this does not
# open the entire file, but rather creates a file handle. The difference is
# that we are not reading the data into RAM. This means that we can handle
# enormous VCF files, which is a common use case with this type of data
all_strains_gds = SeqArray::seqOpen(here("inst/allClinical.ann.gds"))
# set a filter such that we only look at loci with at most 1 alternative allele
# In the all_strains data, there is only a single position with more than 1 alt
message("Setting single allele loci filter...")
single_allele_fltr =
SeqArray::seqGetData(all_strains_gds,
'variant.id')[SeqVarTools::nAlleles(all_strains_gds) == 2]
message(sprintf("...total variants: %s",
length(SeqVarTools::nAlleles(all_strains_gds))))
message("...FALSE refers to the number of multi allelic loci")
print(table(SeqVarTools::nAlleles(all_strains_gds) == 2))
SeqArray::seqSetFilter(all_strains_gds,
variant.id=single_allele_fltr)
# select only KN99a and TDY1993 (C8)
# SeqArray::seqSetFilter(
# all_strains_gds,
# sample.sel = SeqArray::seqGetData(
# all_strains_gds, 'sample.id') %in% c('KN99a', 'TDY1993'))
# This is the set of 'fields' we're going to extract from the GDS file. Note
# that the return type of each of these 'fields' will not be the same -- eg,
# genotype returns a 3 dimensional matrix, chromosome returns a vector, ...
data_to_extract = c("chromosome", "position", "$ref", "$alt",
"annotation/qual", "genotype", "annotation/info/ANN",
"annotation/info/DP", "annotation/format/DP",
"annotation/format/RO", "annotation/format/AO",
"sample.id")
# `seqGetData` is one of the methods we can use to extract data from the
# GDS file. Using `seqGetData` extracts the data from the file and returns it
# as a list in memory. In the example below, we now have the annotations data
# in memory and can start to parse it
all_strains_annotations = SeqArray::seqGetData(
all_strains_gds, data_to_extract)
# create unique IDs for the variants
variant_ids = paste0("var_",
seq(1,dim(all_strains_annotations$`annotation/format/DP`)[2]))
# The annotations are returned in a list of two items, `length` and `data`.
# The `length` list looks something like 3, 1, 6, 2, ... where the number
# represents the number of times the variant at that index is annotated to a
# different spnEFF annotation. These annotations are stored in the list `data`.
# So, variant 1 has 3 different annotations in `data`, variant 2 has 1
# annotation, and so on.
#
# The goal of the code below is to extract a "key" column from the
# annotations to the variants. If we consider the variant index to be the key,
# what we want to know is that the first 3 entries of the `data` map to the
# variant at index 1
variant_index = map2(
seq(length(all_strains_annotations$`annotation/info/ANN`$length)),
all_strains_annotations$`annotation/info/ANN`$length,
~rep(.x,.y)) %>%
unlist()
# create a dataframe where the first column is the variant name, the second
# column is the full annotation string, and the `alt`, `effect`, `impact`,
# and `gene` columns are created by parsing out the relevant information from
# the annotation string
variant_annotation_df = tibble(
variant = paste('var', variant_index, sep="_"),
annotation = all_strains_annotations$`annotation/info/ANN`$data) %>%
separate(annotation, into = c('alt', 'effect', 'impact', 'col4', 'gene',
'col6', 'col7', 'protein_coding_effect',
'col9', 'col10', 'aa', 'col12', 'col13',
'col14', 'col15'),
sep = "\\|", fill = "right",remove = FALSE) %>%
select(variant, annotation, alt, effect, impact, gene,
protein_coding_effect, aa) %>%
mutate(protein_coding_effect = ifelse(
protein_coding_effect == 'protein_coding', TRUE, FALSE))
# this is part of a function from the BSA package which was set up as a
# convenience to parse some of this data. It is reused here -- see
# R/tidy_metrics.R
data_to_transform = list(
genotype = all_strains_annotations$genotype[1,,],
RealDepth = all_strains_annotations$`annotation/format/DP`,
Reference = all_strains_annotations$`annotation/format/RO`,
Alternative1 = all_strains_annotations$`annotation/format/AO`$data)
# result is a long data frame with columns sample, variant, RealDepth,
# Reference (which is the reference allele depth) and Alternative1, which is
# the Alternative depth. Note that if single_allele_loci_only is not set,
# then this is not actually Alternative1, but just Alternative
depth_metrics_df = suppressMessages(
purrr::map2(data_to_transform,
names(data_to_transform),
tidy_metrics,
all_strains_annotations$sample.id,
variant_ids) %>%
purrr::reduce(dplyr::left_join))
# Now we're going to create our analysis data -- in this case, we're going to
# join the depth df to a temporary dataframe with the chromosome, position, etc
# information. Then we join the annotations dataframe and filter for positions
# which have RealDepth (actual reads aligning over a given position,
# not counting reads which aligned with a gap over that variant position) of
# at least 10
all_strains_annotated_df = depth_metrics_df %>%
left_join(
tibble(variant = variant_ids,
chr = all_strains_annotations$chromosome,
pos = all_strains_annotations$position,
ref = all_strains_annotations$`$ref`,
alt = all_strains_annotations$`$alt`)) %>%
left_join(variant_annotation_df %>%
select(-annotation, -alt),
by = c('variant'),
relationship = 'many-to-many') %>%
filter(!is.na(RealDepth) & RealDepth >= 10) %>%
# exclude any marker sequences
filter(!chr %in% c("NAT", "G418")) %>%
# factor the `impact` column -- note that I ran everything up to the
# `filter` step above, and then looked at the unique values in `impact`
# to figure out the levels. The order of the labels determines how
# this column will sort, with "HIGH" being highest impact and "MODIFIER"
# the lowest
mutate(impact = factor(impact,levels = c('HIGH', 'MODERATE',
'LOW', 'MODIFIER'))) %>%
# `HIGH` on top
arrange(impact)
# Now we're going to extract just the tdy1993 data and reform it into a
# dataframe with columns `variant` and `tdy1993_genotype`
tdy1993_genotype_df = all_strains_annotated_df %>%
filter(sample == 'TDY1993') %>%
select(variant, genotype, RealDepth, Reference, Alternative1) %>%
dplyr::rename(tdy1993_genotype = genotype,
tdy1993_RealDepth = RealDepth,
tdy1993_Reference = Reference,
tdy1993_Alternative1 = Alternative1)
# read in gene info (parsed from the gtf from fungidb)
kn99_genes = read_csv(here("inst/kn99_genes.csv")) %>%
select(-c(chr, source)) %>%
dplyr::rename(gene_start = start,
gene_end = end,
gene_strand = strand,
gene_description = description)
# And finally, we remove the tdy1993 data from the all_strains data, and then join
# the ysp2_genotype_df so that we now have an additional column called
# `ysp_2_8_genotype`
#
# We filter for "complete cases" (or, only keep rows with no NAs in any column)
# so that we're only looking at variants for which we have reliable data over
# both a given sample, and tdy1993
analysis_df = all_strains_annotated_df %>%
filter(sample == 'KN99a') %>%
left_join(tdy1993_genotype_df,
by = 'variant',
relationship = 'many-to-many') %>%
filter(complete.cases(.)) %>%
filter(genotype != 0 | tdy1993_genotype != 0) %>%
dplyr::rename(kn99a_genotype = genotype,
kn99a_RealDepth = RealDepth,
kn99a_Reference = Reference,
kn99a_Alternative1 = Alternative1) %>%
left_join(kn99_genes, by = c('gene' = 'ID')) %>%
dplyr::select(variant, chr, pos, kn99a_genotype,
tdy1993_genotype,ref,alt,impact,effect,aa,gene,
protein_coding_effect, gene_start, gene_end, gene_strand,
gene_description, kn99a_RealDepth,kn99a_Reference,
kn99a_Alternative1, tdy1993_RealDepth, tdy1993_Reference,
tdy1993_Alternative1) %>%
distinct()
nstrains_per_variant = all_strains_annotated_df %>%
filter(!sample %in% c('KN99a', 'TDY1993')) %>%
group_by(variant) %>%
tally()