holstege_deleteome_nps
holstege_deleteome_nps.RmdRaw Data
As of now, the raw data originally published in Kemmeren et al. (2014) is available through the lab’s website. In this example, I am using deleteome_all_mutants_svd_transformed.txt which is the originally published data with an effort made to remove a slow-growth factor of unwanted variance O’Duibhir et al. (2014).
Parsing the kemmeren data
parse_kemmeren_data = function(kemmeren_svd){
# read kemmeren_svd
kemmeren_data_mutants = read_tsv(kemmeren_svd)
# add gene_id as the column name of the first column
colnames(kemmeren_data_mutants)[1] = 'gene_id'
# remove any 'matA' samples
kemmeren_data_mutants = kemmeren_data_mutants %>%
select(-all_of(
colnames(kemmeren_data_mutants)[
str_detect(colnames(kemmeren_data_mutants), "matA")]))
# create a table with columns deleted_locus, which is extracted from the colnames
# of the kemmeren data, and another column with the corresponding column names
kemmeren_col_data = tibble(
deleted_locus = str_extract(
colnames(kemmeren_data_mutants)[3:ncol(kemmeren_data_mutants)], ".+(?=\\.del)"),
colnames = colnames(kemmeren_data_mutants)[3:ncol(kemmeren_data_mutants)]
)
# use gprofilr2 to get the corresponding scerevisiae systematic IDs from the
# gene names in the kemmeren column names
kemmeren_cols_convert = gconvert(kemmeren_col_data$deleted_locus,
organism = "scerevisiae",
mthreshold = 1,
filter_na = FALSE)
# create a dataframe which will act as the colData of the NetProphetDataSet
coldata = kemmeren_col_data %>% left_join(
select(kemmeren_cols_convert, -all_of(c('input_number', 'target_number'))),
by = c("deleted_locus" = "input")
) %>%
filter(!is.na(target)) %>%
select(-deleted_locus) %>%
dplyr::rename(deleted_locus = target, name = name)
# return a list with slots coldata and expr. The expr matrix is the kemmeren
# data, but filtered for those columns in the coldata
list(
coldata = coldata,
expr = kemmeren_data_mutants[,c('gene_id', 'commonName', coldata$colnames)]
)
}Read in s. cerevisiae annotations
Yeast Genome files may be found here
YEAST_GFF = "~/Desktop/rnaseq_pipeline/rnaseq_pipeline/genome_files/S288C_R64/S288C_R64.gff"
# list of top level gene features. Note that the kemmeren data does say that it
# is at the transcript level. However, yeast genes have few exons and these seem
# to be mostly equivalent to genes. That said, I would suggest doing some investigation
# on your own
GFF_GENE_FEATURES = c('gene', 'pseudogene', 'tRNA_gene',
'rRNA_gene', 'ncRNA_gene', 'snoRNA_gene', 'snRNA_gene')
yeast_gff = import(YEAST_GFF)
yeast_genes = yeast_gff[yeast_gff$type %in% GFF_GENE_FEATURES]Parse Uniprot data
To get the data I am using, and more, go to the Uniprot website and enter the following into the search:
taxonomy:"Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast) [559292]" AND reviewed:yes
Note that you can select a different set of columns if you click the ‘column’ button
UNIPROT_DATA = "data/uniprot_yeast_20211102.tab"
# lots of parsing -- provided as an example, but you will likely need to look
# at the file and extract what you'd like. This will be required if you go to
# the website and select/download a different set of columns
uniprot = read_tsv(UNIPROT_DATA) %>%
dplyr::rename(ID = `Gene names (ordered locus )`,
dna_binding = `DNA binding`) %>%
mutate(dna_binding_interval = str_extract(dna_binding, "\\d+\\.\\.\\d+"),
dna_binding_domain = str_extract(dna_binding, '(?<=note=)".+"'),
dna_binding_domain_evidence = str_extract(dna_binding, '(?<=evidence=)".+"')) %>%
separate(dna_binding_interval, sep="\\.\\.", into=c("dna_binding_start", "dna_binding_end")) %>%
mutate(dna_binding_start = as.integer(dna_binding_start),
dna_binding_end = as.integer(dna_binding_end),
dna_binding_domain = trimws(str_remove_all(dna_binding_domain, ';|"|/evidence.+')),
dna_binding_domain_evidence = str_remove_all(dna_binding_domain_evidence, ';|"'),
dna_binding_domain = str_replace(dna_binding_domain,"A\\.T hook 1 DNA_BIND 1502\\.\\.1513 /note=A.T hook 2 DNA_BIND 1516\\.\\.1526 /note=A\\.T hook 3", "A.T hook"))
colnames(uniprot) = trimws(str_remove_all(colnames(uniprot), "Cross-reference|\\(|\\)|"), which = "both")Read in the list of regulators used in the NP2 paper
ROWNAMES_GENE_CONVERT = here("data/gProfiler_scerevisiae_11-1-2021_5-31-10.csv")
regulators_df = read_csv(ROWNAMES_GENE_CONVERT)create kemmeren granges
regulators_uniprot = uniprot %>%
filter(ID %in% regulators_df$initial_alias)
cant_find_id = setdiff(kemmeren_data$expr$gene_id, yeast_gff$ID)
# deleted if the annotation has since been removed or merged from the annotations
replace_wrong_ids = c("YKL047W", "YLR003C", "YAR061W", "DELETED", "DELETED", "DELETED", "DELETED", "DELETED")
names(replace_wrong_ids) = c("ANR2", "CMS1", "YAR062W", "YDL038C", "YGR272C", "YIL080W", "YIL168W", "YIR044C")
kemmeren_data$expr[which(kemmeren_data$expr$gene_id %in% names(replace_wrong_ids)), "gene_id"] = replace_wrong_ids
kemmeren_data$expr = kemmeren_data$expr %>%
filter(gene_id != "DELETED")
setdiff(kemmeren_data$expr$gene_id, yeast_gff$ID[!is.na(yeast_gff$ID)])
kemmeren_granges = yeast_gff[yeast_gff$ID %in% kemmeren_data$expr$gene_id]
kemmeren_granges = kemmeren_granges[order(match(kemmeren_granges$ID,kemmeren_data$expr$gene_id))]
stopifnot(identical(kemmeren_data$expr$gene_id, kemmeren_granges$ID))add gene-wise data to the granges object
kemmeren_granges[which(is.na(kemmeren_granges$gene))]$gene =
kemmeren_data$expr[which(is.na(kemmeren_granges$gene)), 'commonName']
kemmeren_granges$regulator = ifelse(kemmeren_granges$ID %in% regulators_df$initial_alias,
TRUE,
FALSE)
uniprot_cols_for_granges = as_tibble(kemmeren_granges) %>%
left_join(uniprot)
elementMetadata(kemmeren_granges)[colnames(uniprot)[3:ncol(uniprot)]] =
uniprot_cols_for_granges[,colnames(uniprot)[3:ncol(uniprot)]]
kem_tfs = kemmeren_granges[!is.na(kemmeren_granges$dna_binding_domain)]
expr = as.matrix(select(kemmeren_data$expr, -gene_id, -commonName))
rownames(expr) = kemmeren_data$expr$gene_id
del_loci_no_gene = setdiff(kemmeren_data$coldata$deleted_locus,
kemmeren_granges[kemmeren_granges$ID %in% kemmeren_data$coldata$deleted_locus]$ID)
fltr_coldata = kemmeren_data$coldata[!kemmeren_data$coldata$deleted_locus %in% del_loci_no_gene, ]
fltr_expr = expr[,fltr_coldata$colnames]create the regulation matrix
regulators = read_tsv("data/YEAST_DATA/YEAST/OUTPUT/regulators", col_names = FALSE)$X1
regulators = regulators[regulators %in% fltr_coldata$deleted_locus]
tf_fltr = kemmeren_granges$ID %in% regulators
regulation_matrix = matrix(TRUE, nrow(fltr_expr), nrow(fltr_expr),
dimnames = list(rownames(fltr_expr), rownames(fltr_expr)))
regulation_matrix = regulation_matrix[,rownames(fltr_expr) %in% regulators]
self_reg = intersect(rownames(regulation_matrix), colnames(regulation_matrix))
regulation_matrix[cbind(self_reg,self_reg)] = FALSEcreate the NetProphetDataSet
kem_np = NetProphetDataSet(
expr = fltr_expr,
regMatrix = regulation_matrix,
rowRanges = kemmeren_granges,
colData = fltr_coldata
)Result
Kemmeren, P., K. Sameith, L. A. van de Pasch, J. J. Benschop, T. L. Lenstra, T. Margaritis, E. O’Duibhir, et al. 2014. “Large-scale genetic perturbations reveal regulatory networks and an abundance of gene-specific repressors.” Cell 157 (3): 740–52.
O’Duibhir, Eoghan, Philip Lijnzaad, Joris J Benschop, Tineke L Lenstra, Dik van Leenen, Marian JA Groot Koerkamp, Thanasis Margaritis, Mariel O Brok, Patrick Kemmeren, and Frank CP Holstege. 2014. “Cell Cycle Population Effects in Perturbation Studies.” Molecular Systems Biology 10 (6): 732. https://doi.org/https://doi.org/10.15252/msb.20145172.