Create nf-core/rnaseq samplesheet

Per a request from 2021, MGI includes a tsv version of the Demux_samplesheet in the batches of RNAseq that they return to us. I use this Demux_samplesheet to create the nf-core/rnaseq samplesheet input.

library(tidyverse)
library(here)

# control output ---------------------------------------------------------------
WRITE_OUT = FALSE 
LOCAL = FALSE

# set variables ----------------------------------------------------------------
BATCH = "20231016"

FASTQ_DIR = "20231016"

# read in mgi metadata ---------------------------------------------------------
stopifnot(str_detect(FASTQ_DIR, BATCH))

DEMUX_PREFIX = "demux_table_"
LOCAL_FASTQ_PREFIX = file.path(
  "/mnt/scratch/llfs_rna_pipeline/data",
  FASTQ_DIR)

OUT_FASTQ_PREFIX = file.path(
  "/scratch/mblab/chasem/llfs_rna_pipeline/data",
  FASTQ_DIR)


demux_table_path = file.path(LOCAL_FASTQ_PREFIX,
                             "Demux_Summary_Table.txt")

stopifnot(file.exists(demux_table_path))

demux_table = read_tsv(demux_table_path)

# create fastq lookup ----------------------------------------------------------

fastq_lookup = tibble(fastq_path = Sys.glob(file.path(LOCAL_FASTQ_PREFIX, "*fastq*")))

# munge ------------------------------------------------------------------------

demux_table_mutate = demux_table %>%
  mutate(File.Basename = paste(Sample.Name, File.Basename, sep="."))

fastq_lookup_mutate = fastq_lookup %>%
  mutate(fastq_basename = basename(fastq_path)) %>%
  mutate(read = str_extract(basename(fastq_path), "R\\d")) %>%
  mutate(File.Basename = str_extract(basename(fastq_path),
                                     ".+(?=_R\\d.fastq.gz)")) %>%
  dplyr::select(File.Basename, fastq_basename, read)

# create nf sample sheet -------------------------------------------------------

nf_samplesheet = demux_table_mutate %>%
  left_join(fastq_lookup_mutate) %>%
  pivot_wider(names_from=read, values_from = fastq_basename) %>%
  mutate(strandedness = "unstranded") %>%
  mutate(batch = BATCH) %>%
  dplyr::rename(sample = Sample.Name,
                fastq_1 = R1,
                fastq_2 = R2,
                index_sequence = Index.Sequence,
                flow_cell_id = Flow.Cell.ID,
                lane = Lane.Number,
                library_name = Library.Name,
                estimated_total_bases_per_kb = Estimated.Total.Bases.per.KB,
                estimated_total_reads = Estimated.Total.Read.Pairs,
                average_q_score = Average.Q.Score,
                percent_above_q30 = Percent.Above.Q30,
                percent_phix_error = Percent.PhiX.Error) %>%
  mutate(fastq_1 = file.path(OUT_FASTQ_PREFIX, fastq_1),
         fastq_2 = file.path(OUT_FASTQ_PREFIX, fastq_2)) %>%
  dplyr::select(sample,
                fastq_1,
                fastq_2,
                strandedness,
                batch,
                index_sequence,
                flow_cell_id,
                lane,
                library_name,
                estimated_total_bases_per_kb,
                estimated_total_reads,
                average_q_score,
                percent_above_q30,
                percent_phix_error)

# write out --------------------------------------------------------------------
if(WRITE_OUT){
  write_csv(nf_samplesheet,
          file.path(
            "/mnt/scratch/llfs_rna_pipeline",
            "sample_sheets",
            paste0(BATCH, ".csv")))
}