# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
from pyBioTools import Fastq
from pycoQC.pycoQC import pycoQC
from pycoQC.Barcode_split import Barcode_split
# Ploting lib imports
import matplotlib.pyplot as pl
import seaborn as sns
%matplotlib inline
# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200
pd.options.display.max_rows = 50
pd.options.display.min_rows = 50
shutil.rmtree("fastq", ignore_errors=True)
mkdir("fastq")
outdir = "./fastq/guppy_4.0.14"
mkdir(outdir)
for fast5_dir in glob.glob("fast5/*"):
group_name = os.path.split(fast5_dir)[-1]
print (group_name)
group_outdir = f"{outdir}/{group_name}"
mkdir(group_outdir)
stdout_fp = f"{group_outdir}/bsub.err"
stderr_fp = f"{group_outdir}/bsub.out"
# Base command
cmd = f"ont_guppy_4.0.14 -i {fast5_dir} -s {group_outdir} --calib_detect -c dna_r9.4.1_450bps_hac_prom.cfg --num_callers 16 --recursive --device 'auto' --records_per_fastq 0 --disable_pings --compress_fastq --barcode_kits EXP-NBD104 --trim_barcodes"
gpu = "gpu-010"
bsub(cmd=cmd, job_name=f"Guppy_{group_name}", conda="ont_guppy", stdout_fp=stdout_fp, stderr_fp=stderr_fp, other_options=f"-P gpu -gpu - -m {gpu}", mem=20000, dry=False)
for group in ("DN467851H_A3_F3_G3_H3", "DN467851H_B2_C2_E2_F2", "DN467851H_C4_F4_H4_A5"):
stdout_print(f"Process {group}")
work_dir = f"fastq/guppy_4.0.14/{group}/"
summary_file = f"{work_dir}/sequencing_summary.txt"
bsub(conda="pycoQC", cmd=f"Barcode_split -f {summary_file} --output_dir {work_dir}", dry=False, mem=30000, job_name=f"Barcode_split_{group}")
lt = namedtuple("lt", ["sample_id","barcode","run_group"])
sample_info_list = [
lt('117-2_C4', "barcode01", "DN467851H_C4_F4_H4_A5"),
lt('131-1_F4', "barcode02", "DN467851H_C4_F4_H4_A5"),
lt('134-1_H4', "barcode03", "DN467851H_C4_F4_H4_A5"),
lt('134-2_A5', "barcode04", "DN467851H_C4_F4_H4_A5"),
lt('4-1_B2', "barcode05", "DN467851H_B2_C2_E2_F2"),
lt('4-2_C2', "barcode06", "DN467851H_B2_C2_E2_F2"),
lt('7-1_E2', "barcode07", "DN467851H_B2_C2_E2_F2"),
lt('7-2_F2', "barcode08", "DN467851H_B2_C2_E2_F2"),
lt('11-1_A3', "barcode09", "DN467851H_A3_F3_G3_H3"),
lt('69-1_F3', "barcode10", "DN467851H_A3_F3_G3_H3"),
lt('79-2_G3', "barcode11", "DN467851H_A3_F3_G3_H3"),
lt('80-1_H3', "barcode12", "DN467851H_A3_F3_G3_H3")]
fn = "sample_sheet.tsv"
with open (fn, "w") as fp:
header = ["sample_id","fastq","fast5","seq_summary","barcode","run_group"]
fp.write("\t".join(header)+"\n")
for sample_info in sample_info_list:
fast5 = os.path.abspath(f"./fast5/{sample_info.run_group}/")
fastq = os.path.abspath(f"./fastq/guppy_4.0.14/{sample_info.run_group}/{sample_info.barcode}/")
seq_summary = os.path.abspath(f"./fastq/guppy_4.0.14/{sample_info.run_group}/sequencing_summary_{sample_info.barcode}.txt")
assert (os.access(fast5, os.R_OK))
assert (os.access(fastq, os.R_OK))
assert (os.access(seq_summary, os.R_OK))
info = [sample_info.sample_id, fastq, fast5, seq_summary, sample_info.barcode, sample_info.run_group]
fp.write("\t".join(info)+"\n")
head(fn, max_char_col=25, max_char_line=500)