Basecalling

Imports

In [1]:
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp

# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
from pyBioTools import Fastq
from pycoQC.pycoQC import pycoQC
from pycoQC.Barcode_split import Barcode_split

# Ploting lib imports
import matplotlib.pyplot as pl
import seaborn as sns
%matplotlib inline

# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200
pd.options.display.max_rows = 50
pd.options.display.min_rows = 50
Bad key "text.kerning_factor" on line 4 in
/nfs/software/birney/adrien/miniconda3/envs/Python3.7/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution

Guppy Basecalling

In [2]:
shutil.rmtree("fastq", ignore_errors=True)
mkdir("fastq")
Creating /hps/research1/birney/users/adrien/datasets/medaka_DNA_promethion/brain_run2/fastq
In [13]:
outdir = "./fastq/guppy_4.0.14"
mkdir(outdir)

for fast5_dir in glob.glob("fast5/*"):
    group_name = os.path.split(fast5_dir)[-1]
    print (group_name)
    
    group_outdir = f"{outdir}/{group_name}"
    mkdir(group_outdir)
    
    stdout_fp = f"{group_outdir}/bsub.err"
    stderr_fp = f"{group_outdir}/bsub.out"
    
    # Base command
    cmd = f"ont_guppy_4.0.14 -i {fast5_dir} -s {group_outdir} --calib_detect -c dna_r9.4.1_450bps_hac_prom.cfg --num_callers 16 --recursive --device 'auto' --records_per_fastq 0 --disable_pings --compress_fastq --barcode_kits EXP-NBD104 --trim_barcodes"
    
    gpu = "gpu-010"
    bsub(cmd=cmd, job_name=f"Guppy_{group_name}", conda="ont_guppy", stdout_fp=stdout_fp, stderr_fp=stderr_fp, other_options=f"-P gpu -gpu - -m {gpu}", mem=20000, dry=False)
DN467851H_C4_F4_H4_A5
bsub -J Guppy_DN467851H_C4_F4_H4_A5 -M 20000 -R 'rusage[mem=20000]' -oo ./fastq/guppy_4.0.14/DN467851H_C4_F4_H4_A5/bsub.err -eo ./fastq/guppy_4.0.14/DN467851H_C4_F4_H4_A5/bsub.out -P gpu -gpu - -m gpu-010  "ont_guppy_4.0.14 -i fast5/DN467851H_C4_F4_H4_A5 -s ./fastq/guppy_4.0.14/DN467851H_C4_F4_H4_A5 --calib_detect -c dna_r9.4.1_450bps_hac_prom.cfg --num_callers 16 --recursive --device 'auto' --records_per_fastq 0 --disable_pings --compress_fastq --barcode_kits EXP-NBD104 --trim_barcodes"
Job <9874305> is submitted to default queue <research-rh74>.
DN467851H_B2_C2_E2_F2
bsub -J Guppy_DN467851H_B2_C2_E2_F2 -M 20000 -R 'rusage[mem=20000]' -oo ./fastq/guppy_4.0.14/DN467851H_B2_C2_E2_F2/bsub.err -eo ./fastq/guppy_4.0.14/DN467851H_B2_C2_E2_F2/bsub.out -P gpu -gpu - -m gpu-010  "ont_guppy_4.0.14 -i fast5/DN467851H_B2_C2_E2_F2 -s ./fastq/guppy_4.0.14/DN467851H_B2_C2_E2_F2 --calib_detect -c dna_r9.4.1_450bps_hac_prom.cfg --num_callers 16 --recursive --device 'auto' --records_per_fastq 0 --disable_pings --compress_fastq --barcode_kits EXP-NBD104 --trim_barcodes"
Job <9874314> is submitted to default queue <research-rh74>.
DN467851H_A3_F3_G3_H3
bsub -J Guppy_DN467851H_A3_F3_G3_H3 -M 20000 -R 'rusage[mem=20000]' -oo ./fastq/guppy_4.0.14/DN467851H_A3_F3_G3_H3/bsub.err -eo ./fastq/guppy_4.0.14/DN467851H_A3_F3_G3_H3/bsub.out -P gpu -gpu - -m gpu-010  "ont_guppy_4.0.14 -i fast5/DN467851H_A3_F3_G3_H3 -s ./fastq/guppy_4.0.14/DN467851H_A3_F3_G3_H3 --calib_detect -c dna_r9.4.1_450bps_hac_prom.cfg --num_callers 16 --recursive --device 'auto' --records_per_fastq 0 --disable_pings --compress_fastq --barcode_kits EXP-NBD104 --trim_barcodes"
Job <9874321> is submitted to default queue <research-rh74>.

pycoQC Barcode_split

In [7]:
for group in ("DN467851H_A3_F3_G3_H3", "DN467851H_B2_C2_E2_F2", "DN467851H_C4_F4_H4_A5"):
    stdout_print(f"Process {group}")
    work_dir = f"fastq/guppy_4.0.14/{group}/"
    summary_file = f"{work_dir}/sequencing_summary.txt"
    bsub(conda="pycoQC", cmd=f"Barcode_split -f {summary_file} --output_dir {work_dir}", dry=False, mem=30000, job_name=f"Barcode_split_{group}")
Process DN467851H_A3_F3_G3_H3bsub -J Barcode_split_DN467851H_A3_F3_G3_H3 -M 30000 -R 'rusage[mem=30000]'  "Barcode_split -f fastq/guppy_4.0.14/DN467851H_A3_F3_G3_H3//sequencing_summary.txt --output_dir fastq/guppy_4.0.14/DN467851H_A3_F3_G3_H3/"
Job <2523722> is submitted to default queue <research-rh74>.
Process DN467851H_B2_C2_E2_F2bsub -J Barcode_split_DN467851H_B2_C2_E2_F2 -M 30000 -R 'rusage[mem=30000]'  "Barcode_split -f fastq/guppy_4.0.14/DN467851H_B2_C2_E2_F2//sequencing_summary.txt --output_dir fastq/guppy_4.0.14/DN467851H_B2_C2_E2_F2/"
Job <2523723> is submitted to default queue <research-rh74>.
Process DN467851H_C4_F4_H4_A5bsub -J Barcode_split_DN467851H_C4_F4_H4_A5 -M 30000 -R 'rusage[mem=30000]'  "Barcode_split -f fastq/guppy_4.0.14/DN467851H_C4_F4_H4_A5//sequencing_summary.txt --output_dir fastq/guppy_4.0.14/DN467851H_C4_F4_H4_A5/"
Job <2523725> is submitted to default queue <research-rh74>.

make sample sheet

In [36]:
lt = namedtuple("lt", ["sample_id","barcode","run_group"])
sample_info_list = [
    lt('117-2_C4', "barcode01", "DN467851H_C4_F4_H4_A5"),
    lt('131-1_F4', "barcode02", "DN467851H_C4_F4_H4_A5"),
    lt('134-1_H4', "barcode03", "DN467851H_C4_F4_H4_A5"),
    lt('134-2_A5', "barcode04", "DN467851H_C4_F4_H4_A5"),
    lt('4-1_B2', "barcode05", "DN467851H_B2_C2_E2_F2"),
    lt('4-2_C2', "barcode06", "DN467851H_B2_C2_E2_F2"),
    lt('7-1_E2', "barcode07", "DN467851H_B2_C2_E2_F2"),
    lt('7-2_F2', "barcode08", "DN467851H_B2_C2_E2_F2"),
    lt('11-1_A3', "barcode09", "DN467851H_A3_F3_G3_H3"),
    lt('69-1_F3', "barcode10", "DN467851H_A3_F3_G3_H3"),
    lt('79-2_G3', "barcode11", "DN467851H_A3_F3_G3_H3"),
    lt('80-1_H3', "barcode12", "DN467851H_A3_F3_G3_H3")]

fn = "sample_sheet.tsv"
with open (fn, "w") as fp:
    header = ["sample_id","fastq","fast5","seq_summary","barcode","run_group"]
    fp.write("\t".join(header)+"\n")
    for sample_info in sample_info_list:
        fast5 = os.path.abspath(f"./fast5/{sample_info.run_group}/")
        fastq = os.path.abspath(f"./fastq/guppy_4.0.14/{sample_info.run_group}/{sample_info.barcode}/")
        seq_summary = os.path.abspath(f"./fastq/guppy_4.0.14/{sample_info.run_group}/sequencing_summary_{sample_info.barcode}.txt")
        
        assert (os.access(fast5, os.R_OK))
        assert (os.access(fastq, os.R_OK))
        assert (os.access(seq_summary, os.R_OK))
        
        info = [sample_info.sample_id, fastq, fast5, seq_summary, sample_info.barcode, sample_info.run_group]
        fp.write("\t".join(info)+"\n")
        
head(fn, max_char_col=25, max_char_line=500)
sample_id fastq                     fast5                     seq_summary               barcode   run_group             
117-2_C4  /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode01 DN467851H_C4_F4_H4_A5 
131-1_F4  /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode02 DN467851H_C4_F4_H4_A5 
134-1_H4  /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode03 DN467851H_C4_F4_H4_A5 
134-2_A5  /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode04 DN467851H_C4_F4_H4_A5 
4-1_B2    /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode05 DN467851H_B2_C2_E2_F2 
4-2_C2    /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode06 DN467851H_B2_C2_E2_F2 
7-1_E2    /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode07 DN467851H_B2_C2_E2_F2 
7-2_F2    /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode08 DN467851H_B2_C2_E2_F2 
11-1_A3   /hps/research1/birney/use /hps/research1/birney/use /hps/research1/birney/use barcode09 DN467851H_A3_F3_G3_H3