from datetime import date
from pycltools.pycltools import jprint
jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2020_06_04', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
from tqdm import tqdm
import csv
import re
# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
import pybedtools as pbt
import pyranges as pr
import mappy as mp
# Ploting lib imports
import matplotlib.pyplot as pl
from matplotlib.colors import rgb2hex
import seaborn as sns
%matplotlib inline
# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100
outdir = "individual_assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
for sample_id in ["69-1_F3", "131-1_F4", "117-2_C4", "4-2_B2", "7-2_F2", "4-1_B2", "134-2_A5", "79-2_G3", "134-1_H4"]:
fasta = f"/nfs/research1/birney/projects/medaka/inbred_panel/medaka-alignments-release-94/nanopore_assemblies/assembled_fa/{sample_id}.ctg.lay.gz.fa"
print (f"Copying sample {sample_id}")
copyFile(fasta, outdir)
for sample_id in ["80-1_H3", "7-1_E2", "11-1_A3"]:
fasta = f"/nfs/research1/birney/projects/medaka/inbred_panel/medaka-alignments-release-94/nanopore_assemblies/assembled_fa/repeat_runs/{sample_id}.ctg.lay.gz.fa"
print (f"Copying sample {sample_id}")
copyFile(fasta, outdir)
with open("individual_assemblies/11-1_A3.ctg.lay.gz.fa") as fa_in, open ("individual_assemblies/11-1_A3.ctg.fixed.fa", "w") as fa_out:
for line in fa_in:
line = line.strip('\x00').strip()
fa_out.write(line+"\n")
shutil.move("individual_assemblies/11-1_A3.ctg.fixed.fa", "individual_assemblies/11-1_A3.ctg.lay.gz.fa")
for fn in glob.glob("./individual_assemblies/*.ctg.lay.gz.fa"):
n=1
sample_id = os.path.basename(fn).split(".")[0]
out_fn = f"./individual_assemblies/{sample_id}_clean.fa"
stdout_print(f"Parsing sample {sample_id}\n")
with pyfaidx.Fasta(fn) as fa_in, open(out_fn, "w") as fa_out:
for seq in fa_in:
fa_out.write(f">MIKK_{sample_id}_{n}\n{str(seq)}\n")
n+=1
with pyfaidx.Fasta(out_fn) as fa_out:
print(f"\tNumber of bases {len(fa_out)}")
print(f"\tNumber of contigs {len(fa_out.keys())}")
outdir = "./individual_assemblies/summary"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
from pandas.plotting import table
from scipy.ndimage import gaussian_filter1d
def compute_N50 (data):
data.sort()
half_sum = sum(data)/2
cum_sum = 0
for v in data:
cum_sum += v
if cum_sum >= half_sum:
return int(v)
n_base_d = Counter ()
n_contig_d = Counter ()
len_contig_d = defaultdict(list)
for fn in glob.glob("./individual_assemblies/*_clean.fa"):
sample_id = os.path.basename(fn).split("_")[0]
with pyfaidx.Fasta(fn) as fa_in:
for seq in fa_in:
n_base_d[sample_id] += len(seq)
n_contig_d[sample_id] += 1
len_contig_d[sample_id].append(len(seq))
n_base_df = pd.DataFrame.from_dict(n_base_d, orient="index", columns=["Cumulative length"])
n_contig_df = pd.DataFrame.from_dict(n_contig_d, orient="index", columns=["Contig number"])
# Process data for summary dataframe
percentile_val = (0, 10, 25, 50, 75, 90, 100)
percentile_lab = ("min", "D1", "Q1", "median", "Q3", "D9", "max")
summary_len_d = defaultdict(OrderedDict)
for sample_id, sample_len_list in len_contig_d.items():
percentile = np.int32(np.percentile(sample_len_list, percentile_val))
for l, p in zip(percentile_lab, percentile):
summary_len_d[sample_id][l]=p
summary_len_d[sample_id]["N50"] = compute_N50(sample_len_list)
summary_len_df = pd.DataFrame.from_dict(summary_len_d, orient="index")
# Sort index of DF
n_base_df = n_base_df.sort_index()
n_contig_df = n_contig_df.sort_index()
summary_len_df = summary_len_df.sort_index()
# Process data for plot
bins = list(np.logspace(3,7,100))
count_d=OrderedDict()
count_d["length bin"] = bins
for sample_id, sample_len_list in len_contig_d.items():
count_d[sample_id] = np.histogram((sample_len_list), bins=bins+[1e10])[0]
count_df= pd.DataFrame.from_dict(count_d)
count_df = count_df.set_index("length bin")
count_df = count_df/count_df.sum(axis=0)
for col in count_df.columns:
count_df[col] = gaussian_filter1d (count_df[col], sigma=2)
with pl.style.context('ggplot'):
fig, (ax1, ax2, ax3) = pl.subplots(3, 1, figsize=(15,15))
n_base_df.plot.bar(ax=ax1, color="tomato", legend=False)
table(ax1, n_base_df, colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
ax1.set_title('Total assembly length per sample')
ax1.set_ylabel("Total length (bp)")
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
n_contig_df.plot.bar(ax=ax2, color="steelblue", legend=False)
table(ax2, n_contig_df, colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
ax2.set_title('Number of contigs per sample')
ax2.set_ylabel("Number of contigs")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
labels = ['69-1', '131-1', '117-2', '4-2', '7-2', '4-1', '134-2', '79-2', '134-1','80-1', '7-1', '11-1']
color_list = ["#e31a1cff","#1f78b4ff","#33a02cff","#6a3d9aff","#ff7f00ff","#800000ff","#a6cee3ff","#b2df8aff","#333300ff","#cccc00ff","#000080ff","#008080ff"]
color_dict = {lab:col for lab, col in zip(labels, color_list)}
for label, data in count_df.iteritems():
ax3.plot(data.index, data.values, color=color_dict[label], label=label)
for label, n50 in summary_len_df["N50"].items():
ax3.axvline(n50, color=color_dict[label], ls=":", alpha=0.5)
ax3.legend(bbox_to_anchor=(1, 1), loc=2, facecolor='white', frameon=False)
table(ax3, summary_len_df, colWidths=[0.05]*8, colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.5,0.8))
ax3.set_xscale("log")
ax3.set_xlabel("Length (log scale)")
ax3.set_ylabel("Density")
ax3.set_title("Contig lengths distribution")
ax3.set_xlim(1e3,1e7)
ax3.set_ylim(0,None)
fig.tight_layout()
out_fn = os.path.join(f"{outdir}/assembly_stats.svg")
fig.savefig(out_fn)
all_df = pd.concat([n_base_df, n_contig_df, summary_len_df], axis=1)
all_df.to_csv(f"{outdir}/assembly_stats.tsv", sep="\t")
display(all_df)
fn_list = glob.glob("./individual_assemblies/*clean.fa")
lab_list = [os.path.basename(fn).split("_")[0] for fn in fn_list]
reference = "./references/Oryzias_latipes_HDRR_clean.fa"
outdir = "./individual_assemblies/quast/"
annotation = "./references/Oryzias_latipes_HDRR_sorted.gff3"
threads = 40
mem = 60000
fn_str = " ".join(fn_list)
lab_str = ",".join(lab_list)
cmd = f"quast-lg --silent -o {outdir} -r {reference} -g gene:{annotation} --threads {threads} --memory-efficient --plots-format svg -l {lab_str} {fn_str}"
bsub(cmd, conda="Quast", threads=threads, mem=mem, dry=False, job_name="Quast")
pd.read_csv("./individual_assemblies/quast/transposed_report.tsv", sep="\t", index_col=0).sort_index()
from IPython.display import IFrame
IFrame("./individual_assemblies/quast/report.pdf", width=1600, height=800)