from datetime import date
from pycltools.pycltools import jprint
jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2020_06_04', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
from tqdm import tqdm
import csv
import re
# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
import pybedtools as pbt
import pyranges as pr
import mappy as mp
# Ploting lib imports
import matplotlib.pyplot as pl
from matplotlib.colors import rgb2hex
import seaborn as sns
%matplotlib inline
# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100
outdir = "./references/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
outdir = "./references/"
for label, link in (
("HDRR", "ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes/dna/Oryzias_latipes.ASM223467v1.dna.toplevel.fa.gz"),
("HDRR+", "http://utgenome.org/medaka_v2/Medaka-Hd-rR-PacBio_unanchored_contigs.fasta.gz"),
("HNI", "ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes_hni/dna/Oryzias_latipes_hni.ASM223471v1.dna.toplevel.fa.gz"),
("HNI+", "http://utgenome.org/medaka_v2/Medaka-HNI-PacBio_unanchored_contigs.fasta.gz"),
("HSOK", "ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes_hsok/dna/Oryzias_latipes_hsok.ASM223469v1.dna.toplevel.fa.gz"),
("HSOK+", "http://utgenome.org/medaka_v2/Medaka-HSOK-PacBio_unanchored_contigs.fasta.gz")):
print(f"Downloading and cleaning {label}")
zipped_fn = f"./{outdir}/Oryzias_latipes_{label}.fa.gz"
unziped_fn = f"./{outdir}/Oryzias_latipes_{label}.fa"
cleaned_fn = f"./{outdir}/Oryzias_latipes_{label}_clean.fa"
# Download
wget(link, zipped_fn)
# Extract Gzip
gunzip_file(zipped_fn, unziped_fn)
# Clean up fasta names
with open(cleaned_fn, "w") as fa_out, pyfaidx.Fasta(unziped_fn) as fa_in:
for seq in fa_in:
fa_out.write(f">{label}_{seq.name}\n{str(seq)}\n")
# Index file
with pyfaidx.Fasta(cleaned_fn) as fa:
print(len(fa))
# Remove originals
for fn in [zipped_fn, unziped_fn, unziped_fn+".fai"]:
os.remove(fn)
outdir = "./references/"
for label, link in (
("HDRR", "ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes/Oryzias_latipes.ASM223467v1.100.gff3.gz"),
("HNI", "ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes_hni/Oryzias_latipes_hni.ASM223471v1.100.gff3.gz"),
("HSOK", "ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes_hsok/Oryzias_latipes_hsok.ASM223469v1.100.gff3.gz")):
print(f"Downloading and cleaning {label}")
zipped_fn = f"./{outdir}/Oryzias_latipes_{label}.gff3.gz"
unziped_fn = f"./{outdir}/Oryzias_latipes_{label}.gff3"
cleaned_fn = f"./{outdir}/Oryzias_latipes_{label}_clean.gff3"
sorted_fn = f"./{outdir}/Oryzias_latipes_{label}_sorted.gff3"
# Download
wget(link, zipped_fn)
# Extract Gzip
gunzip_file(zipped_fn, unziped_fn)
# Clean fasta names
with open(unziped_fn, "r") as gff_in, open(cleaned_fn, "w") as gff_out:
for line in gff_in:
line = line.strip()
if line.startswith("##sequence-region"):
line = line.split()
line[1] = "{}_{}".format(label, line[1])
line = "\t".join(line)
if line[0] != "#":
line = line.split()
line[0] = "{}_{}".format(label, line[0])
line = "\t".join(line)
gff_out.write(line+"\n")
# Sort and index for IGV
bash (f"igvtools sort {cleaned_fn} {sorted_fn}", conda="IGV")
bash (f"igvtools index {sorted_fn}", conda="IGV")
# Remove originals
os.remove(zipped_fn)
os.remove(unziped_fn)
os.remove(cleaned_fn)
references_list = [
'./references/Oryzias_latipes_HDRR_clean.fa',
'./references/Oryzias_latipes_HDRR+_clean.fa',
'./references/Oryzias_latipes_HNI_clean.fa',
'./references/Oryzias_latipes_HNI+_clean.fa',
'./references/Oryzias_latipes_HSOK_clean.fa',
'./references/Oryzias_latipes_HSOK+_clean.fa']
alternative_list = glob.glob("./individual_assemblies/*_clean.fa")
all_ref = references_list+alternative_list
out_fn = "references/Oryzias_latipes_all_assemblies_contig.fa"
with open (out_fn, "w") as fa_out:
for fasta_fn in all_ref:
print(f"Copy {fasta_fn}")
with open(fasta_fn,'r') as fa_in:
shutil.copyfileobj(fa_in, fa_out)
with pyfaidx.Fasta(out_fn) as fa:
print(len(fa))
gff_list = [
'./references/Oryzias_latipes_HDRR_sorted.gff3',
'./references/Oryzias_latipes_HNI_sorted.gff3',
'./references/Oryzias_latipes_HSOK_sorted.gff3']
out_fn = "references/Oryzias_latipes_all_assemblies.gff3"
out_fn_sorted = "references/Oryzias_latipes_all_assemblies_sorted.gff3"
with open (out_fn, "w") as gff_out:
gff_out.write("##gff-version 3\n")
print(f"Copy headers")
for gff_fn in gff_list:
with open (gff_fn) as gff_fp:
for line in gff_fp:
line = line.strip()
if line.startswith("##sequence-region"):
gff_out.write(f"{line}\n")
elif not line.startswith("#"):
break
print(f"Copy annotations")
for gff_fn in gff_list:
with open (gff_fn) as gff_fp:
for line in gff_fp:
line = line.strip()
if not line.startswith("#"):
gff_out.write(f"{line}\n")
# Sort and index for IGV
print(f"Sort and index with igvtools")
bash (f"igvtools sort {out_fn} {out_fn_sorted}", conda="IGV")
bash (f"igvtools index {out_fn_sorted}", conda="IGV")
# Remove originals
os.remove(out_fn)
outdir = "./graph_assembly"
#shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
mem = 60000
threads = 20
dry = False
references_list = [
'./references/Oryzias_latipes_HDRR_clean.fa',
'./references/Oryzias_latipes_HDRR+_clean.fa',
'./references/Oryzias_latipes_HNI_clean.fa',
'./references/Oryzias_latipes_HNI+_clean.fa',
'./references/Oryzias_latipes_HSOK_clean.fa',
'./references/Oryzias_latipes_HSOK+_clean.fa']
alternative_list = glob.glob("./individual_assemblies/*_clean.fa")
all_ref = " ".join(references_list+alternative_list)
gfa = f"{outdir}/all_ref_graph.gfa"
cmd = f"minigraph -x ggs -L 100 -t {threads} {all_ref} > {gfa}"
bsub(cmd, conda="minigraph", threads=threads, mem=mem)
graph_fn = "./graph_assembly/all_ref_graph.gfa"
seg_line = namedtuple("S", ["seg_id", "sequence", "length", "chromosome", "start", "rank"])
seg_line_list = []
link_line = namedtuple("L", ["seg1_id", "seg1_orient", "seg2_id", "seg2_orient", "cigar", "rank", "seg1_length", "seg2_length"])
link_line_list = []
with open (graph_fn) as fp:
for l in tqdm(fp):
l = l.strip().split("\t")
if l[0] == "S":
sl = seg_line(l[1], l[2], int(l[3].split(":")[-1]), l[4].split(":")[-1], int(l[5].split(":")[-1]), int(l[6].split(":")[-1]))
seg_line_list.append(sl)
if l[0] == "L":
ll = link_line(l[1], l[2], l[3], l[4], l[5], int(l[6].split(":")[-1]), int(l[7].split(":")[-1]), int(l[8].split(":")[-1]))
link_line_list.append(ll)
seg_df = pd.DataFrame(seg_line_list)
link_df = pd.DataFrame(link_line_list)
with pd.option_context("display.max_colwidth", 50):
display(seg_df.sample(10))
display(link_df.sample(10))
Coord = namedtuple("coord", ["chromosome", "start", "end", "start_seg_id", "end_seg_id"])
def find_connected_ref (seg_id, seg_info_d, path=None, reference_type="HDRR"):
seg_info = seg_info_d[seg_id]
# if leaf is ref type return coordinates
if seg_info["type"] == reference_type:
return Coord(seg_info["chromosome"], seg_info["end"], seg_info["start"], seg_id, seg_id)
# Add any visited node to the path
if not path:
path = []
path.append(seg_id)
best_chromosome = best_start = best_end = best_start_seg_id = best_end_seg_id = None
for s in seg_info["connected"]:
# Avoid circular paths error
if not s in path:
# Recursively search for connected segments
coord = find_connected_ref(seg_id=s, seg_info_d=seg_info_d, path=path)
if coord.chromosome and coord.start and coord.end:
if not best_chromosome:
best_chromosome = coord.chromosome
else:
assert best_chromosome == coord.chromosome
if not best_end or coord.end > best_end:
best_end = coord.end
best_end_seg_id = coord.end_seg_id
if not best_start or coord.start < best_start:
best_start = coord.start
best_start_seg_id = coord.start_seg_id
return Coord(best_chromosome, best_start, best_end, best_start_seg_id, best_end_seg_id)
def sort_seg (s1, s2, s1_orient, s2_orient):
""""""
s1_num = int(s1[1:])
s2_num = int(s2[1:])
if s1_num > s2_num:
inv_dict = {"+":"-", "-":"+"}
s1, s2 = s2, s1
s1_orient, s2_orient = inv_dict[s2_orient], inv_dict[s1_orient]
return (s1, s2, s1_orient, s2_orient)
def get_link_info(s1_orient, s2_orient, s1_info, s2_info):
""""""
s1_chromosome = s1_info["chromosome"]
s1_pos = s1_info["end"] if s1_orient == "+" else s1_info["start"]
s1_type = s1_info["type"]
s2_chromosome = s2_info["chromosome"]
s2_pos = s2_info["start"] if s2_orient == "+" else s2_info["end"]
s2_type = s2_info["type"]
if s1_chromosome == s2_chromosome:
connect_type = "intra_{}".format(s1_type)
# Calculate length of break point
connect_gap = s2_pos-s1_pos if s1_orient == "+" else s1_pos-s2_pos
# Define type of jump depending on break length
if s1_orient == s2_orient:
if connect_gap==0:
SV_type = "continuous"
elif connect_gap>0:
SV_type = "deletion"
else:
SV_type = "duplication"
else:
SV_type = "invertion"
elif s1_type == s2_type:
connect_type = "inter_{}".format(s1_type)
connect_gap = None
SV_type = "insertion"
else:
connect_type = "{}_{}".format(s1_type, s2_type)
connect_gap = None
SV_type = "insertion"
d = OrderedDict(
s1_chromosome = s1_chromosome,
s1_pos = s1_pos,
s2_chromosome = s2_chromosome,
s2_pos = s2_pos,
s2_type = s2_type,
connect_type = connect_type,
connect_gap = connect_gap,
SV_type = SV_type)
return d
graph_fn = "./graph_assembly/all_ref_graph.gfa"
main_ref_fasta = "./references/Oryzias_latipes_HDRR_clean.fa"
seg_info_out_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
link_info_out_fn = "./graph_assembly/all_ref_graph_link_info.pkl"
graph_segments_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa"
link_info_d = OrderedDict()
seg_info_d = OrderedDict()
max_comp = 0
min_len_align = 200
ref_label = "HDRR"
# Align remaining segments to reference to define identity
stdout_print("Make minimap db\n")
minimap_db = mp.Aligner(main_ref_fasta, best_n=1, scoring=(2,4,4,2,24,0), k=15, n_threads=10)
stdout_print("Annotate segments and links\n")
with open(graph_fn) as graph_fp, open(graph_segments_fasta, "w") as fasta_fp:
for l in tqdm(graph_fp):
l = l.strip().split("\t")
if l[0] == "S":
seg_id = l[1]
seq =l[2]
chromosome = l[4].split(":")[-1]
length = int(l[3].split(":")[-1])
start = int(l[5].split(":")[-1])
end = start+length
seg_type = chromosome.split("_")[0]
# Write segment sequences to fasta
fasta_fp.write(f">{seg_id}\n{seq}\n")
# Try to align to ref if long enough
if seg_type != ref_label and length >= min_len_align:
seq = l[2]
hits = minimap_db.map(seq=seq)
try:
hit = next(hits)
soft_clip = hit.q_st+(length-hit.q_en)
identity = (hit.blen-hit.NM)/(hit.blen+soft_clip)
hit_chromosome, hit_start, hit_end = hit.ctg, hit.r_st, hit.r_en
except StopIteration:
identity = 0
hit_chromosome = hit_start = hit_end = None
finally:
del hits
else:
identity = None
hit_chromosome = hit_start = hit_end = None
seg_info_d[seg_id] = OrderedDict(
chromosome = chromosome,
start = start,
end = end,
length = length,
type = seg_type,
identity = identity,
hit_chromosome = hit_chromosome,
hit_start = hit_start,
hit_end = hit_end,
component = 0,
connected = [])
# Record all possible connections between segments
if l[0] == "L":
# Sort and flip segments if
s1, s2, s1_orient, s2_orient = sort_seg (s1=l[1], s2=l[3], s1_orient=l[2], s2_orient=l[4])
s1_info=seg_info_d[s1]
s2_info=seg_info_d[s2]
# Characterise junctions type
link_info = get_link_info(s1_orient=s1_orient, s2_orient=s2_orient, s1_info=s1_info, s2_info=s2_info)
link_id = "{}{}_{}{}".format(s1, s1_orient, s2, s2_orient)
link_info_d[link_id] = link_info
# Update list of connected segments
seg_info_d[s2]["connected"].append(s1)
seg_info_d[s1]["connected"].append(s2)
# Determine component the pair belongs to
s1_comp = s1_info["component"]
s2_comp = s2_info["component"]
if not s1_comp and not s2_comp:
max_comp+=1
seg_info_d[s1]["component"] = max_comp
seg_info_d[s2]["component"] = max_comp
elif not s1_comp:
seg_info_d[s1]["component"] = s2_comp
elif not s2_comp:
seg_info_d[s2]["component"] = s1_comp
else:
assert s1_comp == s2_comp
del minimap_db
# Find connected reference coordinate
stdout_print("Traverse graph to find reference anchor points\n")
for seg_id, seg_info in tqdm(seg_info_d.items()):
if seg_info["type"] != ref_label:
coord = find_connected_ref(seg_id=seg_id, seg_info_d=seg_info_d, reference_type=ref_label)
seg_info_d[seg_id]["connected_chromosome"] = coord.chromosome
seg_info_d[seg_id]["connected_start"] = coord.start
seg_info_d[seg_id]["connected_end"] = coord.end
seg_info_d[seg_id]["connected_seg_start"] = coord.start_seg_id
seg_info_d[seg_id]["connected_seg_end"] = coord.end_seg_id
seg_info_df = pd.DataFrame.from_dict(seg_info_d, orient="index")
seg_info_df.index.name = "seg_id"
seg_info_df.to_pickle(seg_info_out_fn)
display(seg_info_df.head())
link_info_df = pd.DataFrame.from_dict(link_info_d, orient="index")
link_info_df.index.name = "link_id"
link_info_df.to_pickle(link_info_out_fn)
display(link_info_df.head())
from pandas.plotting import table
from scipy.ndimage import gaussian_filter1d
def compute_N50 (data):
data = data.dropna().values
data.sort()
half_sum = data.sum()/2
cum_sum = 0
for v in data:
cum_sum += v
if cum_sum >= half_sum:
return int(v)
graph_info_fn="./graph_assembly/all_ref_graph_seg_info.pkl"
outdir="./graph_assembly/"
seg_type_info = defaultdict(OrderedDict)
seg_type_identity_list = OrderedDict()
seg_type_len_list = OrderedDict()
df = pd.read_pickle(graph_info_fn)
df = df[["length","type","identity"]]
# Aggregate unanchore contigs
df["type"] = df["type"].str.rstrip("+")
for seg_type, sdf in df.groupby("type"):
seg_type_info[seg_type]["Total length (bp)"] = sdf["length"].sum()
seg_type_info[seg_type]["Segments number"] = len(sdf)
seg_type_info[seg_type]["Median segment length"] = sdf["length"].dropna().median()
seg_type_info[seg_type]["longest segment"] = sdf["length"].dropna().max()
seg_type_info[seg_type]["N50"] = compute_N50 (sdf["length"])
seg_type_info[seg_type]["Median identity"] = sdf["identity"].dropna().median()
seg_type_len_list[seg_type] = sdf["length"]
seg_type_identity_list[seg_type] = sdf["identity"]
# Add all types together
seg_type_info["All"]["Total length (bp)"] = df["length"].sum()
seg_type_info["All"]["Segments number"] = len(df)
seg_type_info["All"]["Median segment length"] = df["length"].dropna().median()
seg_type_info["All"]["longest segment"] = df["length"].dropna().max()
seg_type_info["All"]["N50"] = compute_N50 (df["length"])
seg_type_info["All"]["Median identity"] = df["identity"].dropna().median()
# seg_type_len_list["All"] = df["length"]
# seg_type_identity_list["All"] = df["identity"]
seg_type_info_df = pd.DataFrame.from_dict(seg_type_info, orient="index")
out_fn = os.path.join(outdir, "segment_type_stats.tsv")
seg_type_info_df.to_csv(out_fn, sep="\t")
display(seg_type_info_df)
# Process data for plot
lower_pow = 1
upper_pow = 5
bins = list(np.logspace(lower_pow,upper_pow,100))
count_d=OrderedDict()
count_d["length bin"] = bins
for sample_id, sample_len_list in seg_type_len_list.items():
count_d[sample_id] = np.histogram((sample_len_list), bins=bins+[pow(10,upper_pow+1)])[0]
seg_df= pd.DataFrame.from_dict(count_d)
seg_df = seg_df.set_index("length bin")
# normalise
seg_df = seg_df/seg_df.sum(axis=0)
for col in seg_df.columns:
seg_df[col] = gaussian_filter1d (seg_df[col], sigma=2)
# Process data for plot
bins = list(np.linspace(0,1,100))
count_d=OrderedDict()
count_d["identity bin"] = bins
for sample_id, sample_len_list in seg_type_identity_list.items():
count_d[sample_id] = np.histogram((sample_len_list), bins=bins+[1.1])[0]
id_df= pd.DataFrame.from_dict(count_d)
id_df = id_df.set_index("identity bin")
# normalise
id_df = id_df/id_df.sum(axis=0)
for col in id_df.columns:
id_df[col] = gaussian_filter1d (id_df[col], sigma=1)
with pl.style.context('ggplot'):
fig, (ax1, ax2, ax3, ax4) = pl.subplots(4, 1, figsize=(10,20))
seg_type_info_df["Total length (bp)"].plot.bar(ax=ax1, color="tomato", legend=False)
table(ax1, seg_type_info_df["Total length (bp)"], colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
ax1.set_title('Total length of segment per type')
ax1.set_ylabel("Total length")
#ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
seg_type_info_df["Segments number"].plot.bar(ax=ax2, color="steelblue", legend=False)
table(ax2, seg_type_info_df["Segments number"], colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
ax2.set_title('Number of segments per type')
ax2.set_ylabel("Number of segments")
#ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
# Colors for types and All
colors = sns.color_palette("tab10", 4)
seg_df.plot(ax=ax3, color=colors)
ax3.set_xscale("log")
#ax3.set_yscale("log")
ax3.set_xlabel("Length (log scale)")
ax3.set_ylabel("Density")
ax3.set_title("Segment lengths distribution")
#ax3.set_xticks(np.logspace(lower_pow, upper_pow,upper_pow+1))
ax3.set_xlim(pow(10,lower_pow),pow(10,upper_pow))
ax3.set_ylim(0,None)
id_df.plot(ax=ax4, color=colors)
ax4.set_xlabel("Identity")
ax4.set_ylabel("Density")
ax4.set_title("Identity distribution")
ax4.set_xlim(0,1)
ax4.set_ylim(0,None)
fig.tight_layout()
out_fn = os.path.join(outdir, "segment_type_stats.svg")
fig.savefig(out_fn)
from scipy.ndimage import gaussian_filter1d
link_info_fn = "./graph_assembly/all_ref_graph_link_info.pkl"
df = pd.read_pickle(link_info_fn)
df = df[df["connect_type"].isin(["HDRR_MIKK", "HDRR_HNI", "HDRR_HSOK", "HNI_HSOK", "HNI_MIKK", "HSOK_MIKK", "intra_HDRR", "intra_MIKK", "intra_HNI", "intra_HSOK", "inter_HDRR", "inter_MIKK", "inter_HNI", "inter_HSOK"])]
display(df.head())
with pl.style.context('ggplot'):
fig, (ax1, ax2, ax3) = pl.subplots(1, 3, figsize=(30 ,7))
ins_df = df[df["SV_type"]=="insertion"]
s = ins_df["connect_type"].value_counts()
s.plot.bar(ax=ax1)
ax1.set_title('Insertion/alternative path supporting junctions')
del_df = df[df["SV_type"]=="deletion"]
s = del_df["connect_type"].value_counts()
s.plot.bar(ax=ax2)
ax2.set_title('Deletion supporting junctions')
hdrr_del = df[df["connect_type"]=="intra_HDRR"]
# Process data for plot
x = list(np.logspace(2,5,100))
y = np.histogram((hdrr_del["connect_gap"]), bins=x+[10e6])[0]
y = gaussian_filter1d (y, sigma=2)
ax3.fill_betweenx(y=y, x1=x)
ax3.set_xlim(1e2,1e5)
ax3.set_ylim(0, None)
ax3.set_xscale('log')
ax3.set_title('Deletion length relative to HDRR reference')
fig.tight_layout()
graph_info_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
svg_out_fn = "./graph_assembly/identity_len_stats.svg"
min_long = 2000
min_divergent = 0.5
graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df = graph_info_df[["type", "length", "identity"]]
graph_info_df["type"] = graph_info_df["type"].str.rstrip("+")
graph_info_df = graph_info_df.dropna ()
#graph_info_df = graph_info_df.sample(20000) ###############################
graph_info_df["log10(length)"] = np.log10(graph_info_df["length"])
graph_info_df["divergence"] = 1-graph_info_df["identity"]
d = defaultdict(Counter)
for line in graph_info_df.itertuples():
# if line.divergence == 1:
# d[line.type]["Fully divergent"] += 1
if line.divergence >= min_divergent and line.length >= min_long:
d[line.type]["Long high divergence"] += 1
elif line.divergence >= min_divergent and line.length < min_long:
d[line.type]["Short high divergence"] += 1
elif line.divergence < min_divergent and line.length >= min_long:
d[line.type]["Long low divergence"] += 1
elif line.divergence < min_divergent and line.length < min_long:
d[line.type]["Short low divergence"] += 1
print("Counts by category")
df = pd.DataFrame(d)
df = df.fillna(0)
display(df)
print("% by category")
df = df/df.sum(axis=0)*100
display(df)
with pl.style.context('ggplot'):
cmap = sns.cubehelix_palette(start=0.9, rot=0.1, dark=0.01, light=0.99, hue=2.75, gamma=0.75, as_cmap=True)
g = sns.FacetGrid(graph_info_df, col="type", col_wrap=2, height=5, aspect=1.25)
g = g.map(sns.kdeplot, "log10(length)", "divergence", shade=True, shade_lowest=False, cmap=cmap, n_levels=20)
for ax in g.axes:
ax.set_ylim(0, 1)
ax.set_xlim(2, 4.5)
g.savefig(svg_out_fn)
def segment_type_stats (graph_info_fn, canonical_fa_fn, outdir):
graph_info_df = pd.read_pickle(graph_info_fn)
# Group segments by component
components_dict = defaultdict(Component)
for seg_id, seg_info in tqdm(zip(graph_info_df.index, graph_info_df.itertuples()), total=len(graph_info_df)):
components_dict[seg_info.component].add(seg_id=seg_id, seg_info=seg_info)
canonical_len_d = OrderedDict()
with pyfaidx.Fasta(canonical_fa_fn) as fa:
for seq in fa:
canonical_len_d[seq.name] = len(seq)
comp_len_d = OrderedDict()
comp_ratio_d = OrderedDict()
comp_num_d = OrderedDict()
for i, j in components_dict.items():
chrom = j.main_chromosome
comp_len_d[chrom] = {"Canonical chromosomes":canonical_len_d[chrom], "Graph segments":j.length}
comp_ratio_d[chrom] = j.length/canonical_len_d[chrom]
comp_num_d[chrom]=len(j.segments)
comp_len_df = pd.DataFrame.from_dict(comp_len_d, orient="index")
comp_ratio_df = pd.DataFrame.from_dict(comp_ratio_d, orient="index", columns=["Ratio canonical/graph"])
comp_num_df = pd.DataFrame.from_dict(comp_num_d, orient="index", columns=["Number of segments"])
with pl.style.context('ggplot'):
fig, (ax1, ax2, ax3) = pl.subplots(3, 1, figsize=(20,15))
comp_len_df.plot.bar(ax=ax1, color=("steelblue","tomato"), legend=True)
table(ax1, comp_len_df, colWidths=[0.10]*2, colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,1))
ax1.set_title('Length of canonical chromosomes')
ax1.set_ylabel("Length")
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
comp_ratio_df.plot.bar(ax=ax2, color="tomato", legend=False)
table(ax2, comp_ratio_df, colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,1))
ax2.set_title('Ratio of Graph segment length by canonical chromosome length')
ax2.set_ylabel("Length ratio")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
comp_num_df.plot.bar(ax=ax3, color="steelblue", legend=False)
table(ax3, comp_num_df, colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,1))
ax3.set_title('Number of graph segments per chromosome')
ax3.set_ylabel("Number of segments")
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45, ha='right')
fig.tight_layout()
out_fn = os.path.join(outdir, "component_stats.svg")
fig.savefig(out_fn)
# Helper class to store component information
class Component():
def __init__ (self):
self.segments = list()
self.chromosomes = Counter()
self.length = 0
def add (self, seg_id, seg_info):
self.segments.append(seg_id)
self.chromosomes[seg_info.chromosome]+=1
self.length += seg_info.length
def __repr__ (self):
return f"Segments: {len(self.segments)}, chromomomes: {len(self.chromosomes)}, Length: {self.length}, Main chromosome: {self.main_chromosome}"
@property
def main_chromosome(self):
return self.chromosomes.most_common(1)[0][0]
segment_type_stats("./graph_assembly/all_ref_graph_seg_info.pkl", "./references/Oryzias_latipes_HDRR_clean.fa", outdir="./graph_assembly/")
graph_info_fn = "./graph_assembly/all_ref_graph_info.pkl"
bandage_info_type = "./graph_assembly/all_ref_graph_type_bandage.csv"
bandage_info_identity = "./graph_assembly/all_ref_graph_identity_bandage.csv"
ref_type = "HDRR"
graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df.index.name="Name"
graph_info_df = graph_info_df.drop(columns=["start", "end", "hit_start", "hit_end", "component", "connected_forward", "connected_reverse"])
#Define colormap
type_list = ['HDRR', 'HDRR+', 'HNI', 'HNI+', 'HSOK', 'HSOK+', 'MIKK']
type_color_dict = OrderedDict()
for col, lab in zip(sns.color_palette('tab20', len(type_list)), type_list):
type_color_dict[lab] = rgb2hex(col)
#Define colormap
no_data_color = "#535353"
identity_thresholds = [0, 0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
identity_color_dict = OrderedDict()
for col, lab in zip(sns.color_palette('afmhot', len(identity_thresholds)), identity_thresholds):
identity_color_dict[lab] = rgb2hex(col)
type_color_list = []
identity_color_list = []
for line in tqdm(graph_info_df.itertuples(), total=len(graph_info_df)):
type_color_list.append(type_color_dict[line.type])
if pd.isna(line.identity):
identity_color_list.append(no_data_color)
else:
for threshold, color in identity_color_dict.items():
if line.identity<=threshold:
identity_color_list.append(color)
break
graph_info_type_df = graph_info_df.copy()
graph_info_type_df["Color"] = type_color_list
graph_info_type_df.to_csv(bandage_info_type, sep=",")
display(graph_info_type_df.sample(5))
graph_info_identity_df = graph_info_df.copy()
graph_info_identity_df["Color"] = identity_color_list
graph_info_identity_df.to_csv(bandage_info_identity, sep=",")
display(graph_info_identity_df.sample(5))
graph_info_fn = "./graph_assembly/all_ref_graph_info.pkl"
bed_out = "./graph_assembly/all_ref_graph_segments.bed"
bed_out_sorted = "./graph_assembly/all_ref_graph_segments_sorted.bed"
graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df = graph_info_df.reset_index()
graph_info_df = graph_info_df[["chromosome", "start", "end", "seg_id"]]
graph_info_df.to_csv(bed_out, sep="\t", index=False, header=False)
print(f"Sort and index with igvtools")
bash (f"igvtools sort {bed_out} {bed_out_sorted}", conda="IGV")
bash (f"igvtools index {bed_out_sorted}", conda="IGV")
# Remove originals
os.remove(bed_out)