from datetime import date
from pycltools.pycltools import jprint
jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2020_06_04', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
from tqdm import tqdm
import csv
import re
# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
import pybedtools as pbt
import pyranges as pr
import mappy as mp
from IPython.display import SVG, display, Image
from pyBioTools.Annotation import Annotation
from scipy.ndimage.filters import gaussian_filter1d
# Ploting lib imports
import matplotlib as mpl
import matplotlib.pyplot as pl
import seaborn as sns
%matplotlib inline
# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
from pandas.plotting import table
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100
outdir = "alignments"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
outdir = "alignments/raw_reads"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
threads = 20
mem=30000
dry=False
graph = "./graph_assembly/all_ref_graph.gfa"
for fasta in glob.glob("/hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/*.fastq"):
sample_id = os.path.basename(fasta).split(".")[0]
print (f"Aligning sample {sample_id}")
# Normal mode
gaf = f"{outdir}/{sample_id}_stable.gaf"
stdout = f"{outdir}/{sample_id}_stable.out"
stderr = f"{outdir}/{sample_id}_stable.err"
bsub(f"minigraph -x lr -t {threads} {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
# Unstable ref mapping mode
gaf = f"{outdir}/{sample_id}_unstable.gaf"
stdout = f"{outdir}/{sample_id}_unstable.out"
stderr = f"{outdir}/{sample_id}_unstable.err"
bsub(f"minigraph -x lr --vc -t {threads} {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
outdir = "alignments/assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
threads = 20
mem = 30000
dry = False
graph = "./graph_assembly/all_ref_graph.gfa"
for fasta in glob.glob("./individual_assemblies/*_clean.fa"):
sample_id = os.path.basename(fasta).split(".")[0].rstrip("_clean")
print (f"Aligning sample {sample_id}")
# Normal mode
gaf = f"{outdir}/{sample_id}_stable.gaf"
stdout = f"{outdir}/{sample_id}_stable.out"
stderr = f"{outdir}/{sample_id}_stable.err"
bsub(f"minigraph -x asm -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
# Unstable ref mapping mode
gaf = f"{outdir}/{sample_id}_unstable.gaf"
stdout = f"{outdir}/{sample_id}_unstable.out"
stderr = f"{outdir}/{sample_id}_unstable.err"
bsub(f"minigraph -x asm --vc -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
outdir = "./rna_seq_data"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
info_fn = "/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/Analysis_low_high/candidates/liver_candidate_rep1.tsv"
sample_d = OrderedDict()
with open(info_fn) as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
for row in reader:
sample_d[row["sanger_id"]] = row["sample_id"].replace("_", "-")
sample_d
for sanger_id, sample_id in sample_d.items():
R1 = f"/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/RNA_pipeline2/results/input/fastp/high_{sanger_id}_1.fastq.gz"
R2 = f"/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/RNA_pipeline2/results/input/fastp/high_{sanger_id}_2.fastq.gz"
if os.path.isfile(R1) and os.path.isfile(R2):
stdout = f"{outdir}/{sample_id}_flash.err"
stderr = f"{outdir}/{sample_id}_flash.out"
cmd = f"flash {R1} {R2} -r 150 -f 220 --output-prefix {sample_id}_merged --output-directory {outdir} --compress --threads 20"
bsub(cmd, conda="flash", dry=False, mem=5000, threads=20, stdout_fp=stdout, stderr_fp=stderr)
outdir = "alignments/rna_seq"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
threads = 20
mem = 15000
dry = False
graph = "./graph_assembly/all_ref_graph.gfa"
for fasta in glob.glob("rna_seq_data/*_merged.extendedFrags.fastq.gz"):
sample_id = os.path.basename(fasta).split("_")[0]
print (f"Aligning sample {sample_id}")
# Normal mode
gaf = f"{outdir}/{sample_id}_stable.gaf"
stdout = f"{outdir}/{sample_id}_stable.out"
stderr = f"{outdir}/{sample_id}_stable.err"
bsub(f"minigraph -x sr -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
# Unstable ref mapping mode
gaf = f"{outdir}/{sample_id}_unstable.gaf"
stdout = f"{outdir}/{sample_id}_unstable.out"
stderr = f"{outdir}/{sample_id}_unstable.err"
bsub(f"minigraph -x sr --vc -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
outdir = "coverage"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
def split_path (path):
l = []
s = ""
for c in path:
if s and c in ["<", ">"]:
l.append(s)
s = c
else:
s+=c
if s:
l.append(s)
return l
def parse_stable_gaf (gaf_fn, min_align_len=100, min_mapq=10, progress=True):
with open (gaf_fn) as gaf_fp:
c = Counter()
l = []
coord_tuple = namedtuple("coord_tuple", ("chromosome", "start", "end"))
for line in tqdm(gaf_fp, disable=not progress):
line = line.strip().split("\t")
align_len = int(line[10])
mapq = int(line[11])
align_type = line[12].split(":")[-1]
if align_len < min_align_len:
c["Short length alignments"]+=1
continue
if mapq < min_mapq:
c["Low mapq alignments"]+=1
continue
if align_type != "P":
c["Non primary alignments"]+=1
continue
# Decompose path from
c["Valid reads"]+=1
for p in split_path(line[5]):
if p[0] in ["<", ">"]:
chromosome = p[1:].split(":")[0]
start = int(p.split(":")[1].split("-")[0])
end = int(p.split(":")[1].split("-")[1])
#strand = "+" if node[0] == ">" else "-"
else:
chromosome = p
start = int(line[7])
end = int(line[8])
#strand = line[4]
c["Valid segments"]+=1
l.append(coord_tuple(chromosome, start, end))
for i, j in c.items():
stdout_print(f"\t\t{i}: {j}\n")
df = pd.DataFrame(l)
df.sort_values(by=["chromosome", "start", "end"], inplace=True)
df.reset_index(inplace=True, drop=True)
return df
def coverage_from_gaf (gaf_src, all_fasta_ref, outdir, min_align_len=100, min_mapq=1):
stdout_print("Define chromosome length for pybedtools\n")
chrom_len = OrderedDict()
with pyfaidx.Fasta(all_fasta_ref) as fa:
for seq in fa:
chrom_len[str(seq.name)]=(0, len(seq))
stdout_print("Parsing gaf files\n")
for gaf_fn in glob.glob(gaf_src):
sample_id = gaf_fn.split("/")[-1].rpartition("_")[0]
stdout_print(f"Analyse sample {sample_id}\n")
stdout_print("\tCollect coordinates from paths\n")
bed_df = parse_stable_gaf (gaf_fn, min_align_len=min_align_len, min_mapq=min_mapq, progress=False)
bed_fn = os.path.join(outdir, f"{sample_id}.bed")
bed_df.to_csv(bed_fn, index=False, header=False, sep="\t")
stdout_print("\tCompute coverage with bedtools\n")
bedgraph_fn = os.path.join(outdir, f"{sample_id}.bedgraph")
bed = pbt.BedTool(bed_fn)
bed.set_chromsizes(chrom_len)
bg = bed.genomecov(bg=True, split=True, trackopts=f'type=bedGraph name={sample_id}')
bg.saveas(bedgraph_fn)
outdir = "coverage/assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
coverage_from_gaf(
gaf_src = "./alignments/assemblies/*_stable.gaf",
all_fasta_ref = "./references/Oryzias_latipes_all_assemblies_contig.fa",
outdir = outdir,
min_align_len=100,
min_mapq=1)
outdir = "coverage/raw_reads"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
coverage_from_gaf(
gaf_src = "./alignments/raw_reads/*_stable.gaf",
all_fasta_ref = "./references/Oryzias_latipes_all_assemblies_contig.fa",
outdir = outdir)
outdir = "coverage/rna_seq"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
coverage_from_gaf(
gaf_src = "./alignments/rna_seq/*_stable.gaf",
all_fasta_ref = "./references/Oryzias_latipes_all_assemblies_contig.fa",
outdir = outdir)
outdir = "segment_usage"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
def iter_idx_tuples (df):
for i, j in zip(df.index, df.itertuples(index=False, name=None)):
yield (i, j)
def iter_line (line, names):
for i, j in zip(names, line):
yield (i, j)
def gaf_seg_coverage (gaf_fn, graph_info_dict, min_align_len=100, min_mapq=10, progress=True):
seg_cov_dict = Counter()
info_counter = Counter()
with open (gaf_fn) as gaf_fp:
for line in tqdm(gaf_fp, disable=not progress):
line = line.strip().split("\t")
path_str = line[5]
seg_list = re.split('<|>',path_str[1:])
start = int(line[7])
end = int(line[8])
align_len = int(line[10])
mapq = int(line[11])
align_type = line[12].split(":")[-1]
if align_len < min_align_len:
info_counter["short length alignments"]+=1
elif mapq < min_mapq:
info_counter["low mapq alignments"]+=1
elif align_type != "P":
info_counter["non primary alignments"]+=1
elif len(seg_list) == 1:
info_counter["Single segment alignments"]+=1
seg_len = end-start
seg = seg_list[0]
seg_cov_dict[seg] += seg_len
else:
info_counter["Multi segment alignments"]+=1
all_seg_len = 0
# First segment exception
seg = seg_list[0]
seg_len = graph_info_dict[seg]["length"]
all_seg_len+=seg_len
seg_cov_dict[seg] += seg_len-start
# Internal segments
for seg in seg_list[1:-1]:
seg_len = graph_info_dict[seg]["length"]
all_seg_len+=seg_len
seg_cov_dict[seg] += seg_len
# Last segment exception
seg = seg_list[-1]
seg_len = graph_info_dict[seg]["length"]
all_seg_len+=seg_len
seg_cov_dict[seg] += seg_len-(all_seg_len-end)
return seg_cov_dict, info_counter
def segment_usage_stats (gaf_src, graph_info_pkl, fasta_canonical_ref, outdir, n_sample_range=[12,10,8,6,4,2,1], min_align_len=100, min_mapq=10, min_normed_cov=0.1):
stdout_print("Get list of reference chromosome\n")
ref_chrom_l=[]
with pyfaidx.Fasta(fasta_canonical_ref) as fa:
ref_chrom_l = [s.name for s in fa]
stdout_print("Parsing reference graph\n")
graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df = graph_info_df[["chromosome","length","start","end","type"]]
graph_info_dict = graph_info_df.to_dict(orient="index")
all_cov = OrderedDict()
all_info = OrderedDict()
stdout_print("Parsing gaf files\n")
for gaf_fn in glob.glob(gaf_src):
sample_id = gaf_fn.split("/")[-1].split("_")[0]
stdout_print(f"\tParsing data for sample {sample_id}\n")
seg_cov_dict, info_counter = gaf_seg_coverage (gaf_fn=gaf_fn, graph_info_dict=graph_info_dict, min_align_len=min_align_len, min_mapq=min_mapq, progress=False)
all_cov[sample_id] = seg_cov_dict
all_info[sample_id] = info_counter
info_df = pd.DataFrame(all_info)
display(info_df)
stdout_print("Cast coverage data to DataFrame\n")
cov_df = pd.DataFrame(all_cov)
cov_df = cov_df.fillna(0)
cov_df = cov_df.astype(np.int64)
stdout_print("Merge cov df and info from graph\n")
data_col = cov_df.columns
cov_df = pd.merge(left=graph_info_df, right=cov_df, left_index=True, right_index=True, how="left")
cov_df[data_col] = cov_df[data_col].fillna(0)
cov_df.sort_values(["chromosome","start","end"], inplace=True)
display(cov_df.head())
out_fn = os.path.join(outdir, "segment_usage_raw.tsv")
cov_df.to_csv(out_fn, sep="\t")
stdout_print("Normalise by length only\n")
len_norm_cov_df = cov_df.copy()
for col in data_col:
len_norm_cov_df[col] = len_norm_cov_df[col]/len_norm_cov_df["length"]
display(len_norm_cov_df.head())
out_fn = os.path.join(outdir, "segment_usage_len_norm.tsv")
len_norm_cov_df.to_csv(out_fn, sep="\t")
stdout_print("Normalise by length and total coverage over canonical chromosomes\n")
canonical_df = cov_df[cov_df["chromosome"].isin(ref_chrom_l)]
full_norm_cov_df = cov_df.copy()
for col in data_col:
full_norm_cov_df[col] = (full_norm_cov_df[col]/full_norm_cov_df["length"])/(canonical_df[col].sum()/canonical_df["length"].sum())
display(full_norm_cov_df.head())
out_fn = os.path.join(outdir, "segment_usage_len_cov_norm.tsv")
full_norm_cov_df.to_csv(out_fn, sep="\t")
stdout_print("Count segment_coverage by category\n")
ref_alt_c = defaultdict(Counter)
ref_chrom_c = defaultdict(Counter)
alt_contig_c = defaultdict(Counter)
norm_cov_data = full_norm_cov_df.drop(columns=["chromosome","length", "start", "end", "type"])
for segment, line in tqdm(iter_idx_tuples(norm_cov_data), total=len(cov_df)):
seg_info = graph_info_dict[segment]
sample_type = seg_info["type"]
# Canonical reference K
if seg_info["chromosome"] in ref_chrom_l:
for sample_id, cov in iter_line(line, norm_cov_data.columns):
ref_alt_c[sample_id][sample_type]+=cov
ref_chrom_c[sample_id][seg_info["chromosome"]]+=cov
# Alternative contig
else:
# Count values
n_samples = 0
for v in line:
if v>min_normed_cov:
n_samples+=1
for n in n_sample_range:
if n_samples >= n:
cat = f"{n} sample contig"
for sample_id, cov in iter_line(line, norm_cov_data.columns):
ref_alt_c[sample_id][sample_type]+=cov
alt_contig_c[sample_id][cat]+=cov
break
stdout_print("Cast to Dataframe and plot\n")
ref_alt_df = pd.DataFrame(ref_alt_c).T
ref_alt_df = ref_alt_df.reindex(columns=["HDRR","HNI","HSOK","MIKK","HDRR+","HNI+","HSOK+"], index=["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"])
out_fn = os.path.join(outdir, "chromosome_alternative_coverage.tsv")
ref_alt_df.to_csv(out_fn, sep="\t")
display(ref_alt_df)
ref_chrom_df = pd.DataFrame(ref_chrom_c).T
ref_chrom_df = ref_chrom_df.reindex(columns=ref_chrom_l, index=["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"])
out_fn = os.path.join(outdir, "reference_chromosomes_coverage.tsv")
ref_chrom_df.to_csv(out_fn, sep="\t")
display(ref_chrom_df)
alt_contig_df = pd.DataFrame(alt_contig_c).T
alt_contig_df = alt_contig_df.reindex(columns=[f"{i} sample contig" for i in n_sample_range], index=["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"])
out_fn = os.path.join(outdir, "alternative_count_coverage.tsv")
alt_contig_df.to_csv(out_fn, sep="\t")
display(alt_contig_df)
with pl.style.context("ggplot"):
fig, axes = pl.subplots(3, 1, figsize=(15,15))
for i, (df, label, palette) in enumerate((
(ref_alt_df, "reference chromosomes and alternative contigs", sns.color_palette("tab10", n_colors=len(ref_alt_df.columns))),
(ref_chrom_df, "reference chromosomes details", sns.cubehelix_palette(len(ref_chrom_df.columns), start=0.9, rot=0.1, dark=0, light=0.90, hue=2.75, gamma=0.75, reverse=True)),
(alt_contig_df, "alternative contigs count", sns.cubehelix_palette(len(alt_contig_df.columns), start=0.9, rot=0.1, dark=0.1, light=0.80, hue=2.75, gamma=0.75, reverse=True)))):
ax = axes[i]
df = df.divide(df.sum(axis=1), axis=0)*100
df.plot.bar(stacked=True, ax=ax, color=palette)
ax.set_title(f"Percentage of graph segment coverage for {label}")
ncol = 2 if len(df.columns)>10 else 1
ax.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False, ncol=ncol)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
fig.tight_layout()
out_fn = os.path.join(outdir, "graph_segment_coverage.svg")
fig.savefig(out_fn)
stdout_print("Write out as bed file per sample\n")
for col in data_col:
sdf = full_norm_cov_df[["chromosome", "start", "end", col]]
sdf = sdf[sdf[col]>0]
outfile = f"./{outdir}/{col}_segments.bedgraph"
with open (outfile, "w") as fp:
fp.write(f"track type=bedGraph type=bedGraph name={col}\n")
sdf.to_csv(outfile, mode="a", header=False, index=False, sep="\t")
def overall_segment_usage (segment_usage_fn, outdir, min_coverage_dna=0.05, min_samples_dna=2):
df = pd.read_csv(fn, sep="\t")
info_field = ["seg_id","chromosome","length","start","end","type"]
info_df = df[info_field].copy()
data_df = df.drop(columns=info_field).copy()
info_df["pass_samples"] = data_df.ge(min_coverage_dna).sum(axis=1)
info_df["type"] = info_df["type"].str.rstrip("+")
len_d = defaultdict(Counter)
seg_d = defaultdict(Counter)
for line in info_df.itertuples():
if line.pass_samples>=min_samples_dna:
len_d[line.type]["MIKK bases"]+=line.length
seg_d[line.type]["MIKK segments"]+=1
len_d["All"]["MIKK bases"]+=line.length
seg_d["All"]["MIKK segments"]+=1
else:
len_d[line.type]["Non-MIKK bases"]+=line.length
seg_d[line.type]["Non-MIKK segments"]+=1
len_d["All"]["Non-MIKK bases"]+=line.length
seg_d["All"]["Non-MIKK segments"]+=1
len_df = pd.DataFrame.from_dict(len_d, orient="index")
seg_df = pd.DataFrame.from_dict(seg_d, orient="index")
len_df = len_df.reindex(["HDRR", "HNI", "HSOK", "MIKK", "All"])
seg_df = seg_df.reindex(["HDRR", "HNI", "HSOK", "MIKK", "All"])
merged_df = df = pd.concat([len_df, seg_df], axis=1)
out_fn = os.path.join(outdir, "segment_usage_type_stats.tsv")
merged_df.to_csv(out_fn, sep="\t")
display(merged_df)
with pl.style.context('ggplot'):
fig, (ax1, ax2) = pl.subplots(2, 1, figsize=(10,10))
len_df.plot.bar(ax=ax1, stacked=True)
table(ax1, len_df, colWidths=[0.10, 0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
ax1.set_title('Total length of segment per type')
ax1.set_ylabel("Total length")
seg_df.plot.bar(ax=ax2, stacked=True)
table(ax2, seg_df, colWidths=[0.10, 0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
ax2.set_title('Number of segments per type')
ax2.set_ylabel("Number of segments")
fig.tight_layout()
out_fn = os.path.join(outdir, "segment_usage_type_stats.svg")
fig.savefig(out_fn)
outdir = "segment_usage/assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
segment_usage_stats (
gaf_src="./alignments/assemblies/*_unstable.gaf",
graph_info_pkl="./graph_assembly/all_ref_graph_seg_info.pkl",
fasta_canonical_ref="./references/Oryzias_latipes_HDRR_clean.fa",
outdir="segment_usage/assemblies",
n_sample_range=[12,10,8,6,4,2,1],
min_align_len=100,
min_mapq=1,
min_normed_cov=0)
overall_segment_usage (
segment_usage_fn="segment_usage/assemblies/segment_usage_len_cov_norm.tsv",
outdir="segment_usage/assemblies",
min_coverage_dna=0.05,
min_samples_dna=2)
outdir = "segment_usage/raw_reads"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
segment_usage_stats (
gaf_src="./alignments/raw_reads/*_unstable.gaf",
graph_info_pkl="./graph_assembly/all_ref_graph_seg_info.pkl",
fasta_canonical_ref="./references/Oryzias_latipes_HDRR_clean.fa",
outdir="segment_usage/raw_reads",
n_sample_range=[12,10,8,6,4,2,1],
min_align_len=100,
min_mapq=1,
min_normed_cov=0.05)
overall_segment_usage (
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv",
outdir="segment_usage/raw_reads",
min_coverage_dna=0.05,
min_samples_dna=2)
min_coverage_dna=0.05
min_samples_dna=2
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"
outdir="segment_usage/raw_reads"
data_df = pd.read_csv(segment_usage_fn, sep="\t", index_col=0)
data_df = data_df.drop(columns=["chromosome","length","start","end","type"])
data_df["pass"] = data_df.ge(min_coverage_dna).sum(axis=1)
data_df["valid"] = data_df["pass"]>=2
data_df = data_df[["pass", "valid"]]
graph_info_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
info_df = pd.read_pickle(graph_info_fn)
info_df = info_df[["component", "length", "type"]]
all_df = pd.merge(info_df, data_df, left_index=True, right_index=True, how="left", validate="1:1")
all_res = OrderedDict()
for chrom, chrom_df in all_df.groupby("component"):
c = Counter()
for i in chrom_df.itertuples():
if i.valid:
seg_type = i.type.rstrip("+")
c[seg_type] += i.length
if c:
all_res[chrom] = c
res_df = pd.DataFrame.from_dict(all_res, orient="index")
res_df_normed = res_df.divide(res_df.sum(axis=1), axis=0)*100
with pl.style.context('ggplot'):
fig, (ax1, ax2) = pl.subplots(2, 1, figsize=(15,10))
res_df.plot.bar(ax=ax1, stacked=True)
ax1.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False)
ax1.set_title("Total length of segments supported by at least 2 samples")
ax1.set_xlabel("Chromosomes")
ax1.set_ylabel("Total length")
res_df_normed.plot.bar(ax=ax2, stacked=True)
ax2.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False)
ax2.set_title("Percentage of length of segments supported by at least 2 samples")
ax2.set_xlabel("Chromosomes")
ax2.set_ylabel("% of length")
fig.tight_layout()
out_fn = os.path.join(outdir, "segment_usage_chrom_type_stats.svg")
fig.savefig(out_fn)
outdir = "usage_karyotypes"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
min_cov = 0.25
n_bins = 250
n_colors = 255
max_count = 150
ref_name = "HDRR"
palette_name = "rocket_r"
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"
print("Load segments info data")
all_seg_df = pd.read_csv(segment_usage_fn, sep="\t")
for ref_name in ["HDRR", "HNI", "HSOK"]:
print("Load reference info")
fasta = f"./references/Oryzias_latipes_{ref_name}_clean.fa"
chr_len = OrderedDict()
with pyfaidx.Fasta(fasta) as fa:
chr_len = {i.name:len(i) for i in fa}
longest = max(chr_len.values())
chr_len_norm = OrderedDict()
for cname, clen in chr_len.items():
norm_len = clen/longest
if norm_len >= 0.1:
chr_len_norm[cname] = norm_len
valid_chr = list(chr_len_norm.keys())
print("Cleanup data")
seg_df = all_seg_df[all_seg_df["chromosome"].isin(valid_chr)].copy()
data_df = seg_df[["80-1","134-2","131-1","7-2","7-1","4-1","69-1","79-2","134-1","117-2","11-1","4-2"]]
seg_df["cov"] = data_df.apply(np.median, axis=1)
sdf = seg_df[["chromosome", "length", "start", "end", "cov"]].copy()
sdf = sdf[sdf["cov"]>=min_cov]
sdf["pos"] = sdf["start"] + (sdf["end"]-sdf["start"])//2
sdf["index"] = np.digitize(sdf["pos"], np.linspace(0, longest, n_bins))-1
print("Compute coverage by genomic bin")
all_count_d = defaultdict(Counter)
for i in sdf.itertuples():
all_count_d[i.chromosome][i.index]+=i.cov
print("Plot results")
count_bins = np.linspace(0, max_count, n_colors)
pal = sns.color_palette(palette_name, n_colors=n_colors)
with pl.style.context('seaborn-white'):
fig, axes = pl.subplots(1, len(chr_len_norm), figsize=(40 ,10), sharey=True)
for ax, cname in zip(axes, valid_chr):
clen = chr_len_norm[cname]
ax.set_title(cname, y=-0.05)
ax.axis("off")
ax.add_patch(mpl.patches.Rectangle((0.25, 0), 0.5, clen, fc="0.9"))
counts = all_count_d[cname]
y_list = np.divide(list(counts.keys()), n_bins)
v_list = gaussian_filter1d(list(counts.values()), sigma=0.5)
c_list = [pal[i-1] for i in np.digitize(v_list, count_bins)]
for y, c in zip(y_list, c_list):
ax.add_patch(mpl.patches.Rectangle((0.25, y), 0.5, 1/n_bins, fc=c))
cb_ax = fig.add_axes([0.93, 0.22, 0.01, 0.5])
cb = mpl.colorbar.ColorbarBase(ax=cb_ax, cmap=mpl.cm.get_cmap(palette_name), norm=mpl.colors.Normalize(vmin=0, vmax=max_count), orientation='vertical', ticklocation="left", label="Normalised coverage")
fig.suptitle(f"Karyotype of median normalised coverage for reference {ref_name}", fontsize=20)
fig.savefig(os.path.join(outdir, f"Karyotype_{ref_name}_all_median.svg"), bbox_inches='tight')
min_cov = 0.25
n_bins = 250
n_colors = 255
max_count = 150
ref_name = "HDRR"
palette_name = "rocket_r"
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"
print("Load segments info data")
all_seg_df = pd.read_csv(segment_usage_fn, sep="\t")
for ref_name in ["HDRR", "HNI", "HSOK"]:
print (f"Processing {ref_name}")
print("Load reference info")
fasta = f"./references/Oryzias_latipes_{ref_name}_clean.fa"
chr_len = OrderedDict()
with pyfaidx.Fasta(fasta) as fa:
chr_len = {i.name:len(i) for i in fa}
longest = max(chr_len.values())
chr_len_norm = OrderedDict()
for cname, clen in chr_len.items():
norm_len = clen/longest
if norm_len >= 0.1:
chr_len_norm[cname] = norm_len
valid_chr = list(chr_len_norm.keys())
print("Cleanup data")
seg_df = all_seg_df[all_seg_df["chromosome"].isin(valid_chr)].copy()
for sample_id in ["80-1","134-2","131-1","7-2","7-1","4-1","69-1","79-2","134-1","117-2","11-1","4-2"]:
print (f"Processing sample {sample_id}")
sdf = seg_df[["chromosome", "length", "start", "end", sample_id]].copy()
sdf = sdf.rename(columns={sample_id:"cov"})
sdf = sdf[sdf["cov"]>=min_cov]
sdf["pos"] = sdf["start"] + (sdf["end"]-sdf["start"])//2
bins = np.linspace(0, longest, n_bins)
sdf["index"] = np.digitize(sdf["pos"], bins)-1
print("Compute coverage by genomic bin")
all_count_d = defaultdict(Counter)
for i in sdf.itertuples():
all_count_d[i.chromosome][i.index]+=i.cov
print("Plot results")
count_bins = np.linspace(0, max_count, n_colors)
pal = sns.color_palette(palette_name, n_colors=n_colors)
with pl.style.context('seaborn-white'):
fig, axes = pl.subplots(1, len(chr_len_norm), figsize=(40 ,10), sharey=True)
for ax, cname in zip(axes, valid_chr):
clen = chr_len_norm[cname]
ax.set_title(cname, y=-0.05)
ax.axis("off")
ax.add_patch(mpl.patches.Rectangle((0.25, 0), 0.5, clen, fc="0.9"))
counts = all_count_d[cname]
y_list = np.divide(list(counts.keys()), n_bins)
v_list = gaussian_filter1d(list(counts.values()), sigma=0.5)
c_list = [pal[i-1] for i in np.digitize(v_list, count_bins)]
for y, c in zip(y_list, c_list):
ax.add_patch(mpl.patches.Rectangle((0.25, y), 0.5, 1/n_bins, fc=c))
cb_ax = fig.add_axes([0.93, 0.22, 0.01, 0.5])
cb = mpl.colorbar.ColorbarBase(ax=cb_ax, cmap=mpl.cm.get_cmap(palette_name), norm=mpl.colors.Normalize(vmin=0, vmax=max_count), orientation='vertical', ticklocation="left", label="Normalised coverage")
fig.suptitle(f"Karyotype of normalised coverage for reference {ref_name} with line {sample_id}", fontsize=20)
fig.savefig(os.path.join(outdir, f"Karyotype_{ref_name}_{sample_id}.svg"), bbox_inches='tight')
pl.show()
outdir = "segment_usage/rna_seq"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
segment_usage_stats (
gaf_src="./alignments/rna_seq/*_unstable.gaf",
graph_info_pkl="./graph_assembly/all_ref_graph_info.pkl",
fasta_canonical_ref="./references/Oryzias_latipes_HDRR_clean.fa",
outdir="segment_usage/rna_seq",
n_sample_range=[50,45,40,35,30,25,20,15,10,5,1],
min_align_len=50,
min_mapq=1,
min_normed_cov=0.05)
overall_segment_usage (
segment_usage_fn="segment_usage/rna_seq/segment_usage_len_cov_norm.tsv",
outdir="segment_usage/rna_seq",
min_coverage_dna=0.05,
min_samples_dna=2)
def usage_annotation(segment_usage_fn, outdir, thresholds = [90, 75, 50, 25, 10]):
segment_usage_df = pd.read_csv(segment_usage_fn, sep="\t", index_col=0)
segment_usage_df_data = segment_usage_df.drop(columns=["chromosome","length","start","end","type","identity"])
coord = segment_usage_df["chromosome"] + ":" + segment_usage_df["start"].astype(str) + "-" + segment_usage_df["end"].astype(str)
segment_usage_df["coord"] = coord
for threshold in thresholds:
segment_usage_df[threshold] = segment_usage_df_data.ge(threshold/100).sum(axis=1)
display(segment_usage_df.head())
# Define colormap for bandage
colors = [rgb2hex(i) for i in sns.color_palette('afmhot_r', len(segment_usage_df_data.columns)+1)]
for threshold in thresholds:
print(f"Write file for threshold {threshold}")
# Formating for bedgraph
bedgraph_df = segment_usage_df[["chromosome","start","end", threshold]].copy()
bedgraph_out_fn = f"{outdir}/seg_usage_{threshold}.bedgraph"
with open (bedgraph_out_fn, "w") as fp:
fp.write("track type=bedGraph name=samples_pass\n")
bedgraph_df.to_csv(bedgraph_out_fn, sep="\t", mode="a", index=False, header=False)
# Formating for bandage
bandage_df = segment_usage_df[[threshold,"coord","length","type","identity"]].copy()
bandage_df.index.name = "name"
bandage_df = bandage_df.rename(columns={threshold:"sample_pass"})
seg_col = []
for seg_id, n in bandage_df["sample_pass"].items():
seg_col.append(colors[n])
bandage_df["color"] = seg_col
bandage_out_fn = f"{outdir}/seg_usage_{threshold}.csv"
bandage_df.to_csv(bandage_out_fn, sep=",")
outdir = "segment_usage/raw_reads_annot/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
usage_annotation (
segment_usage_fn = "segment_usage/raw_reads/segment_usage_len_cov_norm.tsv",
outdir = outdir,
thresholds = [50, 33, 25, 10])
outdir = "segment_usage/rna_seq_annot/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
usage_annotation (
segment_usage_fn = "segment_usage/rna_seq/segment_usage_len_cov_norm.tsv",
outdir = outdir,
thresholds = [50, 33, 25, 10])
outdir = "link_usage"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
def sort_seg (s1, s1_orient, s2, s2_orient):
""""""
s1_num = int(s1[1:])
s2_num = int(s2[1:])
if s1_num > s2_num:
inv_dict = {"+":"-", "-":"+"}
s1, s2 = s2, s1
s1_orient, s2_orient = inv_dict[s2_orient], inv_dict[s1_orient]
link_id = "{}{}_{}{}".format(s1, s1_orient, s2, s2_orient)
return link_id
def split_path (path):
l = []
seg_id = ""
strand = ""
for c in path:
if c in ["<", ">"]:
if seg_id:
l.append((seg_id, strand))
seg_id = ""
strand = "+" if c == ">" else "-"
else:
seg_id +=c
l.append((seg_id, strand))
return l
def gaf_link_coverage (gaf_fn, min_align_len=100, min_mapq=1, progress=True):
link_usage_dict = Counter()
info_counter = Counter()
with open (gaf_fn) as gaf_fp:
for line in tqdm(gaf_fp, disable=not progress):
line = line.strip().split("\t")
seg_list = split_path(line[5])
align_len = int(line[10])
mapq = int(line[11])
align_type = line[12].split(":")[-1]
if align_len < min_align_len:
info_counter["short length alignments"]+=1
elif mapq < min_mapq:
info_counter["low mapq alignments"]+=1
elif align_type != "P":
info_counter["non primary alignments"]+=1
elif len(seg_list) == 1:
info_counter["single segment path"]+=1
else:
info_counter["multi segment path"]+=1
prev_seg = seg_list[0]
for seg in seg_list[1:]:
info_counter["valid links"]+=1
link_id = sort_seg(s1=prev_seg[0], s1_orient=prev_seg[1], s2=seg[0], s2_orient=seg[1])
link_usage_dict[link_id]+=1
prev_seg=seg
return link_usage_dict, info_counter
def link_usage_stats (gaf_src, graph_link_info_fn, outdir, min_align_len=100, min_mapq=1):
data_col = ["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"]
all_link = OrderedDict()
all_info = OrderedDict()
stdout_print("Parsing gaf files\n")
for gaf_fn in glob.glob(gaf_src):
sample_id = gaf_fn.split("/")[-1].split("_")[0]
stdout_print(f"\tParsing data for sample {sample_id}\n")
link_dict, info_counter = gaf_link_coverage (gaf_fn=gaf_fn, min_align_len=min_align_len, min_mapq=min_mapq, progress=False)
all_link[sample_id] = link_dict
all_info[sample_id] = info_counter
info_df = pd.DataFrame(all_info)
info_df = info_df.reindex(columns=data_col)
display(info_df)
stdout_print("Cast link usage data to DataFrame\n")
link_data_df = pd.DataFrame(all_link)
link_data_df = link_data_df.reindex(columns=data_col)
stdout_print("Loading graph link info file\n")
link_info_df = pd.read_pickle(graph_link_info_fn)
#link_info_df = link_info_df[["connect_type","connect_gap","SV_type"]]
stdout_print("Merge data and info\n")
link_df = pd.merge(left=link_info_df, right=link_data_df, left_index=True, right_index=True, how="left")
link_df[data_col] = link_df[data_col].fillna(0).astype(np.int64)
display(link_df.head())
out_fn = os.path.join(outdir, "link_cov.tsv")
link_df.to_csv(out_fn, sep="\t")
stdout_print("Normalise by HDRR continuous links\n")
canonical_link_df = link_df[(link_df["connect_type"]=="intra_HDRR") & (link_df["SV_type"]=="continuous")]
norm_link_df = link_df.copy()
for col in data_col:
norm_link_df[col] = norm_link_df[col]/canonical_link_df[col].median()
display(norm_link_df.head())
out_fn = os.path.join(outdir, "norm_link_cov.tsv")
norm_link_df.to_csv(out_fn, sep="\t")
outdir = "link_usage/assemblies/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
link_usage_stats(
gaf_src="./alignments/assemblies/*_unstable.gaf",
graph_link_info_fn="./graph_assembly/all_ref_graph_link_info.pkl",
outdir=outdir,
min_align_len=100,
min_mapq=1)
outdir = "link_usage/raw_reads/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
link_usage_stats(
gaf_src="./alignments/raw_reads/*_unstable.gaf",
graph_link_info_fn="./graph_assembly/all_ref_graph_link_info.pkl",
outdir=outdir,
min_align_len=100,
min_mapq=1)