from datetime import date
from pycltools.pycltools import jprint
jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2020_06_04', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
from tqdm import tqdm
import csv
import re
# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
import pybedtools as pbt
import pyranges as pr
import mappy as mp
from IPython.display import SVG, display, Image
from pyBioTools.Annotation import Annotation
from scipy.ndimage.filters import gaussian_filter1d
# Ploting lib imports
import matplotlib as mpl
import matplotlib.pyplot as pl
import seaborn as sns
%matplotlib inline
# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
from pandas.plotting import table
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100
outdir = "diverging_insertions"
#shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
# input files
graph_info_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
dna_usage_fn = "segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"
ref_fasta = "references/Oryzias_latipes_HDRR_clean.fa"
outdir = "diverging_insertions"
# Options
ref_type = "HDRR"
min_alt_seg_len = 200
min_alt_path_len = 10000
max_identity = 0.5
min_coverage_dna = 0.5
min_samples_dna = 2
no_Ns_extra = 10000
# Load and filter out segments
graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df = graph_info_df.drop(columns=["component","connected"])
graph_info_df = graph_info_df[(graph_info_df["type"] != ref_type) & (graph_info_df["length"]>=min_alt_seg_len) & (graph_info_df["identity"]<=max_identity)]
# Load and filter out segments
dna_usage_df = pd.read_csv(dna_usage_fn, sep="\t", index_col=0)
dna_usage_df = dna_usage_df[(dna_usage_df["type"] != ref_type) & (dna_usage_df["length"]>=min_alt_seg_len)]
# Count number of samples with sufficient coverage for each segments
dna_usage_df = dna_usage_df.drop(columns=["chromosome","length","start","end","type"])
dna_usage_df["samples_pass_dna"] = dna_usage_df.ge(min_coverage_dna).sum(axis=1)
dna_usage_df = dna_usage_df[dna_usage_df["samples_pass_dna"]>=min_samples_dna]
all_df = pd.merge(graph_info_df, dna_usage_df[["samples_pass_dna"]], left_index=True, right_index=True, how="inner")
all_df = all_df.sort_values(by="length", ascending=False)
print (f"Number of segments found: {len(all_df)}")
out_fn = os.path.join(outdir, "DNA_alternative_segments.tsv")
all_df.to_csv(out_fn, sep="\t")
l = []
with pyfaidx.Fasta(ref_fasta) as ref_fa_fp:
for (connected_chromosome, connected_start, connected_end), branch_df in all_df.groupby(["connected_chromosome", "connected_start","connected_end"]):
if branch_df["length"].sum() > min_alt_path_len:
d = OrderedDict()
d["connected_coord"] = f"{connected_chromosome}:{int(connected_start)}-{int(connected_end)}"
d["Valid segments"] = len(branch_df)
d["MIKK segments"] = len(branch_df[branch_df["type"].isin(["MIKK"])])
d["HNI segments"] = len(branch_df[branch_df["type"].isin(["HNI", "HNI+"])])
d["HSOK segments"] = len(branch_df[branch_df["type"].isin(["HSOK", "HSOK+"])])
d["segment_lengths"] = branch_df["length"].sum()
d["Median identity"] = np.median(branch_df["identity"])
d["Median samples_pass_dna"] = np.median(branch_df["samples_pass_dna"])
# Test presence of Ns in reference
seq = str(ref_fa_fp[connected_chromosome][int(connected_start)-no_Ns_extra:int(connected_end)+no_Ns_extra])
d["Ns in ref"] = seq.count("N")
# Finally add segment list
d["segment_ids"] = ",".join(branch_df.index)
l.append(d)
path_df = pd.DataFrame(l)
print (f"Number of alternative diverging paths found including Ns: {len(path_df)}")
path_df = path_df[path_df["Ns in ref"] == 0]
path_df = path_df.drop(columns=["Ns in ref"])
path_df = path_df.sort_values(by="segment_lengths", ascending=False)
print (f"Number of alternative diverging paths found: {len(path_df)}")
display(path_df.head(50))
out_fn = os.path.join(outdir, "DNA_alternative_paths.tsv")
all_df.to_csv(out_fn, sep="\t")
for fn in glob.glob("./diverging_insertions/bandage_plots/DNA_*.png"):
coord = os.path.split(fn)[-1].split(".")[0]
jprint(coord, bold=True, size=100)
display(Image(fn, width=1000))
def gff3_exon_parser(fn):
""" Parse a gff3 formated file"""
exon_features = OrderedDict()
transcript_features = OrderedDict()
gene_features = OrderedDict()
c=Counter()
with open(fn) as fp:
for line in tqdm(fp):
if not line.startswith("#"):
line = line.strip().split("\t")
if line[2] in ["gene", "mRNA", "exon"]:
try:
d = OrderedDict(chrom=line[0],start=int(line[3]),end=int(line[4]))
attrs = {i.split("=")[0]:i.split("=")[1] for i in line[8].split(";")}
if line[2] == "gene":
ID = attrs.get("ID")
if ID:
gene_features[ID] = d
c["Valid gene"]+=1
elif line[2] == "mRNA":
d["gene_id"] = attrs.get("Parent")
ID = attrs.get("ID")
if ID:
transcript_features[ID] = d
c["Valid transcript"]+=1
elif line[2] == "exon":
d["transcript_id"] = attrs.get("Parent")
ID = attrs.get("exon_id")
if ID:
exon_features[ID] = d
c["Valid exon"]+=1
except (IndexError, ValueError) as E:
c["Invalid feature line"]+=1
# Convert in a dataframe and cleanup
for i, j in c.items():
print(f"{i}:{j}")
for i, j in exon_features.items():
tx_id = j.get("transcript_id")
try:
tx_info = transcript_features.get(tx_id)
gene_id = tx_info.get("gene_id")
exon_features[i]["gene_id"]=gene_id
except:
exon_features[i]["gene_id"]=tx_id
exon_df = pd.DataFrame.from_dict(exon_features, orient="index")
exon_df.index.name="exon_id"
return exon_df
# input files
graph_info_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
dna_usage_fn = "segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"
rna_usage_fn = "segment_usage/rna_seq/segment_usage_len_cov_norm.tsv"
ref_fasta = "references/Oryzias_latipes_HDRR_clean.fa"
gff3_fn = "./references/Oryzias_latipes_all_assemblies_sorted.gff3"
outdir = "diverging_insertions"
# Options
ref_type = "HDRR"
min_alt_seg_len = 200
min_alt_path_len = 10000
max_identity = 0.5
min_coverage_dna = 0.5
min_samples_dna = 2
min_coverage_rna = 0.1
min_samples_rna = 8
no_Ns_extra = 50000
c = Counter()
# Load and filter out segments
graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df = graph_info_df.drop(columns=["component","connected"])
graph_info_df = graph_info_df[(graph_info_df["type"] != ref_type) & (graph_info_df["length"]>=min_alt_seg_len) & (graph_info_df["identity"]<=max_identity)]
# Load and filter out segments
dna_usage_df = pd.read_csv(dna_usage_fn, sep="\t", index_col=0)
dna_usage_df = dna_usage_df[(dna_usage_df["type"] != ref_type) & (dna_usage_df["length"]>=min_alt_seg_len)]
# Count number of samples with sufficient coverage for each segments
dna_usage_df = dna_usage_df.drop(columns=["chromosome","length","start","end","type"])
dna_usage_df["samples_pass_dna"] = dna_usage_df.ge(min_coverage_dna).sum(axis=1)
dna_usage_df = dna_usage_df[dna_usage_df["samples_pass_dna"]>=min_samples_dna]
# Load and filter out segments
rna_usage_df = pd.read_csv(rna_usage_fn, sep="\t", index_col=0)
rna_usage_df = rna_usage_df[(rna_usage_df["type"] != ref_type) & (rna_usage_df["length"]>=min_alt_seg_len)]
# Count number of samples with sufficient coverage for each segments
rna_usage_df = rna_usage_df.drop(columns=["chromosome","length","start","end","type"])
rna_usage_df["samples_pass_rna"] = rna_usage_df.ge(min_coverage_rna).sum(axis=1)
rna_usage_df = rna_usage_df[rna_usage_df["samples_pass_rna"]>=min_samples_rna]
usage_df = pd.merge(dna_usage_df[["samples_pass_dna"]], rna_usage_df[["samples_pass_rna"]], left_index=True, right_index=True, how="inner")
all_df = pd.merge(graph_info_df, usage_df, left_index=True, right_index=True, how="inner")
all_df = all_df.sort_values(by="length", ascending=False)
stdout_print (f"Number of segments found: {len(all_df)}\n")
out_fn = os.path.join(outdir, "DNA_RNA_alternative_segments.tsv")
all_df.to_csv(out_fn, sep="\t")
stdout_print (f"Parse annotation file\n")
exon_df = gff3_exon_parser(fn=gff3_fn)
l = []
with pyfaidx.Fasta(ref_fasta) as ref_fa_fp:
for (connected_chromosome, connected_start, connected_end), branch_df in all_df.groupby(["connected_chromosome", "connected_start","connected_end"]):
banch_len = branch_df["length"].sum()
if banch_len > min_alt_path_len:
# Check number of Ns
seq = str(ref_fa_fp[connected_chromosome][int(connected_start)-no_Ns_extra:int(connected_end)+no_Ns_extra])
Ns_count = seq.count("N")
if Ns_count:
c["No overlaping exons"]+=1
continue
# Find overlaping exon
overlapping_exon_info = exon_df[(exon_df["chrom"]==connected_chromosome)&(exon_df["start"]<=connected_end)&(exon_df["end"]>=connected_start)]
if overlapping_exon_info.empty:
c["Ns in reference sequence"]+=1
continue
gene_ids = set()
for g in overlapping_exon_info.itertuples():
gid = g.gene_id.split(":")[1]
gene_ids.add(gid)
d = OrderedDict()
d["connected_coord"] = f"{connected_chromosome}:{int(connected_start)}-{int(connected_end)}"
d["connected_seg_start"] = branch_df.iloc[0]["connected_seg_start"]
d["connected_seg_end"] = branch_df.iloc[0]["connected_seg_end"]
d["Valid segments"] = len(branch_df)
d["MIKK segments"] = len(branch_df[branch_df["type"].isin(["MIKK"])])
d["HNI segments"] = len(branch_df[branch_df["type"].isin(["HNI", "HNI+"])])
d["HSOK segments"] = len(branch_df[branch_df["type"].isin(["HSOK", "HSOK+"])])
d["segment_lengths"] = branch_df["length"].sum()
d["Median identity"] = np.median(branch_df["identity"])
d["Samples_pass_dna"] = branch_df["samples_pass_dna"].max()
d["Samples_pass_rna"] = branch_df["samples_pass_rna"].max()
d["n_exons_overlapped"] = len(overlapping_exon_info)
d["gene_ids_overlapped"] = ",".join(gene_ids)
d["segment_ids"] = ",".join(branch_df.index)
l.append(d)
c["Valid interval"]+=1
# Convert in a dataframe and cleanup
for i, j in c.items():
stdout_print(f"{i}:{j}\n")
stdout_print ("Cast to dataframe and cleanup\n")
path_df = pd.DataFrame(l)
path_df = path_df.sort_values(by="segment_lengths", ascending=False)
stdout_print (f"Number of alternative diverging paths found: {len(path_df)}\n")
display(path_df)
out_fn = os.path.join(outdir, "DNA_RNA_alternative_paths.tsv")
path_df.to_csv(out_fn, sep="\t")
outdir = "diverging_insertions/sequences"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
seg_info = namedtuple("seg_info", ["seg_id", "orient", "length", "sequence"])
def get_seq_from_segments (segment_fasta, seg_list):
seg_info_list = []
with pyfaidx.Fasta(segment_fasta) as fa:
for seg in seg_list:
seg_id = seg[:-1]
orient = seg[-1]
if orient == "+":
seg_seq = str(fa[seg_id][:])
else:
seg_seq = str(fa[seg_id][:].reverse.complement)
seg_info_list.append(seg_info(seg_id=seg_id, orient=orient, length=len(seg_seq), sequence=seg_seq))
return seg_info_list
def get_flank_seq (seg_id, segment_fasta, flank_seq_len=10000, reverse=False):
seg_num = int(seg_id[1:])
seg_info_list = []
tot_len = 0
with pyfaidx.Fasta(segment_fasta) as fa:
while tot_len<flank_seq_len:
if reverse:
seg_seq = str(fa[seg_id])
len_seg = len(seg_seq)
if tot_len+len_seg > flank_seq_len:
len_seg = flank_seq_len-tot_len
seg_seq = seg_seq[-len_seg:]
seg_info_list.insert(0, seg_info(seg_id=seg_id, orient="+", length=len_seg, sequence=seg_seq))
tot_len += len_seg
seg_num-=1
seg_id = f"s{seg_num}"
else:
seg_seq = str(fa[seg_id])
len_seg = len(seg_seq)
if tot_len+len_seg > flank_seq_len:
len_seg = flank_seq_len-tot_len
seg_seq = seg_seq[:len_seg]
seg_info_list.append(seg_info(seg_id=seg_id, orient="+", length=len_seg, sequence=seg_seq))
tot_len += len_seg
seg_num+=1
seg_id = f"s{seg_num}"
return seg_info_list
def get_ref_alt_seq (left_ref_seg, right_ref_seg, alt_seg_str, flank_seq_len, segment_fasta, outdir, locus):
print("Extract left flank info")
left_seq_info = get_flank_seq (seg_id=left_ref_seg, segment_fasta=segment_fasta, flank_seq_len=flank_seq_len, reverse=True)
print("Extract right flank info")
right_seq_info = get_flank_seq (seg_id=right_ref_seg, segment_fasta=segment_fasta, flank_seq_len=flank_seq_len, reverse=False)
print("Extract reference path info")
internal_ref_seg_list = [f"s{i}+" for i in range(int(left_ref_seg[1:])+1, int(right_ref_seg[1:]))]
internal_ref_seq_info = get_seq_from_segments (segment_fasta=segment_fasta, seg_list=internal_ref_seg_list)
internal_ref_seq_len = sum([i.length for i in internal_ref_seq_info])
print(f"\tLength of reference path {internal_ref_seq_len}")
print("Extract alternative path info")
internal_alt_seg_list = [i.strip() for i in alt_seg_str.split(",")]
internal_alt_seq_info = get_seq_from_segments (segment_fasta=segment_fasta, seg_list=internal_alt_seg_list)
internal_alt_seq_len = sum([i.length for i in internal_alt_seq_info])
print(f"\tLength of alternative path {internal_alt_seq_len}")
ref_seg_info = left_seq_info+internal_ref_seq_info+right_seq_info
alt_seg_info = left_seq_info+internal_alt_seq_info+right_seq_info
# FASTA output
print("Write FASTA output")
out_fn = f"{outdir}/{locus}.fa"
ref_seq = "".join([i.sequence for i in ref_seg_info])
alt_seq = "".join([i.sequence for i in alt_seg_info])
with open(out_fn, "w") as out_fa:
out_fa.write(f">{locus}_reference\n{ref_seq}\n")
out_fa.write(f">{locus}_alternative\n{alt_seq}\n")
# BED output
print("Write BED output")
out_fn = f"{outdir}/{locus}.bed"
with open(out_fn, "w") as out_bed:
start=0
for seg_info in ref_seg_info:
out_bed.write(f"{locus}_reference\t{start}\t{start+seg_info.length}\t{seg_info.seg_id}{seg_info.orient}\n")
start+=seg_info.length
start=0
for seg_info in alt_seg_info:
out_bed.write(f"{locus}_alternative\t{start}\t{start+seg_info.length}\t{seg_info.seg_id}{seg_info.orient}\n")
start+=seg_info.length
Genes overlaped by reference
locus = "HDRR_23:4344994-4363759"
print("Extended coordinates: {}:{}-{}".format(locus.split(":")[0], int(locus.split(":")[1].split("-")[0])-50000, int(locus.split(":")[1].split("-")[1])+50000))
path_df = pd.read_csv("./diverging_insertions/DNA_RNA_alternative_paths.tsv", sep="\t", index_col=0)
display(path_df[path_df["connected_coord"]==locus])
display(Image(f"./diverging_insertions/igv_plots/{locus}.png", width=1000))
display(Image(f"./diverging_insertions/bandage_plots/RNA_{locus}.png", width=1000))
get_ref_alt_seq (
left_ref_seg = "s609578",
right_ref_seg = "s609581",
alt_seg_str = "s1073563+, s1013541-",
segment_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa",
flank_seq_len=50000,
outdir="diverging_insertions/sequences",
locus = "HDRR_23:4344994-4363759")
Genes overlaped by reference
locus = "HDRR_18:24441731-24455814"
print("Extended coordinates: {}:{}-{}".format(locus.split(":")[0], int(locus.split(":")[1].split("-")[0])-50000, int(locus.split(":")[1].split("-")[1])+50000))
path_df = pd.read_csv("./diverging_insertions/DNA_RNA_alternative_paths.tsv", sep="\t", index_col=0)
display(path_df[path_df["connected_coord"]==locus])
display(Image(f"./diverging_insertions/igv_plots/{locus}.png", width=1000))
display(Image(f"./diverging_insertions/bandage_plots/RNA_{locus}.png", width=1000))
get_ref_alt_seq (
left_ref_seg = "s501739",
right_ref_seg = "s501759",
alt_seg_str = "s1073563+, s1013541-",
segment_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa",
flank_seq_len=50000,
outdir="diverging_insertions/sequences",
locus = "HDRR_18:24441731-24455814")
Genes overlaped by reference
locus = "HDRR_21:27476540-27507741"
print("Extended coordinates: {}:{}-{}".format(locus.split(":")[0], int(locus.split(":")[1].split("-")[0])-50000, int(locus.split(":")[1].split("-")[1])+50000))
path_df = pd.read_csv("./diverging_insertions/DNA_RNA_alternative_paths.tsv", sep="\t", index_col=0)
display(path_df[path_df["connected_coord"]==locus])
display(Image(f"./diverging_insertions/igv_plots/{locus}.png", width=1000))
display(Image(f"./diverging_insertions/bandage_plots/RNA_{locus}.png", width=1000))
get_ref_alt_seq (
left_ref_seg = "s575078",
right_ref_seg = "s575094",
alt_seg_str = "s776384+, s776385+, s933617+, s776386+, s776387+, s933618+, s776390+, s776391+, s933619+, s933620+, s933621+, s933622+, s933624+, s933625+, s933626+, s933627+, s933628+, s933629+, s933630+, s933631+, s933632+, s933633+, s933634+, s933635+, s933636+",
segment_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa",
flank_seq_len=50000,
outdir="diverging_insertions/sequences",
locus = "HDRR_21:27476540-27507741")
Genes overlaped by reference
locus = "HDRR_2:1761679-1776422"
print("Extended coordinates: {}:{}-{}".format(locus.split(":")[0], int(locus.split(":")[1].split("-")[0])-50000, int(locus.split(":")[1].split("-")[1])+50000))
path_df = pd.read_csv("./diverging_insertions/DNA_RNA_alternative_paths.tsv", sep="\t", index_col=0)
display(path_df[path_df["connected_coord"]==locus])
display(Image(f"./diverging_insertions/igv_plots/{locus}.png", width=1000))
display(Image(f"./diverging_insertions/bandage_plots/RNA_{locus}.png", width=1000))
get_ref_alt_seq (
left_ref_seg = "s37795",
right_ref_seg = "s37806",
alt_seg_str = "s658979+, s658980+, s658981+, s979176-, s658983+",
segment_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa",
flank_seq_len=50000,
outdir="diverging_insertions/sequences",
locus = "HDRR_2:1761679-1776422")
Genes overlaped by reference
locus = "HDRR_18:23094225-23131585"
print("Extended coordinates: {}:{}-{}".format(locus.split(":")[0], int(locus.split(":")[1].split("-")[0])-50000, int(locus.split(":")[1].split("-")[1])+50000))
path_df = pd.read_csv("./diverging_insertions/DNA_RNA_alternative_paths.tsv", sep="\t", index_col=0)
display(path_df[path_df["connected_coord"]==locus])
display(Image(f"./diverging_insertions/igv_plots/{locus}.png", width=1000))
display(Image(f"./diverging_insertions/bandage_plots/RNA_{locus}.png", width=1000))
get_ref_alt_seq (
left_ref_seg = "s501121",
right_ref_seg = "s501134",
alt_seg_str = "s948397-, s948396-, s948395-, s974668-, s948393-, s948392-, s948391-, s948390-, s948389-, s974666-",
segment_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa",
flank_seq_len=50000,
outdir="diverging_insertions/sequences",
locus = "HDRR_18:23094225-23131585")
Genes overlaped by reference
Extended window to include whole intron
display(Image("./diverging_insertions/igv_plots/HDRR_2:8837517-8853181.png", width=1000))
display(Image("./diverging_insertions/bandage_plots/HDRR_2:8837517-8853181.png", width=1000))
# Reference path
ref_path = [f"s{i}+" for i in range(42910, 42946)]
ref_seq = seq_from_segments (segment_fasta="./references/Oryzias_latipes_graph_segments.fa", seg_list=ref_path)
print(",".join(ref_path))
print(f"Length of reference path: {len(ref_seq)}" )
# Alternative path
alt_path_str = "s42910+, s42911+, s976704-, s42913+, s976703-, s42915+, s931080-, s42916+, s931079-, s931078-, s931077-, s931076-, s42918+, s42919+, s42920+, s42921+, s42922+, s42923+, s42924+, s42925+, s976700-, s976699-, s1041532+, s996971-, s42928+, s42929+, s42930+, s42931+, s42932+, s42933+, s42934+, s42935+, s42936+, s931165-, s42938+, s42940+, s931164-, s976697-, s931162-, s976696-, s976695-, s976694-, s42944+"
alt_path = [i.strip() for i in alt_path_str.split(",")]
alt_seq = seq_from_segments (segment_fasta="./references/Oryzias_latipes_graph_segments.fa", seg_list=alt_path)
print(",".join(alt_path))
print(f"Length of alternative path: {len(alt_seq)}" )
insert_id="HDRR_2:8837517-8853181"
out_fn = f"diverging_insertions/sequences/{insert_id}.fa"
with open(out_fn, "w") as out_fa:
out_fa.write(f">{insert_id}_reference\n{ref_seq}\n")
out_fa.write(f">{insert_id}_alternative\n{alt_seq}\n")
Genes overlaped by reference
Extended window to include genes
display(Image("./diverging_insertions/igv_plots/HDRR_5:5075836-5101956.png", width=1000))
display(Image("./diverging_insertions/bandage_plots/HDRR_5:5075836-5101956.png", width=1000))
# Reference path
ref_path = [f"s{i}+" for i in range(126150, 126192)]
ref_seq = seq_from_segments (segment_fasta="./references/Oryzias_latipes_graph_segments.fa", seg_list=ref_path)
print(",".join(ref_path))
print(f"Length of reference path: {len(ref_seq)}" )
# Alternative path
left = [f"s{i}+" for i in range(126150, 126152)]
right = [f"s{i}+" for i in range(126153, 126192)]
alt_path_str = "s126151+, s675762+, s675763+, s675764+, s675765+, s1029010+, s675767+, s1029011+, s675769+, s1050018-, s675772+, s1062580+, s1062581+, s1062582+, s675775+, s675776+, s1062583+, s675778+, s1062584+, s675780+, s675781+, s1062585+, s675783+, s1062586+, s675785+, s1096510-, s675787+, s1062587+, s675789+, s1062588+, s675791+, s1062589+, s675793+, s675794+, s675795+, s675796+, s675797+, s1062590+, s1062591+, s1096508-, s675802+, s126153+"
alt_path = [i.strip() for i in alt_path_str.split(",")]
alt_path = left+alt_path+right
alt_seq = seq_from_segments (segment_fasta="./references/Oryzias_latipes_graph_segments.fa", seg_list=alt_path)
print(",".join(alt_path))
print(f"Length of alternative path: {len(alt_seq)}" )
insert_id="HDRR_5:5075836-5101956"
out_fn = f"diverging_insertions/sequences/{insert_id}.fa"
with open(out_fn, "w") as out_fa:
out_fa.write(f">{insert_id}_reference\n{ref_seq}\n")
out_fa.write(f">{insert_id}_alternative\n{alt_seq}\n")
fn_out = "diverging_insertions/sequences/merged.fa"
with open (fn_out, "w") as fa_out:
for fn_in in glob.glob("./diverging_insertions/sequences/*.fa"):
with open(fn_in) as fa_in:
fa_out.write(fa_in.read())
with pyfaidx.Fasta(fn_out) as fa:
tot_len = len(fa)
fn_out = "diverging_insertions/sequences/merged_segments_annot.bed"
with open (fn_out, "w") as bed_out:
for fn_in in glob.glob("./diverging_insertions/sequences/HDRR_*.bed"):
with open(fn_in) as bed_in:
bed_out.write(bed_in.read())
bed_fn = "diverging_insertions/sequences/merged_annot.bed"
fa_fn = "diverging_insertions/sequences/merged.fa"
extend = 50000
with open(bed_fn, "w") as bed_fp:
with pyfaidx.Fasta(fa_fn) as fa_fp:
for seq in fa_fp:
start = extend
end = len(seq)-extend
bed_fp.write(f"{seq.name}\t{start}\t{end}\n")
cat(bed_fn)
def gff3_parser(fn, feature_type=[]):
""" Parse a gff3 formated file"""
all_features = []
c=Counter()
with open(fn) as fp:
for line in fp:
if not line.startswith("#"):
line = line.strip().split("\t")
if feature_type and line[2] not in feature_type:
c["Invalid feature type"]+=1
continue
try:
d = OrderedDict(refid=line[0],source=line[1],ftype=line[2],start=int(line[3]),end=int(line[4]),score=line[5],strand=line[6],frame=line[7],attributes=line[8])
all_features.append(d)
c["Valid feature type"]+=1
except (IndexError, ValueError) as E:
c["Invalid feature line"]+=1
# Convert in a dataframe and cleanup
for i, j in c.items():
print(f"{i}:{j}")
df = pd.DataFrame(all_features)
return df
coord_list = (
"HDRR_2:1761679-1776422",
"HDRR_23:4344994-4363759",
"HDRR_18:23094225-23131585",
"HDRR_21:27476540-27507741",
"HDRR_18:24441731-24455814")
gff3_in_fn = "./references/Oryzias_latipes_HDRR_sorted.gff3"
gff3_out_fn = "diverging_insertions/sequences/merged_annot.gff3"
extend = 50000
print ("Inport Gff3 annotatons")
df = gff3_parser(fn=gff3_in_fn)
print ("Extract corresponding annotations")
coord_df_list = []
for coord in coord_list:
chrom, start, end = supersplit(coord, ["-", ":"])
ext_start = int(start)-extend
ext_end = int(end)+extend
coord_df = df[(df["refid"]==chrom)&(df["start"]>=ext_start)&(df["end"]<=ext_end)].copy()
refid = f"{chrom}:{start}-{end}_reference"
coord_df["refid"] = refid
coord_df["start"] = coord_df["start"]-ext_start
coord_df["end"] = coord_df["end"]-ext_start
coord_df_list.append(coord_df)
all_coord_df = pd.concat(coord_df_list)
all_coord_df = all_coord_df.sort_values(["refid", "start", "end"])
print ("Write new annotations")
with open (gff3_out_fn, "w") as fp:
# write gff header
fp.write("##gff-version 3\n")
for coord in coord_list:
chrom, start, end = supersplit(coord, ["-", ":"])
ext_end = int(end)+extend
refid = f"{chrom}:{start}-{end}_reference"
fp.write(f"##sequence-region\t{refid}\t1\t{ext_end}\n")
for l in all_coord_df.itertuples(index=False):
fp.write("\t".join([str(i) for i in l])+"\n")
head(gff3_out_fn, max_char_col=100, max_char_line=200)
outdir = "diverging_insertions/star_index"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
fn_out = "diverging_insertions/sequences/merged.fa"
with open (fn_out, "w") as fa_out:
for fn_in in glob.glob("./diverging_insertions/sequences/*.fa"):
with open(fn_in) as fa_in:
fa_out.write(fa_in.read())
with pyfaidx.Fasta(fn_out) as fa:
tot_len = len(fa)
print (f"Merge reference Total Length: {tot_len}")
len_idx_str = min(14, np.log2(total_len)/2 - 1)
print (f"Length of STAR index str: {len_idx_str}")
outdir = "diverging_insertions/star_index"
fasta_files = "diverging_insertions/sequences/merged.fa"
mem = 10000
threads = 20
dry = False
stdout = f"{outdir}/star_index.out"
stderr = f"{outdir}/star_index.err"
cmd = f"STAR --runMode genomeGenerate --runThreadN {threads} --genomeDir {outdir} --genomeFastaFiles {fasta_files} --genomeSAindexNbases 8"
bsub(cmd = cmd, conda="Star_v2.7.5", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
outdir = "diverging_insertions/star_align"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
mem = 20000
threads = 25
dry = False
index_dir = "diverging_insertions/star_index"
outdir = "diverging_insertions/star_align"
info_fn = "/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/Analysis_low_high/candidates/liver_candidate_rep1.tsv"
sample_d = OrderedDict()
with open(info_fn) as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
for row in reader:
sample_d[row["sanger_id"]] = row["sample_id"].replace("_", "-")
for sanger_id, sample_id in sample_d.items():
R1 = f"/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/RNA_pipeline2/results/input/fastp/high_{sanger_id}_1.fastq.gz"
R2 = f"/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/RNA_pipeline2/results/input/fastp/high_{sanger_id}_2.fastq.gz"
prefix = os.path.join(outdir, f"{sanger_id}_{sample_id}")
stdout = f"{prefix}_star_align.out"
stderr = f"{prefix}_star_align.err"
opt = "--outFilterType BySJout --outSJfilterCountTotalMin -1 10 10 10 --outSJfilterOverhangMin -1 15 15 15 --alignSJoverhangMin 10 --alignSJDBoverhangMin 3"
cmd = f"STAR {opt} --readFilesCommand 'gunzip -c' --runMode alignReads --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM 1000000000 --runThreadN {threads} --genomeDir {index_dir} --readFilesIn {R1} {R2} --outFileNamePrefix {prefix}"
bsub(cmd = cmd, conda="Star_v2.7.5", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
mem = 10000
threads = 10
fai = "./diverging_insertions/sequences/merged.fa.fai"
dry = False
for bam in glob.glob("./diverging_insertions/star_align/*.out.bam"):
bsub (cmd=f"samtools index {bam}", conda="samtools", dry=dry, mem=mem, threads=threads)
bsub (cmd=f"igvtools count -z 1 -w 10 -f median {bam} {bam}.tdf {fai}", conda="IGV", dry=dry, mem=mem, threads=threads)
# bsub (cmd=f"igvtools count --minMapQuality 10 -z 1 -w 10 -f median {bam} {bam}_mapq10.tdf {fai}", conda="IGV", dry=dry, mem=mem, threads=threads)
# bsub (cmd=f"igvtools count --minMapQuality 30 -z 1 -w 10 -f median {bam} {bam}_mapq30.tdf {fai}", conda="IGV", dry=dry, mem=mem, threads=threads)
outdir = "deletions"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
def gff3_exon_parser(fn):
""" Parse a gff3 formated file"""
exon_features = OrderedDict()
transcript_features = OrderedDict()
gene_features = OrderedDict()
c=Counter()
with open(fn) as fp:
for line in tqdm(fp):
if not line.startswith("#"):
line = line.strip().split("\t")
if line[2] in ["gene", "mRNA", "exon"]:
try:
d = OrderedDict(chrom=line[0],start=int(line[3]),end=int(line[4]))
attrs = {i.split("=")[0]:i.split("=")[1] for i in line[8].split(";")}
if line[2] == "gene":
ID = attrs.get("ID")
if ID:
gene_features[ID] = d
c["Valid gene"]+=1
elif line[2] == "mRNA":
d["gene_id"] = attrs.get("Parent")
ID = attrs.get("ID")
if ID:
transcript_features[ID] = d
c["Valid transcript"]+=1
elif line[2] == "exon":
d["transcript_id"] = attrs.get("Parent")
ID = attrs.get("exon_id")
if ID:
exon_features[ID] = d
c["Valid exon"]+=1
except (IndexError, ValueError) as E:
c["Invalid feature line"]+=1
# Convert in a dataframe and cleanup
for i, j in c.items():
print(f"{i}:{j}")
for i, j in exon_features.items():
tx_id = j.get("transcript_id")
try:
tx_info = transcript_features.get(tx_id)
gene_id = tx_info.get("gene_id")
exon_features[i]["gene_id"]=gene_id
except:
exon_features[i]["gene_id"]=tx_id
exon_df = pd.DataFrame.from_dict(exon_features, orient="index")
exon_df.index.name="exon_id"
return exon_df
link_fn = "./link_usage/raw_reads/norm_link_cov.tsv"
ref_fasta = "references/Oryzias_latipes_HDRR_clean.fa"
gff3_fn = "./references/Oryzias_latipes_all_assemblies_sorted.gff3"
outdir = "deletions"
min_link_cov = 0.5
min_samples = 2
min_del_len = 10000
no_Ns_extra = 50000
c = Counter()
print("Import data and filter")
link_df = pd.read_csv (link_fn, sep="\t", index_col=0)
# Select intra HDRR deletions longer than min_del_len
link_df = link_df[(link_df["SV_type"]=="deletion") & (link_df["connect_type"]=="intra_HDRR") & (link_df["connect_gap"]>=min_del_len)]
c["All deletions"]=len(link_df)
# Calculate number of samples with sufficient coverage over deletion link
data_col = ["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"]
link_df["samples_pass"] = link_df[data_col].ge(min_link_cov).sum(axis=1)
link_df = link_df[link_df["samples_pass"]>=min_samples]
c["Deletion with sufficient coverage"]=len(link_df)
display(link_df.head())
out_fn = os.path.join(outdir, "large_deletions.tsv")
link_df.to_csv(out_fn, sep="\t", index=False)
# Filter out reference sequence containing Ns (probable misassemblies)
print("Filter out sequence containing Ns and non overlaping exon")
l = []
exon_df = gff3_exon_parser(fn=gff3_fn)
with pyfaidx.Fasta(ref_fasta) as ref_fa_fp:
for s in tqdm(link_df.itertuples()):
chrom = s.s1_chromosome
start = min(int(s.s1_pos), int(s.s2_pos))
end = max(int(s.s1_pos), int(s.s2_pos))
# Check number of Ns
seq = str(ref_fa_fp[chrom][start-no_Ns_extra:end+no_Ns_extra])
num_Ns = seq.count("N")
if num_Ns:
c["Ns in reference sequence"]+=1
continue
# Find overlaping exon
c["Deletion with sufficient coverage"]=len(link_df)
overlapping_exon_info = exon_df[(exon_df["chrom"]==chrom)&(exon_df["start"]<=end)&(exon_df["end"]>=start)]
if overlapping_exon_info.empty:
c["No overlaping exons"]+=1
continue
gene_ids = set()
for g in overlapping_exon_info.itertuples():
gid = g.gene_id.split(":")[1]
gene_ids.add(gid)
d = OrderedDict()
d["gap_coord"] = f"{chrom}:{start}-{end}"
d["start_segment"] = s.Index.split("_")[0]
d["end_segment"] = s.Index.split("_")[1]
d["gap_length"] = int(s.connect_gap)
d["supporting_samples"] = s.samples_pass
d["n_exons_overlapped"] = len(overlapping_exon_info)
d["gene_ids_overlapped"] = ",".join(gene_ids)
l.append(d)
c["Valid interval"]+=1
select_link_df = pd.DataFrame(l)
select_link_df = select_link_df.sort_values(by="gap_length", ascending=False)
select_link_df = select_link_df.reset_index(drop=True)
# Convert in a dataframe and cleanup
for i, j in c.items():
print(f"{i}:{j}")
display(select_link_df)
out_fn = os.path.join(outdir, "large_deletions_exon_noNs.tsv")
select_link_df.to_csv(out_fn, sep="\t", index=False)
out_fn = os.path.join(outdir, "large_deletions_exon_noNs.bed")
with open (out_fn, "w") as out_fp:
for i in select_link_df.itertuples():
coord = i.gap_coord
chrom = coord.split(":")[0]
start = int(coord.split(":")[1].split("-")[0])
end = int(coord.split(":")[1].split("-")[1])
out_fp.write(f"{chrom}\t{start}\t{end}\n")