Graph assembly and alignment

In [1]:
from datetime import date
from pycltools.pycltools import jprint

jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2020_06_04', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
Bad key "text.kerning_factor" on line 4 in
/nfs/software/birney/adrien/miniconda3/envs/Python3.7/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution

Adrien Leger / EMBL EBI

Starting date : 2020_06_04

Last modification date : 2021_02_15

Imports

In [2]:
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
from tqdm import tqdm
import csv
import re

# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
import pybedtools as pbt
import pyranges as pr
import mappy as mp

# Ploting lib imports
import matplotlib.pyplot as pl
from matplotlib.colors import rgb2hex
import seaborn as sns
%matplotlib inline

# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100

Get and cleanup data

Download reference assembly

In [25]:
outdir = "./references/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/graph_explore/references2
In [29]:
outdir = "./references/"
for label, link in (
    ("HDRR", "ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes/dna/Oryzias_latipes.ASM223467v1.dna.toplevel.fa.gz"),
    ("HDRR+", "http://utgenome.org/medaka_v2/Medaka-Hd-rR-PacBio_unanchored_contigs.fasta.gz"),
    ("HNI", "ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes_hni/dna/Oryzias_latipes_hni.ASM223471v1.dna.toplevel.fa.gz"),
    ("HNI+", "http://utgenome.org/medaka_v2/Medaka-HNI-PacBio_unanchored_contigs.fasta.gz"),
    ("HSOK", "ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes_hsok/dna/Oryzias_latipes_hsok.ASM223469v1.dna.toplevel.fa.gz"),
    ("HSOK+", "http://utgenome.org/medaka_v2/Medaka-HSOK-PacBio_unanchored_contigs.fasta.gz")):
    
    print(f"Downloading and cleaning {label}")   
    zipped_fn = f"./{outdir}/Oryzias_latipes_{label}.fa.gz"
    unziped_fn = f"./{outdir}/Oryzias_latipes_{label}.fa"
    cleaned_fn = f"./{outdir}/Oryzias_latipes_{label}_clean.fa"
    
    # Download
    wget(link, zipped_fn)
    
    # Extract Gzip
    gunzip_file(zipped_fn, unziped_fn)

    # Clean up fasta names
    with open(cleaned_fn, "w") as fa_out, pyfaidx.Fasta(unziped_fn) as fa_in:
        for seq in fa_in:
            fa_out.write(f">{label}_{seq.name}\n{str(seq)}\n")

    # Index file
    with pyfaidx.Fasta(cleaned_fn) as fa:
        print(len(fa))

    # Remove originals
    for fn in [zipped_fn, unziped_fn, unziped_fn+".fai"]:
        os.remove(fn)
Downloading and cleaning HDRR
Downloading: ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes/dna/Oryzias_latipes.ASM223467v1.dna.toplevel.fa.gz	Bytes: 223527279
100.0 MB Downloaded	[44.74 %]
200.0 MB Downloaded	[89.47 %]
223.5 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HDRR.fa.gz
734057086
Downloading and cleaning HDRR+
Downloading: http://utgenome.org/medaka_v2/Medaka-Hd-rR-PacBio_unanchored_contigs.fasta.gz	Bytes: 15437627
15.4 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HDRR+.fa.gz
55662681
Downloading and cleaning HNI
Downloading: ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes_hni/dna/Oryzias_latipes_hni.ASM223471v1.dna.toplevel.fa.gz	Bytes: 207712664
100.0 MB Downloaded	[48.14 %]
200.0 MB Downloaded	[96.29 %]
207.7 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HNI.fa.gz
677633405
Downloading and cleaning HNI+
Downloading: http://utgenome.org/medaka_v2/Medaka-HNI-PacBio_unanchored_contigs.fasta.gz	Bytes: 20844315
20.8 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HNI+.fa.gz
71426658
Downloading and cleaning HSOK
Downloading: ftp://ftp.ensembl.org/pub/release-100/fasta/oryzias_latipes_hsok/dna/Oryzias_latipes_hsok.ASM223469v1.dna.toplevel.fa.gz	Bytes: 226363349
100.0 MB Downloaded	[44.18 %]
200.0 MB Downloaded	[88.35 %]
226.4 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HSOK.fa.gz
744414398
Downloading and cleaning HSOK+
Downloading: http://utgenome.org/medaka_v2/Medaka-HSOK-PacBio_unanchored_contigs.fasta.gz	Bytes: 16051409
16.1 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HSOK+.fa.gz
64546884
In [100]:
outdir = "./references/"
for label, link in (
    ("HDRR", "ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes/Oryzias_latipes.ASM223467v1.100.gff3.gz"),
    ("HNI", "ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes_hni/Oryzias_latipes_hni.ASM223471v1.100.gff3.gz"),
    ("HSOK", "ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes_hsok/Oryzias_latipes_hsok.ASM223469v1.100.gff3.gz")):
    
    print(f"Downloading and cleaning {label}")   
    zipped_fn = f"./{outdir}/Oryzias_latipes_{label}.gff3.gz"
    unziped_fn = f"./{outdir}/Oryzias_latipes_{label}.gff3"
    cleaned_fn = f"./{outdir}/Oryzias_latipes_{label}_clean.gff3"
    sorted_fn = f"./{outdir}/Oryzias_latipes_{label}_sorted.gff3"
    
    # Download
    wget(link, zipped_fn)
    # Extract Gzip
    gunzip_file(zipped_fn, unziped_fn)
    
    # Clean fasta names
    with open(unziped_fn, "r") as gff_in, open(cleaned_fn, "w") as gff_out:
        for line in gff_in:
            line = line.strip()
            
            if line.startswith("##sequence-region"):
                line = line.split()
                line[1] = "{}_{}".format(label, line[1])
                line = "\t".join(line)
            if line[0] != "#":
                line = line.split()
                line[0] = "{}_{}".format(label, line[0])
                line = "\t".join(line)
            gff_out.write(line+"\n")
        
    # Sort and index for IGV
    bash (f"igvtools sort {cleaned_fn} {sorted_fn}", conda="IGV")
    bash (f"igvtools index {sorted_fn}", conda="IGV")

    # Remove originals
    os.remove(zipped_fn)
    os.remove(unziped_fn)
    os.remove(cleaned_fn)
Downloading and cleaning HDRR
Downloading: ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes/Oryzias_latipes.ASM223467v1.100.gff3.gz	Bytes: 12175696
12.2 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HDRR.gff3.gz
Using system JDK.
WARN [2020-07-08T12:21:07,324]  [Globals.java:138] [main]  Development mode is enabled
Sorting ././references2//Oryzias_latipes_HDRR_clean.gff3  -> ././references2//Oryzias_latipes_HDRR_sorted.gff3
Done
Using system JDK.
WARN [2020-07-08T12:21:17,035]  [Globals.java:138] [main]  Development mode is enabled
Done
Downloading and cleaning HNI
Downloading: ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes_hni/Oryzias_latipes_hni.ASM223471v1.100.gff3.gz	Bytes: 11436936
11.4 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HNI.gff3.gz
Using system JDK.
WARN [2020-07-08T12:21:28,959]  [Globals.java:138] [main]  Development mode is enabled
Sorting ././references2//Oryzias_latipes_HNI_clean.gff3  -> ././references2//Oryzias_latipes_HNI_sorted.gff3
Done
Using system JDK.
WARN [2020-07-08T12:21:37,156]  [Globals.java:138] [main]  Development mode is enabled
Done
Downloading and cleaning HSOK
Downloading: ftp://ftp.ensembl.org/pub/release-100/gff3/oryzias_latipes_hsok/Oryzias_latipes_hsok.ASM223469v1.100.gff3.gz	Bytes: 11639342
11.6 MB Downloaded	[100 %]
Uncompressing ././references2//Oryzias_latipes_HSOK.gff3.gz
Using system JDK.
WARN [2020-07-08T12:21:49,025]  [Globals.java:138] [main]  Development mode is enabled
Sorting ././references2//Oryzias_latipes_HSOK_clean.gff3  -> ././references2//Oryzias_latipes_HSOK_sorted.gff3
Done
Using system JDK.
WARN [2020-07-08T12:21:57,915]  [Globals.java:138] [main]  Development mode is enabled
Done

Merge all fasta ref together

In [43]:
references_list = [
    './references/Oryzias_latipes_HDRR_clean.fa',
    './references/Oryzias_latipes_HDRR+_clean.fa',
    './references/Oryzias_latipes_HNI_clean.fa',
    './references/Oryzias_latipes_HNI+_clean.fa',
    './references/Oryzias_latipes_HSOK_clean.fa',
    './references/Oryzias_latipes_HSOK+_clean.fa']
alternative_list = glob.glob("./individual_assemblies/*_clean.fa")
all_ref = references_list+alternative_list

out_fn = "references/Oryzias_latipes_all_assemblies_contig.fa"

with open (out_fn, "w") as fa_out:
    for fasta_fn in all_ref:
        print(f"Copy {fasta_fn}")
        with open(fasta_fn,'r') as fa_in:
            shutil.copyfileobj(fa_in, fa_out)

with pyfaidx.Fasta(out_fn) as fa:
    print(len(fa))
Copy ./references2/Oryzias_latipes_HDRR_clean.fa
Copy ./references2/Oryzias_latipes_HDRR+_clean.fa
Copy ./references2/Oryzias_latipes_HNI_clean.fa
Copy ./references2/Oryzias_latipes_HNI+_clean.fa
Copy ./references2/Oryzias_latipes_HSOK_clean.fa
Copy ./references2/Oryzias_latipes_HSOK+_clean.fa
Copy ./individual_assemblies/69-1_F3_clean.fa
Copy ./individual_assemblies/131-1_F4_clean.fa
Copy ./individual_assemblies/117-2_C4_clean.fa
Copy ./individual_assemblies/4-2_B2_clean.fa
Copy ./individual_assemblies/7-2_F2_clean.fa
Copy ./individual_assemblies/4-1_B2_clean.fa
Copy ./individual_assemblies/134-2_A5_clean.fa
Copy ./individual_assemblies/79-2_G3_clean.fa
Copy ./individual_assemblies/134-1_H4_clean.fa
Copy ./individual_assemblies/80-1_H3_clean.fa
Copy ./individual_assemblies/7-1_E2_clean.fa
Copy ./individual_assemblies/11-1_A3_clean.fa
11123476255

Merge all annotations together

In [102]:
gff_list = [
    './references/Oryzias_latipes_HDRR_sorted.gff3',
    './references/Oryzias_latipes_HNI_sorted.gff3',
    './references/Oryzias_latipes_HSOK_sorted.gff3']

out_fn = "references/Oryzias_latipes_all_assemblies.gff3"
out_fn_sorted = "references/Oryzias_latipes_all_assemblies_sorted.gff3"

with open (out_fn, "w") as gff_out:
    gff_out.write("##gff-version 3\n")
    
    print(f"Copy headers")
    for gff_fn in gff_list:
        with open (gff_fn) as gff_fp: 
            for line in gff_fp:
                line = line.strip()
                if line.startswith("##sequence-region"):
                    gff_out.write(f"{line}\n")
                elif not line.startswith("#"):
                    break
    print(f"Copy annotations")
    for gff_fn in gff_list:
        with open (gff_fn) as gff_fp: 
            for line in gff_fp:
                line = line.strip()
                if not line.startswith("#"):
                    gff_out.write(f"{line}\n")

# Sort and index for IGV
print(f"Sort and index with igvtools")
bash (f"igvtools sort {out_fn} {out_fn_sorted}", conda="IGV")
bash (f"igvtools index {out_fn_sorted}", conda="IGV")

# Remove originals
os.remove(out_fn)
Copy headers
Copy annotations
Sort and index with igvtools
Using system JDK.
WARN [2020-07-08T12:22:18,467]  [Globals.java:138] [main]  Development mode is enabled
Sorting references2/Oryzias_latipes_all_assemblies.gff3  -> references2/Oryzias_latipes_all_assemblies_sorted.gff3
Done
Using system JDK.
WARN [2020-07-08T12:22:40,152]  [Globals.java:138] [main]  Development mode is enabled
Done

Graph Assembly

Generate graph

In [45]:
outdir = "./graph_assembly"
#shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)

mem = 60000
threads = 20
dry = False

references_list = [
    './references/Oryzias_latipes_HDRR_clean.fa',
    './references/Oryzias_latipes_HDRR+_clean.fa',
    './references/Oryzias_latipes_HNI_clean.fa',
    './references/Oryzias_latipes_HNI+_clean.fa',
    './references/Oryzias_latipes_HSOK_clean.fa',
    './references/Oryzias_latipes_HSOK+_clean.fa']

alternative_list = glob.glob("./individual_assemblies/*_clean.fa")
all_ref = " ".join(references_list+alternative_list)

gfa = f"{outdir}/all_ref_graph.gfa"

cmd = f"minigraph -x ggs -L 100 -t {threads} {all_ref} > {gfa}"
bsub(cmd, conda="minigraph", threads=threads, mem=mem)
bsub -M 60000 -R 'rusage[mem=60000]' -n 20 -oo ./graph_assembly2/all_ref_graph.err -eo ./graph_assembly2/all_ref_graph.out -P bigmem  "minigraph -x ggs -L 100 -t 20 ./references2/Oryzias_latipes_HDRR_clean.fa ./references2/Oryzias_latipes_HDRR+_clean.fa ./references2/Oryzias_latipes_HNI_clean.fa ./references2/Oryzias_latipes_HNI+_clean.fa ./references2/Oryzias_latipes_HSOK_clean.fa ./references2/Oryzias_latipes_HSOK+_clean.fa ./individual_assemblies/69-1_F3_clean.fa ./individual_assemblies/131-1_F4_clean.fa ./individual_assemblies/117-2_C4_clean.fa ./individual_assemblies/4-2_B2_clean.fa ./individual_assemblies/7-2_F2_clean.fa ./individual_assemblies/4-1_B2_clean.fa ./individual_assemblies/134-2_A5_clean.fa ./individual_assemblies/79-2_G3_clean.fa ./individual_assemblies/134-1_H4_clean.fa ./individual_assemblies/80-1_H3_clean.fa ./individual_assemblies/7-1_E2_clean.fa ./individual_assemblies/11-1_A3_clean.fa > ./graph_assembly2/all_ref_graph.gfa"
Job <3595722> is submitted to default queue <research-rh74>.
Out[45]:
'3595722'

Display L and S lines

In [55]:
graph_fn = "./graph_assembly/all_ref_graph.gfa"
seg_line = namedtuple("S", ["seg_id", "sequence", "length", "chromosome", "start", "rank"])
seg_line_list = []
link_line = namedtuple("L", ["seg1_id", "seg1_orient", "seg2_id", "seg2_orient", "cigar", "rank", "seg1_length", "seg2_length"])
link_line_list = []

with open (graph_fn) as fp:
    for l in tqdm(fp):
        l = l.strip().split("\t")
        
        if l[0] == "S":
            sl = seg_line(l[1], l[2], int(l[3].split(":")[-1]), l[4].split(":")[-1], int(l[5].split(":")[-1]), int(l[6].split(":")[-1]))
            seg_line_list.append(sl)
            
        if l[0] == "L":
            ll = link_line(l[1], l[2], l[3], l[4], l[5], int(l[6].split(":")[-1]), int(l[7].split(":")[-1]), int(l[8].split(":")[-1]))
            link_line_list.append(ll)

seg_df = pd.DataFrame(seg_line_list)
link_df = pd.DataFrame(link_line_list)

with pd.option_context("display.max_colwidth", 50):
    display(seg_df.sample(10))
    display(link_df.sample(10))
2632099it [00:19, 132149.83it/s]
seg_id sequence length chromosome start rank
821800 s821801 GCAATGGTCTTCTTAAATTAAATATAAAAAATGCTTCTGTTAATGT... 130 HSOK_6 30064625 4
251827 s251828 GGTGTTTGAACTGAAACCAATAAACAAACACAGAAGAGCATAGCCC... 393 HDRR_9 26206200 0
298378 s298379 TTTTTTACATTTAAAAAGATCCCTTTCTTCCTTTGTTTTTCTTTAC... 1897 HDRR_11 10968366 0
892662 s892663 TGTATCATTGACTTTTATTGTTTTTAATTTAATTCACTGCTGGATG... 789 HSOK_23 17815495 4
1002558 s1002559 TATATTGAATATTACACAATTATGAACCATCGTTTTTTATGAGGTA... 1507 MIKK_4-2_B2_31 2625342 9
104816 s104817 ATTTTTCTTGTAATCCAGTGTTTCGCATGCTGAGCAACAAGTTAGT... 8528 HDRR_4 15077549 0
589473 s589474 TTTTGTAACTCCAATGAGGAAACGCTGGAACATGT 35 HDRR_22 11141139 0
671705 s671706 GAGCTGATCT 10 HNI_4 20085113 2
839896 s839897 ATATTGTTCATAGTAAACAGTTGTGATGCCACAATATTTTTTTGTT... 150 HSOK_10 30662802 4
55664 s55665 AAACAAGGTTGGGAGGCAAG 20 HDRR_2 22077791 0
seg1_id seg1_orient seg2_id seg2_orient cigar rank seg1_length seg2_length
871255 s551323 + s551324 + 0M 0 281 20
244930 s154994 + s154995 + 0M 0 1348 333
235909 s149318 + s681559 + 0M 2 265 104
940601 s595221 + s889201 + 0M 4 412 361
1284299 s892636 + s622135 + 0M 4 1823 2278
476208 s301259 + s301260 + 0M 0 4115 367
343863 s217531 + s217532 + 0M 0 101 8382
1174003 s785555 + s785556 + 0M 2 191 95
315979 s199891 + s199892 + 0M 0 2814 17
106632 s67543 + s992867 + 0M 8 2165 710

Parse graph segments

Define helper functions

In [24]:
Coord = namedtuple("coord", ["chromosome", "start", "end", "start_seg_id", "end_seg_id"])

def find_connected_ref (seg_id, seg_info_d, path=None, reference_type="HDRR"):
    seg_info = seg_info_d[seg_id]
    
    # if leaf is ref type return coordinates
    if seg_info["type"] == reference_type:
        return Coord(seg_info["chromosome"], seg_info["end"], seg_info["start"], seg_id, seg_id)
    
    # Add any visited node to the path
    if not path:
        path = []
    path.append(seg_id)
     
    best_chromosome = best_start = best_end = best_start_seg_id = best_end_seg_id = None
    for s in seg_info["connected"]:
        
        # Avoid circular paths error
        if not s in path:
            
            # Recursively search for connected segments 
            coord = find_connected_ref(seg_id=s, seg_info_d=seg_info_d, path=path)
            
            if coord.chromosome and coord.start and coord.end:
                if not best_chromosome:
                    best_chromosome = coord.chromosome
                else:
                    assert best_chromosome == coord.chromosome
                if not best_end or coord.end > best_end:
                        best_end = coord.end
                        best_end_seg_id = coord.end_seg_id
                if not best_start or coord.start < best_start:
                        best_start = coord.start
                        best_start_seg_id = coord.start_seg_id
                        
    return Coord(best_chromosome, best_start, best_end, best_start_seg_id, best_end_seg_id)

def sort_seg (s1, s2, s1_orient, s2_orient):
    """"""
    s1_num = int(s1[1:])
    s2_num = int(s2[1:])
    if s1_num > s2_num:
        inv_dict = {"+":"-", "-":"+"}
        s1, s2 = s2, s1
        s1_orient, s2_orient = inv_dict[s2_orient], inv_dict[s1_orient]
        
    return (s1, s2, s1_orient, s2_orient)

def get_link_info(s1_orient, s2_orient, s1_info, s2_info):
    """"""
    s1_chromosome = s1_info["chromosome"]
    s1_pos = s1_info["end"] if s1_orient == "+" else s1_info["start"]
    s1_type = s1_info["type"]
    
    s2_chromosome = s2_info["chromosome"]
    s2_pos = s2_info["start"] if s2_orient == "+" else s2_info["end"]
    s2_type = s2_info["type"]
    
    if s1_chromosome == s2_chromosome:
        connect_type = "intra_{}".format(s1_type)
        
        # Calculate length of break point
        connect_gap = s2_pos-s1_pos if s1_orient == "+" else s1_pos-s2_pos
        
        # Define type of jump depending on break length 
        if s1_orient == s2_orient:
            if connect_gap==0:
                SV_type = "continuous"
            elif connect_gap>0:
                SV_type = "deletion"
            else:
                SV_type = "duplication"
        else:
            SV_type = "invertion"
    elif s1_type == s2_type:
        connect_type = "inter_{}".format(s1_type)
        connect_gap = None
        SV_type = "insertion"
    else:
        connect_type = "{}_{}".format(s1_type, s2_type)
        connect_gap = None
        SV_type = "insertion"
    
    d = OrderedDict(
        s1_chromosome = s1_chromosome,
        s1_pos = s1_pos,
        s2_chromosome = s2_chromosome,
        s2_pos = s2_pos,
        s2_type = s2_type,
        connect_type = connect_type,
        connect_gap = connect_gap,
        SV_type = SV_type)
    return d

Parse graph

In [26]:
graph_fn = "./graph_assembly/all_ref_graph.gfa"
main_ref_fasta = "./references/Oryzias_latipes_HDRR_clean.fa"
seg_info_out_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
link_info_out_fn = "./graph_assembly/all_ref_graph_link_info.pkl"
graph_segments_fasta = "./graph_assembly/Oryzias_latipes_graph_segments.fa"

link_info_d = OrderedDict()
seg_info_d = OrderedDict()
max_comp = 0
min_len_align = 200
ref_label = "HDRR"

# Align remaining segments to reference to define identity
stdout_print("Make minimap db\n")
minimap_db = mp.Aligner(main_ref_fasta, best_n=1, scoring=(2,4,4,2,24,0), k=15, n_threads=10)

stdout_print("Annotate segments and links\n")
with open(graph_fn) as graph_fp, open(graph_segments_fasta, "w") as fasta_fp:
    for l in tqdm(graph_fp):
        l = l.strip().split("\t")
        
        if l[0] == "S":
            seg_id = l[1]
            seq =l[2]
            chromosome = l[4].split(":")[-1]
            length = int(l[3].split(":")[-1])
            start = int(l[5].split(":")[-1])
            end = start+length
            seg_type = chromosome.split("_")[0]
            
            # Write segment sequences to fasta
            fasta_fp.write(f">{seg_id}\n{seq}\n")

            # Try to align to ref if long enough
            if seg_type != ref_label and length >= min_len_align:
                seq = l[2]
                hits = minimap_db.map(seq=seq)
                try:
                    hit = next(hits)
                    soft_clip = hit.q_st+(length-hit.q_en)
                    identity = (hit.blen-hit.NM)/(hit.blen+soft_clip)
                    hit_chromosome, hit_start, hit_end = hit.ctg, hit.r_st, hit.r_en
                except StopIteration:
                    identity = 0
                    hit_chromosome = hit_start = hit_end = None
                finally:
                    del hits
                
            else:
                identity = None
                hit_chromosome = hit_start = hit_end = None
                
            seg_info_d[seg_id] = OrderedDict(
                chromosome = chromosome,
                start = start,
                end = end,
                length = length,
                type = seg_type,
                identity = identity,
                hit_chromosome = hit_chromosome,
                hit_start = hit_start,
                hit_end = hit_end,
                component = 0,
                connected = [])
        
        # Record all possible connections between segments
        if l[0] == "L":
            
            # Sort and flip segments if 
            s1, s2, s1_orient, s2_orient = sort_seg (s1=l[1], s2=l[3], s1_orient=l[2], s2_orient=l[4]) 
            s1_info=seg_info_d[s1]
            s2_info=seg_info_d[s2]
            
            # Characterise junctions type
            link_info = get_link_info(s1_orient=s1_orient, s2_orient=s2_orient, s1_info=s1_info, s2_info=s2_info)
            link_id = "{}{}_{}{}".format(s1, s1_orient, s2, s2_orient)
            link_info_d[link_id] = link_info
            
            # Update list of connected segments
            seg_info_d[s2]["connected"].append(s1)
            seg_info_d[s1]["connected"].append(s2)
            
            # Determine component the pair belongs to
            s1_comp = s1_info["component"]
            s2_comp = s2_info["component"]
            if not s1_comp and not s2_comp:
                max_comp+=1
                seg_info_d[s1]["component"] = max_comp
                seg_info_d[s2]["component"] = max_comp
            elif not s1_comp:
                seg_info_d[s1]["component"] = s2_comp
            elif not s2_comp:
                seg_info_d[s2]["component"] = s1_comp
            else:
                assert s1_comp == s2_comp

del minimap_db                

# Find connected reference coordinate 
stdout_print("Traverse graph to find reference anchor points\n")
for seg_id, seg_info in tqdm(seg_info_d.items()):
    if seg_info["type"] != ref_label:
        coord = find_connected_ref(seg_id=seg_id, seg_info_d=seg_info_d, reference_type=ref_label)
        seg_info_d[seg_id]["connected_chromosome"] = coord.chromosome
        seg_info_d[seg_id]["connected_start"] = coord.start
        seg_info_d[seg_id]["connected_end"] = coord.end
        seg_info_d[seg_id]["connected_seg_start"] = coord.start_seg_id
        seg_info_d[seg_id]["connected_seg_end"] = coord.end_seg_id

seg_info_df = pd.DataFrame.from_dict(seg_info_d, orient="index")
seg_info_df.index.name = "seg_id"
seg_info_df.to_pickle(seg_info_out_fn)
display(seg_info_df.head())

link_info_df = pd.DataFrame.from_dict(link_info_d, orient="index")
link_info_df.index.name = "link_id"
link_info_df.to_pickle(link_info_out_fn)
display(link_info_df.head())
Make minimap db
Annotate segments and links
2632099it [1:06:48, 656.59it/s]   
Traverse graph to find reference anchor points
100%|██████████| 1109492/1109492 [00:12<00:00, 87690.43it/s] 
chromosome start end length type identity hit_chromosome hit_start hit_end component connected connected_chromosome connected_start connected_end connected_seg_start connected_seg_end
seg_id
s1 HDRR_1 0 37816 37816 HDRR NaN None NaN NaN 1 [s2, s1044544] NaN NaN NaN NaN NaN
s2 HDRR_1 37816 37928 112 HDRR NaN None NaN NaN 1 [s1, s3, s999575] NaN NaN NaN NaN NaN
s3 HDRR_1 37928 40172 2244 HDRR NaN None NaN NaN 1 [s2, s4] NaN NaN NaN NaN NaN
s4 HDRR_1 40172 41595 1423 HDRR NaN None NaN NaN 1 [s3, s5, s999575] NaN NaN NaN NaN NaN
s5 HDRR_1 41595 50707 9112 HDRR NaN None NaN NaN 1 [s4, s6, s1044544, s999574] NaN NaN NaN NaN NaN
s1_chromosome s1_pos s2_chromosome s2_pos s2_type connect_type connect_gap SV_type
link_id
s1+_s2+ HDRR_1 37816 HDRR_1 37816 HDRR intra_HDRR 0.0 continuous
s2+_s3+ HDRR_1 37928 HDRR_1 37928 HDRR intra_HDRR 0.0 continuous
s3+_s4+ HDRR_1 40172 HDRR_1 40172 HDRR intra_HDRR 0.0 continuous
s4+_s5+ HDRR_1 41595 HDRR_1 41595 HDRR intra_HDRR 0.0 continuous
s4-_s999575+ HDRR_1 40172 MIKK_117-2_C4_959 161190 MIKK HDRR_MIKK NaN insertion

Extract stats from segments

In [303]:
from pandas.plotting import table
from scipy.ndimage import gaussian_filter1d

def compute_N50 (data):
    data = data.dropna().values
    data.sort()
    half_sum = data.sum()/2
    cum_sum = 0
    for v in data:
        cum_sum += v
        if cum_sum >= half_sum:
            return int(v)

graph_info_fn="./graph_assembly/all_ref_graph_seg_info.pkl"
outdir="./graph_assembly/"

seg_type_info = defaultdict(OrderedDict)
seg_type_identity_list = OrderedDict()
seg_type_len_list = OrderedDict()

df = pd.read_pickle(graph_info_fn)
df = df[["length","type","identity"]]

# Aggregate unanchore contigs
df["type"] = df["type"].str.rstrip("+")

for seg_type, sdf in df.groupby("type"):
    seg_type_info[seg_type]["Total length (bp)"] = sdf["length"].sum()
    seg_type_info[seg_type]["Segments number"] = len(sdf)
    seg_type_info[seg_type]["Median segment length"] = sdf["length"].dropna().median()
    seg_type_info[seg_type]["longest segment"] = sdf["length"].dropna().max()
    seg_type_info[seg_type]["N50"] = compute_N50 (sdf["length"])
    seg_type_info[seg_type]["Median identity"] = sdf["identity"].dropna().median()

    seg_type_len_list[seg_type] = sdf["length"]
    seg_type_identity_list[seg_type] = sdf["identity"]

# Add all types together
seg_type_info["All"]["Total length (bp)"] = df["length"].sum()
seg_type_info["All"]["Segments number"] = len(df)
seg_type_info["All"]["Median segment length"] = df["length"].dropna().median()
seg_type_info["All"]["longest segment"] = df["length"].dropna().max()
seg_type_info["All"]["N50"] = compute_N50 (df["length"])
seg_type_info["All"]["Median identity"] = df["identity"].dropna().median()

# seg_type_len_list["All"] = df["length"]
# seg_type_identity_list["All"] = df["identity"]

seg_type_info_df = pd.DataFrame.from_dict(seg_type_info, orient="index")
out_fn = os.path.join(outdir, "segment_type_stats.tsv")
seg_type_info_df.to_csv(out_fn, sep="\t")
display(seg_type_info_df)

# Process data for plot
lower_pow = 1
upper_pow = 5
bins = list(np.logspace(lower_pow,upper_pow,100))
count_d=OrderedDict()
count_d["length bin"] = bins
for sample_id, sample_len_list in seg_type_len_list.items():
    count_d[sample_id] = np.histogram((sample_len_list), bins=bins+[pow(10,upper_pow+1)])[0]
seg_df= pd.DataFrame.from_dict(count_d)
seg_df = seg_df.set_index("length bin")
# normalise
seg_df = seg_df/seg_df.sum(axis=0)
for col in seg_df.columns:
    seg_df[col] = gaussian_filter1d (seg_df[col], sigma=2)
    
# Process data for plot
bins = list(np.linspace(0,1,100))
count_d=OrderedDict()
count_d["identity bin"] = bins
for sample_id, sample_len_list in seg_type_identity_list.items():
    count_d[sample_id] = np.histogram((sample_len_list), bins=bins+[1.1])[0]
id_df= pd.DataFrame.from_dict(count_d)
id_df = id_df.set_index("identity bin")
# normalise
id_df = id_df/id_df.sum(axis=0)
for col in id_df.columns:
    id_df[col] = gaussian_filter1d (id_df[col], sigma=1)   

with pl.style.context('ggplot'):
    fig, (ax1, ax2, ax3, ax4) = pl.subplots(4, 1, figsize=(10,20))

    seg_type_info_df["Total length (bp)"].plot.bar(ax=ax1, color="tomato", legend=False)
    table(ax1, seg_type_info_df["Total length (bp)"], colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
    ax1.set_title('Total length of segment per type')
    ax1.set_ylabel("Total length")
    #ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')

    seg_type_info_df["Segments number"].plot.bar(ax=ax2, color="steelblue", legend=False)
    table(ax2, seg_type_info_df["Segments number"], colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
    ax2.set_title('Number of segments per type')
    ax2.set_ylabel("Number of segments")
    #ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')

    # Colors for types and All
    colors = sns.color_palette("tab10", 4)

    seg_df.plot(ax=ax3, color=colors)
    ax3.set_xscale("log")
    #ax3.set_yscale("log")
    ax3.set_xlabel("Length (log scale)")
    ax3.set_ylabel("Density")
    ax3.set_title("Segment lengths distribution")
    #ax3.set_xticks(np.logspace(lower_pow, upper_pow,upper_pow+1))
    ax3.set_xlim(pow(10,lower_pow),pow(10,upper_pow))
    ax3.set_ylim(0,None)

    id_df.plot(ax=ax4, color=colors)
    ax4.set_xlabel("Identity")
    ax4.set_ylabel("Density")
    ax4.set_title("Identity distribution")
    ax4.set_xlim(0,1)
    ax4.set_ylim(0,None)
    
    fig.tight_layout()
    out_fn = os.path.join(outdir, "segment_type_stats.svg")
    fig.savefig(out_fn)
    
Total length (bp) Segments number Median segment length longest segment N50 Median identity
HDRR 734100826 648692 401.0 675459 3000 0.960224
HNI 103507879 148689 239.0 175667 2003 0.629009
HSOK 152043332 100275 371.0 236527 5803 0.604474
MIKK 318174656 211836 559.0 89792 3998 0.736830
All 1307826693 1109492 389.0 675459 3342 0.683020
In [39]:
from scipy.ndimage import gaussian_filter1d

link_info_fn = "./graph_assembly/all_ref_graph_link_info.pkl"
df = pd.read_pickle(link_info_fn)
df = df[df["connect_type"].isin(["HDRR_MIKK", "HDRR_HNI", "HDRR_HSOK", "HNI_HSOK", "HNI_MIKK", "HSOK_MIKK", "intra_HDRR", "intra_MIKK", "intra_HNI", "intra_HSOK", "inter_HDRR", "inter_MIKK", "inter_HNI", "inter_HSOK"])]
display(df.head())

with pl.style.context('ggplot'):

    fig, (ax1, ax2, ax3) = pl.subplots(1, 3, figsize=(30 ,7))

    ins_df = df[df["SV_type"]=="insertion"]
    s = ins_df["connect_type"].value_counts()
    s.plot.bar(ax=ax1)
    ax1.set_title('Insertion/alternative path supporting junctions')

    del_df = df[df["SV_type"]=="deletion"]
    s = del_df["connect_type"].value_counts()
    s.plot.bar(ax=ax2)
    ax2.set_title('Deletion supporting junctions')

    hdrr_del = df[df["connect_type"]=="intra_HDRR"]
    
    # Process data for plot
    x = list(np.logspace(2,5,100))
    y = np.histogram((hdrr_del["connect_gap"]), bins=x+[10e6])[0]
    y = gaussian_filter1d (y, sigma=2)
    ax3.fill_betweenx(y=y, x1=x)
    ax3.set_xlim(1e2,1e5)
    ax3.set_ylim(0, None)
    ax3.set_xscale('log')
    ax3.set_title('Deletion length relative to HDRR reference')
    
    fig.tight_layout()
s1_chromosome s1_pos s2_chromosome s2_pos s2_type connect_type connect_gap SV_type
link_id
s1+_s2+ HDRR_1 37816 HDRR_1 37816 HDRR intra_HDRR 0.0 continuous
s2+_s3+ HDRR_1 37928 HDRR_1 37928 HDRR intra_HDRR 0.0 continuous
s3+_s4+ HDRR_1 40172 HDRR_1 40172 HDRR intra_HDRR 0.0 continuous
s4+_s5+ HDRR_1 41595 HDRR_1 41595 HDRR intra_HDRR 0.0 continuous
s4-_s999575+ HDRR_1 40172 MIKK_117-2_C4_959 161190 MIKK HDRR_MIKK NaN insertion

Identity stats

In [335]:
graph_info_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
svg_out_fn =  "./graph_assembly/identity_len_stats.svg"
min_long = 2000
min_divergent = 0.5

graph_info_df = pd.read_pickle(graph_info_fn)
graph_info_df = graph_info_df[["type", "length", "identity"]]
graph_info_df["type"] = graph_info_df["type"].str.rstrip("+")

graph_info_df = graph_info_df.dropna ()
#graph_info_df = graph_info_df.sample(20000) ###############################

graph_info_df["log10(length)"] = np.log10(graph_info_df["length"])
graph_info_df["divergence"] = 1-graph_info_df["identity"] 

d = defaultdict(Counter)
for line in graph_info_df.itertuples():
#     if line.divergence == 1:
#         d[line.type]["Fully divergent"] += 1
    if line.divergence >= min_divergent and line.length >= min_long:
        d[line.type]["Long high divergence"] += 1
    elif line.divergence >= min_divergent and line.length < min_long:
        d[line.type]["Short high divergence"] += 1
    elif line.divergence < min_divergent and line.length >= min_long:
        d[line.type]["Long low divergence"] += 1
    elif line.divergence < min_divergent and line.length < min_long:
        d[line.type]["Short low divergence"] += 1

print("Counts by category")
df = pd.DataFrame(d)
df = df.fillna(0)
display(df)

print("% by category")
df = df/df.sum(axis=0)*100
display(df)

with pl.style.context('ggplot'):
    cmap = sns.cubehelix_palette(start=0.9, rot=0.1, dark=0.01, light=0.99, hue=2.75, gamma=0.75, as_cmap=True)
    g = sns.FacetGrid(graph_info_df, col="type", col_wrap=2, height=5, aspect=1.25)
    g = g.map(sns.kdeplot, "log10(length)", "divergence", shade=True, shade_lowest=False, cmap=cmap, n_levels=20)
    
    for ax in g.axes:
        ax.set_ylim(0, 1)
        ax.set_xlim(2, 4.5)
        
    g.savefig(svg_out_fn)
Counts by category
HDRR HNI HSOK MIKK
Short low divergence 45.0 44305 35018 81069
Long low divergence 3.0 6935 8179 39013
Short high divergence 8.0 26900 14084 24517
Long high divergence 0.0 4184 9239 10457
% by category
HDRR HNI HSOK MIKK
Short low divergence 80.357143 53.817842 52.642814 52.283691
Long low divergence 5.357143 8.424032 12.295550 25.160587
Short high divergence 14.285714 32.675769 21.172580 15.811707
Long high divergence 0.000000 5.082358 13.889056 6.744015

Get component stats

In [22]:
 def segment_type_stats (graph_info_fn, canonical_fa_fn, outdir):
    
    graph_info_df = pd.read_pickle(graph_info_fn)

    # Group segments by component
    components_dict = defaultdict(Component)
    for seg_id, seg_info in tqdm(zip(graph_info_df.index, graph_info_df.itertuples()), total=len(graph_info_df)):
        components_dict[seg_info.component].add(seg_id=seg_id, seg_info=seg_info)
    
    canonical_len_d = OrderedDict()
    with pyfaidx.Fasta(canonical_fa_fn) as fa:
        for seq in fa:
            canonical_len_d[seq.name] = len(seq)
    
    comp_len_d = OrderedDict()
    comp_ratio_d = OrderedDict()
    comp_num_d = OrderedDict()
    for i, j in components_dict.items():
        chrom = j.main_chromosome
        comp_len_d[chrom] = {"Canonical chromosomes":canonical_len_d[chrom], "Graph segments":j.length}
        comp_ratio_d[chrom] = j.length/canonical_len_d[chrom]
        comp_num_d[chrom]=len(j.segments)
        
    comp_len_df = pd.DataFrame.from_dict(comp_len_d, orient="index")
    comp_ratio_df = pd.DataFrame.from_dict(comp_ratio_d, orient="index", columns=["Ratio canonical/graph"])
    comp_num_df = pd.DataFrame.from_dict(comp_num_d, orient="index", columns=["Number of segments"])
    
    with pl.style.context('ggplot'):
        fig, (ax1, ax2, ax3) = pl.subplots(3, 1, figsize=(20,15))

        comp_len_df.plot.bar(ax=ax1, color=("steelblue","tomato"), legend=True)
        table(ax1, comp_len_df, colWidths=[0.10]*2, colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,1))
        ax1.set_title('Length of canonical chromosomes')
        ax1.set_ylabel("Length")
        ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
        
        comp_ratio_df.plot.bar(ax=ax2, color="tomato", legend=False)
        table(ax2, comp_ratio_df, colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,1))
        ax2.set_title('Ratio of Graph segment length by canonical chromosome length')
        ax2.set_ylabel("Length ratio")
        ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')

        comp_num_df.plot.bar(ax=ax3, color="steelblue", legend=False)
        table(ax3, comp_num_df, colWidths=[0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,1))
        ax3.set_title('Number of graph segments per chromosome')
        ax3.set_ylabel("Number of segments")
        ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45, ha='right')
        
        fig.tight_layout()
        out_fn = os.path.join(outdir, "component_stats.svg")
        fig.savefig(out_fn)
        
# Helper class to store component information
class Component():

    def __init__ (self):
        self.segments = list()
        self.chromosomes = Counter()
        self.length = 0
    
    def add (self, seg_id, seg_info):
        self.segments.append(seg_id)
        self.chromosomes[seg_info.chromosome]+=1
        self.length += seg_info.length
    
    def __repr__ (self):
        return f"Segments: {len(self.segments)}, chromomomes: {len(self.chromosomes)}, Length: {self.length}, Main chromosome: {self.main_chromosome}"
    
    @property
    def main_chromosome(self):
        return self.chromosomes.most_common(1)[0][0]
In [24]:
segment_type_stats("./graph_assembly/all_ref_graph_seg_info.pkl", "./references/Oryzias_latipes_HDRR_clean.fa", outdir="./graph_assembly/")
100%|██████████| 1109492/1109492 [00:07<00:00, 149790.83it/s]

Make graph info file for Bandage

In [31]:
graph_info_fn = "./graph_assembly/all_ref_graph_info.pkl"
bandage_info_type = "./graph_assembly/all_ref_graph_type_bandage.csv"
bandage_info_identity = "./graph_assembly/all_ref_graph_identity_bandage.csv"
ref_type = "HDRR"

graph_info_df = pd.read_pickle(graph_info_fn) 
graph_info_df.index.name="Name"
graph_info_df = graph_info_df.drop(columns=["start", "end", "hit_start", "hit_end", "component", "connected_forward", "connected_reverse"])

#Define colormap
type_list = ['HDRR', 'HDRR+', 'HNI', 'HNI+', 'HSOK', 'HSOK+', 'MIKK']
type_color_dict = OrderedDict()
for col, lab in zip(sns.color_palette('tab20', len(type_list)), type_list):
    type_color_dict[lab] = rgb2hex(col)

#Define colormap
no_data_color = "#535353"
identity_thresholds = [0, 0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
identity_color_dict = OrderedDict()
for col, lab in zip(sns.color_palette('afmhot', len(identity_thresholds)), identity_thresholds):
    identity_color_dict[lab] = rgb2hex(col)
    
type_color_list = []
identity_color_list = []

for line in tqdm(graph_info_df.itertuples(), total=len(graph_info_df)):
    type_color_list.append(type_color_dict[line.type])
    if pd.isna(line.identity):
          identity_color_list.append(no_data_color)
    else:
        for threshold, color in identity_color_dict.items():
            if line.identity<=threshold:
                identity_color_list.append(color)
                break

graph_info_type_df = graph_info_df.copy()
graph_info_type_df["Color"] = type_color_list
graph_info_type_df.to_csv(bandage_info_type, sep=",")
display(graph_info_type_df.sample(5))

graph_info_identity_df = graph_info_df.copy()
graph_info_identity_df["Color"] = identity_color_list
graph_info_identity_df.to_csv(bandage_info_identity, sep=",")
display(graph_info_identity_df.sample(5))
100%|██████████| 1109492/1109492 [00:02<00:00, 448054.18it/s]
chromosome length type identity hit_chromosome Color
Name
s1059479 MIKK_79-2_G3_275 201 MIKK NaN None #d62728
s275553 HDRR_10 1660 HDRR NaN None #1f77b4
s947382 MIKK_69-1_F3_989 1132 MIKK 0.936752 HDRR_13 #d62728
s76794 HDRR_3 255 HDRR NaN None #1f77b4
s105291 HDRR_4 29 HDRR NaN None #1f77b4
chromosome length type identity hit_chromosome Color
Name
s38779 HDRR_2 118 HDRR NaN None #535353
s203698 HDRR_7 58 HDRR NaN None #535353
s416004 HDRR_15 1431 HDRR NaN None #535353
s830100 HSOK_8 3926 HSOK 0.89781 HDRR_9 #ffe667
s645664 HDRR_24 226 HDRR NaN None #535353
In [ ]:
 

Make all segment bed track for IGV

In [98]:
graph_info_fn = "./graph_assembly/all_ref_graph_info.pkl"
bed_out = "./graph_assembly/all_ref_graph_segments.bed"
bed_out_sorted = "./graph_assembly/all_ref_graph_segments_sorted.bed"

graph_info_df = pd.read_pickle(graph_info_fn) 
graph_info_df = graph_info_df.reset_index()
graph_info_df = graph_info_df[["chromosome", "start", "end", "seg_id"]]
graph_info_df.to_csv(bed_out, sep="\t", index=False, header=False)

print(f"Sort and index with igvtools")
bash (f"igvtools sort {bed_out} {bed_out_sorted}", conda="IGV")
bash (f"igvtools index {bed_out_sorted}", conda="IGV")

# Remove originals
os.remove(bed_out)
Sort and index with igvtools
Using system JDK.
WARN [2020-07-08T10:34:01,104]  [Globals.java:138] [main]  Development mode is enabled
Sorting ./graph_assembly2/all_ref_graph_segments.bed  -> ./graph_assembly2/all_ref_graph_segments_sorted.bed
Done
Using system JDK.
WARN [2020-07-08T10:35:15,988]  [Globals.java:138] [main]  Development mode is enabled
Done