Graph usage analysis

In [3]:
from datetime import date
from pycltools.pycltools import jprint

jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2020_06_04', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)

Adrien Leger / EMBL EBI

Starting date : 2020_06_04

Last modification date : 2021_02_16

Imports

In [4]:
# Standard lib imports
import os
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
from tqdm import tqdm
import csv
import re

# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
import pybedtools as pbt
import pyranges as pr
import mappy as mp
from IPython.display import SVG, display, Image
from pyBioTools.Annotation import Annotation 
from scipy.ndimage.filters import gaussian_filter1d

# Ploting lib imports
import matplotlib as mpl
import matplotlib.pyplot as pl
import seaborn as sns
%matplotlib inline

# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
from pandas.plotting import table
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 200
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100

Align to graph

In [6]:
outdir = "alignments"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)
Creating /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/graph_genome_analysis/alignments

Align raw reads to graph

In [8]:
outdir = "alignments/raw_reads"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)

threads = 20
mem=30000
dry=False
graph = "./graph_assembly/all_ref_graph.gfa"

for fasta in glob.glob("/hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/*.fastq"):
    sample_id = os.path.basename(fasta).split(".")[0]
    print (f"Aligning sample {sample_id}")
    
    # Normal mode
    gaf = f"{outdir}/{sample_id}_stable.gaf"
    stdout = f"{outdir}/{sample_id}_stable.out"
    stderr = f"{outdir}/{sample_id}_stable.err"
    bsub(f"minigraph -x lr -t {threads} {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
    
    # Unstable ref mapping mode
    gaf = f"{outdir}/{sample_id}_unstable.gaf"
    stdout = f"{outdir}/{sample_id}_unstable.out"
    stderr = f"{outdir}/{sample_id}_unstable.err"
    bsub(f"minigraph -x lr --vc -t {threads} {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
Creating /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/graph_genome_analysis/alignments/raw_reads
Aligning sample 134-1_H4
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/134-1_H4_stable.out -eo alignments/raw_reads/134-1_H4_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/134-1_H4.fastq -o alignments/raw_reads/134-1_H4_stable.gaf"
Job <6043020> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/134-1_H4_unstable.out -eo alignments/raw_reads/134-1_H4_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/134-1_H4.fastq -o alignments/raw_reads/134-1_H4_unstable.gaf"
Job <6043026> is submitted to default queue <research-rh74>.
Aligning sample 7-2_F2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/7-2_F2_stable.out -eo alignments/raw_reads/7-2_F2_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/7-2_F2.fastq -o alignments/raw_reads/7-2_F2_stable.gaf"
Job <6043028> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/7-2_F2_unstable.out -eo alignments/raw_reads/7-2_F2_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/7-2_F2.fastq -o alignments/raw_reads/7-2_F2_unstable.gaf"
Job <6043032> is submitted to default queue <research-rh74>.
Aligning sample 69-1_F3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/69-1_F3_stable.out -eo alignments/raw_reads/69-1_F3_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/69-1_F3.fastq -o alignments/raw_reads/69-1_F3_stable.gaf"
Job <6043035> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/69-1_F3_unstable.out -eo alignments/raw_reads/69-1_F3_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/69-1_F3.fastq -o alignments/raw_reads/69-1_F3_unstable.gaf"
Job <6043038> is submitted to default queue <research-rh74>.
Aligning sample 117-2_C4
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/117-2_C4_stable.out -eo alignments/raw_reads/117-2_C4_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/117-2_C4.fastq -o alignments/raw_reads/117-2_C4_stable.gaf"
Job <6043041> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/117-2_C4_unstable.out -eo alignments/raw_reads/117-2_C4_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/117-2_C4.fastq -o alignments/raw_reads/117-2_C4_unstable.gaf"
Job <6043049> is submitted to default queue <research-rh74>.
Aligning sample 80-1_H3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/80-1_H3_stable.out -eo alignments/raw_reads/80-1_H3_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/80-1_H3.fastq -o alignments/raw_reads/80-1_H3_stable.gaf"
Job <6043055> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/80-1_H3_unstable.out -eo alignments/raw_reads/80-1_H3_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/80-1_H3.fastq -o alignments/raw_reads/80-1_H3_unstable.gaf"
Job <6043060> is submitted to default queue <research-rh74>.
Aligning sample 11-1_A3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/11-1_A3_stable.out -eo alignments/raw_reads/11-1_A3_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/11-1_A3.fastq -o alignments/raw_reads/11-1_A3_stable.gaf"
Job <6043063> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/11-1_A3_unstable.out -eo alignments/raw_reads/11-1_A3_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/11-1_A3.fastq -o alignments/raw_reads/11-1_A3_unstable.gaf"
Job <6043072> is submitted to default queue <research-rh74>.
Aligning sample 4-2_C2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/4-2_C2_stable.out -eo alignments/raw_reads/4-2_C2_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/4-2_C2.fastq -o alignments/raw_reads/4-2_C2_stable.gaf"
Job <6043103> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/4-2_C2_unstable.out -eo alignments/raw_reads/4-2_C2_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/4-2_C2.fastq -o alignments/raw_reads/4-2_C2_unstable.gaf"
Job <6043125> is submitted to default queue <research-rh74>.
Aligning sample 79-2_G3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/79-2_G3_stable.out -eo alignments/raw_reads/79-2_G3_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/79-2_G3.fastq -o alignments/raw_reads/79-2_G3_stable.gaf"
Job <6043153> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/79-2_G3_unstable.out -eo alignments/raw_reads/79-2_G3_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/79-2_G3.fastq -o alignments/raw_reads/79-2_G3_unstable.gaf"
Job <6043177> is submitted to default queue <research-rh74>.
Aligning sample 131-1_F4
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/131-1_F4_stable.out -eo alignments/raw_reads/131-1_F4_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/131-1_F4.fastq -o alignments/raw_reads/131-1_F4_stable.gaf"
Job <6043208> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/131-1_F4_unstable.out -eo alignments/raw_reads/131-1_F4_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/131-1_F4.fastq -o alignments/raw_reads/131-1_F4_unstable.gaf"
Job <6043234> is submitted to default queue <research-rh74>.
Aligning sample 4-1_B2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/4-1_B2_stable.out -eo alignments/raw_reads/4-1_B2_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/4-1_B2.fastq -o alignments/raw_reads/4-1_B2_stable.gaf"
Job <6043262> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/4-1_B2_unstable.out -eo alignments/raw_reads/4-1_B2_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/4-1_B2.fastq -o alignments/raw_reads/4-1_B2_unstable.gaf"
Job <6043286> is submitted to default queue <research-rh74>.
Aligning sample 7-1_E2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/7-1_E2_stable.out -eo alignments/raw_reads/7-1_E2_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/7-1_E2.fastq -o alignments/raw_reads/7-1_E2_stable.gaf"
Job <6043302> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/7-1_E2_unstable.out -eo alignments/raw_reads/7-1_E2_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/7-1_E2.fastq -o alignments/raw_reads/7-1_E2_unstable.gaf"
Job <6043335> is submitted to default queue <research-rh74>.
Aligning sample 134-2_A5
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/134-2_A5_stable.out -eo alignments/raw_reads/134-2_A5_stable.err  "minigraph -x lr -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/134-2_A5.fastq -o alignments/raw_reads/134-2_A5_stable.gaf"
Job <6043351> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/raw_reads/134-2_A5_unstable.out -eo alignments/raw_reads/134-2_A5_unstable.err  "minigraph -x lr --vc -t 20 ./graph_assembly/all_ref_graph.gfa /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/brain_run2/DNA_analysis/results/input/merged_fastq/134-2_A5.fastq -o alignments/raw_reads/134-2_A5_unstable.gaf"
Job <6043352> is submitted to default queue <research-rh74>.

Align individual assembly to graph

In [9]:
outdir = "alignments/assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)

threads = 20
mem = 30000
dry = False
graph = "./graph_assembly/all_ref_graph.gfa"

for fasta in glob.glob("./individual_assemblies/*_clean.fa"):
    sample_id = os.path.basename(fasta).split(".")[0].rstrip("_clean")
    print (f"Aligning sample {sample_id}")
    
    # Normal mode
    gaf = f"{outdir}/{sample_id}_stable.gaf"
    stdout = f"{outdir}/{sample_id}_stable.out"
    stderr = f"{outdir}/{sample_id}_stable.err"
    bsub(f"minigraph -x asm -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
    
    # Unstable ref mapping mode
    gaf = f"{outdir}/{sample_id}_unstable.gaf"
    stdout = f"{outdir}/{sample_id}_unstable.out"
    stderr = f"{outdir}/{sample_id}_unstable.err"
    bsub(f"minigraph -x asm --vc -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
Creating /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/graph_genome_analysis/alignments/assemblies
Aligning sample 69-1_F3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/69-1_F3_stable.out -eo alignments/assemblies/69-1_F3_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/69-1_F3_clean.fa -o alignments/assemblies/69-1_F3_stable.gaf"
Job <6043435> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/69-1_F3_unstable.out -eo alignments/assemblies/69-1_F3_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/69-1_F3_clean.fa -o alignments/assemblies/69-1_F3_unstable.gaf"
Job <6043436> is submitted to default queue <research-rh74>.
Aligning sample 131-1_F4
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/131-1_F4_stable.out -eo alignments/assemblies/131-1_F4_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/131-1_F4_clean.fa -o alignments/assemblies/131-1_F4_stable.gaf"
Job <6043440> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/131-1_F4_unstable.out -eo alignments/assemblies/131-1_F4_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/131-1_F4_clean.fa -o alignments/assemblies/131-1_F4_unstable.gaf"
Job <6043441> is submitted to default queue <research-rh74>.
Aligning sample 117-2_C4
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/117-2_C4_stable.out -eo alignments/assemblies/117-2_C4_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/117-2_C4_clean.fa -o alignments/assemblies/117-2_C4_stable.gaf"
Job <6043443> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/117-2_C4_unstable.out -eo alignments/assemblies/117-2_C4_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/117-2_C4_clean.fa -o alignments/assemblies/117-2_C4_unstable.gaf"
Job <6043445> is submitted to default queue <research-rh74>.
Aligning sample 4-2_B2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/4-2_B2_stable.out -eo alignments/assemblies/4-2_B2_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/4-2_B2_clean.fa -o alignments/assemblies/4-2_B2_stable.gaf"
Job <6043448> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/4-2_B2_unstable.out -eo alignments/assemblies/4-2_B2_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/4-2_B2_clean.fa -o alignments/assemblies/4-2_B2_unstable.gaf"
Job <6043449> is submitted to default queue <research-rh74>.
Aligning sample 7-2_F2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/7-2_F2_stable.out -eo alignments/assemblies/7-2_F2_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/7-2_F2_clean.fa -o alignments/assemblies/7-2_F2_stable.gaf"
Job <6043450> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/7-2_F2_unstable.out -eo alignments/assemblies/7-2_F2_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/7-2_F2_clean.fa -o alignments/assemblies/7-2_F2_unstable.gaf"
Job <6043452> is submitted to default queue <research-rh74>.
Aligning sample 4-1_B2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/4-1_B2_stable.out -eo alignments/assemblies/4-1_B2_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/4-1_B2_clean.fa -o alignments/assemblies/4-1_B2_stable.gaf"
Job <6043455> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/4-1_B2_unstable.out -eo alignments/assemblies/4-1_B2_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/4-1_B2_clean.fa -o alignments/assemblies/4-1_B2_unstable.gaf"
Job <6043457> is submitted to default queue <research-rh74>.
Aligning sample 134-2_A5
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/134-2_A5_stable.out -eo alignments/assemblies/134-2_A5_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/134-2_A5_clean.fa -o alignments/assemblies/134-2_A5_stable.gaf"
Job <6043459> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/134-2_A5_unstable.out -eo alignments/assemblies/134-2_A5_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/134-2_A5_clean.fa -o alignments/assemblies/134-2_A5_unstable.gaf"
Job <6043460> is submitted to default queue <research-rh74>.
Aligning sample 79-2_G3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/79-2_G3_stable.out -eo alignments/assemblies/79-2_G3_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/79-2_G3_clean.fa -o alignments/assemblies/79-2_G3_stable.gaf"
Job <6043461> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/79-2_G3_unstable.out -eo alignments/assemblies/79-2_G3_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/79-2_G3_clean.fa -o alignments/assemblies/79-2_G3_unstable.gaf"
Job <6043463> is submitted to default queue <research-rh74>.
Aligning sample 134-1_H4
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/134-1_H4_stable.out -eo alignments/assemblies/134-1_H4_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/134-1_H4_clean.fa -o alignments/assemblies/134-1_H4_stable.gaf"
Job <6043465> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/134-1_H4_unstable.out -eo alignments/assemblies/134-1_H4_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/134-1_H4_clean.fa -o alignments/assemblies/134-1_H4_unstable.gaf"
Job <6043467> is submitted to default queue <research-rh74>.
Aligning sample 80-1_H3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/80-1_H3_stable.out -eo alignments/assemblies/80-1_H3_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/80-1_H3_clean.fa -o alignments/assemblies/80-1_H3_stable.gaf"
Job <6043470> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/80-1_H3_unstable.out -eo alignments/assemblies/80-1_H3_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/80-1_H3_clean.fa -o alignments/assemblies/80-1_H3_unstable.gaf"
Job <6043480> is submitted to default queue <research-rh74>.
Aligning sample 7-1_E2
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/7-1_E2_stable.out -eo alignments/assemblies/7-1_E2_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/7-1_E2_clean.fa -o alignments/assemblies/7-1_E2_stable.gaf"
Job <6043491> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/7-1_E2_unstable.out -eo alignments/assemblies/7-1_E2_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/7-1_E2_clean.fa -o alignments/assemblies/7-1_E2_unstable.gaf"
Job <6043508> is submitted to default queue <research-rh74>.
Aligning sample 11-1_A3
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/11-1_A3_stable.out -eo alignments/assemblies/11-1_A3_stable.err  "minigraph -x asm -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/11-1_A3_clean.fa -o alignments/assemblies/11-1_A3_stable.gaf"
Job <6043517> is submitted to default queue <research-rh74>.
bsub -M 30000 -R 'rusage[mem=30000]' -n 20 -oo alignments/assemblies/11-1_A3_unstable.out -eo alignments/assemblies/11-1_A3_unstable.err  "minigraph -x asm --vc -t 10 ./graph_assembly/all_ref_graph.gfa ./individual_assemblies/11-1_A3_clean.fa -o alignments/assemblies/11-1_A3_unstable.gaf"
Job <6043528> is submitted to default queue <research-rh74>.

Align Illumina RNA-Seq datasets to graph

Merge RNAseq Paired end

In [ ]:
outdir = "./rna_seq_data"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)

info_fn = "/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/Analysis_low_high/candidates/liver_candidate_rep1.tsv"

sample_d = OrderedDict()
with open(info_fn) as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        sample_d[row["sanger_id"]] = row["sample_id"].replace("_", "-")
sample_d

for sanger_id, sample_id in sample_d.items():
    R1 = f"/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/RNA_pipeline2/results/input/fastp/high_{sanger_id}_1.fastq.gz"
    R2 = f"/hps/research1/birney/users/adrien/analyses/medaka_RNA_illumina/Indigene_pilot/RNA_pipeline2/results/input/fastp/high_{sanger_id}_2.fastq.gz"
    
    if os.path.isfile(R1) and os.path.isfile(R2):
        stdout = f"{outdir}/{sample_id}_flash.err"
        stderr = f"{outdir}/{sample_id}_flash.out"
    
        cmd = f"flash {R1} {R2} -r 150 -f 220 --output-prefix {sample_id}_merged --output-directory {outdir} --compress --threads 20"
        bsub(cmd, conda="flash", dry=False, mem=5000, threads=20, stdout_fp=stdout, stderr_fp=stderr)

Align data

In [10]:
outdir = "alignments/rna_seq"
shutil.rmtree(outdir, ignore_errors=True)
mkdir(outdir)

threads = 20
mem = 15000
dry = False
graph = "./graph_assembly/all_ref_graph.gfa"

for fasta in glob.glob("rna_seq_data/*_merged.extendedFrags.fastq.gz"):
    sample_id = os.path.basename(fasta).split("_")[0]
    print (f"Aligning sample {sample_id}")
    
    # Normal mode
    gaf = f"{outdir}/{sample_id}_stable.gaf"
    stdout = f"{outdir}/{sample_id}_stable.out"
    stderr = f"{outdir}/{sample_id}_stable.err"
    bsub(f"minigraph -x sr -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)

    # Unstable ref mapping mode
    gaf = f"{outdir}/{sample_id}_unstable.gaf"
    stdout = f"{outdir}/{sample_id}_unstable.out"
    stderr = f"{outdir}/{sample_id}_unstable.err"
    bsub(f"minigraph -x sr --vc -t 10 {graph} {fasta} -o {gaf}", conda="minigraph", dry=dry, mem=mem, threads=threads, stdout_fp=stdout, stderr_fp=stderr)
Creating /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/graph_genome_analysis/alignments/rna_seq
Aligning sample 95-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/95-1_stable.out -eo alignments/rna_seq/95-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/95-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/95-1_stable.gaf"
Job <6043749> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/95-1_unstable.out -eo alignments/rna_seq/95-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/95-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/95-1_unstable.gaf"
Job <6043753> is submitted to default queue <research-rh74>.
Aligning sample 94-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/94-1_stable.out -eo alignments/rna_seq/94-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/94-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/94-1_stable.gaf"
Job <6043763> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/94-1_unstable.out -eo alignments/rna_seq/94-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/94-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/94-1_unstable.gaf"
Job <6043768> is submitted to default queue <research-rh74>.
Aligning sample 91-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/91-1_stable.out -eo alignments/rna_seq/91-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/91-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/91-1_stable.gaf"
Job <6043771> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/91-1_unstable.out -eo alignments/rna_seq/91-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/91-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/91-1_unstable.gaf"
Job <6043775> is submitted to default queue <research-rh74>.
Aligning sample 8-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/8-2_stable.out -eo alignments/rna_seq/8-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/8-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/8-2_stable.gaf"
Job <6043797> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/8-2_unstable.out -eo alignments/rna_seq/8-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/8-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/8-2_unstable.gaf"
Job <6043800> is submitted to default queue <research-rh74>.
Aligning sample 84-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/84-2_stable.out -eo alignments/rna_seq/84-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/84-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/84-2_stable.gaf"
Job <6043802> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/84-2_unstable.out -eo alignments/rna_seq/84-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/84-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/84-2_unstable.gaf"
Job <6043810> is submitted to default queue <research-rh74>.
Aligning sample 80-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/80-1_stable.out -eo alignments/rna_seq/80-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/80-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/80-1_stable.gaf"
Job <6043812> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/80-1_unstable.out -eo alignments/rna_seq/80-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/80-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/80-1_unstable.gaf"
Job <6043816> is submitted to default queue <research-rh74>.
Aligning sample 79-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/79-2_stable.out -eo alignments/rna_seq/79-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/79-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/79-2_stable.gaf"
Job <6043818> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/79-2_unstable.out -eo alignments/rna_seq/79-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/79-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/79-2_unstable.gaf"
Job <6043821> is submitted to default queue <research-rh74>.
Aligning sample 72-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/72-2_stable.out -eo alignments/rna_seq/72-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/72-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/72-2_stable.gaf"
Job <6043829> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/72-2_unstable.out -eo alignments/rna_seq/72-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/72-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/72-2_unstable.gaf"
Job <6043833> is submitted to default queue <research-rh74>.
Aligning sample 71-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/71-1_stable.out -eo alignments/rna_seq/71-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/71-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/71-1_stable.gaf"
Job <6043835> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/71-1_unstable.out -eo alignments/rna_seq/71-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/71-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/71-1_unstable.gaf"
Job <6043838> is submitted to default queue <research-rh74>.
Aligning sample 69-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/69-1_stable.out -eo alignments/rna_seq/69-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/69-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/69-1_stable.gaf"
Job <6043840> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/69-1_unstable.out -eo alignments/rna_seq/69-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/69-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/69-1_unstable.gaf"
Job <6043847> is submitted to default queue <research-rh74>.
Aligning sample 62-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/62-2_stable.out -eo alignments/rna_seq/62-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/62-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/62-2_stable.gaf"
Job <6043850> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/62-2_unstable.out -eo alignments/rna_seq/62-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/62-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/62-2_unstable.gaf"
Job <6043852> is submitted to default queue <research-rh74>.
Aligning sample 61-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/61-1_stable.out -eo alignments/rna_seq/61-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/61-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/61-1_stable.gaf"
Job <6043854> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/61-1_unstable.out -eo alignments/rna_seq/61-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/61-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/61-1_unstable.gaf"
Job <6043856> is submitted to default queue <research-rh74>.
Aligning sample 59-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/59-2_stable.out -eo alignments/rna_seq/59-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/59-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/59-2_stable.gaf"
Job <6043859> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/59-2_unstable.out -eo alignments/rna_seq/59-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/59-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/59-2_unstable.gaf"
Job <6043861> is submitted to default queue <research-rh74>.
Aligning sample 59-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/59-1_stable.out -eo alignments/rna_seq/59-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/59-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/59-1_stable.gaf"
Job <6043865> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/59-1_unstable.out -eo alignments/rna_seq/59-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/59-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/59-1_unstable.gaf"
Job <6043867> is submitted to default queue <research-rh74>.
Aligning sample 55-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/55-2_stable.out -eo alignments/rna_seq/55-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/55-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/55-2_stable.gaf"
Job <6043870> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/55-2_unstable.out -eo alignments/rna_seq/55-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/55-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/55-2_unstable.gaf"
Job <6043878> is submitted to default queue <research-rh74>.
Aligning sample 50-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/50-2_stable.out -eo alignments/rna_seq/50-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/50-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/50-2_stable.gaf"
Job <6043881> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/50-2_unstable.out -eo alignments/rna_seq/50-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/50-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/50-2_unstable.gaf"
Job <6043887> is submitted to default queue <research-rh74>.
Aligning sample 4-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/4-2_stable.out -eo alignments/rna_seq/4-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/4-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/4-2_stable.gaf"
Job <6043889> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/4-2_unstable.out -eo alignments/rna_seq/4-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/4-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/4-2_unstable.gaf"
Job <6043891> is submitted to default queue <research-rh74>.
Aligning sample 4-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/4-1_stable.out -eo alignments/rna_seq/4-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/4-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/4-1_stable.gaf"
Job <6043892> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/4-1_unstable.out -eo alignments/rna_seq/4-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/4-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/4-1_unstable.gaf"
Job <6043893> is submitted to default queue <research-rh74>.
Aligning sample 49-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/49-1_stable.out -eo alignments/rna_seq/49-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/49-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/49-1_stable.gaf"
Job <6043899> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/49-1_unstable.out -eo alignments/rna_seq/49-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/49-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/49-1_unstable.gaf"
Job <6043902> is submitted to default queue <research-rh74>.
Aligning sample 40-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/40-2_stable.out -eo alignments/rna_seq/40-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/40-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/40-2_stable.gaf"
Job <6043904> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/40-2_unstable.out -eo alignments/rna_seq/40-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/40-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/40-2_unstable.gaf"
Job <6043909> is submitted to default queue <research-rh74>.
Aligning sample 40-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/40-1_stable.out -eo alignments/rna_seq/40-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/40-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/40-1_stable.gaf"
Job <6043910> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/40-1_unstable.out -eo alignments/rna_seq/40-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/40-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/40-1_unstable.gaf"
Job <6043913> is submitted to default queue <research-rh74>.
Aligning sample 39-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/39-1_stable.out -eo alignments/rna_seq/39-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/39-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/39-1_stable.gaf"
Job <6043914> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/39-1_unstable.out -eo alignments/rna_seq/39-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/39-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/39-1_unstable.gaf"
Job <6043916> is submitted to default queue <research-rh74>.
Aligning sample 32-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/32-2_stable.out -eo alignments/rna_seq/32-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/32-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/32-2_stable.gaf"
Job <6043917> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/32-2_unstable.out -eo alignments/rna_seq/32-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/32-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/32-2_unstable.gaf"
Job <6043918> is submitted to default queue <research-rh74>.
Aligning sample 30-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/30-1_stable.out -eo alignments/rna_seq/30-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/30-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/30-1_stable.gaf"
Job <6043923> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/30-1_unstable.out -eo alignments/rna_seq/30-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/30-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/30-1_unstable.gaf"
Job <6043927> is submitted to default queue <research-rh74>.
Aligning sample 23-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/23-1_stable.out -eo alignments/rna_seq/23-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/23-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/23-1_stable.gaf"
Job <6043929> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/23-1_unstable.out -eo alignments/rna_seq/23-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/23-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/23-1_unstable.gaf"
Job <6043932> is submitted to default queue <research-rh74>.
Aligning sample 21-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/21-2_stable.out -eo alignments/rna_seq/21-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/21-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/21-2_stable.gaf"
Job <6043936> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/21-2_unstable.out -eo alignments/rna_seq/21-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/21-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/21-2_unstable.gaf"
Job <6043937> is submitted to default queue <research-rh74>.
Aligning sample 20-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/20-1_stable.out -eo alignments/rna_seq/20-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/20-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/20-1_stable.gaf"
Job <6043939> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/20-1_unstable.out -eo alignments/rna_seq/20-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/20-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/20-1_unstable.gaf"
Job <6043941> is submitted to default queue <research-rh74>.
Aligning sample 17-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/17-1_stable.out -eo alignments/rna_seq/17-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/17-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/17-1_stable.gaf"
Job <6043944> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/17-1_unstable.out -eo alignments/rna_seq/17-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/17-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/17-1_unstable.gaf"
Job <6043948> is submitted to default queue <research-rh74>.
Aligning sample 15-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/15-1_stable.out -eo alignments/rna_seq/15-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/15-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/15-1_stable.gaf"
Job <6043952> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/15-1_unstable.out -eo alignments/rna_seq/15-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/15-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/15-1_unstable.gaf"
Job <6043956> is submitted to default queue <research-rh74>.
Aligning sample 14-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/14-1_stable.out -eo alignments/rna_seq/14-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/14-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/14-1_stable.gaf"
Job <6043958> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/14-1_unstable.out -eo alignments/rna_seq/14-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/14-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/14-1_unstable.gaf"
Job <6043961> is submitted to default queue <research-rh74>.
Aligning sample 141-3
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/141-3_stable.out -eo alignments/rna_seq/141-3_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/141-3_merged.extendedFrags.fastq.gz -o alignments/rna_seq/141-3_stable.gaf"
Job <6043963> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/141-3_unstable.out -eo alignments/rna_seq/141-3_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/141-3_merged.extendedFrags.fastq.gz -o alignments/rna_seq/141-3_unstable.gaf"
Job <6043964> is submitted to default queue <research-rh74>.
Aligning sample 140-3
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/140-3_stable.out -eo alignments/rna_seq/140-3_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/140-3_merged.extendedFrags.fastq.gz -o alignments/rna_seq/140-3_stable.gaf"
Job <6043966> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/140-3_unstable.out -eo alignments/rna_seq/140-3_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/140-3_merged.extendedFrags.fastq.gz -o alignments/rna_seq/140-3_unstable.gaf"
Job <6043970> is submitted to default queue <research-rh74>.
Aligning sample 140-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/140-1_stable.out -eo alignments/rna_seq/140-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/140-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/140-1_stable.gaf"
Job <6043974> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/140-1_unstable.out -eo alignments/rna_seq/140-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/140-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/140-1_unstable.gaf"
Job <6043977> is submitted to default queue <research-rh74>.
Aligning sample 13-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/13-2_stable.out -eo alignments/rna_seq/13-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/13-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/13-2_stable.gaf"
Job <6043980> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/13-2_unstable.out -eo alignments/rna_seq/13-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/13-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/13-2_unstable.gaf"
Job <6043982> is submitted to default queue <research-rh74>.
Aligning sample 139-4
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/139-4_stable.out -eo alignments/rna_seq/139-4_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/139-4_merged.extendedFrags.fastq.gz -o alignments/rna_seq/139-4_stable.gaf"
Job <6043983> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/139-4_unstable.out -eo alignments/rna_seq/139-4_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/139-4_merged.extendedFrags.fastq.gz -o alignments/rna_seq/139-4_unstable.gaf"
Job <6043985> is submitted to default queue <research-rh74>.
Aligning sample 137-4
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/137-4_stable.out -eo alignments/rna_seq/137-4_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/137-4_merged.extendedFrags.fastq.gz -o alignments/rna_seq/137-4_stable.gaf"
Job <6043992> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/137-4_unstable.out -eo alignments/rna_seq/137-4_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/137-4_merged.extendedFrags.fastq.gz -o alignments/rna_seq/137-4_unstable.gaf"
Job <6043995> is submitted to default queue <research-rh74>.
Aligning sample 135-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/135-2_stable.out -eo alignments/rna_seq/135-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/135-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/135-2_stable.gaf"
Job <6043999> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/135-2_unstable.out -eo alignments/rna_seq/135-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/135-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/135-2_unstable.gaf"
Job <6044000> is submitted to default queue <research-rh74>.
Aligning sample 134-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/134-1_stable.out -eo alignments/rna_seq/134-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/134-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/134-1_stable.gaf"
Job <6044002> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/134-1_unstable.out -eo alignments/rna_seq/134-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/134-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/134-1_unstable.gaf"
Job <6044004> is submitted to default queue <research-rh74>.
Aligning sample 133-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/133-2_stable.out -eo alignments/rna_seq/133-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/133-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/133-2_stable.gaf"
Job <6044006> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/133-2_unstable.out -eo alignments/rna_seq/133-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/133-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/133-2_unstable.gaf"
Job <6044008> is submitted to default queue <research-rh74>.
Aligning sample 132-5
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/132-5_stable.out -eo alignments/rna_seq/132-5_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/132-5_merged.extendedFrags.fastq.gz -o alignments/rna_seq/132-5_stable.gaf"
Job <6044010> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/132-5_unstable.out -eo alignments/rna_seq/132-5_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/132-5_merged.extendedFrags.fastq.gz -o alignments/rna_seq/132-5_unstable.gaf"
Job <6044011> is submitted to default queue <research-rh74>.
Aligning sample 132-4-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/132-4-1_stable.out -eo alignments/rna_seq/132-4-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/132-4-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/132-4-1_stable.gaf"
Job <6044013> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/132-4-1_unstable.out -eo alignments/rna_seq/132-4-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/132-4-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/132-4-1_unstable.gaf"
Job <6044015> is submitted to default queue <research-rh74>.
Aligning sample 129-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/129-1_stable.out -eo alignments/rna_seq/129-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/129-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/129-1_stable.gaf"
Job <6044018> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/129-1_unstable.out -eo alignments/rna_seq/129-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/129-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/129-1_unstable.gaf"
Job <6044021> is submitted to default queue <research-rh74>.
Aligning sample 125-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/125-1_stable.out -eo alignments/rna_seq/125-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/125-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/125-1_stable.gaf"
Job <6044023> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/125-1_unstable.out -eo alignments/rna_seq/125-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/125-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/125-1_unstable.gaf"
Job <6044025> is submitted to default queue <research-rh74>.
Aligning sample 11-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/11-1_stable.out -eo alignments/rna_seq/11-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/11-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/11-1_stable.gaf"
Job <6044026> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/11-1_unstable.out -eo alignments/rna_seq/11-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/11-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/11-1_unstable.gaf"
Job <6044029> is submitted to default queue <research-rh74>.
Aligning sample 11-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/11-2_stable.out -eo alignments/rna_seq/11-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/11-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/11-2_stable.gaf"
Job <6044030> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/11-2_unstable.out -eo alignments/rna_seq/11-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/11-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/11-2_unstable.gaf"
Job <6044032> is submitted to default queue <research-rh74>.
Aligning sample 117-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/117-2_stable.out -eo alignments/rna_seq/117-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/117-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/117-2_stable.gaf"
Job <6044033> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/117-2_unstable.out -eo alignments/rna_seq/117-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/117-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/117-2_unstable.gaf"
Job <6044036> is submitted to default queue <research-rh74>.
Aligning sample 10-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/10-1_stable.out -eo alignments/rna_seq/10-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/10-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/10-1_stable.gaf"
Job <6044039> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/10-1_unstable.out -eo alignments/rna_seq/10-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/10-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/10-1_unstable.gaf"
Job <6044041> is submitted to default queue <research-rh74>.
Aligning sample 106-2
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/106-2_stable.out -eo alignments/rna_seq/106-2_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/106-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/106-2_stable.gaf"
Job <6044042> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/106-2_unstable.out -eo alignments/rna_seq/106-2_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/106-2_merged.extendedFrags.fastq.gz -o alignments/rna_seq/106-2_unstable.gaf"
Job <6044043> is submitted to default queue <research-rh74>.
Aligning sample 106-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/106-1_stable.out -eo alignments/rna_seq/106-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/106-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/106-1_stable.gaf"
Job <6044044> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/106-1_unstable.out -eo alignments/rna_seq/106-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/106-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/106-1_unstable.gaf"
Job <6044045> is submitted to default queue <research-rh74>.
Aligning sample 104-1
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/104-1_stable.out -eo alignments/rna_seq/104-1_stable.err  "minigraph -x sr -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/104-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/104-1_stable.gaf"
Job <6044047> is submitted to default queue <research-rh74>.
bsub -M 15000 -R 'rusage[mem=15000]' -n 20 -oo alignments/rna_seq/104-1_unstable.out -eo alignments/rna_seq/104-1_unstable.err  "minigraph -x sr --vc -t 10 ./graph_assembly/all_ref_graph.gfa rna_seq_data/104-1_merged.extendedFrags.fastq.gz -o alignments/rna_seq/104-1_unstable.gaf"
Job <6044048> is submitted to default queue <research-rh74>.

Generate coverage BED

In [ ]:
outdir = "coverage"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

Define parsing function

In [ ]:
def split_path (path):
    l = []
    s = ""
    for c in path:
        if s and c in ["<", ">"]:
            l.append(s)
            s = c
        else:
            s+=c
    if s: 
        l.append(s)
    return l

def parse_stable_gaf (gaf_fn, min_align_len=100, min_mapq=10, progress=True):
    
    with open (gaf_fn) as gaf_fp:
        c = Counter()
        l = []
        
        coord_tuple = namedtuple("coord_tuple", ("chromosome", "start", "end"))
        for line in tqdm(gaf_fp, disable=not progress):
            line = line.strip().split("\t")
            align_len = int(line[10])
            mapq = int(line[11])
            align_type = line[12].split(":")[-1]

            if align_len < min_align_len:
                c["Short length alignments"]+=1
                continue
            if mapq < min_mapq:
                c["Low mapq alignments"]+=1
                continue
            if align_type != "P":
                c["Non primary alignments"]+=1
                continue
            
            # Decompose path from
            c["Valid reads"]+=1
            for p in split_path(line[5]):
                
                if p[0] in ["<", ">"]:
                    chromosome = p[1:].split(":")[0]
                    start = int(p.split(":")[1].split("-")[0])
                    end = int(p.split(":")[1].split("-")[1])
                    #strand = "+" if node[0] == ">" else "-"
                                        
                else:
                    chromosome = p
                    start = int(line[7])
                    end = int(line[8])
                    #strand = line[4]
                
                c["Valid segments"]+=1
                l.append(coord_tuple(chromosome, start, end))
    
    for i, j in c.items():
        stdout_print(f"\t\t{i}: {j}\n")
    
    df = pd.DataFrame(l)
    df.sort_values(by=["chromosome", "start", "end"], inplace=True)
    df.reset_index(inplace=True, drop=True)
        
    return df

def coverage_from_gaf (gaf_src, all_fasta_ref, outdir, min_align_len=100, min_mapq=1):
    
    stdout_print("Define chromosome length for pybedtools\n")
    chrom_len = OrderedDict()
    with pyfaidx.Fasta(all_fasta_ref) as fa:
        for seq in fa:
            chrom_len[str(seq.name)]=(0, len(seq))
    
    stdout_print("Parsing gaf files\n")
    for gaf_fn in glob.glob(gaf_src):
        sample_id = gaf_fn.split("/")[-1].rpartition("_")[0]

        stdout_print(f"Analyse sample {sample_id}\n")
        
        stdout_print("\tCollect coordinates from paths\n")
        bed_df = parse_stable_gaf (gaf_fn, min_align_len=min_align_len, min_mapq=min_mapq, progress=False)
        bed_fn = os.path.join(outdir, f"{sample_id}.bed")
        bed_df.to_csv(bed_fn, index=False, header=False, sep="\t")

        stdout_print("\tCompute coverage with bedtools\n")
        bedgraph_fn = os.path.join(outdir, f"{sample_id}.bedgraph")
        bed = pbt.BedTool(bed_fn)
        bed.set_chromsizes(chrom_len)
        bg = bed.genomecov(bg=True, split=True, trackopts=f'type=bedGraph name={sample_id}')
        bg.saveas(bedgraph_fn)

Assemblies

In [ ]:
outdir = "coverage/assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

coverage_from_gaf(
    gaf_src = "./alignments/assemblies/*_stable.gaf", 
    all_fasta_ref = "./references/Oryzias_latipes_all_assemblies_contig.fa",
    outdir = outdir,
    min_align_len=100,
    min_mapq=1)

Raw reads

In [ ]:
outdir = "coverage/raw_reads"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

coverage_from_gaf(
    gaf_src = "./alignments/raw_reads/*_stable.gaf", 
    all_fasta_ref = "./references/Oryzias_latipes_all_assemblies_contig.fa",
    outdir = outdir)

RNA-Seq

In [ ]:
outdir = "coverage/rna_seq"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

coverage_from_gaf(
    gaf_src = "./alignments/rna_seq/*_stable.gaf", 
    all_fasta_ref = "./references/Oryzias_latipes_all_assemblies_contig.fa",
    outdir = outdir)

Segment Usage analysis

In [88]:
outdir = "segment_usage"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/graph_explore/segment_usage2

Parse segments

Define parsing function

In [14]:
def iter_idx_tuples (df):
    for i, j in zip(df.index, df.itertuples(index=False, name=None)):
        yield (i, j)
        
def iter_line (line, names):
    for i, j in zip(names, line):
        yield (i, j)
        
def gaf_seg_coverage (gaf_fn, graph_info_dict, min_align_len=100, min_mapq=10, progress=True):
    seg_cov_dict = Counter()
    info_counter = Counter()
    
    with open (gaf_fn) as gaf_fp:
        for line in tqdm(gaf_fp, disable=not progress):
            line = line.strip().split("\t")
            path_str = line[5]
            seg_list = re.split('<|>',path_str[1:])
            start = int(line[7])
            end = int(line[8])
            align_len = int(line[10])
            mapq = int(line[11])
            align_type = line[12].split(":")[-1]
            
            if align_len < min_align_len:
                info_counter["short length alignments"]+=1
            elif mapq < min_mapq:
                info_counter["low mapq alignments"]+=1
            elif align_type != "P":
                info_counter["non primary alignments"]+=1
            
            elif len(seg_list) == 1:
                info_counter["Single segment alignments"]+=1
                seg_len = end-start
                seg = seg_list[0]
                seg_cov_dict[seg] += seg_len
            
            else:
                info_counter["Multi segment alignments"]+=1
                all_seg_len = 0
                
                # First segment exception
                seg = seg_list[0]
                seg_len = graph_info_dict[seg]["length"]
                all_seg_len+=seg_len
                seg_cov_dict[seg] += seg_len-start

                # Internal segments
                for seg in seg_list[1:-1]:
                    seg_len = graph_info_dict[seg]["length"]
                    all_seg_len+=seg_len
                    seg_cov_dict[seg] += seg_len

                # Last segment exception
                seg = seg_list[-1]
                seg_len = graph_info_dict[seg]["length"]
                all_seg_len+=seg_len
                seg_cov_dict[seg] += seg_len-(all_seg_len-end)
    
    return seg_cov_dict, info_counter

def segment_usage_stats (gaf_src, graph_info_pkl, fasta_canonical_ref, outdir, n_sample_range=[12,10,8,6,4,2,1], min_align_len=100, min_mapq=10, min_normed_cov=0.1):
    
    stdout_print("Get list of reference chromosome\n")
    ref_chrom_l=[]
    with pyfaidx.Fasta(fasta_canonical_ref) as fa:
        ref_chrom_l = [s.name for s in fa]
    
    stdout_print("Parsing reference graph\n")
    graph_info_df = pd.read_pickle(graph_info_fn)
    graph_info_df = graph_info_df[["chromosome","length","start","end","type"]]
    graph_info_dict = graph_info_df.to_dict(orient="index")

    all_cov = OrderedDict()
    all_info = OrderedDict()
    
    stdout_print("Parsing gaf files\n")
    for gaf_fn in glob.glob(gaf_src):
        
        sample_id = gaf_fn.split("/")[-1].split("_")[0]
        stdout_print(f"\tParsing data for sample {sample_id}\n")
        seg_cov_dict, info_counter = gaf_seg_coverage (gaf_fn=gaf_fn, graph_info_dict=graph_info_dict, min_align_len=min_align_len, min_mapq=min_mapq, progress=False)
        all_cov[sample_id] = seg_cov_dict
        all_info[sample_id] = info_counter
        
    info_df = pd.DataFrame(all_info)
    display(info_df)
    
    stdout_print("Cast coverage data to DataFrame\n")
    cov_df = pd.DataFrame(all_cov)
    cov_df = cov_df.fillna(0)
    cov_df = cov_df.astype(np.int64)
        
    stdout_print("Merge cov df and info from graph\n")
    data_col = cov_df.columns
    cov_df = pd.merge(left=graph_info_df, right=cov_df, left_index=True, right_index=True, how="left")
    cov_df[data_col] = cov_df[data_col].fillna(0)
    cov_df.sort_values(["chromosome","start","end"], inplace=True)
    display(cov_df.head())
    out_fn = os.path.join(outdir, "segment_usage_raw.tsv")
    cov_df.to_csv(out_fn, sep="\t")
    
    stdout_print("Normalise by length only\n")
    len_norm_cov_df = cov_df.copy()
    for col in data_col:
        len_norm_cov_df[col] = len_norm_cov_df[col]/len_norm_cov_df["length"]
    display(len_norm_cov_df.head())
    out_fn = os.path.join(outdir, "segment_usage_len_norm.tsv")
    len_norm_cov_df.to_csv(out_fn, sep="\t")
    
    stdout_print("Normalise by length and total coverage over canonical chromosomes\n")
    canonical_df = cov_df[cov_df["chromosome"].isin(ref_chrom_l)]
    full_norm_cov_df = cov_df.copy()
    for col in data_col:
        full_norm_cov_df[col] = (full_norm_cov_df[col]/full_norm_cov_df["length"])/(canonical_df[col].sum()/canonical_df["length"].sum())
    display(full_norm_cov_df.head())
    out_fn = os.path.join(outdir, "segment_usage_len_cov_norm.tsv")
    full_norm_cov_df.to_csv(out_fn, sep="\t")
    
    stdout_print("Count segment_coverage by category\n")
    ref_alt_c = defaultdict(Counter)
    ref_chrom_c = defaultdict(Counter)
    alt_contig_c = defaultdict(Counter)
    
    norm_cov_data = full_norm_cov_df.drop(columns=["chromosome","length", "start", "end", "type"])
    
    for segment, line in tqdm(iter_idx_tuples(norm_cov_data), total=len(cov_df)):
        seg_info = graph_info_dict[segment]
        sample_type = seg_info["type"]
        
        # Canonical reference K
        if seg_info["chromosome"] in ref_chrom_l:
            for sample_id, cov in iter_line(line, norm_cov_data.columns):
                ref_alt_c[sample_id][sample_type]+=cov
                ref_chrom_c[sample_id][seg_info["chromosome"]]+=cov
                
        # Alternative contig
        else:
            # Count values 
            n_samples = 0
            for v in line:
                if v>min_normed_cov:
                    n_samples+=1
            for n in n_sample_range:
                if n_samples >= n:
                    cat = f"{n} sample contig"
                    for sample_id, cov in iter_line(line, norm_cov_data.columns):
                        ref_alt_c[sample_id][sample_type]+=cov
                        alt_contig_c[sample_id][cat]+=cov
                    break
    
    stdout_print("Cast to Dataframe and plot\n")
    ref_alt_df = pd.DataFrame(ref_alt_c).T
    ref_alt_df = ref_alt_df.reindex(columns=["HDRR","HNI","HSOK","MIKK","HDRR+","HNI+","HSOK+"], index=["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"])
    out_fn = os.path.join(outdir, "chromosome_alternative_coverage.tsv")
    ref_alt_df.to_csv(out_fn, sep="\t")
    display(ref_alt_df)
    
    ref_chrom_df = pd.DataFrame(ref_chrom_c).T
    ref_chrom_df = ref_chrom_df.reindex(columns=ref_chrom_l, index=["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"])
    out_fn = os.path.join(outdir, "reference_chromosomes_coverage.tsv")
    ref_chrom_df.to_csv(out_fn, sep="\t")
    display(ref_chrom_df)
    
    alt_contig_df = pd.DataFrame(alt_contig_c).T
    alt_contig_df = alt_contig_df.reindex(columns=[f"{i} sample contig" for i in n_sample_range], index=["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"])
    out_fn = os.path.join(outdir, "alternative_count_coverage.tsv")
    alt_contig_df.to_csv(out_fn, sep="\t")
    display(alt_contig_df)
    
    with pl.style.context("ggplot"):
        fig, axes = pl.subplots(3, 1, figsize=(15,15))
        
        for i, (df, label, palette) in enumerate((
            (ref_alt_df, "reference chromosomes and alternative contigs", sns.color_palette("tab10", n_colors=len(ref_alt_df.columns))),
            (ref_chrom_df, "reference chromosomes details", sns.cubehelix_palette(len(ref_chrom_df.columns), start=0.9, rot=0.1, dark=0, light=0.90, hue=2.75, gamma=0.75, reverse=True)),
            (alt_contig_df, "alternative contigs count", sns.cubehelix_palette(len(alt_contig_df.columns), start=0.9, rot=0.1, dark=0.1, light=0.80, hue=2.75, gamma=0.75, reverse=True)))):

            ax = axes[i]
            df = df.divide(df.sum(axis=1), axis=0)*100
            df.plot.bar(stacked=True, ax=ax, color=palette)
            ax.set_title(f"Percentage of graph segment coverage for {label}")
            ncol = 2 if len(df.columns)>10 else 1
            ax.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False, ncol=ncol)
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

        fig.tight_layout()
        out_fn = os.path.join(outdir, "graph_segment_coverage.svg")
        fig.savefig(out_fn)
        
    stdout_print("Write out as bed file per sample\n")
    for col in data_col:
        sdf = full_norm_cov_df[["chromosome", "start", "end", col]]
        sdf = sdf[sdf[col]>0]
        outfile = f"./{outdir}/{col}_segments.bedgraph"
        with open (outfile, "w") as fp:
            fp.write(f"track type=bedGraph type=bedGraph name={col}\n")
        sdf.to_csv(outfile, mode="a", header=False, index=False, sep="\t")

def overall_segment_usage (segment_usage_fn, outdir, min_coverage_dna=0.05, min_samples_dna=2):
    
    df = pd.read_csv(fn, sep="\t")

    info_field = ["seg_id","chromosome","length","start","end","type"]
    info_df = df[info_field].copy()
    data_df = df.drop(columns=info_field).copy()

    info_df["pass_samples"] = data_df.ge(min_coverage_dna).sum(axis=1)
    info_df["type"] = info_df["type"].str.rstrip("+")
    len_d = defaultdict(Counter)
    seg_d = defaultdict(Counter)

    for line in info_df.itertuples():
        if line.pass_samples>=min_samples_dna:
            len_d[line.type]["MIKK bases"]+=line.length
            seg_d[line.type]["MIKK segments"]+=1
            len_d["All"]["MIKK bases"]+=line.length
            seg_d["All"]["MIKK segments"]+=1
        else:
            len_d[line.type]["Non-MIKK bases"]+=line.length
            seg_d[line.type]["Non-MIKK segments"]+=1
            len_d["All"]["Non-MIKK bases"]+=line.length
            seg_d["All"]["Non-MIKK segments"]+=1

    len_df = pd.DataFrame.from_dict(len_d, orient="index")
    seg_df = pd.DataFrame.from_dict(seg_d, orient="index")
    len_df = len_df.reindex(["HDRR", "HNI", "HSOK", "MIKK", "All"])
    seg_df = seg_df.reindex(["HDRR", "HNI", "HSOK", "MIKK", "All"])

    merged_df = df = pd.concat([len_df, seg_df], axis=1)
    out_fn = os.path.join(outdir, "segment_usage_type_stats.tsv")
    merged_df.to_csv(out_fn, sep="\t")
    display(merged_df)

    with pl.style.context('ggplot'):
        fig, (ax1, ax2) = pl.subplots(2, 1, figsize=(10,10))

        len_df.plot.bar(ax=ax1, stacked=True)
        table(ax1, len_df, colWidths=[0.10, 0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
        ax1.set_title('Total length of segment per type')
        ax1.set_ylabel("Total length")

        seg_df.plot.bar(ax=ax2, stacked=True)
        table(ax2, seg_df, colWidths=[0.10, 0.10], colLoc="center", cellLoc="center", fontsize=14, bbox=(1.15,0,0.2,0.7))
        ax2.set_title('Number of segments per type')
        ax2.set_ylabel("Number of segments")

        fig.tight_layout()
        out_fn = os.path.join(outdir, "segment_usage_type_stats.svg")
        fig.savefig(out_fn)

Assemblies

In [ ]:
outdir = "segment_usage/assemblies"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
In [397]:
segment_usage_stats (
    gaf_src="./alignments/assemblies/*_unstable.gaf",
    graph_info_pkl="./graph_assembly/all_ref_graph_seg_info.pkl",
    fasta_canonical_ref="./references/Oryzias_latipes_HDRR_clean.fa",
    outdir="segment_usage/assemblies",
    n_sample_range=[12,10,8,6,4,2,1],
    min_align_len=100,
    min_mapq=1,
    min_normed_cov=0)
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/graph_explore/segment_usage2/assemblies
Get list of reference chromosome
Parsing reference graph
Parsing gaf files
	Parsing data for sample 11-1
	Parsing data for sample 7-1
	Parsing data for sample 80-1
	Parsing data for sample 134-1
	Parsing data for sample 79-2
	Parsing data for sample 134-2
	Parsing data for sample 4-1
	Parsing data for sample 7-2
	Parsing data for sample 4-2
	Parsing data for sample 117-2
	Parsing data for sample 131-1
	Parsing data for sample 69-1
11-1 7-1 80-1 134-1 79-2 134-2 4-1 7-2 4-2 117-2 131-1 69-1
Multi segment alignments 3028 2636 3934 3027 2815 3886 2729 2620 2474 2667 3059 2714
Single segment alignments 1709 2028 1478 1963 1791 2085 1883 1934 1950 2091 2231 1667
low mapq alignments 30 48 27 41 43 50 49 46 55 58 47 30
Cast coverage data to DataFrame
Merge cov df and info from graph
chromosome length start end type 11-1 7-1 80-1 134-1 79-2 134-2 4-1 7-2 4-2 117-2 131-1 69-1
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 286.0 0.0 0.0 0.0 0.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.0 0.0 0.0 0.0 597.0 584.0 0.0 0.0 0.0 597.0 0.0 0.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Normalise by length only
chromosome length start end type 11-1 7-1 80-1 134-1 79-2 134-2 4-1 7-2 4-2 117-2 131-1 69-1
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 1.0 0.0 0.0 0.0 0.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.0 0.0 0.0 0.0 1.0 0.978224 0.0 0.0 0.0 1.0 0.0 0.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
Normalise by length and total coverage over canonical chromosomes
chromosome length start end type 11-1 7-1 80-1 134-1 79-2 134-2 4-1 7-2 4-2 117-2 131-1 69-1
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.0 0.0 0.0 0.0 0.000000 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.0 0.0 0.0 0.0 0.000000 0.00000 0.0 1.170095 0.0 0.000000 0.0 0.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.0 0.0 0.0 0.0 1.176123 1.14787 0.0 0.000000 0.0 1.168944 0.0 0.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.0 0.0 0.0 0.0 0.000000 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.0 0.0 0.0 0.0 0.000000 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.0
Count segment_coverage by category
100%|██████████| 1109492/1109492 [00:12<00:00, 90815.74it/s]
Cast to Dataframe and plot

HDRR HNI HSOK MIKK HDRR+ HNI+ HSOK+
4-1 609663.162592 20040.168962 5736.836096 79913.350712 39.889065 423.956564 81.508456
4-2 610073.103835 19991.825389 5732.076753 80892.810382 9.349842 431.864216 91.536551
7-1 609353.303767 19866.077308 5615.397733 79685.465794 42.077514 411.864495 86.922931
7-2 612258.415504 19909.144692 5665.971965 79473.104067 44.463598 402.802118 86.748426
11-1 604560.953525 19859.270634 5616.566668 80149.363212 22.508425 422.201635 84.835861
69-1 616057.305559 20027.264295 5780.228834 78901.396100 9.408810 421.641315 82.010793
79-2 609868.894742 19946.486979 5606.039471 78514.011541 31.755308 425.372120 89.296403
80-1 600773.101228 19794.692676 5594.466031 82384.840755 19.265728 424.759800 81.645032
117-2 614147.913644 19912.952634 5765.798169 79320.386110 12.858386 404.320662 89.739488
131-1 595329.888907 47360.641117 5795.387205 74486.756652 10.734707 883.527770 105.134817
134-1 609197.848215 19895.481656 5726.545094 78920.475158 48.078306 417.592867 78.471917
134-2 611554.636139 19891.553753 5760.406823 78389.137447 33.326420 425.909713 84.022790
HDRR_1 HDRR_2 HDRR_3 HDRR_4 HDRR_5 HDRR_6 HDRR_7 HDRR_8 HDRR_9 HDRR_10 HDRR_11 HDRR_12 HDRR_13 HDRR_14 HDRR_15 HDRR_16 HDRR_17 HDRR_18 HDRR_19 HDRR_20 HDRR_21 HDRR_22 HDRR_23 HDRR_24 HDRR_MT
4-1 32813.536762 21326.865539 30034.938134 28091.192012 29673.415335 26154.378380 27158.044430 20317.151923 26923.640415 26882.385927 25117.091584 26934.799105 28054.580298 26144.331885 24570.583824 26044.381099 26013.312317 25052.218088 20185.377185 20025.686450 26768.240079 23610.178388 23220.360559 18546.472872 0.0
4-2 33145.968268 21262.912717 29696.786013 28315.329608 30008.804762 26246.803549 27488.912800 20398.764564 26757.133572 26849.701965 24986.190930 26981.939700 27754.750651 26380.663605 24482.170178 25797.321124 26021.986974 24992.121737 20280.703189 20132.703856 26526.319507 23736.649888 23411.443192 18417.021477 0.0
7-1 33345.832407 20983.124675 30171.925502 28057.712661 29428.252857 25775.922885 27374.691528 20035.263538 27102.995617 27139.371110 24820.518583 27097.226302 27668.422895 25897.342878 24566.167621 25691.204020 26129.747455 24980.462448 20316.253897 20212.259247 26791.478797 23792.944700 23400.782626 18573.399517 0.0
7-2 33298.953969 21191.189940 30429.001636 28340.714860 29677.352143 26203.628247 27386.930418 20331.493078 27017.598734 27350.615743 25126.972337 27235.722481 28003.720248 26018.981198 24658.949553 25779.076499 26242.760666 25055.634086 20269.643865 20061.537826 26877.422358 23834.255829 23362.588104 18503.671689 0.0
11-1 32913.276812 21133.060357 29786.855543 28155.395041 29323.242244 25721.601021 26771.889988 20127.081355 26806.618484 26879.580817 24902.043285 26449.648576 27375.469306 25659.235735 24554.185672 25950.419539 25866.250995 24564.069975 20473.856777 19907.220513 26790.075282 23525.974215 22843.624832 18080.277155 0.0
69-1 33822.480665 21459.234148 30269.616735 28313.554002 29964.844454 26254.683689 27555.709498 20360.078913 27217.437285 27322.066872 25257.846556 27348.464659 28208.027193 26270.236090 24801.432046 26129.130006 26486.787444 24847.950520 20772.658345 20235.714345 27103.916092 23975.369896 23471.395765 18608.670341 0.0
79-2 33111.511631 20835.986606 30101.367678 27927.805749 29823.530026 26149.532127 27353.354423 20507.996179 26921.511257 27053.227234 25018.096638 27058.643543 27791.940031 25957.579803 24749.205393 25687.077624 26257.012907 24684.574639 20330.816309 20210.041995 26507.499030 23974.218704 23269.144305 18587.220902 0.0
80-1 32815.328225 20846.583971 29537.132729 27683.028732 28882.190037 25836.552228 27090.002654 19910.154927 26348.947136 26377.694145 24675.988915 26587.673024 27752.420314 25432.306113 24515.548837 25712.118150 25978.196166 24284.216595 19920.145792 19786.314712 26353.816591 23291.909056 23018.715570 18136.116607 0.0
117-2 33251.993639 21496.959356 29728.764575 28209.559165 29821.928294 26130.088199 27542.974656 20411.873219 27194.396123 27431.384311 25085.227358 27307.860869 28315.737126 26230.754214 24618.070646 26009.967885 26339.110676 25135.173183 20314.529718 20284.635392 27260.884926 24003.749635 23368.069218 18654.221254 0.0
131-1 31146.185074 19628.732941 30332.969029 24972.170909 30686.976068 26826.328555 28191.647234 19819.276545 27931.506063 23955.076033 25418.180791 26955.730016 28729.948312 25903.037328 22792.138904 25540.681332 26046.725282 24114.968601 19968.336735 20681.825875 22897.086828 24693.892621 20366.087799 17730.380024 0.0
134-1 33182.669641 21230.636493 30275.276925 28364.337876 29776.667577 26339.500781 27312.641488 19832.951039 26906.082464 27074.575849 24847.328166 26980.585843 27970.912660 26070.111105 24763.526039 25866.832244 26148.893988 24926.255223 19419.840269 20106.914669 26437.115639 23697.320001 23102.197298 18564.674942 0.0
134-2 33313.335651 21314.526569 29832.021705 28525.173182 30089.165315 26260.472029 27413.326175 19829.346401 26859.820720 27116.197349 24815.427297 27065.495544 27610.867622 26076.101085 24828.957370 25512.018530 26521.202488 24995.652376 20372.003279 20093.124032 27244.839468 23846.969671 23339.331849 18679.260437 0.0
12 sample contig 10 sample contig 8 sample contig 6 sample contig 4 sample contig 2 sample contig 1 sample contig
4-1 27918.098900 23109.661127 13073.209986 11420.824811 11000.480087 11708.787515 8004.647431
4-2 27812.863058 23328.518408 13514.329174 11749.208901 11074.855350 11540.432452 8129.255791
7-1 27837.010911 23512.277487 13312.164744 11582.867052 11053.076426 11599.551272 6810.857882
7-2 27844.843174 23504.237928 13366.848413 11605.735311 10958.356916 11661.065471 6641.147652
11-1 28157.745023 22689.409514 12672.797591 10849.867210 10089.471270 10298.871230 11396.584596
69-1 27948.943010 22661.576528 12688.014820 10903.020064 10266.501372 11036.484028 9717.410325
79-2 27976.333258 22892.969762 12953.544472 11039.718834 10460.726517 10719.938317 8569.730663
80-1 28640.438998 21658.733408 12245.114141 10583.126416 9765.428322 10250.379903 15156.448833
117-2 27836.504273 23142.663818 13061.669118 11337.332535 10641.202654 11017.123570 8469.559480
131-1 28390.211482 18460.134125 11158.237673 9958.986987 9505.701415 10808.792125 40360.118461
134-1 27929.438388 22825.747352 13077.034821 11223.008274 10600.933218 10716.516480 8713.966464
134-2 27920.139520 23367.422111 13171.013597 11639.164216 10972.276994 11047.126081 6467.214427
Write out as bed file per sample
In [15]:
overall_segment_usage (
    segment_usage_fn="segment_usage/assemblies/segment_usage_len_cov_norm.tsv",
    outdir="segment_usage/assemblies",
    min_coverage_dna=0.05,
    min_samples_dna=2)
MIKK bases Non-MIKK bases MIKK segments Non-MIKK segments
HDRR 713609808 20491018 615564 33128
HNI 43204187 60303692 47854 100835
HSOK 49055247 102988085 23881 76394
MIKK 203539620 114635036 161533 50303
All 1009408862 298417831 848832 260660

Raw reads

In [ ]:
outdir = "segment_usage/raw_reads"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
In [398]:
segment_usage_stats (
    gaf_src="./alignments/raw_reads/*_unstable.gaf",
    graph_info_pkl="./graph_assembly/all_ref_graph_seg_info.pkl",
    fasta_canonical_ref="./references/Oryzias_latipes_HDRR_clean.fa",
    outdir="segment_usage/raw_reads",
    n_sample_range=[12,10,8,6,4,2,1], 
    min_align_len=100,
    min_mapq=1,
    min_normed_cov=0.05)
Get list of reference chromosome
Parsing reference graph
Parsing gaf files
	Parsing data for sample 80-1
	Parsing data for sample 134-2
	Parsing data for sample 131-1
	Parsing data for sample 7-2
	Parsing data for sample 7-1
	Parsing data for sample 4-1
	Parsing data for sample 69-1
	Parsing data for sample 79-2
	Parsing data for sample 134-1
	Parsing data for sample 117-2
	Parsing data for sample 11-1
	Parsing data for sample 4-2
80-1 134-2 131-1 7-2 7-1 4-1 69-1 79-2 134-1 117-2 11-1 4-2
Multi segment alignments 1713927 3949014 3568223 2650564 2315352 2428416 2241619 2474125 2326952 2887799 1791070 2828853
Single segment alignments 1203364 2828320 2526375 1958093 1643457 1736458 1592651 1798324 1642742 2027456 1234231 2076573
low mapq alignments 83469 194759 171111 127989 111761 119741 107222 122570 114040 143142 83982 139776
short length alignments 29408 81989 70168 51098 47634 49020 39933 44784 45235 57506 33323 56746
Cast coverage data to DataFrame
Merge cov df and info from graph
chromosome length start end type 80-1 134-2 131-1 7-2 7-1 4-1 69-1 79-2 134-1 117-2 11-1 4-2
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 1417.0 3164.0 71.0 2021.0 624.0 4290.0 364.0 862.0 1943.0 1322.0 624.0 1248.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 391.0 2755.0 0.0 1698.0 1144.0 858.0 2574.0 0.0 277.0 1426.0 2574.0 1535.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 2336.0 4968.0 850.0 5306.0 3920.0 2347.0 3819.0 5838.0 3040.0 6548.0 1508.0 5144.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 435.0 649.0 123.0 1695.0 1212.0 0.0 1199.0 0.0 0.0 1127.0 0.0 743.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.0 0.0 5152.0 435.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Normalise by length only
chromosome length start end type 80-1 134-2 131-1 7-2 7-1 4-1 69-1 79-2 134-1 117-2 11-1 4-2
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 2.270833 5.070513 0.113782 3.238782 1.000000 6.875000 0.583333 1.381410 3.113782 2.118590 1.000000 2.000000
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 1.367133 9.632867 0.000000 5.937063 4.000000 3.000000 9.000000 0.000000 0.968531 4.986014 9.000000 5.367133
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 3.912898 8.321608 1.423786 8.887772 6.566164 3.931323 6.396985 9.778894 5.092127 10.968174 2.525963 8.616415
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 1.531690 2.285211 0.433099 5.968310 4.267606 0.000000 4.221831 0.000000 0.000000 3.968310 0.000000 2.616197
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.000000 0.000000 1.130072 0.095416 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Normalise by length and total coverage over canonical chromosomes
chromosome length start end type 80-1 134-2 131-1 7-2 7-1 4-1 69-1 79-2 134-1 117-2 11-1 4-2
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.153746 0.148558 0.003689 0.139817 0.046945 0.318755 0.029822 0.062995 0.146057 0.080340 0.061490 0.076256
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.092561 0.282229 0.000000 0.256300 0.187779 0.139093 0.460111 0.000000 0.045431 0.189078 0.553410 0.204637
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.264922 0.243811 0.046163 0.383681 0.308246 0.182274 0.327036 0.445939 0.238855 0.415930 0.155322 0.328525
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.103703 0.066953 0.014042 0.257649 0.200341 0.000000 0.215834 0.000000 0.000000 0.150485 0.000000 0.099750
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.000000 0.000000 0.036640 0.004119 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Count segment_coverage by category
100%|██████████| 1109492/1109492 [00:12<00:00, 89119.57it/s]
Cast to Dataframe and plot

HDRR HNI HSOK MIKK HDRR+ HNI+ HSOK+
4-1 560525.985899 19024.345969 6141.524616 73295.278862 257.823662 455.508481 114.032990
4-2 558336.493311 18930.738109 5976.847434 73384.525015 50.559359 457.645561 116.391460
7-1 561589.256943 18978.370973 6050.612122 73361.744794 202.212259 440.541807 107.476702
7-2 558451.922936 18879.533903 5994.690120 73099.266212 181.320316 446.001205 107.274133
11-1 559017.124317 18923.345813 5948.867608 72381.710904 29.871640 453.900862 110.479277
69-1 559605.507096 18863.621798 6088.761037 73672.960079 46.563665 458.011417 128.786389
79-2 555905.609123 18858.330327 5976.993742 71982.711317 71.103713 462.760614 115.740673
80-1 557249.451718 18924.173531 5963.821456 71903.648533 21.105055 442.218302 115.718082
117-2 562917.616875 18879.125612 5957.329726 72681.552542 43.986126 446.250362 108.163285
131-1 550696.499178 44150.383053 6214.525116 64120.572260 53.686188 872.513889 110.396043
134-1 560846.924343 19024.324319 6024.091360 71533.757057 212.420978 464.813031 109.325154
134-2 559395.128680 18943.906816 6080.775393 71696.054601 174.249543 470.310039 120.381414
HDRR_1 HDRR_2 HDRR_3 HDRR_4 HDRR_5 HDRR_6 HDRR_7 HDRR_8 HDRR_9 HDRR_10 HDRR_11 HDRR_12 HDRR_13 HDRR_14 HDRR_15 HDRR_16 HDRR_17 HDRR_18 HDRR_19 HDRR_20 HDRR_21 HDRR_22 HDRR_23 HDRR_24 HDRR_MT
4-1 31451.769018 20514.094459 27057.410983 25806.354602 26340.012077 23787.902270 24518.053345 18249.238161 24779.495995 24753.903542 22643.535234 24684.887681 25258.702474 26825.729942 21958.028483 23782.401227 23549.121632 22740.575960 18688.062093 19019.771560 24053.703077 21484.695058 21898.703496 16654.492659 25.340871
4-2 30395.004627 20482.630346 27096.975813 25467.273297 26373.300716 23535.477455 24833.286546 18374.424017 24971.935036 24386.423333 22576.511051 24494.083189 25338.661213 26825.381607 21784.445503 23455.746365 23208.092667 22804.166301 18727.172993 18799.842086 24125.052604 21747.982131 21758.657351 16752.225378 21.741683
7-1 31146.064803 20455.933407 27405.270277 26006.607203 26575.857406 23640.679181 24868.210293 18491.602414 24776.325337 24629.376171 22805.896846 24714.812551 25469.223327 26245.538699 21933.753770 23769.388550 23671.817545 23016.810275 18375.335420 19158.939335 24204.561597 21313.419820 22062.131608 16828.480317 23.220790
7-2 30481.789603 20829.070892 27471.629182 25634.656550 26305.844886 23550.685024 24352.847365 18383.261055 24610.047300 24652.645890 22543.417493 24701.402536 25362.324312 26244.124669 21812.407875 23715.136096 23711.043952 22712.731683 18408.524454 18679.086181 24138.357565 21360.402056 21896.342868 16871.993727 22.149719
11-1 30313.597382 20628.152967 26996.282435 25534.512226 26451.090269 23479.625110 24522.047648 18456.937042 24497.441377 24578.290678 22436.515618 24493.323534 25288.250028 27398.637833 21936.913368 23458.164565 23623.911554 22829.271538 18621.453296 18963.053504 24339.129356 21527.843779 21784.576179 16834.068922 24.034112
69-1 30193.570430 21035.769961 26946.749602 26124.254657 26466.376872 23579.400829 24352.359810 18425.437097 24733.703752 24402.157573 22619.385359 24394.144889 25241.371281 26608.930326 22067.255593 23573.420043 23487.026041 22920.625460 18950.681797 18922.545502 24371.761201 21786.495989 21784.051062 16595.102721 22.929250
79-2 30185.674829 20438.175671 27003.202399 25673.653642 26206.005844 23340.040816 24268.423256 18343.871844 24703.687956 24282.444622 22405.470497 24785.213928 25288.013322 26488.517533 21990.668798 23357.091393 23380.399822 22507.759979 18657.444528 18898.569171 24286.763413 21394.772824 21293.083619 16702.895587 23.763830
80-1 30291.731694 20550.332529 26952.350359 25971.236184 26386.921474 23501.073532 24535.501815 18317.070247 24584.399769 24363.713515 22497.345035 24501.157682 25123.362344 26871.885204 21757.580104 23118.021116 23599.719193 22691.692038 18474.358830 19266.909098 24386.313151 21355.532845 21582.916071 16547.896351 20.431536
117-2 30499.443648 21000.126237 27358.878004 25780.428922 26572.416478 23836.861160 24704.543081 18639.987438 24811.960703 24673.597395 22708.378708 24745.802758 25638.511681 26896.313684 22163.702196 23731.205596 23674.746825 22869.826448 18816.773545 19245.628482 24543.332679 21889.835670 21401.736937 16697.040812 16.537789
131-1 29401.006673 19843.779626 27258.734860 23453.637831 27171.247802 24406.917250 25087.679491 17715.760969 25534.114891 22332.803866 23111.576837 24715.518046 25932.957456 26401.421543 20154.997270 23765.920304 23567.102464 22087.715800 18853.571545 19543.880761 21273.663798 22395.700554 20191.449632 16471.250022 24.089889
134-1 31232.079658 20294.001345 27504.525495 26080.076660 26386.396781 23673.434693 24369.558831 18544.582804 24858.646431 24421.589658 22432.927312 24508.384878 25418.065511 26568.778372 22119.168307 23584.373328 23528.862735 23000.063770 18744.439494 19078.047515 24207.558998 21655.529885 21979.787185 16632.861273 23.183423
134-2 31046.160922 20798.545114 27197.060251 26101.596925 26203.378994 23687.495434 24321.736291 18388.772000 24572.578398 24308.248967 22403.895451 24383.618289 25201.843979 26942.149921 22032.919237 23628.682940 23434.733819 22729.533473 18819.994042 18995.837365 24072.715156 21451.118835 22016.121731 16633.367326 23.023819
12 sample contig 10 sample contig 8 sample contig 6 sample contig 4 sample contig 2 sample contig 1 sample contig
4-1 54276.520752 19615.649069 9377.055114 6446.212951 4723.875833 3755.439178 1093.761684
4-2 53601.301182 19911.325355 9479.507001 6384.346506 4695.181525 3634.113791 1210.931577
7-1 54184.135557 20043.965228 9218.921087 6282.930924 4732.076136 3603.471618 1075.458106
7-2 53880.648282 20005.312227 9263.930888 6249.842582 4659.674514 3577.650504 1071.026891
11-1 53320.003055 19457.184328 9153.895561 6302.260728 4621.733274 3606.570026 1386.529132
69-1 53829.956831 19673.590454 9326.108656 6510.776399 4739.753195 3802.217200 1376.301649
79-2 53448.662268 19040.418879 9062.996201 6297.926353 4577.885000 3674.845338 1364.906348
80-1 53162.792991 19052.607078 9078.472218 6279.821273 4690.278683 3743.690055 1363.022662
117-2 53587.686479 19587.082816 9230.104387 6285.988426 4541.805046 3490.526227 1393.214272
131-1 50712.847675 16598.501270 8698.715734 6497.111532 5538.115337 8228.242472 19248.542528
134-1 53396.930939 19294.659654 9127.023414 6238.051549 4471.693974 3421.135905 1419.236462
134-2 53420.783130 19705.887916 9152.499393 6294.321945 4490.082858 3308.332024 1113.770540
Write out as bed file per sample
Out[398]:
2
In [16]:
overall_segment_usage (
    segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv",
    outdir="segment_usage/raw_reads",
    min_coverage_dna=0.05,
    min_samples_dna=2)
MIKK bases Non-MIKK bases MIKK segments Non-MIKK segments
HDRR 713609808 20491018 615564 33128
HNI 43204187 60303692 47854 100835
HSOK 49055247 102988085 23881 76394
MIKK 203539620 114635036 161533 50303
All 1009408862 298417831 848832 260660
In [3]:
min_coverage_dna=0.05
min_samples_dna=2
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"
outdir="segment_usage/raw_reads"

data_df = pd.read_csv(segment_usage_fn, sep="\t", index_col=0)
data_df = data_df.drop(columns=["chromosome","length","start","end","type"])
data_df["pass"] = data_df.ge(min_coverage_dna).sum(axis=1)
data_df["valid"] = data_df["pass"]>=2
data_df = data_df[["pass", "valid"]]

graph_info_fn = "./graph_assembly/all_ref_graph_seg_info.pkl"
info_df = pd.read_pickle(graph_info_fn)
info_df = info_df[["component", "length", "type"]]

all_df = pd.merge(info_df, data_df, left_index=True, right_index=True, how="left", validate="1:1")

all_res = OrderedDict()
for chrom, chrom_df in all_df.groupby("component"):
    c = Counter()
    
    for i in chrom_df.itertuples():
        if i.valid:
            seg_type = i.type.rstrip("+")
            c[seg_type] += i.length
    if c:
        all_res[chrom] = c
    
res_df = pd.DataFrame.from_dict(all_res, orient="index")
res_df_normed = res_df.divide(res_df.sum(axis=1), axis=0)*100

with pl.style.context('ggplot'):
    fig, (ax1, ax2) = pl.subplots(2, 1, figsize=(15,10))
    res_df.plot.bar(ax=ax1, stacked=True)
    ax1.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False)
    ax1.set_title("Total length of segments supported by at least 2 samples")
    ax1.set_xlabel("Chromosomes")
    ax1.set_ylabel("Total length")
    res_df_normed.plot.bar(ax=ax2, stacked=True)
    ax2.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False)
    ax2.set_title("Percentage of length of segments supported by at least 2 samples")
    ax2.set_xlabel("Chromosomes")
    ax2.set_ylabel("% of length")
    fig.tight_layout()
    out_fn = os.path.join(outdir, "segment_usage_chrom_type_stats.svg")
    fig.savefig(out_fn)

Karyotype view

In [8]:
outdir = "usage_karyotypes"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
Creating /hps/research1/birney/users/adrien/indigene/analyses/indigene_nanopore_DNA/graph_genome_analysis/usage_karyotypes

Median for all lines

In [11]:
min_cov = 0.25
n_bins = 250
n_colors = 255
max_count = 150
ref_name = "HDRR"
palette_name = "rocket_r"
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"

print("Load segments info data")
all_seg_df = pd.read_csv(segment_usage_fn, sep="\t")

for ref_name in ["HDRR", "HNI", "HSOK"]:
    print("Load reference info")
    fasta = f"./references/Oryzias_latipes_{ref_name}_clean.fa"
    chr_len = OrderedDict()
    with pyfaidx.Fasta(fasta) as fa:
        chr_len = {i.name:len(i) for i in fa}
    longest = max(chr_len.values())

    chr_len_norm = OrderedDict()
    for cname, clen in chr_len.items():
        norm_len = clen/longest
        if norm_len >= 0.1:
            chr_len_norm[cname] = norm_len
    valid_chr = list(chr_len_norm.keys())

    print("Cleanup data")
    seg_df = all_seg_df[all_seg_df["chromosome"].isin(valid_chr)].copy()
    data_df = seg_df[["80-1","134-2","131-1","7-2","7-1","4-1","69-1","79-2","134-1","117-2","11-1","4-2"]]
    seg_df["cov"] = data_df.apply(np.median, axis=1)
    sdf = seg_df[["chromosome", "length", "start", "end", "cov"]].copy()
    sdf = sdf[sdf["cov"]>=min_cov]
    sdf["pos"] = sdf["start"] + (sdf["end"]-sdf["start"])//2
    sdf["index"] = np.digitize(sdf["pos"], np.linspace(0, longest, n_bins))-1

    print("Compute coverage by genomic bin")
    all_count_d = defaultdict(Counter)
    for i in sdf.itertuples():
        all_count_d[i.chromosome][i.index]+=i.cov

    print("Plot results")
    count_bins = np.linspace(0, max_count, n_colors)
    pal = sns.color_palette(palette_name, n_colors=n_colors)
    with pl.style.context('seaborn-white'):
        fig, axes = pl.subplots(1, len(chr_len_norm), figsize=(40 ,10), sharey=True)
        for ax, cname in zip(axes, valid_chr):
            clen = chr_len_norm[cname]
            ax.set_title(cname, y=-0.05)
            ax.axis("off")
            ax.add_patch(mpl.patches.Rectangle((0.25, 0), 0.5, clen, fc="0.9"))
            counts = all_count_d[cname]
            y_list = np.divide(list(counts.keys()), n_bins)
            v_list = gaussian_filter1d(list(counts.values()), sigma=0.5)
            c_list = [pal[i-1] for i in np.digitize(v_list, count_bins)]
            for y, c in zip(y_list, c_list):
                ax.add_patch(mpl.patches.Rectangle((0.25, y), 0.5, 1/n_bins, fc=c))

        cb_ax = fig.add_axes([0.93, 0.22, 0.01, 0.5])
        cb = mpl.colorbar.ColorbarBase(ax=cb_ax, cmap=mpl.cm.get_cmap(palette_name), norm=mpl.colors.Normalize(vmin=0, vmax=max_count), orientation='vertical', ticklocation="left", label="Normalised coverage")

        fig.suptitle(f"Karyotype of median normalised coverage for reference {ref_name}", fontsize=20)
        fig.savefig(os.path.join(outdir, f"Karyotype_{ref_name}_all_median.svg"), bbox_inches='tight')
Load segments info data
Load reference info
Cleanup data
Compute coverage by genomic bin
Plot results
Load reference info
Cleanup data
Compute coverage by genomic bin
Plot results
Load reference info
Cleanup data
Compute coverage by genomic bin
Plot results

Individual lines

In [14]:
min_cov = 0.25
n_bins = 250
n_colors = 255
max_count = 150
ref_name = "HDRR"
palette_name = "rocket_r"
segment_usage_fn="segment_usage/raw_reads/segment_usage_len_cov_norm.tsv"

print("Load segments info data")
all_seg_df = pd.read_csv(segment_usage_fn, sep="\t")

for ref_name in ["HDRR", "HNI", "HSOK"]:
    print (f"Processing {ref_name}")
    print("Load reference info")
    fasta = f"./references/Oryzias_latipes_{ref_name}_clean.fa"
    chr_len = OrderedDict()
    with pyfaidx.Fasta(fasta) as fa:
        chr_len = {i.name:len(i) for i in fa}
    longest = max(chr_len.values())

    chr_len_norm = OrderedDict()
    for cname, clen in chr_len.items():
        norm_len = clen/longest
        if norm_len >= 0.1:
            chr_len_norm[cname] = norm_len
    valid_chr = list(chr_len_norm.keys())

    print("Cleanup data")
    seg_df = all_seg_df[all_seg_df["chromosome"].isin(valid_chr)].copy()

    for sample_id in ["80-1","134-2","131-1","7-2","7-1","4-1","69-1","79-2","134-1","117-2","11-1","4-2"]:
        print (f"Processing sample {sample_id}")
        sdf = seg_df[["chromosome", "length", "start", "end", sample_id]].copy()
        sdf = sdf.rename(columns={sample_id:"cov"})
        sdf = sdf[sdf["cov"]>=min_cov]    
        sdf["pos"] = sdf["start"] + (sdf["end"]-sdf["start"])//2
        bins = np.linspace(0, longest, n_bins)
        sdf["index"] = np.digitize(sdf["pos"], bins)-1

        print("Compute coverage by genomic bin")
        all_count_d = defaultdict(Counter)
        for i in sdf.itertuples():
            all_count_d[i.chromosome][i.index]+=i.cov

        print("Plot results")
        count_bins = np.linspace(0, max_count, n_colors)
        pal = sns.color_palette(palette_name, n_colors=n_colors)
        with pl.style.context('seaborn-white'):
            fig, axes = pl.subplots(1, len(chr_len_norm), figsize=(40 ,10), sharey=True)
            for ax, cname in zip(axes, valid_chr):
                clen = chr_len_norm[cname]
                ax.set_title(cname, y=-0.05)
                ax.axis("off")
                ax.add_patch(mpl.patches.Rectangle((0.25, 0), 0.5, clen, fc="0.9"))
                counts = all_count_d[cname]
                y_list = np.divide(list(counts.keys()), n_bins)
                v_list = gaussian_filter1d(list(counts.values()), sigma=0.5)
                c_list = [pal[i-1] for i in np.digitize(v_list, count_bins)]
                for y, c in zip(y_list, c_list):
                    ax.add_patch(mpl.patches.Rectangle((0.25, y), 0.5, 1/n_bins, fc=c))

            cb_ax = fig.add_axes([0.93, 0.22, 0.01, 0.5])
            cb = mpl.colorbar.ColorbarBase(ax=cb_ax, cmap=mpl.cm.get_cmap(palette_name), norm=mpl.colors.Normalize(vmin=0, vmax=max_count), orientation='vertical', ticklocation="left", label="Normalised coverage")

            fig.suptitle(f"Karyotype of normalised coverage for reference {ref_name} with line {sample_id}", fontsize=20)
            fig.savefig(os.path.join(outdir, f"Karyotype_{ref_name}_{sample_id}.svg"), bbox_inches='tight')
            pl.show()
Load segments info data
Processing HDRR
Load reference info
Cleanup data
Processing sample 80-1
Compute coverage by genomic bin
Plot results
Processing sample 134-2
Compute coverage by genomic bin
Plot results
Processing sample 131-1
Compute coverage by genomic bin
Plot results
Processing sample 7-2
Compute coverage by genomic bin
Plot results
Processing sample 7-1
Compute coverage by genomic bin
Plot results
Processing sample 4-1
Compute coverage by genomic bin
Plot results
Processing sample 69-1
Compute coverage by genomic bin
Plot results
Processing sample 79-2
Compute coverage by genomic bin
Plot results
Processing sample 134-1
Compute coverage by genomic bin
Plot results
Processing sample 117-2
Compute coverage by genomic bin
Plot results
Processing sample 11-1
Compute coverage by genomic bin
Plot results
Processing sample 4-2
Compute coverage by genomic bin
Plot results
Processing HNI
Load reference info
Cleanup data
Processing sample 80-1
Compute coverage by genomic bin
Plot results
Processing sample 134-2
Compute coverage by genomic bin
Plot results
Processing sample 131-1
Compute coverage by genomic bin
Plot results
Processing sample 7-2
Compute coverage by genomic bin
Plot results
Processing sample 7-1
Compute coverage by genomic bin
Plot results
Processing sample 4-1
Compute coverage by genomic bin
Plot results
Processing sample 69-1
Compute coverage by genomic bin
Plot results
Processing sample 79-2
Compute coverage by genomic bin
Plot results
Processing sample 134-1
Compute coverage by genomic bin
Plot results
Processing sample 117-2
Compute coverage by genomic bin
Plot results
Processing sample 11-1
Compute coverage by genomic bin
Plot results
Processing sample 4-2
Compute coverage by genomic bin
Plot results
Processing HSOK
Load reference info
Cleanup data
Processing sample 80-1
Compute coverage by genomic bin
Plot results
Processing sample 134-2
Compute coverage by genomic bin
Plot results
Processing sample 131-1
Compute coverage by genomic bin
Plot results
Processing sample 7-2
Compute coverage by genomic bin
Plot results
Processing sample 7-1
Compute coverage by genomic bin
Plot results
Processing sample 4-1
Compute coverage by genomic bin
Plot results
Processing sample 69-1
Compute coverage by genomic bin
Plot results
Processing sample 79-2
Compute coverage by genomic bin
Plot results
Processing sample 134-1
Compute coverage by genomic bin
Plot results
Processing sample 117-2
Compute coverage by genomic bin
Plot results
Processing sample 11-1
Compute coverage by genomic bin
Plot results
Processing sample 4-2
Compute coverage by genomic bin
Plot results

RNA-Seq

In [ ]:
outdir = "segment_usage/rna_seq"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)
In [399]:
segment_usage_stats (
    gaf_src="./alignments/rna_seq/*_unstable.gaf",
    graph_info_pkl="./graph_assembly/all_ref_graph_info.pkl",
    fasta_canonical_ref="./references/Oryzias_latipes_HDRR_clean.fa",
    outdir="segment_usage/rna_seq",
    n_sample_range=[50,45,40,35,30,25,20,15,10,5,1],
    min_align_len=50,
    min_mapq=1,
    min_normed_cov=0.05)
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/graph_explore/segment_usage2/rna_seq
Get list of reference chromosome
Parsing reference graph
Parsing gaf files
	Parsing data for sample 104-1
	Parsing data for sample 106-1
	Parsing data for sample 106-2
	Parsing data for sample 10-1
	Parsing data for sample 117-2
	Parsing data for sample 11-2
	Parsing data for sample 11-1
	Parsing data for sample 125-1
	Parsing data for sample 129-1
	Parsing data for sample 132-4-1
	Parsing data for sample 132-5
	Parsing data for sample 133-2
	Parsing data for sample 134-1
	Parsing data for sample 135-2
	Parsing data for sample 137-4
	Parsing data for sample 139-4
	Parsing data for sample 13-2
	Parsing data for sample 140-1
	Parsing data for sample 140-3
	Parsing data for sample 141-3
	Parsing data for sample 14-1
	Parsing data for sample 15-1
	Parsing data for sample 17-1
	Parsing data for sample 20-1
	Parsing data for sample 21-2
	Parsing data for sample 23-1
	Parsing data for sample 30-1
	Parsing data for sample 32-2
	Parsing data for sample 39-1
	Parsing data for sample 40-1
	Parsing data for sample 40-2
	Parsing data for sample 49-1
	Parsing data for sample 4-1
	Parsing data for sample 4-2
	Parsing data for sample 50-2
	Parsing data for sample 55-2
	Parsing data for sample 59-1
	Parsing data for sample 59-2
	Parsing data for sample 61-1
	Parsing data for sample 62-2
	Parsing data for sample 69-1
	Parsing data for sample 71-1
	Parsing data for sample 72-2
	Parsing data for sample 79-2
	Parsing data for sample 80-1
	Parsing data for sample 84-2
	Parsing data for sample 8-2
	Parsing data for sample 91-1
	Parsing data for sample 94-1
	Parsing data for sample 95-1
104-1 106-1 106-2 10-1 117-2 11-2 11-1 125-1 129-1 132-4-1 132-5 133-2 134-1 135-2 137-4 139-4 13-2 140-1 140-3 141-3 14-1 15-1 17-1 20-1 21-2 23-1 30-1 32-2 39-1 40-1 40-2 49-1 4-1 4-2 50-2 55-2 59-1 59-2 61-1 62-2 69-1 71-1 72-2 79-2 80-1 84-2 8-2 91-1 94-1 95-1
Single segment alignments 16344392 18679954 17499403 15708910 17658338 18569648 18432769 16066082 17358795 16526814 20972865 20502791 19641356 20898045 18422811 16647817 20624785 17804875 19456368 21451404 17785974 17222061 21935636 19179613 20929828 21049259 21422542 19228327 17523100 18036807 23175863 20077568 16643678 18449604 25996647 16284867 19512755 21994407 21384676 17703094 20066505 17011665 15738179 17454445 18684958 19228297 18634433 18297821 21894394 22246833
short length alignments 809784 936924 879277 795111 867879 938645 932425 814047 852912 896575 1117169 1131041 1013802 1091001 928212 841063 1050361 1002903 983939 1141176 907084 923779 1063188 1023535 1103144 1052532 1253858 1038738 948128 991648 1294328 1002074 812739 909527 1368062 845814 957274 1107550 1203246 885376 1113152 852984 799755 868263 927807 925763 955482 998850 1054348 1118542
low mapq alignments 922715 1016640 1016341 853791 994533 997878 1053867 783143 854500 585329 981229 788149 1058399 919315 1006955 906874 955007 914461 1079495 1334313 952040 988049 1197369 1009852 1029802 1025398 1161770 1097060 969304 761070 1021840 1196008 920902 1007340 1427275 772390 972554 1227343 937026 908972 932389 896219 858898 1021912 1063006 977745 936903 719898 1017361 1191968
Multi segment alignments 746626 813170 881190 679245 819843 835662 851603 679744 687793 599431 769167 687788 850784 764241 820646 698514 699301 637280 865127 904663 703005 726791 979991 829006 852913 798617 799400 869673 743973 598433 780089 953401 747010 833737 959764 701650 807915 951447 698750 722345 663873 739870 670705 883675 864746 773674 741071 599416 869934 1003868
Cast coverage data to DataFrame
Merge cov df and info from graph
chromosome length start end type 104-1 106-1 106-2 10-1 117-2 11-2 11-1 125-1 129-1 132-4-1 132-5 133-2 134-1 135-2 137-4 139-4 13-2 140-1 140-3 141-3 14-1 15-1 17-1 20-1 21-2 23-1 30-1 32-2 39-1 40-1 40-2 49-1 4-1 4-2 50-2 55-2 59-1 59-2 61-1 62-2 69-1 71-1 72-2 79-2 80-1 84-2 8-2 91-1 94-1 95-1
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 142.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 167.0 186.0 0.0 0.0 0.0 167.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 196.0 0.0 0.0 0.0 0.0 0.0 0.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 342.0 0.0 86.0 108.0 120.0 0.0 0.0 0.0 120.0 108.0 0.0 0.0 0.0 0.0 0.0 228.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 176.0 0.0 88.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 256.0 0.0 0.0 88.0 0.0 70.0 188.0 0.0 0.0 0.0 0.0 188.0 0.0 0.0 0.0 340.0 0.0 0.0 0.0 0.0 0.0 125.0 0.0 138.0 0.0 0.0 78.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Normalise by length only
chromosome length start end type 104-1 106-1 106-2 10-1 117-2 11-2 11-1 125-1 129-1 132-4-1 132-5 133-2 134-1 135-2 137-4 139-4 13-2 140-1 140-3 141-3 14-1 15-1 17-1 20-1 21-2 23-1 30-1 32-2 39-1 40-1 40-2 49-1 4-1 4-2 50-2 55-2 59-1 59-2 61-1 62-2 69-1 71-1 72-2 79-2 80-1 84-2 8-2 91-1 94-1 95-1
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.227564 0.0 0.000000 0.000000 0.00000 0.0 0.0 0.0 0.000000 0.000000 0.267628 0.298077 0.000000 0.0 0.0 0.267628 0.0 0.0 0.00000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.314103 0.0 0.0 0.0 0.0 0.0 0.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 1.195804 0.0 0.300699 0.377622 0.41958 0.0 0.0 0.0 0.419580 0.377622 0.000000 0.000000 0.000000 0.0 0.0 0.797203 0.0 0.0 0.00000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.294807 0.0 0.147404 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.428811 0.0 0.0 0.147404 0.0 0.117253 0.314908 0.00000 0.0 0.0 0.0 0.314908 0.000000 0.000000 0.000000 0.569514 0.0 0.0 0.000000 0.0 0.0 0.20938 0.0 0.231156 0.0 0.0 0.130653 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.00000 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.00000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.00000 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.00000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
Normalise by length and total coverage over canonical chromosomes
chromosome length start end type 104-1 106-1 106-2 10-1 117-2 11-2 11-1 125-1 129-1 132-4-1 132-5 133-2 134-1 135-2 137-4 139-4 13-2 140-1 140-3 141-3 14-1 15-1 17-1 20-1 21-2 23-1 30-1 32-2 39-1 40-1 40-2 49-1 4-1 4-2 50-2 55-2 59-1 59-2 61-1 62-2 69-1 71-1 72-2 79-2 80-1 84-2 8-2 91-1 94-1 95-1
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.044461 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 0.055107 0.05831 0.000000 0.0 0.0 0.061401 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.066301 0.0 0.0 0.0 0.0 0.0 0.0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.233633 0.0 0.073147 0.073488 0.099385 0.0 0.0 0.0 0.100714 0.066434 0.000000 0.00000 0.000000 0.0 0.0 0.182898 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.072603 0.0 0.031143 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.087737 0.0 0.0 0.028799 0.0 0.028523 0.061283 0.000000 0.0 0.0 0.0 0.075589 0.000000 0.000000 0.00000 0.112892 0.0 0.0 0.000000 0.0 0.0 0.039038 0.0 0.050868 0.0 0.0 0.027411 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
Count segment_coverage by category
100%|██████████| 1109492/1109492 [00:31<00:00, 34955.53it/s]
Cast to Dataframe and plot

HDRR HNI HSOK MIKK HDRR+ HNI+ HSOK+
4-1 350757.347347 13679.540073 5114.985132 8521.628494 0.519762 33.722235 4.197783
4-2 334822.616949 13059.157617 5633.643348 24447.519360 39.097602 33.081872 4.892076
7-1 NaN NaN NaN NaN NaN NaN NaN
7-2 NaN NaN NaN NaN NaN NaN NaN
11-1 339588.035448 12561.071282 8493.107702 26193.498595 21.132744 32.964758 5.049522
69-1 300248.690122 8624.324560 1766.442827 15143.416778 1.789487 44.713096 5.263329
79-2 359901.483493 13693.292837 916.349531 6137.611568 6.258409 38.034037 3.163344
80-1 337180.546476 12341.162629 1312.415072 13597.891975 0.884612 24.743665 3.176274
117-2 340388.846813 10756.270400 6765.587918 6031.666022 1.295919 33.411074 5.012776
131-1 NaN NaN NaN NaN NaN NaN NaN
134-1 334686.921743 12666.399135 3944.880036 16716.113217 8.000796 43.601183 3.993611
134-2 NaN NaN NaN NaN NaN NaN NaN
HDRR_1 HDRR_2 HDRR_3 HDRR_4 HDRR_5 HDRR_6 HDRR_7 HDRR_8 HDRR_9 HDRR_10 HDRR_11 HDRR_12 HDRR_13 HDRR_14 HDRR_15 HDRR_16 HDRR_17 HDRR_18 HDRR_19 HDRR_20 HDRR_21 HDRR_22 HDRR_23 HDRR_24 HDRR_MT
4-1 9425.947992 2968.600962 4466.096803 151590.139478 7401.959610 60780.859671 6444.729268 6691.537240 8378.966511 9670.549815 6184.253568 6314.725109 8677.232265 5101.835293 4407.076676 6625.843355 10941.889005 4304.648589 4401.583054 5293.372817 3684.912561 6339.396737 4801.819279 5518.794873 340.576816
4-2 8781.631798 3463.120934 5170.732592 126870.100631 8743.605351 65576.852207 7052.675375 8641.107758 8684.191823 9234.793505 6289.022737 6811.680857 8627.636566 5213.824511 4594.214365 5903.020280 8006.694556 5165.013299 4689.864050 5287.739296 4001.724951 6974.121882 5366.049648 5286.875589 386.322390
7-1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7-2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11-1 9485.125491 3071.630751 4522.663459 150662.205431 7496.335192 55692.459149 6892.307832 6632.503770 7148.302567 7272.905761 5932.203987 5924.247691 8455.682502 5096.061065 4169.405423 5975.418820 10695.146196 5393.746388 4452.477901 4191.709977 3986.669179 5963.292521 4658.679714 5550.912100 265.942581
69-1 19295.145791 5568.995773 6552.140549 37070.507089 9190.716587 14571.146314 8166.267265 9045.324241 12633.660261 12566.594173 11594.450209 9213.744217 16302.059617 8946.487997 8045.158829 13485.778092 31973.350189 10948.628935 7858.560938 5441.573615 9718.119552 9984.055273 7083.144928 14812.853045 180.226643
79-2 7976.888115 2755.508279 4107.254899 174728.123233 7271.496433 66592.982985 5243.180298 6530.048662 6419.539884 7655.731268 5777.389372 4908.002101 6739.851440 4572.465786 3677.283488 5103.725629 8087.248694 4649.562157 4117.412303 3973.422960 3938.871521 5470.665675 4250.148521 4966.030372 388.649419
80-1 7023.782282 2487.678175 3593.902123 158710.008466 6136.463611 68806.019698 5471.904237 5993.891413 6331.201751 6775.942352 4948.436261 5129.384296 7042.889788 4107.483957 3641.028456 5052.739504 7009.532495 4136.950640 3964.658825 4100.760646 3169.822670 4895.227107 4049.912012 4355.739523 245.186190
117-2 8114.210577 2833.789648 4250.313498 168053.376868 6636.547693 52855.809594 6677.508155 5649.522960 7078.831037 7407.752109 5129.173951 5604.222034 7732.534105 4449.851061 3506.234340 6017.976709 7907.532311 4101.980968 3906.637407 4624.666222 3163.465889 5413.712687 4045.405356 4933.256557 294.535076
131-1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
134-1 12312.476473 3985.157303 5439.973528 107677.516130 8093.860578 44713.369943 10257.296676 7949.554700 9712.869465 8774.301383 8218.057422 6635.763691 10001.749076 7213.531563 7450.656224 9323.898300 17989.355105 6531.676934 6128.127434 5120.263136 5746.805834 8172.227960 5144.171619 11466.361759 627.899504
134-2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50 sample contig 45 sample contig 40 sample contig 35 sample contig 30 sample contig 25 sample contig 20 sample contig 15 sample contig 10 sample contig 5 sample contig 1 sample contig
4-1 21214.409678 3744.593952 460.881130 363.018513 249.200774 226.619629 239.323416 219.527645 188.140514 210.022322 238.855906
4-2 28464.108375 10013.622677 1613.027329 547.154141 407.690189 394.216686 311.924974 316.434677 339.437433 372.505008 437.270386
7-1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7-2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11-1 33066.443643 10316.787425 1764.519531 333.641008 279.966957 246.098242 263.002869 224.909473 225.356410 277.599696 308.499349
69-1 15730.098397 5115.199766 1077.796611 520.108621 625.422009 558.900197 336.267908 479.763794 297.191948 387.498076 457.702750
79-2 16343.850702 1986.486641 475.170177 359.865497 301.227331 183.952726 193.291325 238.888771 225.013016 220.127356 266.836185
80-1 20334.713858 4569.951475 891.653088 218.197789 221.297341 166.857303 170.705055 143.065141 162.716164 196.032428 205.084585
117-2 19884.787698 1437.086773 338.903977 313.401102 207.173768 243.756142 185.236995 175.745512 288.358217 223.005135 295.788790
131-1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
134-1 22646.425832 5760.473287 1111.035766 481.218131 481.122386 440.164041 324.374137 364.499675 342.621927 333.637708 1097.415089
134-2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Write out as bed file per sample
In [17]:
overall_segment_usage (
    segment_usage_fn="segment_usage/rna_seq/segment_usage_len_cov_norm.tsv",
    outdir="segment_usage/rna_seq",
    min_coverage_dna=0.05,
    min_samples_dna=2)
MIKK bases Non-MIKK bases MIKK segments Non-MIKK segments
HDRR 713609808 20491018 615564 33128
HNI 43204187 60303692 47854 100835
HSOK 49055247 102988085 23881 76394
MIKK 203539620 114635036 161533 50303
All 1009408862 298417831 848832 260660

Generate label CSV file for bandage and bedgraph file for IGV

Define parsing function

In [124]:
def usage_annotation(segment_usage_fn, outdir, thresholds = [90, 75, 50, 25, 10]):

    segment_usage_df = pd.read_csv(segment_usage_fn, sep="\t", index_col=0)
    segment_usage_df_data = segment_usage_df.drop(columns=["chromosome","length","start","end","type","identity"])

    coord = segment_usage_df["chromosome"] + ":" + segment_usage_df["start"].astype(str) + "-" + segment_usage_df["end"].astype(str)
    segment_usage_df["coord"] = coord

    for threshold in thresholds:
        segment_usage_df[threshold] = segment_usage_df_data.ge(threshold/100).sum(axis=1)
    display(segment_usage_df.head())

    # Define colormap for bandage
    colors = [rgb2hex(i) for i in sns.color_palette('afmhot_r', len(segment_usage_df_data.columns)+1)]

    for threshold in thresholds:
        print(f"Write file for threshold {threshold}")

        # Formating for bedgraph
        bedgraph_df = segment_usage_df[["chromosome","start","end", threshold]].copy()
        bedgraph_out_fn = f"{outdir}/seg_usage_{threshold}.bedgraph"
        with open (bedgraph_out_fn, "w") as fp:
            fp.write("track type=bedGraph name=samples_pass\n")
        bedgraph_df.to_csv(bedgraph_out_fn, sep="\t", mode="a", index=False, header=False)

        # Formating for bandage
        bandage_df = segment_usage_df[[threshold,"coord","length","type","identity"]].copy()
        bandage_df.index.name = "name"
        bandage_df = bandage_df.rename(columns={threshold:"sample_pass"})
        seg_col = []
        for seg_id, n in bandage_df["sample_pass"].items():
            seg_col.append(colors[n])
        bandage_df["color"] = seg_col
        bandage_out_fn = f"{outdir}/seg_usage_{threshold}.csv"
        bandage_df.to_csv(bandage_out_fn, sep=",")

Raw reads

In [127]:
outdir = "segment_usage/raw_reads_annot/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

usage_annotation (
    segment_usage_fn = "segment_usage/raw_reads/segment_usage_len_cov_norm.tsv",
    outdir = outdir,
    thresholds = [50, 33, 25, 10])
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/graph_explore/segment_usage2/raw_reads_annot
chromosome length start end type identity 80-1_H3 134-2_A5 131-1_F4 7-2_F2 7-1_E2 4-1_B2 69-1_F3 79-2_G3 134-1_H4 117-2_C4 11-1_A3 4-2_B2 coord 50 33 25 10
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.949126 0.153746 0.148558 0.003689 0.139817 0.046945 0.318755 0.029822 0.062995 0.146057 0.080340 0.061490 0.076256 HDRR+_000617F:44100-44724 0 0 1 5
s648600 HDRR+_000617F 286 46919 47205 HDRR+ NaN 0.092561 0.282229 0.000000 0.256300 0.187779 0.139093 0.460111 0.000000 0.045431 0.189078 0.553410 0.204637 HDRR+_000617F:46919-47205 1 2 4 8
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.973289 0.264922 0.243811 0.046163 0.383681 0.308246 0.182274 0.327036 0.445939 0.238855 0.415930 0.155322 0.328525 HDRR+_000617F:50034-50631 0 3 7 11
s648602 HDRR+_000617F 284 52947 53231 HDRR+ NaN 0.103703 0.066953 0.014042 0.257649 0.200341 0.000000 0.215834 0.000000 0.000000 0.150485 0.000000 0.099750 HDRR+_000617F:52947-53231 0 0 1 5
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.997368 0.000000 0.000000 0.036640 0.004119 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 HDRR+_000617F:127768-132327 0 0 0 0
Write file for threshold 50
Write file for threshold 33
Write file for threshold 25
Write file for threshold 10

RNA-Seq

In [128]:
outdir = "segment_usage/rna_seq_annot/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

usage_annotation (
    segment_usage_fn = "segment_usage/rna_seq/segment_usage_len_cov_norm.tsv",
    outdir = outdir,
    thresholds = [50, 33, 25, 10])
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/graph_explore/segment_usage2/rna_seq_annot
chromosome length start end type identity 104-1 106-1 106-2 10-1 117-2 11-2 11-1 125-1 129-1 132-4-1 132-5 133-2 134-1 135-2 137-4 139-4 13-2 140-1 140-3 141-3 14-1 15-1 17-1 20-1 21-2 23-1 30-1 32-2 39-1 40-1 40-2 49-1 4-1 4-2 50-2 55-2 59-1 59-2 61-1 62-2 69-1 71-1 72-2 79-2 80-1 84-2 8-2 91-1 94-1 95-1 coord 50 33 25 10
seg_id
s648599 HDRR+_000617F 624 44100 44724 HDRR+ 0.949126 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.044461 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 0.055107 0.05831 0.000000 0.0 0.0 0.061401 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.066301 0.0 0.0 0.0 0.0 0.0 0.0 HDRR+_000617F:44100-44724 0 0 0 0
s648600 HDRR+_000617F 286 46919 47205 HDRR+ 1.000000 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.233633 0.0 0.073147 0.073488 0.099385 0.0 0.0 0.0 0.100714 0.066434 0.000000 0.00000 0.000000 0.0 0.0 0.182898 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 HDRR+_000617F:46919-47205 0 0 0 3
s648601 HDRR+_000617F 597 50034 50631 HDRR+ 0.973289 0.072603 0.0 0.031143 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.087737 0.0 0.0 0.028799 0.0 0.028523 0.061283 0.000000 0.0 0.0 0.0 0.075589 0.000000 0.000000 0.00000 0.112892 0.0 0.0 0.000000 0.0 0.0 0.039038 0.0 0.050868 0.0 0.0 0.027411 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 HDRR+_000617F:50034-50631 0 0 0 1
s648602 HDRR+_000617F 284 52947 53231 HDRR+ 0.993007 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 HDRR+_000617F:52947-53231 0 0 0 0
s648603 HDRR+_000617F 4559 127768 132327 HDRR+ 0.997368 0.000000 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 HDRR+_000617F:127768-132327 0 0 0 0
Write file for threshold 50
Write file for threshold 33
Write file for threshold 25
Write file for threshold 10

Link Usage analysis

In [ ]:
outdir = "link_usage"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

Define parsing function

In [102]:
def sort_seg (s1, s1_orient, s2, s2_orient):
    """"""
    s1_num = int(s1[1:])
    s2_num = int(s2[1:])
    if s1_num > s2_num:
        inv_dict = {"+":"-", "-":"+"}
        s1, s2 = s2, s1
        s1_orient, s2_orient = inv_dict[s2_orient], inv_dict[s1_orient]
    
    link_id = "{}{}_{}{}".format(s1, s1_orient, s2, s2_orient)
    return link_id

def split_path (path):
    l = []
    seg_id = ""
    strand = ""
    for c in path:
        if c in ["<", ">"]:
            if seg_id:
                l.append((seg_id, strand))
            seg_id = ""
            strand = "+" if c == ">" else "-"
        else:
            seg_id +=c

    l.append((seg_id, strand))
    return l

def gaf_link_coverage (gaf_fn, min_align_len=100, min_mapq=1, progress=True):
    link_usage_dict = Counter()
    info_counter = Counter()
    
    with open (gaf_fn) as gaf_fp:
        for line in tqdm(gaf_fp, disable=not progress):
            line = line.strip().split("\t")
            seg_list = split_path(line[5])
            align_len = int(line[10])
            mapq = int(line[11])
            align_type = line[12].split(":")[-1]
            
            if align_len < min_align_len:
                info_counter["short length alignments"]+=1
            elif mapq < min_mapq:
                info_counter["low mapq alignments"]+=1
            elif align_type != "P":
                info_counter["non primary alignments"]+=1
            elif len(seg_list) == 1:
                info_counter["single segment path"]+=1
            else:
                info_counter["multi segment path"]+=1
                prev_seg = seg_list[0]
                for seg in seg_list[1:]:
                    info_counter["valid links"]+=1
                    link_id = sort_seg(s1=prev_seg[0], s1_orient=prev_seg[1], s2=seg[0], s2_orient=seg[1])
                    link_usage_dict[link_id]+=1
                    prev_seg=seg
    
    return link_usage_dict, info_counter

def link_usage_stats (gaf_src, graph_link_info_fn, outdir, min_align_len=100, min_mapq=1):
    data_col = ["4-1","4-2","7-1","7-2","11-1","69-1","79-2","80-1","117-2","131-1","134-1","134-2"]
    
    all_link = OrderedDict()
    all_info = OrderedDict()
    
    stdout_print("Parsing gaf files\n")
    for gaf_fn in glob.glob(gaf_src):
        sample_id = gaf_fn.split("/")[-1].split("_")[0]
        stdout_print(f"\tParsing data for sample {sample_id}\n")
        link_dict, info_counter = gaf_link_coverage (gaf_fn=gaf_fn, min_align_len=min_align_len, min_mapq=min_mapq, progress=False)
        all_link[sample_id] = link_dict
        all_info[sample_id] = info_counter
        
    info_df = pd.DataFrame(all_info)
    info_df = info_df.reindex(columns=data_col)
    display(info_df)

    stdout_print("Cast link usage data to DataFrame\n")
    link_data_df = pd.DataFrame(all_link)
    link_data_df = link_data_df.reindex(columns=data_col)
    
    stdout_print("Loading graph link info file\n")
    link_info_df = pd.read_pickle(graph_link_info_fn)
    #link_info_df = link_info_df[["connect_type","connect_gap","SV_type"]]
    
    stdout_print("Merge data and info\n")
    link_df = pd.merge(left=link_info_df, right=link_data_df, left_index=True, right_index=True, how="left")
    link_df[data_col] = link_df[data_col].fillna(0).astype(np.int64)
    display(link_df.head())
    out_fn = os.path.join(outdir, "link_cov.tsv")
    link_df.to_csv(out_fn, sep="\t")
    
    stdout_print("Normalise by HDRR continuous links\n")
    canonical_link_df = link_df[(link_df["connect_type"]=="intra_HDRR") & (link_df["SV_type"]=="continuous")]
    norm_link_df = link_df.copy()
    for col in data_col:
        norm_link_df[col] = norm_link_df[col]/canonical_link_df[col].median()
    display(norm_link_df.head())
    out_fn = os.path.join(outdir, "norm_link_cov.tsv")
    norm_link_df.to_csv(out_fn, sep="\t")

Assemblies

In [91]:
outdir = "link_usage/assemblies/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

link_usage_stats(
    gaf_src="./alignments/assemblies/*_unstable.gaf",
    graph_link_info_fn="./graph_assembly/all_ref_graph_link_info.pkl",
    outdir=outdir,
    min_align_len=100,
    min_mapq=1)
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/link_usage/assemblies
Parsing gaf files
	Parsing data for sample 11-1
	Parsing data for sample 7-1
	Parsing data for sample 80-1
	Parsing data for sample 134-1
	Parsing data for sample 79-2
	Parsing data for sample 134-2
	Parsing data for sample 4-1
	Parsing data for sample 7-2
	Parsing data for sample 4-2
	Parsing data for sample 117-2
	Parsing data for sample 131-1
	Parsing data for sample 69-1
4-1 4-2 7-1 7-2 11-1 69-1 79-2 80-1 117-2 131-1 134-1 134-2
multi segment path 2729 2474 2636 2620 3028 2714 2815 3934 2667 3059 3027 3886
valid links 609468 612977 611008 612742 599242 612712 606858 588209 614870 606116 607994 609453
single segment path 1883 1950 2028 1934 1709 1667 1791 1478 2091 2231 1963 2085
low mapq alignments 49 55 48 46 30 30 43 27 58 47 41 50
Cast link usage data to DataFrame
Loading graph link info file
Merge data and info
connect_type connect_gap SV_type 4-1 4-2 7-1 7-2 11-1 69-1 79-2 80-1 117-2 131-1 134-1 134-2
link_id
s1+_s2+ intra_HDRR 0.0 continuous 0 1 1 1 1 1 1 1 1 1 1 1
s2+_s3+ intra_HDRR 0.0 continuous 0 0 0 1 0 1 1 0 0 0 0 0
s3+_s4+ intra_HDRR 0.0 continuous 0 0 0 0 0 1 0 0 0 0 0 0
s4+_s5+ intra_HDRR 0.0 continuous 0 1 1 1 1 1 1 1 1 1 1 1
s4-_s999575+ HDRR_MIKK NaN insertion 0 1 1 1 1 0 1 1 1 1 1 1
Normalise by HDRR continuous links
connect_type connect_gap SV_type 4-1 4-2 7-1 7-2 11-1 69-1 79-2 80-1 117-2 131-1 134-1 134-2
link_id
s1+_s2+ intra_HDRR 0.0 continuous 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
s2+_s3+ intra_HDRR 0.0 continuous 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
s3+_s4+ intra_HDRR 0.0 continuous 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
s4+_s5+ intra_HDRR 0.0 continuous 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
s4-_s999575+ HDRR_MIKK NaN insertion 0.0 1.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0

Raw reads

In [103]:
outdir = "link_usage/raw_reads/"
shutil.rmtree(outdir, ignore_errors=True)
mkdir (outdir)

link_usage_stats(
    gaf_src="./alignments/raw_reads/*_unstable.gaf",
    graph_link_info_fn="./graph_assembly/all_ref_graph_link_info.pkl",
    outdir=outdir,
    min_align_len=100,
    min_mapq=1)
Creating /hps/research1/birney/users/adrien/analyses/medaka_assembly_graph/link_usage/raw_reads
Parsing gaf files
	Parsing data for sample 80-1
	Parsing data for sample 134-2
	Parsing data for sample 131-1
	Parsing data for sample 7-2
	Parsing data for sample 7-1
	Parsing data for sample 4-1
	Parsing data for sample 69-1
	Parsing data for sample 79-2
	Parsing data for sample 134-1
	Parsing data for sample 117-2
	Parsing data for sample 11-1
	Parsing data for sample 4-2
4-1 4-2 7-1 7-2 11-1 69-1 79-2 80-1 117-2 131-1 134-1 134-2
multi segment path 2428416 2828853 2315352 2650564 1791070 2241619 2474125 1713927 2887799 3568223 2326952 3949014
valid links 13741360 16651984 13616743 14699040 10325444 12430098 13819171 9316208 16865508 19837086 13566908 21643190
single segment path 1736458 2076573 1643457 1958093 1234231 1592651 1798324 1203364 2027456 2526375 1642742 2828320
low mapq alignments 119741 139776 111761 127989 83982 107222 122570 83469 143142 171111 114040 194759
short length alignments 49020 56746 47634 51098 33323 39933 44784 29408 57506 70168 45235 81989
Cast link usage data to DataFrame
Loading graph link info file
Merge data and info
s1_chromosome s1_pos s2_chromosome s2_pos s2_type connect_type connect_gap SV_type 4-1 4-2 7-1 7-2 11-1 69-1 79-2 80-1 117-2 131-1 134-1 134-2
link_id
s1+_s2+ HDRR_1 37816 HDRR_1 37816 HDRR intra_HDRR 0.0 continuous 15 25 16 22 8 13 12 6 14 23 12 40
s2+_s3+ HDRR_1 37928 HDRR_1 37928 HDRR intra_HDRR 0.0 continuous 0 1 1 1 0 1 0 0 1 2 0 5
s3+_s4+ HDRR_1 40172 HDRR_1 40172 HDRR intra_HDRR 0.0 continuous 0 0 0 0 0 0 0 0 0 0 0 0
s4+_s5+ HDRR_1 41595 HDRR_1 41595 HDRR intra_HDRR 0.0 continuous 27 20 18 17 11 17 20 10 35 26 25 39
s4-_s999575+ HDRR_1 40172 MIKK_117-2_C4_959 161190 MIKK HDRR_MIKK NaN insertion 22 21 24 18 11 14 16 12 27 25 22 43
Normalise by HDRR continuous links
s1_chromosome s1_pos s2_chromosome s2_pos s2_type connect_type connect_gap SV_type 4-1 4-2 7-1 7-2 11-1 69-1 79-2 80-1 117-2 131-1 134-1 134-2
link_id
s1+_s2+ HDRR_1 37816 HDRR_1 37816 HDRR intra_HDRR 0.0 continuous 0.789474 1.086957 0.888889 1.10 0.571429 0.764706 0.631579 0.500000 0.608696 0.884615 0.666667 1.333333
s2+_s3+ HDRR_1 37928 HDRR_1 37928 HDRR intra_HDRR 0.0 continuous 0.000000 0.043478 0.055556 0.05 0.000000 0.058824 0.000000 0.000000 0.043478 0.076923 0.000000 0.166667
s3+_s4+ HDRR_1 40172 HDRR_1 40172 HDRR intra_HDRR 0.0 continuous 0.000000 0.000000 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
s4+_s5+ HDRR_1 41595 HDRR_1 41595 HDRR intra_HDRR 0.0 continuous 1.421053 0.869565 1.000000 0.85 0.785714 1.000000 1.052632 0.833333 1.521739 1.000000 1.388889 1.300000
s4-_s999575+ HDRR_1 40172 MIKK_117-2_C4_959 161190 MIKK HDRR_MIKK NaN insertion 1.157895 0.913043 1.333333 0.90 0.785714 0.823529 0.842105 1.000000 1.173913 0.961538 1.222222 1.433333