from datetime import date
from pycltools.pycltools import jprint
jprint('Adrien Leger / EMBL EBI', bold=True, size=150)
jprint('Starting date : 2021_02_17', bold=True, italic=True, size=125)
jprint('Last modification date : {}_{:02}_{:02}'.format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
# Standard lib imports
import os
from os import listdir as ls
from datetime import date
from collections import *
from glob import glob, iglob
from shutil import rmtree
import itertools
from pprint import pprint as pp
import string
# Generic third party imports
from pycltools.pycltools import *
import pysam
import pyfaidx
# Ploting lib imports
import matplotlib.pyplot as pl
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
%matplotlib inline
# Data wrangling lib imports
import pandas as pd
import numpy as np
import scipy as sp
import xarray as xr
#import xarray_extras as xre
# xr.set_options(display_style="html", display_width=500)
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 20
pd.options.display.max_rows = 200
# Limix
import limix
import pandas_plink
outdir = "tx_expression"
rmtree(outdir, ignore_errors=True)
mkdir(outdir)
import scipy.stats as stats
def samples_select (df, samples_id_dict):
"""
Change ids to line ids and select only liver samples
"""
col_dict = {}
for sample_id, col_val in df.iteritems():
sample_id = sample_id.rstrip("_counts")
if sample_id in samples_id_dict:
line_id = samples_id_dict[sample_id]
col_dict[line_id] = col_val
# Cast to df and sort columns alphabetically
df = pd.DataFrame.from_dict(col_dict)
df = df[df.columns.sort_values()]
df.index.name="transcripts"
return df
def filter_low_expression (counts_df, tpm_df, sample_frac_threshold=0.2, count_threshold=6, tpm_threshold=1):
"""
Genes are thresholded based on the following expression rules:
TPM >= tpm_threshold in >= sample_frac_threshold*samples
read counts >= count_threshold in sample_frac_threshold*samples
"""
n_samples = int(len(tpm_df.columns)*sample_frac_threshold)
# Expression thresholds
mask = ((np.sum(tpm_df>=tpm_threshold,axis=1)>=n_samples) & (np.sum(counts_df>=count_threshold,axis=1)>=n_samples)).values
return tpm_df[mask]
print("Load data")
samples_id_df = pd.read_csv("../RNA_pipeline2/all_sample_sheet.tsv", index_col=0, sep="\t")
samples_id_df = samples_id_df[(samples_id_df.protocol=="high")&((samples_id_df.organ=="liver"))]
samples_id_dict = samples_id_df["line"]
tpm_df = pd.read_csv("../RNA_pipeline2/results/counts/salmon_count_merge/tpm.tsv", index_col=0, sep="\t").fillna(0)
counts_df = pd.read_csv("../RNA_pipeline2/results/counts/salmon_count_merge/counts.tsv", index_col=0, sep="\t").fillna(0)
print(f"\tInitial Number of Samples:{len(tpm_df.columns)}")
print(f"\tInitial Number of Transcripts:{len(tpm_df.index)}")
print("Switch ids to line ids and select samples")
tpm_df = samples_select(tpm_df, samples_id_dict)
counts_df = samples_select(counts_df, samples_id_dict)
print(f"\tSamples remaining after sample selection:{len(tpm_df.columns)}")
print("Filter low expression transcripts")
tpm_df = filter_low_expression (counts_df, tpm_df, sample_frac_threshold=0.5, count_threshold=5, tpm_threshold=1)
print(f"\tTranscripts remaining after expression filtering: {len(tpm_df)}")
# VISUALIZATION
tpm_df2 = tpm_df.copy()
tpm_df2.clip(1, inplace=True)
tpm_df2.fillna(1, inplace=True)
# Define color per organ
c = Counter()
for col in tpm_df2.columns:
c[col.rpartition("_")[0]]+=1
sib_lines = [i for i,j in c.items() if j>1]
pal = sns.mpl_palette("Set1", len(sib_lines))
color_dict={i:j for i,j in zip(sib_lines, pal)}
color_map = [color_dict.get(line.split("_")[0], "grey") for line in tpm_df2.columns]
print("Apply log 2 transformation")
log_tpm_df = np.log2(tpm_df2.fillna(1))
with pl.style.context("ggplot"):
g = sns.clustermap(log_tpm_df, figsize=(30, 10), row_cluster=True, cmap="magma_r", yticklabels=False, dendrogram_ratio=0.2, robust=True, col_colors=color_map, cbar_pos=(1.01,0.2,0.02,0.5))
g.ax_heatmap.set_ylabel("")
g.ax_heatmap.tick_params(axis='x', labelsize=20)
g.ax_row_dendrogram.remove()
g.ax_cbar.set_ylabel("Log2 (TPM)", fontsize=20, labelpad=10)
g.ax_cbar.tick_params(axis='y', left=True, right=False, labelleft=True, labelright=False, labelsize=15)
g.fig.suptitle("Clustered heatmap of normalised transcript expression", fontsize=25, y=1.05, x=0.6)
g.fig.savefig("tx_expression/heatmap.png", bbox_inches='tight')
print("Add transcript information")
tx_info_df = pd.read_csv("../RNA_pipeline2/results/input/transcriptome/transcriptome.tsv",sep="\t", index_col=0, low_memory=False)
tx_info_df = tx_info_df[["chrom_id","start","end"]]
tpm_info_df = pd.merge(tpm_df, tx_info_df, left_index=True, right_index=True, how="left")
tpm_info_df = tpm_info_df.sort_values(["chrom_id","start","end"])
tpm_info_df = tpm_info_df.set_index(["chrom_id","start","end", tpm_info_df.index])
print("Save filtered TPM")
tpm_info_df.to_csv("tx_expression/filtered_tpm.tsv", sep="\t")
display(tpm_info_df.head())
print("Apply quantile normalization")
tpm_info_df = limix.qc.quantile_gaussianize(tpm_info_df, axis=0)
print("Save normalised TPM")
tpm_info_df.to_csv("tx_expression/filtered_norm_tpm.tsv", sep="\t")
display(tpm_info_df.head())
n_genes = 10
df =pd.read_csv("tx_expression/filtered_tpm.tsv", sep="\t", index_col=[0,1,2,3])
df = df.head(n_genes).T.melt()
with pl.style.context("ggplot"):
fig, ax = pl.subplots(figsize=(20,n_genes))
ax.set_title("Filtered transcripts TPM")
ax.set_xscale("log")
sns.swarmplot(y="transcripts", x="value", data=df, ax=ax)
df =pd.read_csv("tx_expression/filtered_norm_tpm.tsv", sep="\t", index_col=[0,1,2,3])
df = df.head(n_genes).T.melt()
with pl.style.context("ggplot"):
fig, ax = pl.subplots(figsize=(20,n_genes))
ax.set_title("Filtered and quantile normalized transcripts TPM")
sns.swarmplot(y="transcripts", x="value", data=df, ax=ax)
src_vcf = "/nfs/research1/birney/projects/indigene/datafreeze/14-03-2019/vcf/medaka_inbred_panel_ensembl_new_reference_release_94.vcf.gz"
head(src_vcf, ignore_comment_line=True, comment_char="##")
src_vcf = "/nfs/research1/birney/projects/indigene/datafreeze/14-03-2019/vcf/medaka_inbred_panel_ensembl_new_reference_release_94.vcf.gz"
raw_outdir = "plink_raw"
rmtree(raw_outdir, ignore_errors=True)
mkdir(raw_outdir)
mem = 10000
threads = 20
queue = "research-rh74"
dry=False
stderr_fp=f"{raw_outdir}/bsub_stderr.txt"
stdout_fp=f"{raw_outdir}/bsub_stdout.txt"
jobid = bsub (
cmd = f"plink --vcf {src_vcf} --make-bed --snps-only --chr-set 24 no-xy --out {raw_outdir}/medaka --memory {mem} --threads {threads}",
conda="plink", threads=threads, mem=mem, queue=queue, dry=dry, stderr_fp=stderr_fp, stdout_fp=stdout_fp)
fixed_outdir = "plink_metadata_fixed"
rmtree(fixed_outdir, ignore_errors=True)
mkdir(fixed_outdir)
print("Redefine sex, FAM ID and IID in FAM file")
df = pd.read_csv(f"{raw_outdir}/medaka.fam", sep=" ", names=["FID", "IID", "F_IID", "M_IID", "Sex", "Phenotype"])
sample_id_df = pd.read_csv("internal_id.tsv", sep="\t", index_col=0)
for i, line in df.iterrows():
internal_id = f"{line.FID}_{line.IID}"
sample_id = sample_id_df.loc[internal_id]
df.loc[i, "FID"] = sample_id["FID"]
df.loc[i, "IID"] = sample_id["IID"]
df.loc[i, "Sex"] = 1
df.to_csv(f"{fixed_outdir}/medaka.fam", sep=" ", header=False, index=False)
display(df.head())
print("Redefine snp_id in BIM file")
df = pd.read_csv(f"{raw_outdir}/medaka.bim", sep="\t", names=["CHR", "VID", "POS_M", "POS_B", "AL1", "AL2"])
df["VID"] = [f"rs_{i}" for i in range(len(df))]
df.to_csv(f"{fixed_outdir}/medaka.bim", sep="\t", header=False, index=False)
display(df.head())
print("Copy bed file")
shutil.copy2(f"{raw_outdir}/medaka.bed", fixed_outdir)
panel_outdir = "plink_panel_only"
rmtree(panel_outdir, ignore_errors=True)
mkdir(panel_outdir)
print ("Load data")
df = pd.read_csv("/nfs/research1/birney/projects/indigene/raw_data/sanger_pipeline/G2035/G2035_metadata_03-14-19.txt", sep="\t")
tpm_line_ids = list(pd.read_csv("./tx_expression/filtered_tpm.tsv", sep="\t", index_col=0).columns)
internal_id_dict = {}
line_id_counter = Counter()
print("Get correspondance between internal ids and line ids")
for idx, line in df.iterrows():
line_id = line["supplier_name"].replace(" ", "_").replace("--", "_").replace("-", "_")
fam_id = line_id.split("_")[0]
if fam_id.isdigit():
fam_id="MIKK"
line_id_counter[line_id]+=1
if line_id_counter[line_id]>1:
sample_rep = string.ascii_uppercase[line_id_counter[line_id]-1]
line_id = f"{line_id}.{sample_rep}"
rna_seq = line_id in tpm_line_ids
internal_id = line["cram_file_names"].split(".")[0]
internal_id_dict[internal_id] = (fam_id, line_id, rna_seq)
df = pd.DataFrame.from_dict(internal_id_dict, orient="index", columns=["FID", "IID", "RNAseq"])
df = df.sort_values(["FID", "IID"])
display(df)
df.to_csv(f"{panel_outdir}/internal_id.tsv", sep="\t")
# Extract IDs corresponding to pannel members with data in the transcriptome dataset
sample_id_df = pd.read_csv(f"{panel_outdir}/internal_id.tsv", sep="\t", index_col=0)
sample_id_df = sample_id_df[sample_id_df["RNAseq"]==True]
id_file = f"{panel_outdir}/fam_id.txt"
with open (id_file, "w") as fp:
for i, line in sample_id_df.iterrows():
fp.write(f"{line.FID}\t{line.IID}\n")
print("Select MIKK panel samples only")
mem = 10000
threads = 20
queue = "research-rh74"
dry=False
stderr_fp=f"{panel_outdir}/bsub_stderr.txt"
stdout_fp=f"{panel_outdir}/bsub_stdout.txt"
jobid = bsub (
cmd = f"plink --keep {id_file} --make-bed --chr-set 24 no-xy --bfile {fixed_outdir}/medaka --out {panel_outdir}/medaka --memory {mem} --threads {threads}",
conda="plink", threads=threads, mem=mem, queue=queue, dry=dry, stderr_fp=stderr_fp, stdout_fp=stdout_fp)
panel_outdir = "plink_panel_only"
pruned_outdir = "plink_pruned"
rmtree(pruned_outdir, ignore_errors=True)
mkdir(pruned_outdir)
print("Variant filtering and LD pruning")
mem = 10000
threads = 25
queue = "research-rh74"
dry=False
stderr_fp=f"{pruned_outdir}/bsub_stderr_prune.txt"
stdout_fp=f"{pruned_outdir}/bsub_stdout_prune.txt"
jobid1 = bsub (
cmd=f"plink --geno 0 --maf 0.001 --indep 50 5 5 --chr-set 24 no-xy --bfile {panel_outdir}/medaka --out {pruned_outdir}/medaka --memory {mem} --threads {threads}",
conda="plink", threads=threads, mem=mem, queue=queue, dry=dry, stderr_fp=stderr_fp, stdout_fp=stdout_fp)
stderr_fp=f"{pruned_outdir}/bsub_stderr_extract.txt"
stdout_fp=f"{pruned_outdir}/bsub_stdout_extract.txt"
jobid2 = bsub (
cmd=f"plink --extract {pruned_outdir}/medaka.prune.in --chr-set 24 no-xy --make-bed --bfile {panel_outdir}/medaka --out {pruned_outdir}/medaka --memory {mem} --threads {threads}",
conda="plink", threads=threads, mem=mem, queue=queue, dry=dry, stderr_fp=stderr_fp, stdout_fp=stdout_fp, wait_jobid=jobid1)
bjobs_update(cmd="plink*")
pruned_outdir = "plink_pruned"
l = []
with open (f"{pruned_outdir}/bsub_stdout_prune.txt") as fp:
for line in fp:
if line.startswith("Pruned"):
ls = supersplit(line, [" ", ",", "."])
chrom=f"chr{ls[5]}"
init_var = int(ls[1])
retained_var = int(ls[8])
discarded_var = init_var-retained_var
l.append((chrom, discarded_var, retained_var))
df = pd.DataFrame(l, columns=["Chromosomes", "Discarded Variants", "Retained Variants"])
df = df.set_index("Chromosomes")
with pl.style.context("ggplot"):
fig, ax = pl.subplots(figsize=(20,7))
df.plot.bar(stacked=True, ax=ax, rot=45)
_ = ax.legend(bbox_to_anchor=(1, 1), loc=2, facecolor="white", frameon=False)
pl.show()
df.loc["all"] = df.sum()
df["% retained"] = (df["Retained Variants"]/(df["Retained Variants"]+df["Discarded Variants"])*100).round(2)
display(df.T)
outdir = "limix"
rmtree(outdir, ignore_errors=True)
mkdir(outdir)
genotypes = pandas_plink.read_plink1_bin (
bed=f"plink_pruned/medaka.bed",
bim=f"plink_pruned/medaka.bim",
fam=f"plink_pruned/medaka.fam",
verbose=True)
genotypes = genotypes.sortby("sample")
print (f"Variants imported {len(genotypes.variant)}")
display(genotypes)
kinship = limix.stats.linear_kinship(genotypes, verbose=True)
kinship = pd.DataFrame(kinship, index=genotypes.sample.variable, columns=genotypes.sample.variable)
kinship.to_csv("./limix/kinship.tsv", sep="\t")
k = pd.read_csv("./limix/kinship.tsv", sep="\t", index_col=0)
cmin, cmax = np.percentile(k.values, [2,98])
k = k.clip(cmin, cmax)
k = (k-cmin)/(cmax-cmin)
line_count = Counter()
for line in k.index:
line = line.rpartition("_")[0]
line_count[line]+=1
sib_lines = [i for i,j in line_count.items() if j>=2]
pal = sns.color_palette("Set1", n_colors=len(sib_lines))
col_dict = {i:j for i, j in zip(sib_lines, pal)}
col_cat = []
for line in k.index:
line = line.rpartition("_")[0]
col_cat.append(col_dict.get(line, "grey"))
with pl.style.context("ggplot"):
g = sns.clustermap(k, cmap="magma_r", figsize=(20,20), row_colors=col_cat, col_colors=col_cat, dendrogram_ratio=0.15, robust=True, cbar_pos=(1.03,0.2,0.03,0.5))
g.ax_heatmap.set_xlabel(None)
g.ax_heatmap.set_ylabel(None)
g.ax_cbar.set_ylabel("genetic relatedness", fontsize=20, labelpad=10)
g.ax_cbar.tick_params(axis='y', left=True, right=False, labelleft=True, labelright=False, labelsize=15)
g.ax_heatmap.tick_params(labelsize=15)
g.fig.suptitle("Kinship matrix heatmap", fontsize=25, y=1.05, x=0.6)
g.fig.savefig("limix/kinship_heatmap.png", bbox_inches='tight')
from statsmodels.stats.multitest import multipletests
import csv
import warnings
warnings.simplefilter('ignore')
stdout_print("Import data\n")
K = pd.read_csv("./limix/kinship.tsv", sep="\t", index_col=0)
Y = pd.read_csv(f"tx_expression/filtered_norm_tpm.tsv", sep="\t", index_col=[0,1,2,3])
G = pandas_plink.read_plink1_bin (bed=f"plink_pruned/medaka.bed", bim=f"plink_pruned/medaka.bim", fam=f"plink_pruned/medaka.fam", verbose=False)
G = G.sortby("sample")
stdout_print("Run eQTL analysis\n")
extend = 500000
min_pval = np.nextafter(np.float64(0), np.float64(1))
max_pval = 1.0
with open("./limix/eQTL_pval_500k.tsv", "w") as fp:
writer = csv.DictWriter(fp, fieldnames=["chrom","transcript_id", "pos", "snp_id","pval"], delimiter='\t')
writer.writeheader()
for (chrom, start, end, tx), y in tqdm(Y.iterrows(), total=len(Y), ncols=200, mininterval=10):
if chrom == "MT":
continue
try:
g = G.where(((G.chrom == chrom) & (G.pos>=start-extend) & (G.pos<=end+extend)), drop=True)
qtl = limix.qtl.scan(G=g, Y=y, K=K, verbose=False)
# Get pValue and correct invalid value
pval_list = qtl.stats.pv20.values
pval_list = np.nan_to_num(pval_list, nan=1.0)
pval_list = np.clip(pval_list, min_pval, max_pval)
for pos, snp_id, pval in zip(g.pos.values, g.snp.values, pval_list):
writer.writerow({"chrom":chrom,"transcript_id":tx,"pos":pos,"snp_id":snp_id,"pval":pval})
except Exception as E:
print (f"Exception for transcript {tx} ({chrom}:{start}-{end})")
print (E)
continue
# Adjust save signifiant hit only and adjust pvalues for multiple tests
print ("Load eQTL results")
df = pd.read_csv("./limix/eQTL_pval_500k.tsv", sep="\t", dtype={"chrom":str})
pval_adj_list = multipletests(df["pval"], alpha=0.1, method='fdr_bh')[1]
pval_adj_list = np.nan_to_num(pval_adj_list, nan=1.0)
pval_adj_list = np.clip(pval_adj_list, min_pval, max_pval)
df["pval"] = pval_adj_list
print ("Write results")
df.to_csv("./limix/eQTL_pval_adj_500k.tsv", index=False, sep="\t")
print ("Load transcripts info")
tx_info_df = pd.read_csv("../pycoSnake_pipeline/results/input/transcriptome/transcriptome.tsv",sep="\t", dtype={"chrom_id":str})
tx_info_df = tx_info_df[["transcript_id", "gene", "start","end","strand","length", "gene_biotype", "transcript_biotype"]]
tx_info_df = tx_info_df.rename(columns={"gene":"gene_id", "start":"tx_start","end":"tx_end","strand":"tx_strand","length":"tx_length"})
for lab, fn, pval_threshold in [("pval", "./limix/eQTL_pval_500k.tsv", 1e-2), ("pval_adj", "./limix/eQTL_pval_adj_500k.tsv", 1e-2)]:
print (f"Load eQTL results for {lab}")
df = pd.read_csv(fn, sep="\t", dtype={"chrom":str})
df = df[["chrom", "transcript_id", "pos", "snp_id", "pval"]]
sig_df = df[df["pval"]<=pval_threshold]
sig_df = pd.merge(sig_df, tx_info_df, on="transcript_id", how="left")
n_hits = len(sig_df)
n_tx = sig_df["transcript_id"].nunique()
n_genes = sig_df["gene_id"].nunique()
print(f"num hits:{n_hits} / num transcripts:{n_tx} / num genes:{n_genes}")
print (f"Save results to file")
display (sig_df.head())
sig_df.to_csv(f"./limix/eQTL_sig_{lab}_500k.tsv", index=False, sep="\t")
outdir = "results_plots"
rmtree(outdir, ignore_errors=True)
mkdir(outdir)
def plot_copenhagen (data_fn, fasta_fn, tx_info_fn, out_fn, pval_threshold=1e-3, pval_min=1e-12, pval_type="pval"):
print ("Load eQTL results")
df = pd.read_csv(data_fn, sep="\t")
# Fix invalid pvalues
df[pval_type] = df[pval_type].fillna(1.0)
df[pval_type] = df[pval_type].clip(np.nextafter(np.float64(0), np.float64(1)), 1.0)
print ("Load chromosome length")
chr_len = OrderedDict()
with pyfaidx.Fasta(fasta_fn) as fa:
for c in fa:
chr_len[c.name]=len(c)
longest = max(list(chr_len.values()))
filt_chr_len = OrderedDict()
for cid, clen in chr_len.items():
if clen >= longest/10:
filt_chr_len[cid] = clen
print ("Load transcripts info")
tx_info_df = pd.read_csv(tx_info_fn, sep="\t", index_col=0, dtype={"chrom_id":str})
tx_info_df = tx_info_df[["chrom_id","start","end"]]
print ("Compute variables")
# Define custom palette
pal = sns.color_palette(desat=0.8)
pal[7] = (0.25, 0.25, 0.25)
pal = itertools.cycle(pal)
figsize = (20, 80)
frac_plot_area = 0.5
ml_pval_threshold = -np.log10(pval_threshold)
ml_pval_min = -np.log10(pval_min)
n_chr = len(filt_chr_len)
total_h = 1/n_chr
plot_h = total_h*frac_plot_area
print ("Generate plots")
with pl.style.context("ggplot"):
fig = pl.figure(figsize=figsize)
b = 1-total_h
# Define chromosomes
chr_ax = OrderedDict()
for cid, clen in filt_chr_len.items():
ax = pl.axes([0, b, clen/longest, plot_h])
ax.set_xlim(0, clen)
ax.set_xlabel("Genomic positions")
ax.set_ylim(0, ml_pval_min+1)
ax.set_ylabel("p-values")
ax.set_title(f"chr {cid}", y=1.2, loc='left')
ax.axhline(ml_pval_threshold, color="0.25", ls=":")
chr_ax[cid] = ax
b-=total_h
for chrom, chr_df in df.groupby("chrom"):
print(f"Processing chromosome {chrom}")
ax = chr_ax.get(str(chrom), None)
if ax is not None:
for tx, tx_df in chr_df.groupby("transcript_id"):
pvals = pd.Series(index=tx_df["pos"], data=-np.log10(tx_df[pval_type].values))
if pvals.max() >= ml_pval_threshold:
color = next(pal)
# Plot tx position
tx_info = tx_info_df.loc[tx]
ax.axvspan(tx_info.start, tx_info.end, color=color, alpha=0.5)
ax.text(tx_info.start, ml_pval_min+1.1, tx, rotation=45, fontsize=8, color=color)
# Plot pvalues
p = pvals[pvals>=ml_pval_min]
if not p.empty:
ax.scatter(p.index, ml_pval_min, s=20, marker="^", c=[color], zorder=1)
p = pvals[(pvals>=ml_pval_threshold)&(pvals<ml_pval_min)]
if not p.empty:
ax.scatter(p.index, p.values, s=20, marker=".", c=[color], zorder=1)
p = pvals[pvals<ml_pval_threshold]
if not p.empty:
ax.scatter(p.index, p.values, s=10, marker=".", c=[color], alpha=0.5, zorder=1)
else:
ax.scatter(pvals.index, pvals.values, s=10, marker=".", c="0.6", alpha=0.5, zorder=0)
pl.show()
fig.savefig(out_fn, bbox_inches='tight')
plot_copenhagen (
data_fn = "./limix/eQTL_pval_500k.tsv",
fasta_fn = "../pycoSnake_pipeline/results/input/genone/genome.fa",
tx_info_fn = "../pycoSnake_pipeline/results/input/transcriptome/transcriptome.tsv",
out_fn = "results_plots/copenhagen_pval.png",
pval_threshold=1e-6, pval_min=1e-16, pval_type="pval")
plot_copenhagen (
data_fn = "./limix/eQTL_pval_adj_500k.tsv",
fasta_fn = "../pycoSnake_pipeline/results/input/genone/genome.fa",
tx_info_fn = "../pycoSnake_pipeline/results/input/transcriptome/transcriptome.tsv",
out_fn = "results_plots/copenhagen_pval_adj.png",
pval_threshold=1e-2, pval_min=1e-10, pval_type="pval")
def plot_manhattan (data_fn, fasta_fn, out_fn, n_best=3, max_pval=15, sig_pval=6, pval_type="pval"):
print ("Load chromosome length")
chr_len_df = pd.DataFrame(columns=["start", "end"])
start = 0
with pyfaidx.Fasta(fasta_fn) as fa:
for c in fa:
if c.name != "MT":
chr_len_df.loc[c.name]=[start,start+len(c)-1]
start+=len(c)
print ("Load eQTL results")
df = pd.read_csv(data_fn, sep="\t")
df[pval_type] = df[pval_type].fillna(1.0)
df[pval_type] = df[pval_type].clip(np.nextafter(np.float64(0), np.float64(1)), 1.0)
print ("Select best pvalues")
best_pval_res = []
for tx, tx_df in df.groupby("transcript_id"):
sorted_pval = tx_df.sort_values(pval_type)
best_pval = sorted_pval.iloc[0:n_best]
best_pval_res.append(best_pval)
best_pval_df = pd.concat(best_pval_res)
best_pval_df = best_pval_df.sort_values(["chrom", "pos"])
print ("Plot data")
# define palettes for significant and non significant hits
pal = itertools.cycle(sns.color_palette("muted", desat=0.8))
chr_col = {chr_id:next(pal) for chr_id in chr_len_df.index}
pal_sig = itertools.cycle(sns.color_palette("dark", desat=0.8))
chr_col_sig = {chr_id:next(pal_sig) for chr_id in chr_len_df.index}
xmax = chr_len_df.end.max()
with pl.style.context("ggplot"):
fig = pl.figure(figsize=(40, 10))
# plot data
ax = pl.subplot2grid((10,1), (0,0), rowspan=9)
ax.set_ylim(0,max_pval+0.5)
ax.set_xlim(0,xmax)
ax.set_xticks(list(chr_len_df.start))
ax.tick_params(axis="x", labelbottom=False, bottom=False)
ax.tick_params(axis="y", labelsize=14)
for idx, line in best_pval_df.iterrows():
log_pval = -np.log10(line[pval_type])
x = line.pos+chr_len_df.loc[str(line.chrom), "start"]
y = log_pval if log_pval<max_pval else max_pval
color = chr_col[str(line.chrom)] if log_pval<sig_pval else chr_col_sig[str(line.chrom)]
s = 20 if log_pval<sig_pval else 50
marker = "." if log_pval<max_pval else "^"
ax.scatter(x, y, color=color, marker=marker, s=s)
# Plot sig limit
ax.axhline(sig_pval, color="0.25", ls=":")
#plot chromosomes
ax = pl.subplot2grid((10,1), (9,0), colspan=1)
ax.grid(False)
ax.axis(False)
ax.set_xlim(0,xmax)
for chr_id, line in chr_len_df.iterrows():
ax.axvspan(line.start, line.end, color=chr_col_sig[chr_id])
ax.text(line.start+(line.end-line.start)/2, y=0.5, s=f"chr{chr_id}", horizontalalignment='center', verticalalignment='center', color="white", weight=550, fontsize=14)
pl.show()
fig.savefig(out_fn, bbox_inches='tight')
plot_manhattan (
data_fn="./limix/eQTL_pval_500k.tsv",
fasta_fn="../pycoSnake_pipeline/results/input/genone/genome.fa",
out_fn="results_plots/manhattan_pval.png",
n_best=5, max_pval=18, sig_pval=6, pval_type="pval")
plot_manhattan (
data_fn="./limix/eQTL_pval_adj_500k.tsv",
fasta_fn="../pycoSnake_pipeline/results/input/genone/genome.fa",
out_fn="results_plots/manhattan_pval_adj.png",
n_best=5, max_pval=10, sig_pval=2, pval_type="pval")
def plot_expr_geno (data_df, G, Y, out_fn_prefix, top=8, tx_id=None, chrom=None, start=None, end=None, sort_val="pval", ncols=4, max_pval=1e-6):
data_df = data_df[data_df["pval"]<=max_pval]
# Sort and pick top
if chrom:
data_df = data_df[data_df["chrom"]==chrom]
if start:
data_df = data_df[data_df["pos"]>=start]
if end:
data_df = data_df[data_df["pos"]<=end]
if tx_id:
data_df = data_df[data_df["transcript_id"]==tx_id]
data_df = data_df.sort_values(sort_val)
if top:
data_df = data_df.head(top)
n_snp = len(data_df)
print(f"Found {n_snp} SNPs")
data_df = data_df.reset_index(drop=True)
# compute ploting grid
row_heigh = 4
col_width = 8
nrows = n_snp//ncols
if n_snp%ncols:
nrows+=1
all_genotype_df = pd.DataFrame()
#all_expression_df = pd.DataFrame()
with pl.style.context("ggplot"):
fig, axes = pl.subplots(nrows=nrows, ncols=ncols, figsize=(col_width*ncols, row_heigh*nrows), sharex=True)
try:
axes = list(itertools.chain.from_iterable(axes))
except TypeError:
pass
snp_ids = []
for idx, line in tqdm(data_df.iterrows()):
snp_id = line.snp_id
snp_chrom = line.chrom
snp_pos = line.pos
snp_tx = line.transcript_id
snp_pval = line.pval
# Select genotypes
g = G.where((G.snp == snp_id), drop=True)
genotypes = []
for i in g.values:
if i[0] == 0:
genotypes.append("AA")
elif i[0] == 1:
genotypes.append("AB")
elif i[0] == 2:
genotypes.append("BB")
# Select phenotype
y = Y.xs(snp_tx, level=3, drop_level=False).iloc[0]
expression = y.values
tx_chrom = y.name[0]
tx_start = y.name[1]
tx_end = y.name[2]
# Get line ids
geno_line_id = list(g.sample.values)
pheno_line_id = list(y.index.values)
assert geno_line_id==pheno_line_id
res_df = pd.DataFrame({"line_id":geno_line_id, "genotype":genotypes, "expression":expression})
sns.stripplot(x="genotype", y="expression", data=res_df, order=["AA", "AB", "BB"], ax=axes[idx])
sns.boxenplot(x="genotype", y="expression", data=res_df, order=["AA", "AB", "BB"], ax=axes[idx], color="0.7", showfliers=False)
axes[idx].set_xlabel("")
axes[idx].set_title ("Transcript {} (chr-{}:{:,}-{:,}) \n SNP {} (chr-{}:{:,}) \n pvalue:{:.3e}".format(
snp_tx, tx_chrom, tx_start, tx_end, snp_id, snp_chrom, snp_pos, snp_pval), fontsize=10)
snp_ids.append(snp_id)
genotype_df = res_df.set_index("line_id", drop=True)
genotype_df = genotype_df.rename(columns={"genotype":snp_id})
genotype_df = genotype_df[[snp_id]]
if all_genotype_df.empty:
all_genotype_df = genotype_df
else:
all_genotype_df = pd.merge(all_genotype_df, genotype_df, left_index=True, right_index=True)
pl.subplots_adjust(hspace=0.5)
pl.show()
fig.savefig(out_fn_prefix+".svg", bbox_inches='tight')
#display(data_df)
data_df.to_csv(out_fn_prefix+"_eqtl.tsv", sep="\t", index=False)
all_genotype_df = all_genotype_df.sort_values(snp_ids)
#display(all_genotype_df)
all_genotype_df.to_csv(out_fn_prefix+"_genotypes.tsv", sep="\t", index=True)
print("Load phenotypes")
Y_raw = pd.read_csv(f"tx_expression/filtered_tpm.tsv", sep="\t", index_col=[0,1,2,3])
Y_norm = pd.read_csv(f"tx_expression/filtered_norm_tpm.tsv", sep="\t", index_col=[0,1,2,3])
print("Load genotypes")
G = pandas_plink.read_plink1_bin (bed=f"plink_pruned/medaka.bed", bim=f"plink_pruned/medaka.bim", fam=f"plink_pruned/medaka.fam", verbose=False)
G = G.sortby("sample")
print ("Load eQTL results")
df = pd.read_csv("./limix/eQTL_sig_pval_500k.tsv", sep="\t")
plot_expr_geno(df, G=G, Y=Y_raw, out_fn_prefix="./results_plots/expression_tpm_raw_top_eQTL_chr1", top=None, chrom=1, start=2.7e7, end=3e7, sort_val="pos", ncols=3, max_pval=1e-5)
plot_expr_geno(df, G=G, Y=Y_raw, out_fn_prefix="./results_plots/expression_tpm_raw_top_eQTL_chr18", top=None, chrom=18, start=400000, end=700000, sort_val="pos", ncols=3)
plot_expr_geno(df, G=G, Y=Y_raw, out_fn_prefix="./results_plots/expression_tpm_raw_top_eQTL_ENSORLT00000002786.2", top=None, tx_id="ENSORLT00000002786.2", sort_val="pos", ncols=3)
plot_expr_geno(df, G=G, Y=Y_raw, out_fn_prefix="./results_plots/expression_tpm_raw_top_eQTL", top=80)
plot_expr_geno(df, G=G, Y=Y_norm, out_fn_prefix="./results_plots/expression_tpm_norm_top_eQTL", top=80)
eqtl_df = pd.read_csv("./results_plots/expression_tpm_raw_top_eQTL_ENSORLT00000002786.2_eqtl.tsv", sep="\t")
with open("./results_plots/eQTL_ENSORLT00000002786.2.bed", "w") as fp:
for i in eqtl_df.itertuples():
fp.write(f"{i.chrom}\t{i.pos}\t{i.pos+1}\t{i.snp_id}\n")
head("./results_plots/eQTL_ENSORLT00000002786.2.bed")
geno_df = pd.read_csv("./results_plots/expression_tpm_raw_top_eQTL_ENSORLT00000002786.2_genotypes.tsv", sep="\t", index_col=0)
DNA_sample_df = pd.read_csv("/nfs/research1/birney/projects/medaka/inbred_panel/medaka-alignments-release-94/sample_link/cram_file_to_line_ids.txt", sep="\t", index_col=1)
DNA_sample_df.head()
for g in ("AA", "AB", "BB"):
s = (geno_df==g).all(axis=1)
s = s[s==True].index
print(f"Genotype: {g}")
s = DNA_sample_df.loc[s].sort_values("cram_file")
display(s)
# pval_threshold = 0.01
# print ("Load transcripts info")
# tx_info_df = pd.read_csv("../pycoSnake_pipeline/results/input/transcriptome/transcriptome.tsv",sep="\t", dtype={"chrom_id":str})
# tx_info_df = tx_info_df[["transcript_id", "start","end","strand"]]
# tx_info_df = tx_info_df.rename(columns={"start":"tx_start","end":"tx_end","strand":"tx_strand"})
# print ("Load eQTL results")
# eqtl_df = pd.read_csv("./limix/eQTL_pval_adj_500k.tsv", sep="\t", dtype={"chrom":str})
# eqtl_df = eqtl_df[["transcript_id", "pos","pval"]]
# eqtl_df = eqtl_df[eqtl_df["pval"]<=pval_threshold]
# print ("Merge df")
# df = pd.merge(eqtl_df, tx_info_df, on="transcript_id", how="left")
# print ("Retain best pvalue per transcript")
# l = []
# for tx, tx_df in df.groupby("transcript_id"):
# tx_df = tx_df.sort_values("pval")
# l.append(tx_df.iloc[0])
# df = pd.DataFrame(l)
# print(len(df))
# display(df.head(10))
df = pd.read_csv("./limix/eQTL_sig_pval_500k.tsv", sep="\t", dtype={"chrom":str})
external_n_bins = 100
internal_n_bins = 30
extend = 500000
#res_d = defaultdict(Counter)
res_d=OrderedDict()
res_d["upstream"] = [0 for _ in range(external_n_bins)]
res_d["downstream"] = [0 for _ in range(external_n_bins)]
res_d["internal"] = [0 for _ in range(internal_n_bins)]
tx_len_list = []
for tx, tx_df in df.groupby("transcript_id"):
tx_start = tx_df.iloc[0].tx_start
tx_end = tx_df.iloc[0].tx_end
tx_strand = tx_df.iloc[0].tx_strand
tx_len_list.append(tx_end-tx_start)
vmin = tx_start-extend
vmax = tx_start
upstream_len_bin = (vmax-vmin)//external_n_bins
upstream_bins = np.linspace(vmin, vmax-upstream_len_bin, external_n_bins, dtype=int)
vmin = tx_end
vmax = tx_end+extend
downstream_len_bin = (vmax-vmin)//external_n_bins
downstream_bins = np.linspace(vmin, vmax-downstream_len_bin, external_n_bins, dtype=int)
vmin = tx_start
vmax = tx_end
internal_len_bin = (vmax-vmin)//internal_n_bins
internal_bins = np.linspace(vmin, vmax-internal_len_bin, internal_n_bins, dtype=int)
for row in tx_df.itertuples():
if row.pos <= tx_start:
index = np.digitize(row.pos, upstream_bins, right=False)-1
if tx_strand == "+":
res_d["upstream"][index]+=1000/upstream_len_bin
elif tx_strand == "-":
res_d["downstream"][external_n_bins-index-1]+=1000/downstream_len_bin
elif row.pos >= tx_end:
index = np.digitize(row.pos, downstream_bins, right=False)-1
if tx_strand == "+":
res_d["downstream"][index]+=1000/downstream_len_bin
elif tx_strand == "-":
res_d["upstream"][external_n_bins-index-1]+=1000/upstream_len_bin
else:
index = np.digitize(row.pos, internal_bins, right=False)-1
if tx_strand == "+":
res_d["internal"][index]+=1000/internal_len_bin
elif tx_strand == "-":
res_d["internal"][internal_n_bins-index-1]+=1000/internal_len_bin
external_xticks_bins = int((external_n_bins/10)+1)
internal_xticks_bins = int((internal_n_bins/5)+1)
ymax = 0
p_res_d=OrderedDict()
for lab, y in res_d.items():
y = gaussian_filter1d(y, sigma=0.8)
p_res_d[lab] = y
if max(y)>ymax:
ymax = max(y)
tot_bin_len = 2*external_n_bins+internal_n_bins
spacer_len = tot_bin_len/100
tot_bin_len+=spacer_len*2
with pl.style.context("ggplot"):
fig = pl.figure(figsize=(25, 7))
ax = pl.axes([0,0,external_n_bins/tot_bin_len,1])
ax.fill_between(np.arange(0, external_n_bins), p_res_d["upstream"], color="0.25")
ax.set_xticks(np.linspace(0, external_n_bins, external_xticks_bins))
ax.set_xticklabels(np.linspace(-extend, 0, external_xticks_bins, dtype=int), rotation=50)
ax.set_xlim(0, external_n_bins-1)
ax.set_ylim(0, ymax)
ax.set_ylabel("Length normalised number of eQTL hits")
ax.set_title("Upstream", pad=20, size=16)
ax.set_xlabel("Distance to TSS", labelpad=20)
ax.set_ylabel("", labelpad=20)
ax = pl.axes([(external_n_bins+spacer_len)/tot_bin_len,0,internal_n_bins/tot_bin_len,1])
ax.fill_between(np.arange(0, internal_n_bins), p_res_d["internal"], color="seagreen")
ax.set_xticks(np.linspace(0, internal_n_bins, internal_xticks_bins))
ax.set_xticklabels(np.round(np.linspace(0, 1, internal_xticks_bins),1), rotation=50)
ax.set_yticklabels([])
ax.set_xlim(0, internal_n_bins-1)
ax.set_ylim(0, ymax)
ax.set_title("Metagene body", pad=30, size=16)
ax.set_xlabel("Normalised gene distance", labelpad=20)
ax = pl.axes([(external_n_bins+spacer_len*2+internal_n_bins)/tot_bin_len,0,external_n_bins/tot_bin_len,1])
ax.fill_between(np.arange(0, external_n_bins), p_res_d["downstream"], color="0.25")
ax.set_xticks(np.linspace(0, external_n_bins, external_xticks_bins))
ax.set_xticklabels(np.linspace(0, extend, external_xticks_bins, dtype=int), rotation=50)
ax.set_yticklabels([])
ax.set_xlim(0, external_n_bins-1)
ax.set_ylim(0, ymax)
ax.set_title("Downstream", pad=20, size=16)
ax.set_xlabel("Distance from TES", labelpad=20)
pl.show()
fig.savefig("./results_plots/metagene_pval_adj.svg", bbox_inches='tight')