HEAD ======= >>>>>>> 1f72b2c38317bbf9e22e15671a199f851eaded2e
library(here)
source(here::here("code/scripts/repeats/source.R"))
Working directory on EBI cluster: /hps/research1/birney/users/ian/mikk_paper
Jack’s directory here: /nfs/leia/research/enright/jack/10_medaka_fish/02_genome_repeats/repeatmasker_filtered/medaka_hdrr
# Create directory for repeats work
mkdir repeats
# Pull data over from Jack's repo via local
scp brettell@yoda:/nfs/leia/research/enright/jack/10_medaka_fish/02_genome_repeats/repeatmasker_filtered/medaka_hdrr/processed/medaka_hdrr_repeats.fixed.gff ~/Documents/Repositories/mikk_genome/data
scp ~/Desktop/medaka_hdrr_repeats.fixed.gff brettell@ebi:/hps/research1/birney/users/ian/mikk_paper/repeats
# Read in data
= read.table(file.path(lts_dir, "medaka_hdrr_repeats.fixed.gff"),
hdrr_reps header = F, sep = "\t", skip = 3, comment.char = "", quote = "", as.is = T) %>%
# Remove empty V8 column
::select(-V8) %>%
dplyr# Get class of repeat from third column
::mutate(class = stringr::str_split(V3, pattern = "#", simplify = T)[, 1]) %>%
dplyr# Rename columns
::rename(chr = V1, tool = V2, class_full = V3, start = V4, end = V5, percent = V6, strand = V7, info = V9)
dplyr
::datatable(head(hdrr_reps, 100)) DT
# Find types of class other than "(GATCCA)n" types
= unique(hdrr_reps$class[grep(")n", hdrr_reps$class, invert = T)])
class_types class_types
[1] “DNA” “Unknown” “SINE” “LINE” “LTR” “Simple_repeat” “Satellite” “RC” “tRNA”
[10] “rRNA” “Retroposon” “snRNA” "" “buffer” “ARTEFACT” “scRNA”
# Find types of class other than "(GATCCA)n" types
= unique(hdrr_reps$class[grep(")n", hdrr_reps$class, invert = T)])
class_types class_types
[1] “DNA” “Unknown” “SINE” “LINE” “LTR” “Simple_repeat” “Satellite”
[8] “RC” “tRNA” “rRNA” “Retroposon” “snRNA” "" “buffer”
[15] “ARTEFACT” “scRNA”
# How many in the blank class?
length(which(hdrr_reps$class == ""))
[1] 8
# Recode class
= hdrr_reps %>%
hdrr_reps # NA for blanks
::mutate(class = dplyr::na_if(class, "")) %>%
dplyr# "misc" for others in "(GATCCA)n" type classes
::mutate(class = dplyr::if_else(!class %in% class_types, "Misc.", class)) %>%
dplyr# rename "Simple_repeat"
::mutate(class = dplyr::recode(class, "Simple_repeat" = "Simple repeat"))
dplyr
::datatable(head(hdrr_reps, 1000)) DT
# How many repeats in each category?
%>%
hdrr_reps count(class) %>%
::datatable(., options = list(pageLength = nrow(.))) DT
# Exclude classes with very few counts
= c("ARTEFACT", "buffer", "scRNA", "snRNA")
excluded_classes
= hdrr_reps %>%
plot_list ::mutate(chr = paste("chr", chr, sep = ""),
dplyrchr = factor(chr, levels = chr_order),
strand = factor(strand, levels = strand_order)) %>%
::filter(!class %in% excluded_classes) %>%
dplyr# dplyr::filter(strand == "+") %>%
# dplyr::slice_sample(n = 20000) %>%
# sort
::arrange(class, strand, chr, start) %>%
dplyr# get length and log length of repeats
::mutate(length = end - start + 1,
dplyrlog_length = log(length)) %>%
::select(chr, start, end, strand, class, length, log_length) %>%
dplyr# remove the couple of rows with NA in `chr` column
na.omit(.)
# Get max lengths for each class (rounded up to the nearest 1000)
= plot_list %>%
max_lengths_df ::group_by(class) %>%
dplyr::summarise(max = round.choose(max(length), 1e3, dir = 1)) %>%
dplyr::arrange(desc(max))
dplyr
= max_lengths_df %>%
max_lengths ::pull(max)
dplyr
= max_lengths_df %>%
class_order ::pull(class)
dplyr
# Change order for best
= c("Misc.",
class_order "LINE",
"DNA",
"LTR",
"Unknown",
"RC",
"Satellite",
"rRNA",
"Simple repeat",
"SINE",
"tRNA",
"Retroposon")
# Order by longest repeats
= plot_list %>%
plot_list ::mutate(class = factor(class, levels = class_order))
dplyr
# Split by class and strand
= plot_list %>%
plot_list split(., .$class) %>%
lapply(., function(class){
= split(class, class$strand)
out
return(out)
})
= lapply(plot_list, function(class){
plot_list lapply(class, function(strand){
%>%
strand ::mutate(length = ifelse(strand == "-",
dplyr* -1,
length %>%
length)) # choose target values
::select(chr, start, end, length)
dplyr
})
})
# True
= plot_list test
= here::here("docs/plots/repeats/20210302_hdrr_repeats_true.png")
out_plot png(out_plot,
width = 20,
height = 20,
units = "cm",
res = 400)
# Choose palette
= grDevices::colorRampPalette(pal_electroangler)(length(test))
pal
# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
track.margin = c(0, 0),
gap.degree = c(rep(1, nrow(chroms) - 1), 11))
# Initialize plot
circos.initializeWithIdeogram(chroms,
plotType = c("axis", "labels"),
major.by = 1e7,
axis.labels.cex = 0.25*par("cex"))
= 0
counter_class lapply(test, function(class){
# set counter_class
<<- counter_class + 1
counter_class # set max ylim
= max_lengths[counter_class]
max_length = 0
counter lapply(class, function(strand){
# Set counter
<<- counter + 1
counter # Set y-limits for both strands
if (names(class)[counter] == "+"){
= c(0, max_length)
ylim
}else if (names(class)[counter] == "-"){
= c(-max_length, 0)
ylim
}
# Add track
circos.genomicTrack(strand,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
value,type = "h",
col = pal[counter_class],
cex = 0.05,
baseline = 0)
},track.height = 0.03,
ylim = ylim,
bg.border = NA)
# Add labels
if (names(class)[counter] == "+"){
# y-axis
circos.yaxis(side = "right",
at = ylim,
labels.cex = 0.2*par("cex"),
tick.length = 2
)
# strand
circos.text(0, max_length / 2,
labels = "+",
sector.index = "chr1",
cex = 0.4*par("cex"))
else if (names(class)[counter] == "-"){
} # strand
circos.text(0, -(max_length / 2),
labels = "-",
sector.index = "chr1",
cex = 0.4*par("cex"))
# repeat class
circos.text(0, 0,
labels = names(test)[counter_class],
facing = "clockwise",
adj = c(0.5, -0.5),
sector.index = "chr1",
cex = 0.25*par("cex"))
}
})
})
circos.clear()
dev.off()
::include_graphics(here::here("docs/plots/repeats/20210302_hdrr_repeats_true.png")) knitr
# Get sequence of start to end for each chr
= hdrr_reps %>%
cov_list ::filter(chr != "MT") %>%
dplyr::mutate(chr = factor(chr, levels = 1:24)) %>%
dplyrsplit(., f = .$chr)
#
= lapply(cov_list, function(chr){
tmp apply(chr, 1, function(x){
seq(x[["start"]], x[["end"]])
%>%
}) unlist(.) %>%
sort(.) %>%
unique(.) %>%
length(.)
})
= data.frame(CHR = names(tmp),
repeat_cov REP_COV = unlist(tmp))
= repeat_cov %>%
repeat_cov ::filter(is.na(CHR) == F) %>%
dplyr::mutate(CHR = paste("chr", CHR, sep = "")) %>%
dplyr::full_join(y = chroms, by = c("CHR" = "chr")) %>%
dplyr::mutate(CHR = str_replace(CHR, "chr", ""),
dplyrCHR = factor(CHR, levels = 1:24)) %>%
::select(CHR, START = start, END = end, REP_COV) %>%
dplyr::mutate(REP_PROP = REP_COV/END) dplyr
Plot
= repeat_cov %>%
repeat_cov_plot ggplot() +
geom_col(aes(CHR, REP_PROP, fill = CHR)) +
theme_cowplot(font_size = 10) +
guides(fill = F) +
xlab("Chromosome") +
ylab("Proportion of chromosome\ncovered by repeats")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
repeat_cov_plot
= grDevices::colorRampPalette(pal_electroangler)(length(plot_list))
pal
= lapply(plot_list, function(class){
hist_df ::bind_rows(class, .id = "strand")
dplyr%>%
}) ::bind_rows(.id = "class") %>%
dplyr::mutate(class = factor(class, levels = class_order),
dplyrlength = ifelse(length < 0, length * -1, length))
= hist_df %>%
hist_plot ggplot(aes(x = log10(length),
# remove values under 0 created by the log10
y = ifelse(log10(..count..) < 0,
0,
log10(..count..)),
fill = class,
colour = class)) +
geom_area(stat = "bin",
bins = 20) +
# geom_freqpoly(aes(colour = class)) +
facet_wrap(~class, nrow = 3, ncol = 4) +
scale_fill_manual(values = pal) +
scale_colour_manual(values = karyoploteR::darker(pal)) +
guides(fill = F) +
guides(colour = F) +
xlab(expression(log[10](length))) +
ylab(expression(log[10](count))) +
theme_cowplot(font_size = 10) +
theme(strip.background = element_blank())
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
hist_plot
= here::here("plots", "repeats", "20210302_hdrr_repeats_true.png")
circos_repeats_path
= ggdraw() +
final_svtype draw_image(circos_repeats_path,
x = 0, y = 0, width = 1, height = .75, scale = 1.12) +
draw_plot(repeat_cov_plot,
x = 0, y = .75, width = .4, height = .25) +
draw_plot(hist_plot,
x = .4, y = .75, width = .6, height = .25) +
draw_plot_label(label = c("A", "B", "C"), size = 25,
x = c(0, .39, 0), y = c(1, 1, .75),
hjust = c(-.25, 0, -.25),
color = "#4f0943")
final_svtype
ggsave(here::here("plots", "repeats", "20210401_hdrr_repeats_final.png"),
device = "png",
dpi = 400,
units = "cm",
width = 30,
height = 42)