Commit 02d8b73b authored by Afif Elghraoui's avatar Afif Elghraoui

New upstream version 0~20170131+ds

parents
user_data/*
!user_data/.htaccess
user_uploads/*
!user_uploads/.htaccess
*.delta
python_env
reproducibility
The MIT License (MIT)
Copyright (c) 2016 Maria Nattestad
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Assemblytics (full web application)
This repo contains the full Assemblytics web application.
The web app is running on assemblytics.com, and the command-line version is available at https://github.com/marianattestad/assemblytics
This repository exists so you can install it on your own servers or locally on your computer if need be.
The code here includes the interactive dot plot (which you can't use from the command-line version).
Dependencies:
- R
- ggplot
- plyr
- Python
- argparse
- numpy
Notes for installation:
- Make sure to open up permissions in user_uploads and user_data so the webserver can read and write there.
- It does not contain the examples as some of these are huge files.
<!DOCTYPE html>
<html>
<!-- NAVIGATION BAR-->
<?php include "header.html";?>
<link rel="stylesheet" type="text/css" media="screen" href="http://cdnjs.cloudflare.com/ajax/libs/fancybox/1.3.4/jquery.fancybox-1.3.4.css" />
<!-- <div class="row"> -->
<!--LEFT-->
<!-- <div class="col-lg-8"> -->
<!-- ////////////////////////////////////////////////// -->
<!-- //////////////// RESULTS //////////////// -->
<!-- ////////////////////////////////////////////////// -->
<!-- HEADER -->
<div class="thumbnail frame">
<div class = "caption" style="text-align: center"><h3 id="nickname_header"></h3></div>
</div>
<div id="results">
<!-- All plots -->
<div class="thumbnail plot_frame frame">
<div id="container_for_all_plots">
<!-- ALL PLOTS GO HERE -->
</div>
<div style="display:block">
<!-- <p id="missing_plots"><p> -->
<br>
<p>
All plots are available as both .png and .pdf files through the Download button below and can be used in publications.<a href="http://www.ncbi.nlm.nih.gov/pubmed/27318204"> Cite Assemblytics.</a>
</p>
</div>
</div>
<!-- Assembly statistics N50 etc. -->
<div class="thumbnail frame plot_frame">
<div class="caption">
<h4>Assembly statistics</h4>
<p id="landing_for_assembly_stats"></p>
</div>
</div>
<!-- Variant statistics -->
<div class="thumbnail frame plot_frame">
<div class="caption">
<h4>Variant summary statistics</h4>
<p id="landing_for_summary_statistics"></p>
</div>
</div>
<div class="thumbnail frame plot_frame">
<div class="caption">
<h4>Variant file preview</h4>
<p id="landing_for_variant_file_preview"></p>
</div>
</div>
<!-- Download button -->
<div class="thumbnail frame plot_frame">
<div class="caption">
<h4>Download all data</h4>
<p><a href="" download class="btn btn-primary" class="download_btn" id="download_zip" role="button">Download zip file of all results</a>
<!-- <a href="" download class="btn btn-default" class="download_btn" id="down_txt_1" role="button">Download all variants (.bed file)</a> -->
</p>
</div>
</div>
</div>
<!-- </div> -->
<!-- RIGHT-->
<!-- <div class="col-lg-4"> -->
<!-- </div> -->
<!-- </div> -->
<!-- </div> end of centered middle of body -->
<!--View analysis later-->
<div id="codepanel" >
<div class="panel panel-info">
<div class="panel-heading">
<h3 class="panel-title">View analysis later</h3>
</div>
<div id="code" class="panel-body">
<?php
$code=$_GET["code"];
$url="http://assemblytics.com/analysis.php?code=$code";
echo "Return to view your results at any time: <input type=\"text\" class=\"form-control\" value=\"$url\"></input>";
?>
</div>
</div>
</div>
<!-- ////////////////////////////////////////////////// -->
<!-- ///////////// Progress info ///////////// -->
<!-- ////////////////////////////////////////////////// -->
<div class="panel panel-info center" id="progress_panel">
<div class="panel-heading">
<h3 class="panel-title">Progress</h3>
</div>
<div class="panel-body">
<div id="plot_info">
Checking progress...
</div>
</div>
</div> <!-- End of progress info -->
<!-- ////////////////////////////////////////////////// -->
<!-- jquery must be first because bootstrap depends on it -->
<script src="js/jquery.min.js"></script>
<script src="js/bootstrap.min.js"></script>
<script type="text/javascript" src="https://www.google.com/jsapi"></script>
<script type='text/javascript' src="js/analysis_page_script.js?rndstr="<?php rand(100000,999999) ?> ></script>
<script type="text/javascript" src="http://code.jquery.com/jquery-migrate-1.2.1.min.js"></script>
<script type="text/javascript" src="http://cdnjs.cloudflare.com/ajax/libs/fancybox/1.3.4/jquery.fancybox-1.3.4.pack.min.js"></script>
</body>
</html>
# Author: Maria Nattestad
# Email: mnattest@cshl.edu
# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
library(ggplot2)
library(scales)
args<-commandArgs(TRUE)
prefix <- args[1]
filename_ref <- paste(prefix, ".coords.ref.genome", sep="")
filename_query <- paste(prefix, ".coords.query.genome", sep="")
ref.data <- read.csv(filename_ref, sep="\t", quote='',header=FALSE)
query.data <- read.csv(filename_query, sep="\t", quote='',header=FALSE)
names(ref.data) <- c("name","length")
names(query.data) <- c("name","length")
ref.data$length <- as.numeric(ref.data$length)
query.data$length <- as.numeric(query.data$length)
genome.length <- max(sum(ref.data$length),sum(query.data$length))
ref.cumsum <- data.frame(NG=cumsum(ref.data$length/genome.length*100),contig.length=ref.data$length,contig.source="Reference")
query.cumsum <- data.frame(NG=cumsum(query.data$length/genome.length*100),contig.length=query.data$length,contig.source="Query")
both.plot <- rbind(ref.cumsum,query.cumsum)
ref.cumsum.0 <- rbind(data.frame(NG=c(0),contig.length=max(ref.cumsum$contig.length),contig.source="Reference"),ref.cumsum)
query.cumsum.0 <- rbind(data.frame(NG=c(0),contig.length=max(query.cumsum$contig.length),contig.source="Query"),query.cumsum)
with.zeros <- rbind(ref.cumsum.0,query.cumsum.0)
bp_format<-function(num) {
if (num > 1000000000) {
paste(formatC(num/1000000000,format="f",digits=3,big.mark=",",drop0trailing = TRUE)," Gbp",sep="")
}
else if (num > 1000000) {
paste(formatC(num/1000000,format="f",digits=3,big.mark=",",drop0trailing = TRUE)," Mbp",sep="")
}
else {
paste(formatC(num,format="f",big.mark=",",drop0trailing = TRUE), " bp", sep="")
}
}
theme_set(theme_bw(base_size = 12) + theme(panel.grid.minor = element_line(colour = NA)))
colors <- c("blue","limegreen")
for (to_png in c(TRUE,FALSE)) {
if (to_png) {
png(file=paste(prefix,".Assemblytics.Nchart.png",sep=""),width=1000,height=1000,res=200)
} else {
pdf(paste(prefix,".Assemblytics.Nchart.pdf",sep=""))
}
if (nrow(with.zeros) > 2) {
print(
ggplot(with.zeros, aes(x = NG, y = contig.length, color=contig.source)) +
xlim(0,100) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)), limits=c(1,genome.length)) +
geom_path(size=1.5,alpha=0.5) +
geom_point(data=both.plot,size=2,alpha=0.5) +
labs(x = paste("NG(x)% where 100% = ",bp_format(genome.length), sep=""),y="Sequence length",colour="Assembly",title="Cumulative sequence length") +
scale_color_manual(values=colors) +
annotation_logticks(sides="lr")
)
} else {
# To make bacterial genomes at least show a dot instead of an error because
# they only have 1 contig
print(
ggplot(both.plot, aes(x = NG, y = contig.length, color=contig.source)) +
xlim(0,100) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)), limits=c(1,genome.length)) +
# geom_path(size=1.5,alpha=0.5) +
geom_point(size=4,alpha=0.5) +
labs(x = paste("NG(x)% where 100% = ",bp_format(genome.length), sep=""),y="Sequence length",colour="Assembly",title="Cumulative sequence length") +
scale_color_manual(values=colors) +
annotation_logticks(sides="lr")
)
}
dev.off()
}
This diff is collapsed.
library(ggplot2)
args<-commandArgs(TRUE)
prefix <- args[1]
filename <- paste(prefix,".oriented_coords.csv",sep="")
plot.output.filename <- paste(prefix,".Assemblytics.Dotplot_filtered",sep="")
plot.title <- "Dot plot of Assemblytics filtered alignments"
ref.pos <- function(chrom,pos,chr.lengths) {
chrom.index <- which(names(chr.lengths)==chrom)-1
offset.based.on.previous.chromosomes <- 0
if (chrom.index != 0) {
offset.based.on.previous.chromosomes <- sum(as.numeric(chr.lengths[c(1:chrom.index)]))
}
loc <- offset.based.on.previous.chromosomes + pos
loc
}
coords <- read.csv(filename,sep=",",header=TRUE)
if (nrow(coords)>100000) {
coords <- coords[1:100000,]
}
# names(coords) <- c("ref_start", "ref_end","query_start","query_end","ref_length","query_length","ref","query")
coords$ref <- as.character(coords$ref)
coords$query <- as.character(coords$query)
ordered_common_chromosome_names <- c(seq(1,100),paste("chr",seq(1,100),sep=""),paste("Chr",seq(1,100),sep=""),c("X","Y","M","MT","Chr0","chr0","0"))
all_chromosomes_some_ordered <- c(intersect(ordered_common_chromosome_names,unique(coords$ref)),setdiff(unique(coords$ref),ordered_common_chromosome_names))
coords$ref <- factor(coords$ref,levels=all_chromosomes_some_ordered)
chromosomes <- levels(coords$ref)
chr.lengths <- sapply(chromosomes,function(chr){max(coords[coords$ref==chr,"ref_length"])})
names(chr.lengths) <- chromosomes
coords <- cbind(coords, alignment.length=abs(coords$query_start-coords$query_end))
coords <- cbind(coords, ref.loc.start=mapply(FUN=ref.pos,coords$ref,coords$ref_start,MoreArgs=list(chr.lengths)),
ref.loc.stop=mapply(FUN=ref.pos,coords$ref,coords$ref_end,MoreArgs=list(chr.lengths)))
# pick longest alignment. then pick the ref.loc.start of that
query.group <- split(coords,factor(coords$query))
ref.loc.of.longest.alignment.by.query <- unlist(sapply(query.group, function(coords.for.each.query) {coords.for.each.query$ref.loc.start[coords.for.each.query$alignment.length==max(coords.for.each.query$alignment.length)][1]}),recursive=FALSE)
# decide optimal-ish ordering of the queries
ordered.query.names <- names(ref.loc.of.longest.alignment.by.query)[order(ref.loc.of.longest.alignment.by.query)]
# construct a query.lengths list
query.lengths <- sapply(ordered.query.names,function(each.query){
max(coords[coords$query==each.query,"query_length"])
})
# use the query.lengths to give offset positions to each query, adding a query.loc.start column and a query.loc.stop column to each entry in filtered.coords
coords$query.loc.start <- mapply(FUN=ref.pos,coords$query,coords$query_start,MoreArgs=list(query.lengths))
coords$query.loc.stop <- mapply(FUN=ref.pos,coords$query,coords$query_end,MoreArgs=list(query.lengths))
# Hide labels for chromosomes accounting for less than 2% of the reference
chr.labels <- names(chr.lengths)
chr.labels[chr.lengths < 0.02*sum(as.numeric(chr.lengths))] <- ""
query.labels <- names(query.lengths)
query.labels[query.lengths < 0.02*sum(as.numeric(query.lengths))] <- ""
theme_set(theme_bw(base_size = 24))
colors <- c("black","red")
coords$tag <- factor(coords$tag,levels=c("unique","repetitive"))
# CREATE PNG
png(file=paste(plot.output.filename, ".png",sep=""),width=1000,height=1000)
print(ggplot(coords, aes(x=ref.loc.start,xend=ref.loc.stop,y=query.loc.start,yend=query.loc.stop,color=tag)) + geom_segment(lineend="butt",size=1.5) + labs(x="Reference",y="Query",title=plot.title) + scale_y_continuous(breaks = cumsum(as.numeric(query.lengths)),labels=query.labels,expand=c(0,0), limits = c(0,sum(as.numeric(query.lengths)))) + scale_x_continuous(breaks = cumsum(as.numeric(chr.lengths)),labels=chr.labels,expand=c(0,0),limits=c(0,sum(as.numeric(chr.lengths)))) + scale_color_manual(values=colors,name="Filter") + theme(
axis.ticks.y=element_line(size=0),
axis.text.x = element_text(angle = 90, hjust = 1,vjust=-0.5),
axis.text.y = element_text(size=12,vjust=1.1),
plot.title = element_text(vjust=3),
panel.grid.major.x = element_line(colour = "black",size=0.1),
panel.grid.major.y = element_line(colour = "black",size=0.1),
panel.grid.minor = element_line(NA)
))
dev.off()
# CREATE PDF
# pdf(file=paste(plot.output.filename, ".pdf",sep=""),width=100,height=100,res=100)
# print(ggplot(coords, aes(x=ref.loc.start,xend=ref.loc.stop,y=query.loc.start,yend=query.loc.stop)) + geom_segment(lineend="butt",size=1.5) + labs(x="Reference",y="Query",title=plot.title) + scale_y_continuous(breaks = cumsum(as.numeric(query.lengths)),labels=query.labels,expand=c(0,0), limits = c(0,sum(as.numeric(query.lengths)))) + scale_color_manual(values=colors) + scale_x_continuous(breaks = cumsum(as.numeric(chr.lengths)),labels=chr.labels,expand=c(0,0),limits=c(0,sum(as.numeric(chr.lengths)))) + theme(
# axis.ticks.y=element_line(size=0),
# axis.text.x = element_text(angle = 90, hjust = 1,vjust=-0.5),
# axis.text.y = element_text(size=12,vjust=1.1),
# plot.title = element_text(vjust=3),
# panel.grid.major.x = element_line(colour = "black",size=0.1),
# panel.grid.major.y = element_line(colour = "black",size=0.1),
# panel.grid.minor = element_line(NA)
# ))
#
# dev.off()
#! /usr/bin/env python
# Author: Maria Nattestad
# Email: mnattest@cshl.edu
# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
import argparse
import numpy as np
import re
import operator
def run(args):
coords = args.coords
output_prefix = args.out
f = open(coords)
f.readline() # ignore header
fields_by_query = {}
existing_query_names = set()
existing_reference_names = set()
reference_lengths = []
query_lengths = {}
for line in f:
fields = line.strip().split(",")
query_name = fields[7]
query_lengths[query_name] = int(fields[5])
if not query_name in existing_query_names:
fields_by_query[query_name] = []
existing_query_names.add(query_name)
fields_by_query[query_name].append(fields)
ref_name = fields[6]
ref_length = int(fields[4])
if not ref_name in existing_reference_names:
existing_reference_names.add(ref_name)
reference_lengths.append((ref_name,ref_length))
f.close()
# Find the order of the reference chromosomes
reference_lengths.sort(key=lambda x: natural_key(x[0]))
# Find the cumulative sums
cumulative_sum = 0
ref_chrom_offsets = {}
queries_by_reference = {}
for ref,ref_length in reference_lengths:
ref_chrom_offsets[ref] = cumulative_sum
cumulative_sum += ref_length
queries_by_reference[ref] = set()
# Calculate relative positions of each alignment in this cumulative length, and take the median of these for each query, then sort the queries by those scores
flip_by_query = {}
references_by_query = {} # for index
relative_ref_position_by_query = [] # for ordering
for query_name in fields_by_query:
lines = fields_by_query[query_name]
sum_forward = 0
sum_reverse = 0
amount_of_reference = {}
ref_position_scores = []
references_by_query[query_name] = set()
for ref,ref_length in reference_lengths:
amount_of_reference[ref] = 0
for fields in lines:
tag = fields[8]
if tag == "unique":
query_stop = int(fields[3])
query_start = int(fields[2])
ref_start = int(fields[0])
ref_stop = int(fields[1])
alignment_length = abs(int(fields[3])-int(fields[2]))
ref = fields[6]
# for index:
references_by_query[query_name].add(ref)
queries_by_reference[ref].add(query_name)
# amount_of_reference[ref] += alignment_length
# for ordering:
ref_position_scores.append(ref_chrom_offsets[ref] + (ref_start+ref_stop)/2)
# for orientation:
if query_stop < query_start:
sum_reverse += alignment_length
else:
sum_forward += alignment_length
# orientation:
flip_by_query[query_name] = sum_reverse > sum_forward
# for ref in amount_of_reference:
# if amount_of_reference[ref] > 0:
# references_by_query[query_name].add(ref)
# queries_by_reference[ref].add(query_name)
# ordering
if len(ref_position_scores) > 0:
relative_ref_position_by_query.append((query_name,np.median(ref_position_scores)))
else:
relative_ref_position_by_query.append((query_name,0))
relative_ref_position_by_query.sort(key=lambda x: x[1])
fout_ref_index = open(output_prefix + ".ref.index",'w')
fout_ref_index.write("ref,ref_length,matching_queries\n")
# reference_lengths is sorted by the reference chromosome name
for ref,ref_length in reference_lengths:
fout_ref_index.write("%s,%d,%s\n" % (ref,ref_length,"~".join(queries_by_reference[ref])))
fout_ref_index.close()
fout_query_index = open(output_prefix + ".query.index",'w')
fout_query_index.write("query,query_length,matching_refs\n")
# relative_ref_position_by_query is sorted by rel_pos
for query,rel_pos in relative_ref_position_by_query:
fout_query_index.write("%s,%d,%s\n" % (query,query_lengths[query],"~".join(references_by_query[query])))
fout_query_index.close()
f = open(coords)
fout = open(output_prefix + ".oriented_coords.csv",'w')
header = f.readline().strip()
fout.write(header+",alignment_length\n") # copy the header
alignment_length_column = len(header.split(","))
# sorted_by_alignment_length = []
uniques = []
repetitives = []
for line in f:
fields = line.strip().split(",")
query_name = fields[7]
if flip_by_query[query_name] == True:
fields[2] = int(fields[5]) - int(fields[2])
fields[3] = int(fields[5]) - int(fields[3])
alignment_length = abs(int(fields[2])-int(fields[1]))
fields.append(alignment_length)
if fields[8] == "unique":
uniques.append(fields)
else:
repetitives.append(fields)
f.close()
uniques.sort(key=lambda x: x[alignment_length_column],reverse=True)
repetitives.sort(key=lambda x: x[alignment_length_column],reverse=True)
fout_info = open(output_prefix + ".info.csv",'w')
fout_info.write("key,value\n")
fout_info.write("unique alignments,%d\n" % len(uniques))
fout_info.write("repetitive alignments,%d\n" % len(repetitives))
for fields in uniques:
fout.write(",".join(map(str,fields)) + "\n")
if len(repetitives) < 100000:
for fields in repetitives:
fout.write(",".join(map(str,fields)) + "\n")
fout_info.write("showing repetitive alignments,True\n")
else:
fout_repeats = open(output_prefix + ".oriented_coords.repetitive.csv",'w')
fout_repeats.write(header+",alignment_length\n") # copy the header
for fields in repetitives:
fout_repeats.write(",".join(map(str,fields)) + "\n")
fout_repeats.close()
fout_info.write("showing repetitive alignments,False: Too many\n")
fout.close()
fout_info.close()
def natural_key(string_):
"""See http://www.codinghorror.com/blog/archives/001018.html"""
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
def main():
parser=argparse.ArgumentParser(description="Index and orient a coordinate file for dotplots.")
parser.add_argument("-coords",help="coords.csv file from Assemblytics_uniq_anchor.py" ,dest="coords", type=str, required=True)
parser.add_argument("-out",help="output prefix for indices and oriented coordinates file" ,dest="out", type=str, required=True)
parser.set_defaults(func=run)
args=parser.parse_args()
args.func(args)
if __name__=="__main__":
main()
\ No newline at end of file
#!/usr/bin/env python
# Author: Maria Nattestad
# Email: mnattest@cshl.edu
# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
import argparse
import numpy as np
def SVtable(args):
filename = args.file
minimum_variant_size = args.minimum_variant_size
maximum_variant_size = args.maximum_variant_size
simplify_types = False
f=open(filename)
typeList = []
sizeList = []
rawTypes = []
linecounter = 0
for line in f:
fields = line.strip().split()
if not fields[4].isdigit():
continue
svType = fields[6]
rawTypes.append(svType)
if simplify_types == True:
if svType == "Insertion" or svType == "Expansion":
typeList.append("Insertion/Expansion")
elif svType == "Deletion" or svType == "Contraction":
typeList.append("Deletion/Contraction")
else:
typeList.append(svType)
else:
typeList.append(svType)
sizeList.append(int(fields[4]))
linecounter += 1
f.close()
size_thresholds = [10,50,500,10000,50000,100000,500000,1000000]
sizeArray = np.array(sizeList)
typeArray = np.array(typeList)
svTypes = ["Insertion","Deletion","Tandem_expansion","Tandem_contraction","Repeat_expansion","Repeat_contraction"]
if simplify_types == True:
svTypes = ["Insertion/Expansion","Deletion/Contraction"]
overall_total = 0
overall_total_bases = 0
overall_total_SVs = 0
overall_total_SV_bases = 0
SV_size = 50
all_SV_types = svTypes + list(set(rawTypes)-set(svTypes))
f_output_csv = open(filename[0:-4]+".summary.csv",'w')
if linecounter > 0:
for svType in all_SV_types:
sizes = sizeArray[typeArray==svType]
overall_total += len(sizes)
overall_total_bases += sum(sizes)
overall_total_SVs += len(sizes[sizes>=SV_size])
overall_total_SV_bases += sum(sizes[sizes>=SV_size])
print svType
f_output_csv.write(svType + "\n")
format = "%20s%10s%15s"
print format % ("", "Count","Total bp")
f_output_csv.write("Size range,Count,Total bp\n")
previous_size = minimum_variant_size
for threshold in size_thresholds:
if threshold <= minimum_variant_size or previous_size >= maximum_variant_size:
continue
subset = sizes[np.logical_and(sizes>=previous_size,sizes<threshold)];
print format % ("%s-%s bp: " % (intWithCommas(previous_size),intWithCommas(threshold)), str(len(subset)), str(sum(subset)))
f_output_csv.write("%s,%s,%s\n" % ("%s-%s bp" % (previous_size,threshold), str(len(subset)), str(sum(subset))))
previous_size = threshold
if previous_size < maximum_variant_size:
subset = sizes[sizes>=previous_size];
print format % ("> %s bp: " % (intWithCommas(previous_size)), str(len(subset)), str(sum(subset)))
f_output_csv.write("%s,%s,%s\n" % (