Skip to content
Commits on Source (8)
......@@ -428,6 +428,8 @@ SPAdes assembly:
--depth_filter DEPTH_FILTER Filter out contigs lower than this fraction of the chromosomal
depth, if doing so does not result in graph dead ends (default:
0.25)
--largest_component Only keep the largest connected component of the assembly graph
(default: keep all connected components)
--spades_tmp_dir SPADES_TMP_DIR
Specify SPAdes temporary directory using the SPAdes --tmp-dir
option (default: make a temporary directory in the output
......
unicycler (0.4.7+dfsg-3) UNRELEASED; urgency=medium
unicycler (0.4.8+dfsg-1) UNRELEASED; urgency=medium
* Team upload.
[ Michael R. Crusoe ]
* Inherit and use LDFLAGS and CPPFLAGS
* Mark unicycler-data as Multi-Arch: foreign, as recommended by the
Multiarch hinter.
* Standards-Version: 4.3.0, no changes needed
-- Michael R. Crusoe <michael.crusoe@gmail.com> Fri, 18 Jan 2019 02:34:45 -0800
[ Andreas Tille ]
* New upstream version
* debhelper-compat 12
* Standards-Version: 4.4.1
* Set upstream metadata fields: Repository.
* (Build-)Depends: bcftools
-- Andreas Tille <tille@debian.org> Tue, 12 Nov 2019 16:01:56 +0100
unicycler (0.4.7+dfsg-2) unstable; urgency=medium
......
......@@ -4,11 +4,12 @@ Uploaders: Andreas Tille <tille@debian.org>,
Liubov Chuprikova <chuprikovalv@gmail.com>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
Build-Depends: debhelper-compat (= 12),
dh-python,
python3-all,
python3-setuptools,
default-jdk,
bcftools,
bowtie2,
ncbi-blast+,
pilon,
......@@ -17,7 +18,7 @@ Build-Depends: debhelper (>= 11~),
spades,
libseqan2-dev,
zlib1g-dev
Standards-Version: 4.3.0
Standards-Version: 4.4.1
Vcs-Browser: https://salsa.debian.org/med-team/unicycler
Vcs-Git: https://salsa.debian.org/med-team/unicycler.git
Homepage: https://github.com/rrwick/Unicycler
......@@ -29,6 +30,7 @@ Depends: ${python3:Depends},
${misc:Depends},
python3-setuptools,
default-jre,
bcftools,
bowtie2,
ncbi-blast+,
pilon,
......
Reference:
- Author: >
- Author: >
Ryan R. Wick and Louise M. Judd and Claire L. Gorrie and Kathryn
E. Holt
Title: >
Title: >
Unicycler: Resolving bacterial genome assemblies from short and long
sequencing reads
Journal: PLOS Computational Biology
Year: 2017
Volume: 13
Number: 6
Pages: e1005595
DOI: 10.1371/journal.pcbi.1005595
PMID: 28594827
URL: "http://journals.plos.org/ploscompbiol/article?id=\
10.1371/journal.pcbi.1005595"
eprint: "http://journals.plos.org/ploscompbiol/article/\
file?id=10.1371/journal.pcbi.1005595&type=printable"
- Author: >
Journal: PLOS Computational Biology
Year: 2017
Volume: 13
Number: 6
Pages: e1005595
DOI: 10.1371/journal.pcbi.1005595
PMID: 28594827
URL: "http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005595"
eprint: "http://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1005595&type=printable"
- Author: >
Ryan R. Wick and Louise M. Judd and Claire L. Gorrie and Kathryn E. Holt
Title: >
Title: >
Completing bacterial genome assemblies with multiplex MinION sequencing
Journal: Microbial Genomics
Year: 2017
Volume: 3
Number: 10
Pages: e000132
DOI: 10.1099/mgen.0.000132
PMID: 29177090
URL: "http://mgen.microbiologyresearch.org/content/journal/\
mgen/10.1099/mgen.0.000132"
eprint: "http://mgen.microbiologyresearch.org/deliver/fulltext/\
mgen/3/10/mgen000132.pdf?itemId=/content/journal/mgen/10.1099/\
mgen.0.000132&mimeType=pdf&isFastTrackArticle="
Journal: Microbial Genomics
Year: 2017
Volume: 3
Number: 10
Pages: e000132
DOI: 10.1099/mgen.0.000132
PMID: 29177090
URL: "http://mgen.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000132"
eprint: "http://mgen.microbiologyresearch.org/deliver/fulltext/mgen/3/10/mgen000132.pdf?itemId=/content/journal/mgen/10.1099/mgen.0.000132&mimeType=pdf&isFastTrackArticle="
Registry:
- Name: OMICtools
Entry: OMICS_14591
- Name: bio.tools
Entry: unicycler
- Name: conda:bioconda
Entry: unicycler
- Name: SciCrunch
Entry: NA
- Name: OMICtools
Entry: OMICS_14591
- Name: bio.tools
Entry: unicycler
- Name: conda:bioconda
Entry: unicycler
- Name: SciCrunch
Entry: NA
Repository: https://github.com/rrwick/Unicycler.git
......@@ -423,6 +423,7 @@ class AssemblyGraph(object):
3) deleting the segment would not create any dead ends
"""
segment_nums_to_remove = []
total_length_removed = 0
ten_longest_contigs = sorted(self.segments.values(), reverse=True,
key=lambda x: x.get_length())[:10]
whole_graph_cutoff = self.get_median_read_depth(ten_longest_contigs) * relative_depth_cutoff
......@@ -437,7 +438,9 @@ class AssemblyGraph(object):
self.all_segments_below_depth(component, whole_graph_cutoff) or \
self.dead_end_change_if_deleted(seg_num) <= 0:
segment_nums_to_remove.append(seg_num)
total_length_removed += segment.get_length()
self.remove_segments(segment_nums_to_remove)
return len(segment_nums_to_remove), total_length_removed
def filter_homopolymer_loops(self):
"""
......@@ -455,6 +458,28 @@ class AssemblyGraph(object):
log.log('Removed homopolymer loops:', 3)
log.log_number_list(segment_nums_to_remove, 3)
def choose_largest_component(self):
"""
Special logic: throw out all of the graph's connected components except for the largest one.
"""
largest_component_length = None
connected_components = self.get_connected_components()
for component_nums in connected_components:
component_segments = [self.segments[x] for x in component_nums]
component_length = sum(x.get_length() for x in component_segments)
if largest_component_length is None or component_length > largest_component_length:
largest_component_length = component_length
segment_nums_to_remove = []
for component_nums in connected_components:
component_segments = [self.segments[x] for x in component_nums]
component_length = sum(x.get_length() for x in component_segments)
if component_length < largest_component_length:
segment_nums_to_remove += component_nums
self.remove_segments(segment_nums_to_remove)
if segment_nums_to_remove:
log.log('\nRemoved not-largest components:', 3)
log.log_number_list(segment_nums_to_remove, 3)
def remove_segments(self, nums_to_remove):
"""
This function deletes all segments in the nums_to_remove list, along with their links. It
......@@ -923,7 +948,7 @@ class AssemblyGraph(object):
dead_ends += 1
return potential_dead_ends - dead_ends
def clean(self, read_depth_filter):
def clean(self, read_depth_filter, largest_component):
"""
This function does various graph repairs, filters and normalisations to make it a bit
nicer.
......@@ -931,9 +956,12 @@ class AssemblyGraph(object):
log.log('Repair multi way junctions ' + get_dim_timestamp(), 3)
self.repair_multi_way_junctions()
log.log('Filter by read depth ' + get_dim_timestamp(), 3)
self.filter_by_read_depth(read_depth_filter)
removed_count, removed_length = self.filter_by_read_depth(read_depth_filter)
log.log('Filter homopolymer loops ' + get_dim_timestamp(), 3)
self.filter_homopolymer_loops()
if largest_component:
log.log('Keep largest component ' + get_dim_timestamp(), 3)
self.choose_largest_component()
log.log('Merge all possible ' + get_dim_timestamp(), 3)
self.merge_all_possible(None, 2)
log.log('Normalise read depths ' + get_dim_timestamp(), 3)
......@@ -943,6 +971,7 @@ class AssemblyGraph(object):
log.log('Sort link order ' + get_dim_timestamp(), 3)
self.sort_link_order()
log.log('Graph cleaning finished ' + get_dim_timestamp(), 3)
return removed_count, removed_length
def final_clean(self):
"""
......
......@@ -490,7 +490,7 @@ def get_read_loop_vote(start, end, middle, repeat, strand, minimap_alignments, r
best_score = test_seq_score
best_count = loop_count
# Break when we've hit the max loop count. But if the max isn't our best, then we keep
# Break when we've hit the max loop count. But if the max is our best, then we keep
# trying higher.
if loop_count >= max_tested_loop_count and loop_count != best_count:
break
......
......@@ -30,8 +30,8 @@ SIMPLE_REPEAT_BRIDGING_BAND_SIZE = 50
CONTIG_READ_QSCORE = 40
# This is the maximum number of times an assembly will be Racon polished
RACON_POLISH_LOOP_COUNT_HYBRID = 5
RACON_POLISH_LOOP_COUNT_LONG_ONLY = 10
RACON_POLISH_LOOP_COUNT_HYBRID = 2
RACON_POLISH_LOOP_COUNT_LONG_ONLY = 4
# This is the number of times assembly graph contigs are included in the Racon polish reads. E.g.
......
......@@ -30,7 +30,8 @@ class BadFastq(Exception):
def get_best_spades_graph(short1, short2, short_unpaired, out_dir, read_depth_filter, verbosity,
spades_path, threads, keep, kmer_count, min_k_frac, max_k_frac, kmers,
no_spades_correct, expected_linear_seqs, spades_tmp_dir):
no_spades_correct, expected_linear_seqs, spades_tmp_dir,
largest_component):
"""
This function tries a SPAdes assembly at different k-mers and returns the best.
'The best' is defined as the smallest dead-end count after low-depth filtering. If multiple
......@@ -126,7 +127,7 @@ def get_best_spades_graph(short1, short2, short_unpaired, out_dir, read_depth_fi
continue
log.log('\nCleaning k{} graph'.format(kmer), 2)
assembly_graph.clean(read_depth_filter)
assembly_graph.clean(read_depth_filter, largest_component)
clean_graph_filename = os.path.join(spades_dir, ('k%03d' % kmer) + '_assembly_graph.gfa')
assembly_graph.save_to_gfa(clean_graph_filename, verbosity=2)
......@@ -183,7 +184,7 @@ def get_best_spades_graph(short1, short2, short_unpaired, out_dir, read_depth_fi
assembly_graph = AssemblyGraph(best_graph_filename, best_kmer, paths_file=paths_file,
insert_size_mean=insert_size_mean,
insert_size_deviation=insert_size_deviation)
assembly_graph.clean(read_depth_filter)
removed_count, removed_length = assembly_graph.clean(read_depth_filter, largest_component)
clean_graph_filename = os.path.join(spades_dir, 'k' + str(best_kmer) + '_assembly_graph.gfa')
assembly_graph.save_to_gfa(clean_graph_filename, verbosity=2)
......@@ -197,9 +198,14 @@ def get_best_spades_graph(short1, short2, short_unpaired, out_dir, read_depth_fi
row_colour={best_kmer_row: 'green'},
row_extra_text={best_kmer_row: ' ' + get_left_arrow() + 'best'})
# Report on the results of the read depth filter (can help with identifying levels of
# contamination).
log.log('\nRead depth filter: removed {} contigs totalling {} bp'.format(removed_count,
removed_length))
# Clean up.
if keep < 3 and os.path.isdir(spades_dir):
log.log('\nDeleting ' + spades_dir + '/')
log.log('Deleting ' + spades_dir + '/')
shutil.rmtree(spades_dir, ignore_errors=True)
if keep < 3 and spades_tmp_dir is not None and os.path.isdir(spades_tmp_dir):
log.log('Deleting ' + spades_tmp_dir + '/')
......
......@@ -8,6 +8,7 @@
#include <limits>
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wunknown-warning-option"
#pragma GCC diagnostic ignored "-Wvla"
#pragma GCC diagnostic ignored "-Wvla-extension"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
......
......@@ -85,7 +85,7 @@ def main():
args.spades_path, args.threads, args.keep,
args.kmer_count, args.min_kmer_frac, args.max_kmer_frac,
args.kmers, args.no_correct, args.linear_seqs,
args.spades_tmp_dir)
args.spades_tmp_dir, args.largest_component)
determine_copy_depth(graph)
if args.keep > 0 and not os.path.isfile(best_spades_graph):
graph.save_to_gfa(best_spades_graph, save_copy_depth_info=True, newline=True,
......@@ -346,6 +346,10 @@ def get_arguments():
help='Filter out contigs lower than this fraction of the chromosomal '
'depth, if doing so does not result in graph dead ends'
if show_all_args else argparse.SUPPRESS)
spades_group.add_argument('--largest_component', action='store_true',
help='Only keep the largest connected component of the assembly '
'graph (default: keep all connected components)'
if show_all_args else argparse.SUPPRESS)
spades_group.add_argument('--spades_tmp_dir', type=str, default=None,
help="Specify SPAdes temporary directory using the SPAdes --tmp-dir "
"option (default: make a temporary directory in the output "
......@@ -912,10 +916,10 @@ def rotate_completed_replicons(graph, args, counter):
log.log_section_header('Rotating completed replicons')
log.log_explanation('Any completed circular contigs (i.e. single contigs which have one '
'link connecting end to start) can have their start position changed '
'with altering the sequence. For consistency, Unicycler now searches '
'for a starting gene (dnaA or repA) in each such contig, and if one '
'is found, the contig is rotated to start with that gene on the '
'forward strand.')
'without altering the sequence. For consistency, Unicycler now '
'searches for a starting gene (dnaA or repA) in each such contig, and '
'if one is found, the contig is rotated to start with that gene on '
'the forward strand.')
rotation_result_table = [['Segment', 'Length', 'Depth', 'Starting gene', 'Position',
'Strand', 'Identity', 'Coverage']]
......
......@@ -13,4 +13,4 @@ details. You should have received a copy of the GNU General Public License along
not, see <http://www.gnu.org/licenses/>.
"""
__version__ = '0.4.7'
__version__ = '0.4.8'