Skip to content
Commits on Source (6)
# Change Log
[v2.14.3](https://github.com/sanger-pathogens/ariba/tree/v2.14.3) (2019-08-23)
[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.2...v2.14.3)
**Fixed bugs:**
- Version 3.0.3 of CARD breaks prepareref [\#278](https://github.com/sanger-pathogens/ariba/issues/278)
- RT 667288: Change docker file Ariba git clone to a copy
[v2.14.2](https://github.com/sanger-pathogens/ariba/tree/v2.14.2) (2019-06-18)
[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.1...v2.14.2)
......
......@@ -7,7 +7,10 @@ MAINTAINER ariba-help@sanger.ac.uk
# Software version numbers
ARG BOWTIE2_VERSION=2.2.9
ARG SPADES_VERSION=3.13.1
ARG ARIBA_VERSION=2.14.2
ARG ARIBA_TAG=master
ARG ARIBA_BUILD_DIR=/ariba
ARG LOCALE_COUNTRY=en_GB
RUN apt-get -qq update && \
apt-get install --no-install-recommends -y \
......@@ -27,10 +30,18 @@ RUN apt-get -qq update && \
wget \
zlib1g-dev
# Install locales
RUN apt-get update && apt-get install -y locales-all && rm -rf /var/lib/apt/lists/*
# Set a default locale.
ENV LANG=${LOCALE_COUNTRY}.UTF-8 \
LANGUAGE=${LOCALE_COUNTRY}:en
# Install bowtie
RUN wget -q http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \
&& unzip bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \
&& rm -f bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip
# Install SPAdes
RUN wget -q https://github.com/ablab/spades/releases/download/v${SPADES_VERSION}/SPAdes-${SPADES_VERSION}-Linux.tar.gz \
&& tar -zxf SPAdes-${SPADES_VERSION}-Linux.tar.gz \
&& rm -f SPAdes-${SPADES_VERSION}-Linux.tar.gz
......@@ -40,13 +51,15 @@ RUN wget -q https://github.com/ablab/spades/releases/download/v${SPADES_VERSION}
ENV ARIBA_BOWTIE2=$PWD/bowtie2-${BOWTIE2_VERSION}/bowtie2 ARIBA_CDHIT=cdhit-est MPLBACKEND="agg"
ENV PATH=$PATH:$PWD/SPAdes-${SPADES_VERSION}-Linux/bin
RUN cd /usr/local/bin && ln -s /usr/bin/python3 python && cd
RUN ln -s -f /usr/bin/python3 /usr/local/bin/python
RUN git clone https://github.com/sanger-pathogens/ariba.git \
&& cd ariba \
&& git checkout v${ARIBA_VERSION} \
&& rm -rf .git \
# Install Ariba
RUN mkdir -p $ARIBA_BUILD_DIR
COPY . $ARIBA_BUILD_DIR
RUN cd $ARIBA_BUILD_DIR \
&& python3 setup.py clean --all \
&& python3 setup.py test \
&& python3 setup.py install
&& python3 setup.py install \
&& rm -rf $ARIBA_BUILD_DIR
CMD ariba
......@@ -16,6 +16,8 @@ class RefPreparer:
version_report_lines=None,
min_gene_length=6,
max_gene_length=10000,
min_noncoding_length=6,
max_noncoding_length=20000,
genetic_code=11,
cdhit_min_id=0.9,
cdhit_min_length=0.0,
......@@ -38,6 +40,8 @@ class RefPreparer:
self.all_coding = all_coding
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.min_noncoding_length = min_noncoding_length
self.max_noncoding_length = max_noncoding_length
self.genetic_code = genetic_code
self.cdhit_min_id = cdhit_min_id
self.cdhit_min_length = cdhit_min_length
......@@ -177,6 +181,8 @@ class RefPreparer:
self.metadata_tsv_files,
min_gene_length=self.min_gene_length,
max_gene_length=self.max_gene_length,
min_noncoding_length = self.min_noncoding_length,
max_noncoding_length = self.max_noncoding_length,
genetic_code=self.genetic_code,
)
......@@ -213,8 +219,9 @@ class RefPreparer:
pickle.dump(clusters, f)
if number_of_removed_seqs > 0:
print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the log file 01.filter.check_genes.log for details. This will show them:', file=sys.stderr)
print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the 01.filter.check_genes.log and 01.filter.check_noncoding.log for details. This will show them:', file=sys.stderr)
print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)
print(' cat', os.path.join(outdir, '01.filter.check_noncoding.log'), file=sys.stderr)
if number_of_bad_variants_logged > 0:
print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
......@@ -19,6 +19,8 @@ class ReferenceData:
rename_file=None,
min_gene_length=6,
max_gene_length=10000,
min_noncoding_length=6,
max_noncoding_length=20000,
genetic_code=11,
parameters_file=None,
):
......@@ -26,6 +28,8 @@ class ReferenceData:
self.seq_dicts = {}
self.min_gene_length = min_gene_length
self.max_gene_length = max_gene_length
self.min_noncoding_length = min_noncoding_length
self.max_noncoding_length = max_noncoding_length
self.sequences, self.metadata = ReferenceData._load_input_files_and_check_seq_names(fasta_files, metadata_tsv_files)
if len(self.sequences) == 0:
......@@ -208,7 +212,7 @@ class ReferenceData:
for sequence_name, metadata_dict in sorted(all_metadata.items()):
if sequence_name in removed_sequences:
print(sequence_name, 'was removed because does not look like a gene, so removing its metadata', file=log_fh)
print(sequence_name, 'was removed because it failed filtering checks, so removing its metadata', file=log_fh)
log_lines += 1
del all_metadata[sequence_name]
continue
......@@ -278,6 +282,16 @@ class ReferenceData:
return got[0], 'KEEP\tMade into gene. strand=' + got[1] + ', frame=' + str(got[2])
@classmethod
def _check_noncoding_seq(cls, seq, min_length, max_length):
if len(seq) < min_length:
return False, 'REMOVE\tToo short. Length: ' + str(len(seq))
elif len(seq) > max_length:
return False, 'REMOVE\tToo long. Length: ' + str(len(seq))
else:
return True, None
@classmethod
def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_gene_length):
to_remove = set()
......@@ -308,11 +322,46 @@ class ReferenceData:
return to_remove
@classmethod
def _remove_bad_noncoding_seqs(cls, sequences, metadata, log_file, min_noncoding_length, max_noncoding_length):
to_remove = set()
if len(sequences) == 0:
return to_remove
log_fh = pyfastaq.utils.open_file_write(log_file)
for name in sorted(sequences):
if metadata[name]['seq_type'] != 'n':
continue
valid, message = ReferenceData._check_noncoding_seq(sequences[name], min_noncoding_length, max_noncoding_length)
if not valid:
to_remove.add(name)
if message is not None:
print(name, message, sep='\t', file=log_fh)
pyfastaq.utils.close(log_fh)
for name in to_remove:
sequences.pop(name)
return to_remove
def sanity_check(self, outprefix):
removed_seqs = self._remove_bad_genes(self.sequences, self.metadata, outprefix + '.check_genes.log', self.min_gene_length, self.max_gene_length)
log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs)
return len(removed_seqs), log_lines
removed_gene_seqs = self._remove_bad_genes(self.sequences,
self.metadata, outprefix + '.check_genes.log',
self.min_gene_length, self.max_gene_length)
removed_noncoding_seqs = self._remove_bad_noncoding_seqs(self.sequences, self.metadata,
outprefix + '.check_noncoding.log', self.min_noncoding_length,
self.max_noncoding_length)
all_removed_seqs = removed_gene_seqs.union(removed_noncoding_seqs)
log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, all_removed_seqs)
return len(all_removed_seqs), log_lines
@classmethod
def _new_seq_name(cls, name):
......
......@@ -18,6 +18,8 @@ def run(options):
version_report_lines=version_report_lines,
min_gene_length=options.min_gene_length,
max_gene_length=options.max_gene_length,
min_noncoding_length=options.min_noncoding_length,
max_noncoding_length=options.max_noncoding_length,
genetic_code=options.genetic_code,
cdhit_min_id=options.cdhit_min_id,
cdhit_min_length=options.cdhit_min_length,
......
>noncoding1-toolong
CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA
>noncoding2
CTACTGAT
>cannot_make_into_a_gene
AAAAAAAAAAAAAAAA
>noncoding3-tooshort
C
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>noncoding4-toolong
CTACTGATCATCTACTATCTG
>noncoding5-tooshort
CTCTC
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
cannot_make_into_a_gene 1 0 . . .
noncoding1-toolong 0 0 . . .
noncoding2 0 0 C4T . .
noncoding3-tooshort 0 0 C4T . .
noncoding4-toolong 0 0 C4T . .
noncoding5-tooshort 0 0 C4T . .
gene1 1 0 . . .
gene2 1 0 . . .
\ No newline at end of file
cannot_make_into_a_gene was removed because does not look like a gene, so removing its metadata
cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata
input fasta file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.fa
input tsv file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.tsv
genetic_code 1
noncoding1-toolong noncoding1_toolong
noncoding3-tooshort noncoding3_tooshort
noncoding4-toolong noncoding4_toolong
noncoding5-tooshort noncoding5_tooshort
ARIBA run with this command:
setup.py prepareref test
from this directory: /Users/kp11/workspace/applications/Ariba/ariba
cannot_make_into_a_gene REMOVE Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA
gene1 KEEP Made into gene. strand=+, frame=0
gene2 KEEP Made into gene. strand=+, frame=0
cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata
noncoding1_toolong was removed because it failed filtering checks, so removing its metadata
noncoding3_tooshort was removed because it failed filtering checks, so removing its metadata
noncoding4_toolong was removed because it failed filtering checks, so removing its metadata
noncoding5_tooshort was removed because it failed filtering checks, so removing its metadata
noncoding1_toolong REMOVE Too long. Length: 38
noncoding3_tooshort REMOVE Too short. Length: 1
noncoding4_toolong REMOVE Too long. Length: 21
noncoding5_tooshort REMOVE Too short. Length: 5
>gene1
ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>gene2
ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA
>noncoding2
CTACTGAT