Skip to content
Commits on Source (7)
[![DOI](https://zenodo.org/badge/97020646.svg)](https://zenodo.org/badge/latestdoi/97020646)
# SalmID
Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia).
......@@ -5,26 +7,34 @@ Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs S
Python 3
## Installation:
Clone git to your machine:
The easy way with homebrew ([Linux](http://linuxbrew.sh/) or [MacOS](https://brew.sh/)):
```
git clone --recursive https://github.com/hcdenbakker/SalmID.git
brew install brewsci/bio/salmid
```
Big thanks to [Torsten Seemann](https://tseemann.github.io/) for including this in homebrew!
Make SalmID executable:
```
cd SalmID
```
Alternatively download from GitHub:
```bash
git clone https://github.com/hcdenbakker/SalmID.git
```
chmod +x SalmID.py
build a wheel using [poetry](https://poetry.eustace.io/):
```bash
cd SalmID
poetry build
```
and install using `pip`
Add the SalmID folder to your path
```bash
pip install dist/salmid*.whl
```
To execute:
```
SalmID.py
SalmID.py -e .fastq.gz
```
This will perform a SalmID run on all fastq.gz files in the current directory.
```
......@@ -38,4 +48,4 @@ This will perform a SalmID run on all files in directory 'directory_with_data' w
## Todo's and thoughts for future releases:
- Provide coverage estimates for genomes in sample based on kmer frequencies
- Write code to use SalmID on long read (minion, pacbio) platforms
\ No newline at end of file
- Write code to use SalmID on long read (minion, pacbio) platforms
salmid (0.12-1) UNRELEASED; urgency=medium
salmid (0.1.23-1) UNRELEASED; urgency=medium
* Initial release (Closes: #<bug>)
-- Andreas Tille <tille@debian.org> Mon, 03 Sep 2018 21:23:47 +0200
-- Andreas Tille <tille@debian.org> Fri, 21 Dec 2018 17:09:43 +0100
version=4
https://github.com/hcdenbakker/SalmID/releases .*/archive/@ANY_VERSION@@ARCHIVE_EXT@
# Upstream sometimes leaves out the separating '.'
opts="uversionmangle=s/0.1([^.])/0.1.$1/" \
https://github.com/hcdenbakker/SalmID/releases .*/archive/@ANY_VERSION@@ARCHIVE_EXT@
[tool.poetry]
name = "salmid"
version = "0.1.23"
description = "Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia)."
authors = ["Henk den Bakker <hcd82599@uga.edu>"]
license = "MIT"
include = [ 'salmid/invA_mers_dict', 'salmid/rpoB_mers_dict' ]
[tool.poetry.dependencies]
python = "^3.5"
[tool.poetry.dev-dependencies]
[tool.poetry.scripts]
'SalmID.py' = 'salmid.core:main'
[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
......@@ -9,12 +9,13 @@ import sys
from argparse import ArgumentParser
try:
from version import SalmID_version
except:
from .version import SalmID_version
except ImportError:
SalmID_version = "version unknown"
def reverse_complement(sequence):
"""return the reverse complement of a nucleotide (including IUPAC ambiguous nuceotide codes)"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
return "".join(complement[base] for base in reversed(sequence))
......@@ -24,63 +25,74 @@ def parse_args():
"Parse the input arguments, use '-h' for help."
parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data')
# inputs
parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version)
parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SalmID_version)
parser.add_argument(
'-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz',
'-i', '--input_file', type=str, required=False, default='None', metavar='your_fastqgz',
help='Single fastq.gz file input, include path to file if file is not in same directory ')
parser.add_argument(
'-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension',
'-e', '--extension', type=str, required=False, default='.fastq.gz', metavar='file_extension',
help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' +
' with this extension in current directory, otherwise files in input directory')
parser.add_argument(
'-d','--input_dir', type=str, required=False, default='.', metavar = 'directory',
'-d', '--input_dir', type=str, required=False, default='.', metavar='directory',
help='Directory which contains data for identification, when not specified files in current directory will be analyzed.')
parser.add_argument(
'-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy',
'-r', '--report', type=str, required=False, default='percentage', metavar='percentage, coverage or taxonomy',
help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or '
'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).')
parser.add_argument(
'-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough',
'-m', '--mode', type=str, required=False, default='quick', metavar='quick or thorough',
help='Quick [quick] or thorough [thorough] mode')
if len(sys.argv)==1:
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
return parser.parse_args()
def get_av_read_length(file):
"""Samples the first 100 reads from a fastq file and return the average read length."""
i = 1
n_reads = 0
total_length = 0
if file.endswith(".gz"):
file_content=io.BufferedReader(gzip.open(file))
file_content = io.BufferedReader(gzip.open(file))
else:
file_content=open(file,"r").readlines()
file_content = open(file, "r").readlines()
for line in file_content:
if i % 4 == 2:
total_length += len(line.strip())
n_reads +=1
n_reads += 1
i += 1
if n_reads == 100:
break
return total_length/100
return total_length / 100
def createKmerDict_reads(list_of_strings, kmer):
"""Count occurence of K-mers in a list of strings
Args:
list_of_strings(list of str): nucleotide sequences as a list of strings
kmer(int): length of the K-mer to count
Returns:
dict: dictionary with kmers as keys, counts for each kmer as values"""
kmer_table = {}
for string in list_of_strings:
sequence = string.strip('\n')
for i in range(len(sequence)-kmer+1):
new_mer =sequence[i:i+kmer]
new_mer_rc = reverse_complement(new_mer)
if new_mer in kmer_table:
kmer_table[new_mer.upper()] += 1
else:
kmer_table[new_mer.upper()] = 1
if new_mer_rc in kmer_table:
kmer_table[new_mer_rc.upper()] += 1
else:
kmer_table[new_mer_rc.upper()] = 1
if len(sequence) >= kmer:
for i in range(len(sequence) - kmer + 1):
new_mer = sequence[i:i + kmer]
new_mer_rc = reverse_complement(new_mer)
if new_mer in kmer_table:
kmer_table[new_mer.upper()] += 1
else:
kmer_table[new_mer.upper()] = 1
if new_mer_rc in kmer_table:
kmer_table[new_mer_rc.upper()] += 1
else:
kmer_table[new_mer_rc.upper()] = 1
return kmer_table
......@@ -96,16 +108,16 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
reads_2 = []
total_reads = 0
if file.endswith(".gz"):
file_content=io.BufferedReader(gzip.open(file))
file_content = io.BufferedReader(gzip.open(file))
else:
file_content=open(file,"r").readlines()
file_content = open(file, "r").readlines()
for line in file_content:
start = int((len(line) - k) // 2)
if i % 4 == 2:
total_reads += 1
if file.endswith(".gz"):
s1 = line[start:k + start].decode()
line=line.decode()
line = line.decode()
else:
s1 = line[start:k + start]
if s1 in kmerDict_1:
......@@ -126,15 +138,16 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
else:
kmer_Dict1 = createKmerDict_reads(reads_1, k)
mers_1 = set([key for key in kmer_Dict1])
mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1)
mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1]) / len(mers_1)
if len(reads_2) == 0:
kmer_Dict2 = {}
else:
kmer_Dict2 = createKmerDict_reads(reads_2, k)
mers_2 = set([key for key in kmer_Dict2])
mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2)
mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2]) / len(mers_2)
return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
'''
Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
......@@ -145,10 +158,11 @@ def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
'''
if len(iterable) == 0:
return 0
return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers)
return sum([kmer_dict[value] for value in iterable]) / len(clade_specific_kmers)
def kmer_lists(query_fastq_gz, k,
allmers,allmers_rpoB,
allmers, allmers_rpoB,
uniqmers_bongori,
uniqmers_I,
uniqmers_IIa,
......@@ -165,8 +179,8 @@ def kmer_lists(query_fastq_gz, k,
uniqmers_Listeria_ss_rpoB,
uniqmers_Lmono_rpoB,
mode):
dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
allmers_rpoB, mode)
dict_invA, dict_rpoB, mean_invA, mean_rpoB, total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
allmers_rpoB, mode)
target_mers_invA = set([key for key in dict_invA])
target_mers_rpoB = set([key for key in dict_rpoB])
if target_mers_invA == 0:
......@@ -211,17 +225,18 @@ def kmer_lists(query_fastq_gz, k,
S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov,
IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov]
locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori,
p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
p_I, p_IIa, p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
return locus_scores, coverages, total_reads
def report_taxon(locus_covs, average_read_length, number_of_reads):
list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',
list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', # noqa: E201
'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)',
'S. enterica subsp. salamae (invA: clade a)', 'S. enterica subsp. salamae (invA: clade b)',
'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)']
'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)' ] # noqa: E202
if sum(locus_covs) < 1:
rpoB = ('No rpoB matches!', 0)
invA = ('No invA matches!', 0)
......@@ -229,18 +244,18 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
else:
# given list of scores get taxon
if sum(locus_covs[0:5]) > 0:
best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1
best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x]) + 1
all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x])
if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0):
rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1):
elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]), 1) < 1):
rpoB = (list_taxa[0], locus_covs[0])
else:
rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
else:
rpoB = ('No rpoB matches!', 0)
if sum(locus_covs[5:]) > 0:
best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5
best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x]) + 5
invA = (list_taxa[best_invA], locus_covs[best_invA])
else:
invA = ('No invA matches!', 0)
......@@ -250,7 +265,6 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
return rpoB, invA, (average_read_length * number_of_reads) / 5000000
def main():
ex_dir = os.path.dirname(os.path.realpath(__file__))
args = parse_args()
......@@ -260,7 +274,7 @@ def main():
else:
extension = args.extension
inputdir = args.input_dir
files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)]
files = [inputdir + '/' + f for f in os.listdir(inputdir) if f.endswith(extension)]
report = args.report
mode = args.mode
f_invA = open(ex_dir + "/invA_mers_dict", "rb")
......@@ -288,7 +302,7 @@ def main():
uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia']
uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss']
uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono']
#todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
# todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
if report == 'taxonomy':
print('file\trpoB\tinvA\texpected coverage')
for f in files:
......@@ -317,55 +331,56 @@ def main():
'\t' + str(round(report[2], 1)))
else:
print(
'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +
'(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +
' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +
'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + # noqa: E122
'(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + # noqa: E122
' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + # noqa: E122
'\tsubsp. II (clade VIII : invA)')
if report == 'percentage':
for f in files:
locus_scores, coverages , reads = kmer_lists( f, 27,
allmers,allmers_rpoB,
uniqmers_bongori,
uniqmers_I,
uniqmers_IIa,
uniqmers_IIb,
uniqmers_IIIa,
uniqmers_IIIb,
uniqmers_IV,
uniqmers_VI,
uniqmers_VII,
uniqmers_VIII,
uniqmers_bongori_rpoB,
uniqmers_S_enterica_rpoB,
uniqmers_Escherichia_rpoB,
uniqmers_Listeria_ss_rpoB,
uniqmers_Lmono_rpoB,
mode)
locus_scores, coverages, reads = kmer_lists(f, 27,
allmers, allmers_rpoB,
uniqmers_bongori,
uniqmers_I,
uniqmers_IIa,
uniqmers_IIb,
uniqmers_IIIa,
uniqmers_IIIb,
uniqmers_IV,
uniqmers_VI,
uniqmers_VII,
uniqmers_VIII,
uniqmers_bongori_rpoB,
uniqmers_S_enterica_rpoB,
uniqmers_Escherichia_rpoB,
uniqmers_Listeria_ss_rpoB,
uniqmers_Lmono_rpoB,
mode)
pretty_scores = [str(round(score)) for score in locus_scores]
print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores))
print(f.split('/')[-1] + '\t' + '\t'.join(pretty_scores))
else:
for f in files:
locus_scores, coverages , reads = kmer_lists( f, 27,
allmers,allmers_rpoB,
uniqmers_bongori,
uniqmers_I,
uniqmers_IIa,
uniqmers_IIb,
uniqmers_IIIa,
uniqmers_IIIb,
uniqmers_IV,
uniqmers_VI,
uniqmers_VII,
uniqmers_VIII,
uniqmers_bongori_rpoB,
uniqmers_S_enterica_rpoB,
uniqmers_Escherichia_rpoB,
uniqmers_Listeria_ss_rpoB,
uniqmers_Lmono_rpoB,
mode)
locus_scores, coverages, reads = kmer_lists(f, 27,
allmers, allmers_rpoB,
uniqmers_bongori,
uniqmers_I,
uniqmers_IIa,
uniqmers_IIb,
uniqmers_IIIa,
uniqmers_IIIb,
uniqmers_IV,
uniqmers_VI,
uniqmers_VII,
uniqmers_VIII,
uniqmers_bongori_rpoB,
uniqmers_S_enterica_rpoB,
uniqmers_Escherichia_rpoB,
uniqmers_Listeria_ss_rpoB,
uniqmers_Lmono_rpoB,
mode)
pretty_covs = [str(round(cov, 1)) for cov in coverages]
print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
if __name__ == '__main__':
main()
SalmID_version = '0.1.23'
SalmID_version = '0.12'