Skip to content
Commits on Source (2)
[![DOI](https://zenodo.org/badge/97020646.svg)](https://zenodo.org/badge/latestdoi/97020646)
# SalmID
Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia).
......@@ -5,26 +7,34 @@ Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs S
Python 3
## Installation:
Clone git to your machine:
The easy way with homebrew ([Linux](http://linuxbrew.sh/) or [MacOS](https://brew.sh/)):
```
git clone --recursive https://github.com/hcdenbakker/SalmID.git
brew install brewsci/bio/salmid
```
Big thanks to [Torsten Seemann](https://tseemann.github.io/) for including this in homebrew!
Make SalmID executable:
```
cd SalmID
```
Alternatively download from GitHub:
```bash
git clone https://github.com/hcdenbakker/SalmID.git
```
chmod +x SalmID.py
build a wheel using [poetry](https://poetry.eustace.io/):
```bash
cd SalmID
poetry build
```
and install using `pip`
Add the SalmID folder to your path
```bash
pip install dist/salmid*.whl
```
To execute:
```
SalmID.py
SalmID.py -e .fastq.gz
```
This will perform a SalmID run on all fastq.gz files in the current directory.
```
......
[tool.poetry]
name = "salmid"
version = "0.1.23"
description = "Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia)."
authors = ["Henk den Bakker <hcd82599@uga.edu>"]
license = "MIT"
include = [ 'salmid/invA_mers_dict', 'salmid/rpoB_mers_dict' ]
[tool.poetry.dependencies]
python = "^3.5"
[tool.poetry.dev-dependencies]
[tool.poetry.scripts]
'SalmID.py' = 'salmid.core:main'
[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
......@@ -9,12 +9,13 @@ import sys
from argparse import ArgumentParser
try:
from version import SalmID_version
except:
from .version import SalmID_version
except ImportError:
SalmID_version = "version unknown"
def reverse_complement(sequence):
"""return the reverse complement of a nucleotide (including IUPAC ambiguous nuceotide codes)"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
return "".join(complement[base] for base in reversed(sequence))
......@@ -48,7 +49,9 @@ def parse_args():
sys.exit(1)
return parser.parse_args()
def get_av_read_length(file):
"""Samples the first 100 reads from a fastq file and return the average read length."""
i = 1
n_reads = 0
total_length = 0
......@@ -67,9 +70,18 @@ def get_av_read_length(file):
def createKmerDict_reads(list_of_strings, kmer):
"""Count occurence of K-mers in a list of strings
Args:
list_of_strings(list of str): nucleotide sequences as a list of strings
kmer(int): length of the K-mer to count
Returns:
dict: dictionary with kmers as keys, counts for each kmer as values"""
kmer_table = {}
for string in list_of_strings:
sequence = string.strip('\n')
if len(sequence) >= kmer:
for i in range(len(sequence) - kmer + 1):
new_mer = sequence[i:i + kmer]
new_mer_rc = reverse_complement(new_mer)
......@@ -135,6 +147,7 @@ def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2]) / len(mers_2)
return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
'''
Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
......@@ -147,6 +160,7 @@ def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
return 0
return sum([kmer_dict[value] for value in iterable]) / len(clade_specific_kmers)
def kmer_lists(query_fastq_gz, k,
allmers, allmers_rpoB,
uniqmers_bongori,
......@@ -214,14 +228,15 @@ def kmer_lists(query_fastq_gz, k,
p_I, p_IIa, p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
return locus_scores, coverages, total_reads
def report_taxon(locus_covs, average_read_length, number_of_reads):
list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.',
list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', # noqa: E201
'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
'S. enterica subsp. salamae (invA: clade a)', 'S. enterica subsp. salamae (invA: clade b)',
'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)']
'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)' ] # noqa: E202
if sum(locus_covs) < 1:
rpoB = ('No rpoB matches!', 0)
invA = ('No invA matches!', 0)
......@@ -250,7 +265,6 @@ def report_taxon(locus_covs, average_read_length, number_of_reads):
return rpoB, invA, (average_read_length * number_of_reads) / 5000000
def main():
ex_dir = os.path.dirname(os.path.realpath(__file__))
args = parse_args()
......@@ -317,9 +331,9 @@ def main():
'\t' + str(round(report[2], 1)))
else:
print(
'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' +
'(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' +
' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' +
'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + # noqa: E122
'(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + # noqa: E122
' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + # noqa: E122
'\tsubsp. II (clade VIII : invA)')
if report == 'percentage':
for f in files:
......@@ -366,6 +380,7 @@ def main():
pretty_covs = [str(round(cov, 1)) for cov in coverages]
print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
if __name__ == '__main__':
main()
SalmID_version = '0.1.23'
SalmID_version = '0.12'