Skip to content
Commits on Source (8)
Metadata-Version: 1.0
Name: cgecore
Version: 1.5.0
Version: 1.5.2
Summary: Center for Genomic Epidemiology Core Module
Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module
Author: Center for Genomic Epidemiology
......
Metadata-Version: 1.0
Name: cgecore
Version: 1.5.0
Version: 1.5.2
Summary: Center for Genomic Epidemiology Core Module
Home-page: https://bitbucket.org/genomicepidemiology/cge_core_module
Author: Center for Genomic Epidemiology
......
PKG-INFO
README.md
setup.cfg
setup.py
cgecore/__init__.py
cgecore/alignment.py
......@@ -18,3 +20,10 @@ cgecore/organisminfo/gram_neg.txt
cgecore/organisminfo/gram_pos.txt
cgecore/organisminfo/gramstain.py
cgecore/organisminfo/species.py
debian/changelog
debian/control
debian/copyright
debian/rules
debian/watch
debian/source/format
debian/upstream/metadata
\ No newline at end of file
......@@ -13,7 +13,7 @@ from .argumentparsing import (check_file_type, get_arguments, get_string,
)
#####################
__version__ = "1.5.0"
__version__ = "1.5.2"
__all__ = [
"argumentparsing",
"cmdline",
......
......@@ -98,7 +98,6 @@ class Blaster():
# Parsing over the hits and only keeping the best
for blast_record in blast_records:
# OLD CODE TO BE REMOVED
# query = blast_record.query
# blast_record.alignments.sort(key=lambda align: (
......@@ -113,7 +112,6 @@ class Blaster():
key=lambda align: (max((int(hsp.identities)
for hsp in align.hsps))),
reverse=True)
query = blast_record.query
for alignment in blast_record.alignments:
......@@ -121,13 +119,13 @@ class Blaster():
# HSP fragment
best_e_value = 1
best_bit = 0
start_hsp = 0
end_hsp = 0
for hsp in alignment.hsps:
if hsp.expect < best_e_value or hsp.bits > best_bit:
best_e_value = hsp.expect
best_bit = hsp.bits
# best_e_value = hsp.expect
# best_bit = hsp.bits
tmp = alignment.title.split(" ")
sbjct_header = tmp[1]
# DEBUG
......@@ -158,7 +156,6 @@ class Blaster():
hit_id = "%s:%s..%s:%s:%f" % (
contig_name, query_start, query_end,
sbjct_header, cal_score)
# If the hit is on the other strand
if sbjct_start > sbjct_end:
tmp = sbjct_start
......@@ -198,6 +195,8 @@ class Blaster():
'perc_coverage': perc_coverage
}
# Saving the result if any
if best_hsp:
save = 1
......@@ -206,7 +205,6 @@ class Blaster():
if gene_results:
tmp_gene_split = gene_split
tmp_results = gene_results
# Compare the hit results
save, gene_split, gene_results = (
self.compare_results(save, best_hsp,
......@@ -326,15 +324,23 @@ class Blaster():
old_HSP = hit_data['HSP_length']
remove_old = 0
# If they align to the same gene in the database they are
# compared
if new_db_hit == old_db_hit:
# If the hit provids additional sequence it is kept and
#If the hit comes from different contig
if old_contig != new_contig:
# Save a split if the new hit still creats one
if(new_db_hit in tmp_gene_split
and hit_id not in tmp_gene_split[new_db_hit]):
tmp_gene_split[new_db_hit][hit_id] = 1
# If the hit provides additional sequence it is kept and
# the new coverage is saved otherwise the one with the
# highest score is kept
if(new_start_sbjct < old_start_sbjct
elif(new_start_sbjct < old_start_sbjct
or new_end_sbjct > old_end_sbjct):
# Save the hits as split
......@@ -342,28 +348,29 @@ class Blaster():
if hit not in tmp_gene_split[old_db_hit]:
tmp_gene_split[old_db_hit][hit] = 1
else:
if new_score > old_score:
# else:
# if new_score > old_score:
# Set to remove old hit
remove_old = 1
# remove_old = 1
# Save a split if the new hit still creats one
if(new_db_hit in tmp_gene_split
and hit_id not in tmp_gene_split[new_db_hit]):
# if(new_db_hit in tmp_gene_split
# and hit_id not in tmp_gene_split[new_db_hit]):
tmp_gene_split[new_db_hit][hit_id] = 1
else:
save = 0
# tmp_gene_split[new_db_hit][hit_id] = 1
# else:
# save = 0
# If the old and new hit is not identical the
# possible saved gene split for the new hit is
# removed
if hit_id != hit:
if(new_db_hit in tmp_gene_split
and hit_id in tmp_gene_split[new_db_hit]):
# if hit_id != hit:
# if(new_db_hit in tmp_gene_split
# and hit_id in tmp_gene_split[new_db_hit]):
del tmp_gene_split[new_db_hit][hit_id]
break
# del tmp_gene_split[new_db_hit][hit_id]
# break
# If the hits comes form the same part of the contig
# sequnce but match different genes only the best hit is
......
......@@ -36,9 +36,28 @@ class CGEFinder():
kma_apm=None, kma_memmode=False, kma_nanopore=False, debug=False,
kma_add_args=None, kma_cge=False, kma_1t1=False):
"""
I expect that there will only be one hit pr gene, but if there are
TODO: Result storage - Too complex. Not effective.
Before changing code: Check downstream dependencies of results
dicts.
Currently the code stores results in four different dicts. This can
be reduced to just one.
The main dict that stores all results currently stores one result pr
hit, and distiguishes hits to the same gene by adding an increasing
integer (obtained by getting the length of the internal gene dict).
The main dict also stores an internal dict for each 'gene'
containing an empty dict for each hit.
Solution: Create a main<dict> -> gene<list> -> hit<dict> design, and
remove all the references main -> hit. This reduces redundancy and
the need for manipulating gene names with an incremental integer.
TODO: Method too many responsibilities
Solution: Create KMA class, create additional functions.
Original comment:
"I expect that there will only be one hit pr gene, but if there are
more, I assume that the sequence of the hits are the same in the res
file and the aln file.
file and the aln file."
Not sure if this holds for the current code.
"""
threshold = threshold * 100
min_cov = min_cov * 100
......@@ -117,16 +136,24 @@ class CGEFinder():
"\n{}\n{}".format(out.decode("utf-8"),
err.decode("utf-8")))
gene_res_count = {}
for line in res_file:
if kma_results[db] == 'No hit found':
kma_results[db] = dict()
# kma_results[db]["excluded"] = dict()
# continue
data = [data.strip() for data in line.split("\t")]
gene = data[0]
gene_count = gene_res_count.get(gene, 0)
gene_count = gene_count + 1
gene_res_count[gene] = gene_count
if(gene_count == 1):
hit = gene
else:
hit = "{}_{}".format(gene, gene_count)
sbjct_len = int(data[3])
sbjct_ident = float(data[4])
coverage = float(data[5])
......@@ -134,11 +161,6 @@ class CGEFinder():
q_value = float(data[-2])
p_value = float(data[-1])
if gene not in kma_results[db]:
hit = gene
else:
hit = gene + "_" + str(len(kma_results[db][gene]) + 1)
exclude_reasons = []
if(coverage < min_cov or sbjct_ident < threshold):
......@@ -146,7 +168,6 @@ class CGEFinder():
exclude_reasons.append(sbjct_ident)
if(exclude_reasons):
# kma_results[db]["excluded"][hit] = exclude_reasons
kma_results["excluded"][hit] = exclude_reasons
kma_results[db][hit] = dict()
......@@ -171,7 +192,7 @@ class CGEFinder():
# Open align file
with open(align_filename, "r") as align_file:
hit_no = dict()
gene_aln_count = {}
gene = ""
# Parse through alignments
for line in align_file:
......@@ -183,17 +204,13 @@ class CGEFinder():
if line.startswith("#"):
gene = line[1:].strip()
if gene not in hit_no:
hit_no[gene] = str(1)
else:
hit_no[gene] += str(int(hit_no[gene]) + 1)
gene_count = gene_aln_count.get(gene, 0)
gene_aln_count[gene] = gene_count + 1
else:
# Check if gene one of the user specified genes
if hit_no[gene] == '1':
if(gene_aln_count[gene] == 1):
hit = gene
else:
hit = gene + "_" + hit_no[gene]
hit = "{}_{}".format(gene, gene_aln_count[gene])
if hit in kma_results[db]:
line_data = line.split("\t")[-1].strip()
......@@ -209,7 +226,7 @@ class CGEFinder():
else:
print(hit + " not in results: ", kma_results)
# concatinate all sequences lists and find subject start
# concatinate all sequence lists and find subject start
# and subject end
gene_align_sbjct[db] = {}
......@@ -217,8 +234,6 @@ class CGEFinder():
gene_align_homo[db] = {}
for hit in kma_results[db]:
# if(hit == "excluded"):
# continue
align_sbjct = "".join(kma_results[db][hit]['sbjct_string'])
align_query = "".join(kma_results[db][hit]['query_string'])
align_homo = "".join(kma_results[db][hit]['homo_string'])
......
python-cgecore (1.5.2-1) unstable; urgency=medium
* Team upload.
* New upstream version
* Standards-Version: 4.4.1
* Set upstream metadata fields: Repository.
* Remove obsolete field Name from debian/upstream/metadata (already
present in machine-readable debian/copyright).
* Cleaning egg-info directory
-- Steffen Moeller <moeller@debian.org> Mon, 06 Jan 2020 17:33:07 +0100
python-cgecore (1.5.0-1) unstable; urgency=medium
* New upstream version
......
......@@ -9,7 +9,7 @@ Build-Depends: debhelper-compat (= 12),
python3,
python3-setuptools,
python3-biopython
Standards-Version: 4.4.0
Standards-Version: 4.4.1
Vcs-Browser: https://salsa.debian.org/med-team/python-cgecore
Vcs-Git: https://salsa.debian.org/med-team/python-cgecore.git
Homepage: https://bitbucket.org/genomicepidemiology/cge_core_module/
......
......@@ -4,3 +4,7 @@ export PYBUILD_NAME=cgecore
%:
dh $@ --with python3 --buildsystem=pybuild
override_dh_auto_clean:
dh_auto_clean
rm -rf cgecore.egg-info
Name: cgecore
Registry:
- Name: conda:bioconda
Entry: cgecore
Repository: https://bitbucket.org/genomicepidemiology/cge_core_module