Skip to content
Commits on Source (7)
......@@ -17,20 +17,17 @@ gcc-latest-alloptions:
- tar xzf diamond-linux64.tar.gz
- mkdir ~/bin
- cp diamond ~/bin
- perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasd120f22094e2d6a75fb650523c7b5c2763a316aa7f8884dff0cbe3ccd002c9e1e *testasd.ffadj-graph.testasd9ad470e29be4937c6f4996f80221ede51670824bb2e4bb4a50946062a130ffd7 *testasd.poff.html.testasd4f8263bb4b2738e528635f3e121c659407119a1aecafb5340c9d28f5bd66cdaf *testasd.poff.tsv.testasd26d7f5d7b87dd7b71b4920753dc65e7c303e89cdfa56d3aaf00033c7918e6d10 *testasd.poff.tsv.xml.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
- perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*poff* && rm testasd*fadj* && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
gcc-latest-all-p:
image: gcc
stage: recompile-and-test
script:
#- apt-get -y install libboost-all-dev
- export CWD=$(pwd)
- echo "installing last"
- wget http://last.cbrc.jp/last-982.zip && unzip last*zip 2>/dev/null && cd last*/ && make && cp src/last* $HOME
- cd $CWD && echo "installing usearch"
- curl https://drive5.com/cgi-bin/upload3.py?license=2019070410321731111 --output $HOME/usearch && chmod +x $HOME/usearch
#- echo "installing rapsearch"
#- git clone https://github.com/zhaoyanswill/RAPSearch2 && cd RAP*/Src && make && mv *rapsearch* $HOME && cd ../../
- cd $CWD && echo "installing mmseqs2"
- git clone https://github.com/soedinglab/MMseqs2 && cd MMs* && cmake . && make && cp src/mmseqs $HOME && cd ..
- cd $CWD && echo "installing blat"
......
......@@ -221,3 +221,6 @@
updated shebang of ffadj such that python2.7 is used directly (ffadj fails if called with higher version of python)
-p=blastp is now alias of blastp+ and legacy blast is now -p=blastp_legacy (blastn is equivalent)
Makefile: static now includes -lquadmath
25. Sept (uid: 3899)
synteny update to python3 (but the code looks fishy, the -synteny option now gets a deprecated warning)
proteinortho now only print html for <10 files automatically and otherwise only gives the option
......@@ -12,14 +12,14 @@
# Run 'make install' for installing the compiled files to /usr/local/bin
# Run 'make install PREFIX=/home/paul/bin/' for local installation
############ FLAGS: ##########################################
############ OPTIONS: ##########################################
## STATIC=TRUE : enable static compiling (default:FALSE)
## CXX=g++ : the g++ compiler
## CXXFLAGS = compiler flags passed to g++
## CXXLIBRARY = the path to the libs like lapack,... (dont forget the -L)
## CXXINCLUDE = include path (^) (dont forget the -I)
## PREFIX = the installation prefix (only for make install)
##############################################################
################################################################
##########################
## enviroment variables ##
......@@ -32,12 +32,24 @@ USELAPACK=TRUE
# compile statically
STATIC=FALSE
ifdef static
STATIC=$(static)
endif
ifdef PREFIX
INSTALLDIR=$(PREFIX)
endif
ifdef prefix
INSTALLDIR=$(prefix)
endif
ifdef installdir
INSTALLDIR=$(installdir)
endif
ifdef LAPACK
USELAPACK=$(LAPACK)
endif
ifeq ($(STATIC),true)
STATIC=TRUE
endif
USEPRECOMPILEDLAPACK=TRUE
......@@ -132,7 +144,7 @@ ifeq ($(USELAPACK),TRUE)
ifeq ($(USEPRECOMPILEDLAPACK),TRUE)
ifeq ($(STATIC),TRUE)
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (static linking)";
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -static -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -lquadmath -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] ) || ( \
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -static -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -lquadmath && ([ $$? -eq 0 ] ) || ( \
echo "......$(ORANGE)static linking failed, now I try dynamic linking.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -llapack -lblas -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)dynamic linking failed too, now I try dynamic linking without -WL,-whole-archive (this should now work for OSX).$(NC)"; \
......@@ -219,7 +231,7 @@ install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_gra
@echo "If needed you can add $(INSTALLDIR) to \$$PATH with 'export PATH=\$$PATH:$(INSTALLDIR)'."
.PHONY: test
test: proteinortho6.pl test_step2 test_step3 test_clean
test: proteinortho6.pl test_clean test_step2 test_step3 test_clean2
@echo "[TEST] All tests $(GREEN)passed$(NC)"
.PHONY: test_step2
......@@ -234,15 +246,15 @@ test_step2: proteinortho6.pl
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
@if [ "$(shell which blastp)" = "" ]; then\
echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
# @echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
# @if [ "$(shell which blastp)" = "" ]; then\
# echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
# else \
# ./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
# set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
# set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
# echo "$(GREEN)passed$(NC)"; \
# fi
@echo -n " [3/12] -p=diamond test: "
@if [ "$(shell which diamond)" = "" ]; then\
......@@ -353,6 +365,11 @@ test_clean:
@echo "[TEST] Clean up all test files..."; \
rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
.PHONY: test_clean2
test_clean2:
@echo "[TEST] Clean up all test files..."; \
rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
.PHONY: clean
clean:
rm -rf src/BUILD test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*
......
# Proteinortho
Proteinortho is a tool to detect orthologous genes within different species. For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. Details can be found in <a href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-124">Lechner et al., BMC Bioinformatics. 2011 Apr 28;12:124.</a>
To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (manuscript in preparation), is already build in Proteinortho. The general workflow of proteinortho is depicted [![here](https://www.dropbox.com/s/7ubl1ginn3fmf8k/proteinortho_workflow.jpg?dl=0)].
To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (doi:10.1371/journal.pone.0105015), is already build in Proteinortho. The general workflow of proteinortho is depicted [![here](https://www.dropbox.com/s/7ubl1ginn3fmf8k/proteinortho_workflow.jpg?dl=0)].
# New Features of Proteinortho Version 6!
......@@ -108,7 +108,7 @@ Proteinortho uses standard software which is often installed already or is part
- blat (http://hgdownload.soe.ucsc.edu/admin/)
- mmseqs2 (conda install mmseqs2, https://github.com/soedinglab/MMseqs2)
- Perl v5.08 or higher (to test this, type perl -v in the command line)
- Python v2.6.0 or higher to include synteny analysis (to test this, type 'python -V' in the command line)
- (optional) Python v3.0 or higher to include synteny analysis (to test this, type 'python -V' in the command line)
- Perl standard modules (these should come with Perl): Thread::Queue, File::Basename, Pod::Usage, threads (if you miss one just install with `cpan install ...` )
</details>
......@@ -228,8 +228,7 @@ If you have problems compiling/running the program go to [Troubleshooting (prote
one. Details can be found in Lechner et al., BMC Bioinformatics. 2011 Apr
28;12:124. To enhance the prediction accuracy, the relative order of genes
(synteny) can be used as additional feature for the discrimination of
orthologs. The corresponding extension, namely PoFF (manuscript in
preparation), is already build in Proteinortho.
orthologs. The corresponding extension, namely PoFF (doi:10.1371/journal.pone.0105015), is already build in Proteinortho.
Proteinortho assumes, that you have all your gene sequences in FASTA
format either represented as amino acids or as nucleotides. The source
......@@ -344,6 +343,7 @@ Open `proteinorthoHelper.html` in your favorite browser or visit [lechnerlab.de/
<br>
**Synteny options (optional, step 2)**
(This option is deprecated)
(output: <myproject>.ffadj-graph, <myproject>.poff.tsv (tab separated file)-graph)
<details>
......@@ -692,5 +692,3 @@ Since the fiedler vector would result in a good split, the kmere heuristic is th
Lechner, M., Findeisz, S., Steiner, L., Marz, M., Stadler, P. F., &
Prohaska, S. J. (2011). Proteinortho: detection of (co-) orthologs in
large-scale analysis. BMC bioinformatics, 12(1), 124.
proteinortho (6.0.7+dfsg-1) UNRELEASED; urgency=medium
proteinortho (6.0.8+dfsg-1) unstable; urgency=medium
* New upstream version
* New upstream version ported to Python3
* debhelper-compat 12
* Use 2to3 to port to Python3
* Fix perl interpreter path
-- Andreas Tille <tille@debian.org> Mon, 16 Sep 2019 12:41:53 +0200
-- Andreas Tille <tille@debian.org> Wed, 25 Sep 2019 16:39:22 +0200
proteinortho (6.0.6+dfsg-1) unstable; urgency=medium
......
......@@ -19,7 +19,7 @@ Depends: ${shlibs:Depends},
ncbi-blast+,
diamond-aligner,
liblapack3,
python
python3
Description: Detection of (Co-)orthologs in large-scale protein analysis
Proteinortho is a stand-alone tool that is geared towards large datasets
and makes use of distributed computing techniques when run on multi-core
......
Description: Use 2to3 to port to Python3
Author: Andreas Tille <tille@debian.org>
Last-Update: Mon, 16 Sep 2019 12:41:53 +0200
--- a/src/proteinortho_ffadj_mcs.py
+++ b/src/proteinortho_ffadj_mcs.py
@@ -1,9 +1,9 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/python3
-from sys import stdout, stderr, exit, argv, maxint
+from sys import stdout, stderr, exit, argv, maxsize
from copy import deepcopy
from bisect import bisect
-from itertools import izip, product
+from itertools import product
from os.path import basename, dirname
from random import randint
from math import ceil
@@ -46,7 +46,7 @@ class Run:
adjTerm = 0
if len(self.weight) > 1:
adjTerm = sum([self.weight[i] * self.weight[i+1] for i in
- xrange(len(self.weight)-1)])
+ range(len(self.weight)-1)])
edgeTerm = sum([w **2 for w in self.weight])
# edgeTerm = max(self.weight)**2
return alpha * adjTerm + (1-alpha) * edgeTerm
@@ -101,9 +101,9 @@ def readDistsAndOrder(data, edgeThreshol
if edgeWeight < edgeThreshold:
continue
- if not g1_chromosomes.has_key(chr1):
+ if chr1 not in g1_chromosomes:
g1_chromosomes[chr1] = set()
- if not g2_chromosomes.has_key(chr2):
+ if chr2 not in g2_chromosomes:
g2_chromosomes[chr2] = set()
g1_chromosomes[chr1].add(g1)
@@ -124,19 +124,19 @@ def readDistsAndOrder(data, edgeThreshol
# add telomeres
for t1, t2 in product(tel1, tel2):
- if not res.has_key(t1):
+ if t1 not in res:
res[t1] = dict()
res[t1][t2] = (DIRECTION_BOTH_STRANDS, 1)
-# res[maxint] = dict([
-# (maxint, (DIRECTION_WATSON_STRAND, 1)),
+# res[maxsize] = dict([
+# (maxsize, (DIRECTION_WATSON_STRAND, 1)),
# (0, (DIRECTION_WATSON_STRAND, 1)),
-# (maxint, (DIRECTION_CRICK_STRAND, 1)),
+# (maxsize, (DIRECTION_CRICK_STRAND, 1)),
# (0, (DIRECTION_CRICK_STRAND, 1))])
-# res[maxint] = dict([
-# (maxint, (DIRECTION_WATSON_STRAND, 1)),
+# res[maxsize] = dict([
+# (maxsize, (DIRECTION_WATSON_STRAND, 1)),
# (0, (DIRECTION_WATSON_STRAND, 1)),
-# (maxint, (DIRECTION_CRICK_STRAND, 1)),
+# (maxsize, (DIRECTION_CRICK_STRAND, 1)),
# (0, (DIRECTION_CRICK_STRAND, 1))])
return hasMultipleChromosomes, g1, g2, res
@@ -148,20 +148,20 @@ def establish_linear_genome_order(chromo
g.append((k, -1))
telomeres.add((k, -1))
g.extend([(k, i) for i in sorted(chromosomes[k])])
- g.append((k, maxint))
- telomeres.add((k, maxint))
+ g.append((k, maxsize))
+ telomeres.add((k, maxsize))
return telomeres, g
def insertIntoRunList(runs, runList):
- keys = map(lambda x: x.getWeight(alpha), runList)
+ keys = [x.getWeight(alpha) for x in runList]
for run in runs:
i = bisect(keys, run.getWeight(alpha))
keys.insert(i, run.getWeight(alpha))
runList.insert(i, run)
def checkMatching(g1, g2, g1_runs, g2_runs, runs, dist):
- g1pos = dict(izip(g1, xrange(len(g1))))
- g2pos = dict(izip(g2, xrange(len(g2))))
+ g1pos = dict(zip(g1, range(len(g1))))
+ g2pos = dict(zip(g2, range(len(g2))))
if len(g1) != len(g2):
@@ -177,7 +177,7 @@ def checkMatching(g1, g2, g1_runs, g2_ru
r_counter = 0
prev_run = None
c_adj = 0
- for i in xrange(len(g1)):
+ for i in range(len(g1)):
if not g1_runs[i]:
logging.error('Gene %s is not included in any run' %g1[i])
continue
@@ -213,7 +213,7 @@ def checkMatching(g1, g2, g1_runs, g2_ru
missing_runs = all_included.symmetric_difference(runs)
if missing_runs:
logging.error(('Additional runs in runslist that are not part in the' + \
- ' matching: %s') %(map(str, missing_runs)))
+ ' matching: %s') %(list(map(str, missing_runs))))
logging.info('Number of adjacencies is %s in matching of size %s.' %(c_adj,
len(g1)))
@@ -222,7 +222,7 @@ def checkMatching(g1, g2, g1_runs, g2_ru
logging.error(('Sum of run lengths does not equal matching size! Sum ' + \
'of run lengths: %s, matching size: %s') % (r_counter, len(g1)))
- for j in xrange(len(g2)):
+ for j in range(len(g2)):
if not g2_runs[j]:
logging.error('Gene %s is not included in any run' %g2[j])
if len(g2_runs[j]) > 1:
@@ -262,8 +262,8 @@ def checkMatching(g1, g2, g1_runs, g2_ru
'Weights: %s, run length: %s, run: %s') %(len(r.weight),
g1pos[r.endG1] - g1pos[r.startG1], r))
- g1_chromosomes = set(map(lambda x: x[0], g1[g1pos[r.startG1]:g1pos[r.endG1]+1]))
- g2_chromosomes = set(map(lambda x: x[0], g2[g2pos[r.startG2]:g2pos[r.endG2]+1]))
+ g1_chromosomes = set([x[0] for x in g1[g1pos[r.startG1]:g1pos[r.endG1]+1]])
+ g2_chromosomes = set([x[0] for x in g2[g2pos[r.startG2]:g2pos[r.endG2]+1]])
if len(g1_chromosomes) != 1 and len(g2_chromosomes) != 1:
logging.error(('Number of chromosomes on G1 (#chrs: %s) or G2 ' + \
'(#chrs: %s) in run %s is not 1 (Meaning that possibly' + \
@@ -281,7 +281,7 @@ def checkMatching(g1, g2, g1_runs, g2_ru
run_ends[r.startG1] = (r.direction, r.endG2)
run_ends[r.endG1] = (r.direction, r.startG2)
- for i in xrange(len(g1)-1):
+ for i in range(len(g1)-1):
g1i = g1[i]
g1i2 = g1[i+1]
if g1i in run_ends and g1i2 in run_ends and run_ends[g1i][0] == \
@@ -290,13 +290,13 @@ def checkMatching(g1, g2, g1_runs, g2_ru
g2i = run_ends[g1i][1]
g2i2 = run_ends[g1i2][1]
if direction == DIRECTION_CRICK_STRAND and g2pos[g2i] == g2pos[g2i2]-1:
- logging.error('Runs %s and %s could be merged, but are not!' % (map(str, g1_runs[i])[0], map(str, g1_runs[i+1])[0]))
+ logging.error('Runs %s and %s could be merged, but are not!' % (list(map(str, g1_runs[i]))[0], list(map(str, g1_runs[i+1]))[0]))
elif direction == DIRECTION_WATSON_STRAND and g2pos[g2i] == g2pos[g2i2]+1:
- logging.error('Runs %s and %s could be merged, but are not!' % (map(str, g1_runs[i])[0], map(str, g1_runs[i+1])[0]))
+ logging.error('Runs %s and %s could be merged, but are not!' % (list(map(str, g1_runs[i]))[0], list(map(str, g1_runs[i+1]))[0]))
def getAllRuns(g1, g2, d):
- g2pos = dict(izip(g2, xrange(len(g2))))
+ g2pos = dict(zip(g2, range(len(g2))))
g1_runs = [set() for _ in g1]
g2_runs = [set() for _ in g2]
@@ -305,7 +305,7 @@ def getAllRuns(g1, g2, d):
reportedRuns= list()
- for i in xrange(len(g1)):
+ for i in range(len(g1)):
curPos = g1[i]
@@ -355,7 +355,7 @@ def getAllRuns(g1, g2, d):
# if no edge exists, nothing has to be done...
if e:
- for (g2_gene, (direction, weight)) in d[curPos].items():
+ for (g2_gene, (direction, weight)) in list(d[curPos].items()):
if (direction, g2_gene) not in forbiddenRunStarts:
j = g2pos[g2_gene]
if isinstance(direction, BothStrands):
@@ -391,12 +391,12 @@ def replaceByNew(g1_runs, g2_runs, i, j,
break
def doMatching(g1, g2, g1_runs, g2_runs, m, runList):
- g1pos = dict(izip(g1, xrange(len(g1))))
- g2pos = dict(izip(g2, xrange(len(g2))))
+ g1pos = dict(zip(g1, range(len(g1))))
+ g2pos = dict(zip(g2, range(len(g2))))
newRuns = set()
- for k in xrange(g1pos[m.endG1] - g1pos[m.startG1] + 1):
+ for k in range(g1pos[m.endG1] - g1pos[m.startG1] + 1):
i = g1pos[m.startG1] + k
j = g2pos[m.startG2] + k
@@ -516,13 +516,13 @@ def doMatching(g1, g2, g1_runs, g2_runs,
insertIntoRunList(newRuns, runList)
def mergeRuns(mod_g1, g1, g2, g1_runs, g2_runs, runList, alreadyMatched):
- g1pos = dict(izip(g1, xrange(len(g1))))
- g2pos = dict(izip(g2, xrange(len(g2))))
+ g1pos = dict(zip(g1, range(len(g1))))
+ g2pos = dict(zip(g2, range(len(g2))))
newRuns = set()
wSrt = lambda x: x.getWeight(alpha)
mod_g1 = list(mod_g1)
- for x in xrange(len(mod_g1)):
+ for x in range(len(mod_g1)):
g1i = mod_g1[x]
i = g1pos[g1i]
if len(g1) < i+2:
@@ -601,18 +601,18 @@ def removeSingleGenes(genome, genome_run
def findRandomRunSequence(g1, g2, dists, topXperCent):
g2dists = dict()
- for g1i, x in dists.items():
- for g2j, d in x.items():
+ for g1i, x in list(dists.items()):
+ for g2j, d in list(x.items()):
if g2j not in g2dists:
g2dists[g2j] = dict()
g2dists[g2j][g1i] = d
# copy g1, g2 and dists map, because we'll modify it. Also remove all genes
# that do not contain edges.
- g1 = [x for x in g1 if dists.has_key(x) and len(dists[x])]
- g2 = [x for x in g2 if g2dists.has_key(x) and len(g2dists[x])]
+ g1 = [x for x in g1 if x in dists and len(dists[x])]
+ g2 = [x for x in g2 if x in g2dists and len(g2dists[x])]
- g1pos = dict(izip(g1, xrange(len(g1))))
+ g1pos = dict(zip(g1, range(len(g1))))
g1_runs, g2_runs, runs = getAllRuns(g1, g2, dists)
logging.info('Found %s runs.' %len(runs))
@@ -621,7 +621,7 @@ def findRandomRunSequence(g1, g2, dists,
res = set()
while runList:
- noOfAdjacencies = len(filter(lambda x: x.getWeight(alpha) and x.getWeight(alpha) or 0, runList))
+ noOfAdjacencies = len([x for x in runList if x.getWeight(alpha) and x.getWeight(alpha) or 0])
if noOfAdjacencies:
randPos = randint(1, ceil(noOfAdjacencies * topXperCent))
else:
@@ -645,7 +645,7 @@ def findRandomRunSequence(g1, g2, dists,
for g in del_g1.intersection(mod_g1):
mod_g1.remove(g)
- g1pos = dict(izip(g1, xrange(len(g1))))
+ g1pos = dict(zip(g1, range(len(g1))))
# add new modification points
mod_g1.update(new_mod_g1)
@@ -653,7 +653,7 @@ def findRandomRunSequence(g1, g2, dists,
if del_g2:
logging.info('Zombie genes removed from G2: %s' %', '.join(map(str, del_g2)))
for g2j in mod_g2:
- for g1i, (d, _) in g2dists[g2j].items():
+ for g1i, (d, _) in list(g2dists[g2j].items()):
if g1i in g1:
if d == DIRECTION_CRICK_STRAND:
mod_g1.add(g1i)
@@ -665,8 +665,8 @@ def findRandomRunSequence(g1, g2, dists,
runList, res)
if res:
- logging.info('Matching finished. Longest run size is %s.' %(max(map(len,
- res))))
+ logging.info('Matching finished. Longest run size is %s.' %(max(list(map(len,
+ res)))))
else:
logging.info('Matching finished, but no runs found. Empty input?')
@@ -681,19 +681,19 @@ def repeatMatching(g1, g2, g1_mod, g2_mo
g2_runs_res = g2_runs
selectedRuns_res = list()
- g1pos = dict(izip(g1_mod, xrange(len(g1_mod))))
- g2pos = dict(izip(g2_mod, xrange(len(g2_mod))))
+ g1pos = dict(zip(g1_mod, range(len(g1_mod))))
+ g2pos = dict(zip(g2_mod, range(len(g2_mod))))
noReps = repMatching
while repMatching:
- for i in xrange(len(g1_runs)):
+ for i in range(len(g1_runs)):
run_set = g1_runs[i]
if len(run_set) != 1:
logging.error(('Expected run, set length of 1, but was told' + \
' different: %s.') %(', '.join(map(str, run_set))))
- run = run_set.__iter__().next()
+ run = next(run_set.__iter__())
g1i = g1_mod[i]
@@ -720,11 +720,11 @@ def repeatMatching(g1, g2, g1_mod, g2_mo
len(g1_mod), noReps-repMatching+2))
# remove runs that fall below min length of minCsSize
- ff = lambda x: len(x.__iter__().next()) >= minCsSize
- g1_mod = [g1_mod[i] for i in xrange(len(g1_mod)) if ff(g1_runs[i])]
- g2_mod = [g2_mod[i] for i in xrange(len(g2_mod)) if ff(g2_runs[i])]
- g1_runs = filter(ff, g1_runs)
- g2_runs = filter(ff, g2_runs)
+ ff = lambda x: len(next(x.__iter__())) >= minCsSize
+ g1_mod = [g1_mod[i] for i in range(len(g1_mod)) if ff(g1_runs[i])]
+ g2_mod = [g2_mod[i] for i in range(len(g2_mod)) if ff(g2_runs[i])]
+ g1_runs = list(filter(ff, g1_runs))
+ g2_runs = list(filter(ff, g2_runs))
selectedRuns = set([s for s in selectedRuns if len(s) >= minCsSize])
# stop if no runs were found matching the criteria
@@ -736,10 +736,10 @@ def repeatMatching(g1, g2, g1_mod, g2_mo
logging.info('%s feasible runs retained.' %len(selectedRuns))
# reconciliate with result data
- g2pos = dict(izip(g2_mod, xrange(len(g2_mod))))
- g1pos = dict(izip(g1_mod, xrange(len(g1_mod))))
- g2pos_res = dict(izip(g2_mod_res, xrange(len(g2_mod_res))))
- g1pos_res = dict(izip(g1_mod_res, xrange(len(g1_mod_res))))
+ g2pos = dict(zip(g2_mod, range(len(g2_mod))))
+ g1pos = dict(zip(g1_mod, range(len(g1_mod))))
+ g2pos_res = dict(zip(g2_mod_res, range(len(g2_mod_res))))
+ g1pos_res = dict(zip(g1_mod_res, range(len(g1_mod_res))))
chr_srt = lambda x, y: x[0] == y[0] and (x[1] < y[1] and -1 or 1) or (x[0] < y[0] and -1 or 1)
g1_mod_new = sorted(set(g1_mod_res + g1_mod), cmp=chr_srt)
@@ -749,17 +749,17 @@ def repeatMatching(g1, g2, g1_mod, g2_mo
for g1i in g1_mod_new:
x = set()
- if g1pos_res.has_key(g1i):
+ if g1i in g1pos_res:
x.update(g1_runs_res[g1pos_res[g1i]])
- if g1pos.has_key(g1i):
+ if g1i in g1pos:
x.update(g1_runs[g1pos[g1i]])
g1_runs_new.append(x)
for g2j in g2_mod_new:
x = set()
- if g2pos_res.has_key(g2j):
+ if g2j in g2pos_res:
x.update(g2_runs_res[g2pos_res[g2j]])
- if g2pos.has_key(g2j):
+ if g2j in g2pos:
x.update(g2_runs[g2pos[g2j]])
g2_runs_new.append(x)
@@ -776,21 +776,21 @@ def repeatMatching(g1, g2, g1_mod, g2_mo
def printMatching(g1, g2, g1_runs, hasMultipleChromosomes, out):
if hasMultipleChromosomes:
- print >> f, 'Chr(G1)\tG1\tChr(G2)\tG2\tdirection\tedge weight'
+ print('Chr(G1)\tG1\tChr(G2)\tG2\tdirection\tedge weight', file=f)
else:
- print >> f, 'G1\tG2\tdirection\tedge weight'
+ print('G1\tG2\tdirection\tedge weight', file=f)
- g2pos = dict(izip(g2, xrange(len(g2))))
- g1pos = dict(izip(g1, xrange(len(g1))))
+ g2pos = dict(zip(g2, range(len(g2))))
+ g1pos = dict(zip(g1, range(len(g1))))
cur_index = dict()
- for i in xrange(len(g1_runs)):
+ for i in range(len(g1_runs)):
run_set = g1_runs[i]
for run in run_set:
g1i = g1[i]
j = 0
- if cur_index.has_key(run):
+ if run in cur_index:
j = cur_index[run]
if run.direction == DIRECTION_CRICK_STRAND:
g2j = g2[g2pos[run.startG2] + j]
@@ -800,22 +800,22 @@ def printMatching(g1, g2, g1_runs, hasMu
direction = run.direction == DIRECTION_CRICK_STRAND and '1' or '-1'
g1i1 = g1i[1] == -1 and 'TELOMERE_START' or g1i[1]
- g1i1 = g1i[1] == maxint and 'TELOMERE_END' or g1i1
+ g1i1 = g1i[1] == maxsize and 'TELOMERE_END' or g1i1
g2j1 = g2j[1] == -1 and 'TELOMERE_START' or g2j[1]
- g2j1 = g2j[1] == maxint and 'TELOMERE_END' or g2j1
+ g2j1 = g2j[1] == maxsize and 'TELOMERE_END' or g2j1
if hasMultipleChromosomes:
- print >> f, '%s\t%s\t%s\t%s\t%s\t%s' %(g1i[0], g1i1, g2j[0],
- g2j1, direction, run.weight[j])
+ print('%s\t%s\t%s\t%s\t%s\t%s' %(g1i[0], g1i1, g2j[0],
+ g2j1, direction, run.weight[j]), file=f)
else:
- print >> f, '%s\t%s\t%s\t%s' %(g1i1, g2j1, direction,
- run.weight[j])
+ print('%s\t%s\t%s\t%s' %(g1i1, g2j1, direction,
+ run.weight[j]), file=f)
cur_index[run] = j+1
if __name__ == '__main__':
if len(argv) < 3 or len(argv) > 8:
- print '\tusage: %s <DIST FILE> <ALPHA> [ <EDGE WEIGHT THRESHOLD> --repeat-matching (-R) <NUMBER >= 2> --min-cs-size (-M) <NUMBER >= 1> ]' %argv[0]
+ print('\tusage: %s <DIST FILE> <ALPHA> [ <EDGE WEIGHT THRESHOLD> --repeat-matching (-R) <NUMBER >= 2> --min-cs-size (-M) <NUMBER >= 1> ]' %argv[0])
exit(1)
repMatching= '--repeat-matching' in argv or '-R' in argv
@@ -826,8 +826,8 @@ if __name__ == '__main__':
minCsSize = int(argv[pos+1])
argv = argv[:pos] + argv[pos+2:]
if not repMatching:
- print >> stderr, ('Argument --min-cs-size (-M) only valid in ' + \
- 'combination with --repeat-matching (-R)')
+ print(('Argument --min-cs-size (-M) only valid in ' + \
+ 'combination with --repeat-matching (-R)'), file=stderr)
exit(1)
else:
minCsSize = 1
@@ -869,7 +869,7 @@ if __name__ == '__main__':
# sum of weights of adjacencies
wAdj = sum([r.getWeight(1) for r in selectedRuns])
# sum of weights of all edges of the matching
- wEdg = sum([sum(map(lambda x: x**2, r.weight)) for r in selectedRuns])
+ wEdg = sum([sum([x**2 for x in r.weight]) for r in selectedRuns])
edg = sum(map(len, selectedRuns))
@@ -892,6 +892,6 @@ if __name__ == '__main__':
' is %s with #edg = %s, adj(M) = %.3f and edg(M) = %.3f') %(bkp, edg,
wAdj, wEdg))
- print '#bkp\t#edg\tadj\tedg'
- print '%s\t%s\t%.6f\t%.6f' %(bkp, edg, wAdj, wEdg)
+ print('#bkp\t#edg\tadj\tedg')
+ print('%s\t%s\t%.6f\t%.6f' %(bkp, edg, wAdj, wEdg))
......@@ -15,3 +15,9 @@ override_dh_auto_install:
override_dh_fixperms:
dh_fixperms
find debian \( -name "*.faa" -o -name "*.gff" \) -exec chmod -x \{\} \;
override_dh_install:
dh_install
for pl in `grep -Rl '#!/usr/bin/env[[:space:]]\+perl' debian/*/usr/*` ; do \
sed -i '1s?^#!/usr/bin/env[[:space:]]\+perl?#!/usr/bin/perl?' $${pl} ; \
done
......@@ -341,6 +341,8 @@ By default, Proteinortho splits each group into two more dense subgroups when th
=head2 POFF (-synteny)
-->> This option is deprecated <<---
The synteny based graph files (myproject.ffadj-graph and myproject.poff-graph) have two additional columns:
same_strand and simscore. The first one indicates if two genes from a match are located at the same strands (1) or not (-1).
The second one is an internal score which can be interpreted as a normalized weight ranging from 0 to 1 based on the respective e-values.
......@@ -453,7 +455,7 @@ use POSIX;
##########################################################################################
# Variables
##########################################################################################
our $version = "6.0.6";
our $version = "6.0.8";
our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3
our $verbose = 1; # 0/1 -> don't / be verbose
our $debug = 0; # 0/1 -> don't / show debug data
......@@ -593,9 +595,9 @@ foreach my $option (@ARGV) {
elsif ($option =~ m/^--?selfblast=(0|1)$/) { $selfblast = $1; }
elsif ($option =~ m/^--?singles$/) { $singles = 1; }
elsif ($option =~ m/^--?singles=(0|1)$/) { $singles = $1; }
elsif ($option =~ m/^--?poff$/) { $synteny = 1; }
elsif ($option =~ m/^--?synteny$/) { $synteny = 1; }
elsif ($option =~ m/^--?synteny=(0|1)$/) { $synteny = $1; }
elsif ($option =~ m/^--?poff$/) { $synteny = 1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
elsif ($option =~ m/^--?synteny$/) { $synteny = 1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
elsif ($option =~ m/^--?synteny=(0|1)$/) { $synteny = $1; print STDERR "$ORANGE"."[WARNING]$NC -->> This option is deprecated <<---"; }
elsif ($option =~ m/^--?dups=0$/) { $duplication = 0; }
elsif ($option =~ m/^--?dups=([1-8])$/) { $duplication = $1+1;}
elsif ($option =~ m/^--?neighbourjoin$/) { $neighbourjoin = 1; }
......@@ -616,8 +618,8 @@ foreach my $option (@ARGV) {
elsif ($option =~ m/^--?subparaCluster=(.*)$/i) { $clusterOptions = $1;}
elsif ($option =~ m/^--?v(ersion)?$/i) { print $version."\n"; exit 0;}
elsif ($option !~ /^-/) { if(!exists($files_map{$option})){$files_map{$option}=1;push(@files,$option);}else{print STDERR "$ORANGE"."[WARNING]$NC The input $option was is skipped, since it was allready given as input.$NC\nPress 'strg+c' to prevent me from proceeding or wait 10 seconds to continue...\n";sleep 10;print STDERR "Well then, proceeding...\n"} }
elsif ($option =~ m/^--?man$/i){ pod2usage(-exitstatus => 0, -verbose => 2) }
else {&print_usage(); &reset_locale();die "Invalid command line option: \'$option\'!\n"; }
elsif ($option =~ m/^--?man$/i){ my $bperldoc=(1-(`perldoc -l Pod::Usage 2>&1`=~m/need to install/)); if( !$bperldoc ){ print STDERR "You need to install the perl-doc package to use this man program.\n"; }else{ pod2usage(-exitstatus => 0, -verbose => 2) } exit 0; }
else {&print_usage(); &reset_locale();die "$RED"."[Error]$NC $ORANGE Invalid command line option: \'$option\'! $NC\n\n"; }
}
if($selfblast){$checkblast=1;}
......@@ -889,7 +891,10 @@ sub cluster {
system("mv mcl.proteinortho-graph $csimgraph");
}
if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n You can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $simtable ".join(" ",@files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtable\n";}
if(scalar @files < 10){
if($verbose){print STDERR "You can extract the fasta files of each orthology group with 'proteinortho_grab_proteins.pl -tofiles $simtable ".join(" ",@files)."'\n (Careful: This will generate a file foreach line in the file $simtable).\n";}
}
if ($singles) {
if($verbose){print STDERR "Adding singles...\n";}
......@@ -905,8 +910,12 @@ sub cluster {
if($verbose){print STDERR "[OUTPUT] -> Orthologous pairs are written to $csimgraph\n";}
}
if(scalar @files < 10){
system("perl $po_path/proteinortho2html.pl $simtable ".join(" ",@files)." >$simtablehtml");
if($verbose){print STDERR "[OUTPUT] -> Orthologous groups are written to $simtablehtml\n";}
}else{
if($verbose){print STDERR "[OUTPUT] -> You can extract a html version of the output using :\nproteinortho2html.pl $simtable [PLACE FASTA FILES HERE] >$simtablehtml\n\n";}
}
if ($doxml) {
system("perl $po_path/proteinortho2xml.pl $simtable >$simtable.xml");
......@@ -1002,7 +1011,7 @@ Options:
-e= E-value for blast [default: 1e-05]
[Synteny options]
-synteny activate PoFF extension to separate similar sequences
-synteny activate PoFF extension to separate similar sequences print
by contextual adjacencies (requires .gff for each .fasta)
-dups= PoFF: number of reiterations for adjacencies heuristic,
to determine duplicated regions (default: 0)
......@@ -1117,6 +1126,8 @@ sub workerthread {
my $thread_id = threads->tid();
my $temp_file = "$tmp_path$project-$run_id-$thread_id";
$temp_file=~s/^\.\///;
# Clean up, just to be safe
unlink("$temp_file.tmp");
unlink("$temp_file.log");
......@@ -1189,10 +1200,9 @@ sub workerthread {
open(PREGRAPH,">>$temp_file.tmp") || &Error("Could not open temp file '$temp_file.tmp': $!");
print PREGRAPH $ordered_matches;
close(PREGRAPH);
my $cmd = "$po_path/proteinortho_ffadj_mcs.py '$temp_file.tmp' $alpha";
if ($duplication) {
$cmd .= " --repeat-matching $duplication --min-cs-size $cs";
}
my $ffadj_param = "-a $alpha";
if ($duplication) { $ffadj_param .= " -R $duplication -M $cs";}
my $cmd = "$po_path/proteinortho_ffadj_mcs.py $ffadj_param '$temp_file.tmp'";
if ($debug) {print STDERR "$cmd\n";}
my $synt_stats = qx($cmd);
$synt_stats=~s/[\r\n]+$//;
......@@ -1218,11 +1228,13 @@ sub workerthread {
my %close = %{$close_copies_pointer};
# Generate hash for synteny hits
my %synteny;
unless (-s "$temp_file.matching") {
print STDERR "$RED [Error] Failed to run $po_path/proteinortho_ffadj_mcs.py for\n$file_i vs $file_j\nMoving source to $temp_file.err for debugging\nI will continue, but results may be insufficient.$NC \n";
print STDERR "\n$RED [Error] Failed to run $po_path/proteinortho_ffadj_mcs.py for\n$file_i vs $file_j\nMoving source to $temp_file.err for debugging\nI will continue, but results may be insufficient.$NC \n\n";
system("mv $temp_file.tmp $temp_file.err");
next;
}
open(OSYNGRAPH,"<$temp_file.matching") || &Error("Could not open temp file $temp_file.matching: $!'");
while(<OSYNGRAPH>) {
$_=~s/[\r\n]+$//;
......@@ -2422,12 +2434,11 @@ sub read_details {
sub Error {
$debug=1;
print STDERR "\n";
print STDERR &get_parameter;
if($_[0] ne "I need at least two files to compare something!"){print STDERR "\n";print STDERR &get_parameter;}
print STDERR "\n\n$RED"."[Error]$NC $ORANGE ".$_[0]." $NC \n\n";
print STDERR "(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com including the parameter-vector above or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n\n\n";
if($_[0] ne "I need at least two files to compare something!"){print STDERR "(If you cannot solve this error, please send a report to incoming+paulklemm-phd-proteinortho-7278443-issue-\@incoming.gitlab.com including the parameter-vector above or visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes for more help. Further more all mails to lechner\@staff.uni-marburg.de are welcome)\n\n\n";}
&reset_locale();
if (!$keep && $tmp_path =~ m/\/proteinortho_cache_[^\/]+\d*\/$/ && $step!=1 ){system("rm -r $tmp_path >/dev/null 2>&1");}
......@@ -2504,7 +2515,7 @@ sub get_po_path {
# &Error("cannot find proteinortho2tree.pl in $tmppath[1].\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n";
# exit 0;
# }
if(!-x $tmppath[1]."/proteinortho_ffadj_mcs.py"){
if(!-x $tmppath[1]."/proteinortho_ffadj_mcs.py" && $synteny){
&Error("cannot find proteinortho_ffadj_mcs.py$NC in: the current directory '.', ./src/, ./src/BUILD/$uname, /usr/bin, /usr/local/bin, -binpath=$binpath.\nPlease do one of the following:\n A. recompile proteinortho (with 'make clean', 'make' and 'make install' or 'make install PREFIX=...') or consider a installation with conda/brew (see the README for more informations)\n B. execute from within the downloaded directory, there are precompiled binaries for Linux_x86_64\n C. specify the path to the binaries with -binpath=...\n");
exit 1;
}
......
This diff is collapsed.