Skip to content
Commits on Source (7)
myproject.*
.Rhistory
old/
remove.graph
test/*.dmnd
test/*.pin
test/*.phr
test/*.psq
test.*
*_old*
test*
# Recycle Bin used on file shares
$RECYCLE.BIN/
# =========================
# Operating System Files
# =========================
# OSX
# =========================
.DS_Store
.AppleDouble
.LSOverride
# Thumbnails
._*
# Files that might appear on external disk
.Spotlight-V100
.Trashes
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# CMake files
CMakeCache.txt
CMakeFiles
cmake_install.cmake
install_manifest.txt
variables:
PROJECT_NAME: "Proteinortho"
before_script:
- echo "starting yml for Proteinortho"
- apt-get update && apt-get -y install cmake diffutils wget ncbi-blast+ time git
stages:
- codequality
- test-precompiled-bins
- recompile-and-test
gcc-latest-alloptions:
image: gcc
stage: test-precompiled-bins
script:
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- mkdir ~/bin
- cp diamond ~/bin
- perl proteinortho*pl -project=testasd -cpus=1 -ram=100 -verbose=2 -selfblast -silent -force -desc -checkfasta -cleanblast -debug -binpath=~/bin -tmp='~/' -e=0.000001 -sim=0.9 -identity=20 -cov=30 -subparaBlast='--more-sensitive' -synteny -dups=1 -cs=4 -alpha=0.4 -conn=0.01 -purity=0.00001 -minspecies=2 -subparaCluster='-cpus 1 -seed 1' -nograph -singles -xml -exactstep3 test/*faa >/dev/null 2>&1 && rm testasd*info* && export LC_NUMERIC="C" && export LC_ALL="C" && for f in testasd.*; do sort $f >$f.testasd; done; sha256sum -b *testasd | tr -d '\n' | awk '{if($0 == "eb88ba29afd4f2dba16d3dbf97a5b0d2ab7686654a854f8502f0e778628e7f56 *testasd.descriptions.testasd120f22094e2d6a75fb650523c7b5c2763a316aa7f8884dff0cbe3ccd002c9e1e *testasd.ffadj-graph.testasd9ad470e29be4937c6f4996f80221ede51670824bb2e4bb4a50946062a130ffd7 *testasd.poff.html.testasd4f8263bb4b2738e528635f3e121c659407119a1aecafb5340c9d28f5bd66cdaf *testasd.poff.tsv.testasd26d7f5d7b87dd7b71b4920753dc65e7c303e89cdfa56d3aaf00033c7918e6d10 *testasd.poff.tsv.xml.testasdf80df4c1a951bfb55b02300a273f6395694f01e8ae908e296d9c14a847d432ac *testasd.proteinortho.html.testasdfa18e9a0530f5a5754f045cfe97deaf818bdb5eb725619952633f1da0641cf7b *testasd.proteinortho.tsv.testasdc598b8c43e48e06614ec19e2f6b870e2737a7117a50ab2b1613880764d0884b2 *testasd.proteinortho.tsv.xml.testasd"){print $0." -> OK"; exit 0}else{print $0." -> failed"; exit 1}}'
gcc-latest-all-p:
image: gcc
stage: recompile-and-test
script:
#- apt-get -y install libboost-all-dev
- export CWD=$(pwd)
- echo "installing last"
- wget http://last.cbrc.jp/last-982.zip && unzip last*zip 2>/dev/null && cd last*/ && make && cp src/last* $HOME
- cd $CWD && echo "installing usearch"
- curl https://drive5.com/cgi-bin/upload3.py?license=2019070410321731111 --output $HOME/usearch && chmod +x $HOME/usearch
#- echo "installing rapsearch"
#- git clone https://github.com/zhaoyanswill/RAPSearch2 && cd RAP*/Src && make && mv *rapsearch* $HOME && cd ../../
- cd $CWD && echo "installing mmseqs2"
- git clone https://github.com/soedinglab/MMseqs2 && cd MMs* && cmake . && make && cp src/mmseqs $HOME && cd ..
- cd $CWD && echo "installing blat"
- wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/blat/blat && cp blat $HOME && chmod +x $HOME/blat
- cd $CWD && echo "installing topaz"
- git clone https://github.com/ajm/topaz && cd topaz/src && make && cp topaz $HOME && cd ../..
- cd $CWD && echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz && tar xzf diamond-linux64.tar.gz && cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- gcc --version
- make clean
- make all
- make test
gcc-latest-diamond:
image: gcc
stage: test-precompiled-bins
script:
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- gcc --version
- make test
nolapack-gcc-latest:
image: gcc
stage: recompile-and-test
script:
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- gcc --version
- make clean
- make LAPACK=FALSE
- make test
gcc5:
image: gcc:5
stage: recompile-and-test
script:
- echo "installing topaz"
- git clone https://github.com/ajm/topaz
- cd topaz/src
- make
- cp topaz $HOME
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- cd ../..
- echo "start proteinortho tests"
- gcc --version
- make clean
- make all
- make test
ubuntu-latest0:
image: ubuntu
stage: test-precompiled-bins
script:
- apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++
- echo "installing topaz"
- git clone https://github.com/ajm/topaz
- cd topaz/src
- make
- cp topaz $HOME
- cd ../..
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- make test
ubuntu-latest:
image: ubuntu
stage: recompile-and-test
script:
- apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++
- echo "installing topaz"
- git clone https://github.com/ajm/topaz
- cd topaz/src
- make
- cp topaz $HOME
- cd ../..
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- make clean
- make all
- make test
debian-latest:
image: debian
stage: recompile-and-test
script:
- apt-get -y update && apt-get -y install gcc && apt-get -y install gfortran && apt-get -y install build-essential g++
- echo "installing topaz"
- git clone https://github.com/ajm/topaz
- cd topaz/src
- make
- cp topaz $HOME
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- cd ../..
- echo "start proteinortho tests"
- make clean
- make all
- make test
fedora-latest:
image: fedora
stage: test-precompiled-bins
script:
- yum -y groupinstall "Development Tools"
- yum -y install gcc-c++
- yum -y install cmake
- yum -y install make
- yum -y install tar
- yum -y install which
- yum -y install wget
- yum -y install libstdc++-static
- yum -y install lapack-static
- yum -y install cpan
- yum -y install python
- yum -y install ncbi-blast+
- cpan Thread::Queue
- wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast*-x64-linux.tar.gz
- tar -xzvf ncbi-blast*-x64-linux.tar.gz
- cp ncbi-blast*/bin/blastp $HOME
- cp ncbi-blast*/bin/makeblastdb $HOME
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- make test
centos-latest:
image: centos
stage: recompile-and-test
script:
- yum -y groupinstall "Development Tools"
- yum -y install gcc-c++
- yum -y install cmake
- yum -y install make
- yum -y install tar
- yum -y install which
- yum -y install wget
- wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast*-x64-linux.tar.gz
- tar -xzvf ncbi-blast*-x64-linux.tar.gz
- cp ncbi-blast*/bin/blastp $HOME
- cp ncbi-blast*/bin/makeblastdb $HOME
- echo "installing diamond"
- wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
- tar xzf diamond-linux64.tar.gz
- cp diamond $HOME
- export PATH="$PATH:$HOME"
- echo "start proteinortho tests"
- make clean
- make
- make test
code_quality:
image: docker:stable
stage: codequality
variables:
DOCKER_DRIVER: overlay2
allow_failure: true
services:
- docker:stable-dind
script:
- export SP_VERSION=$(echo "$CI_SERVER_VERSION" | sed 's/^\([0-9]*\)\.\([0-9]*\).*/\1-\2-stable/')
- docker run
--env SOURCE_CODE="$PWD"
--volume "$PWD":/code
--volume /var/run/docker.sock:/var/run/docker.sock
"registry.gitlab.com/gitlab-org/security-products/codequality:$SP_VERSION" /code
- cat *.json
# artifacts:
# reports:
# codequality: gl-code-quality-report.json
2010
12-17: Proteinortho V4.18 - Source code
2011
01-12: Proteinortho V4.20 - Source code
Support for NCBI blast+
minor bugfixes
08-16: Proteinortho V4.22 - Source code
Added option to output the edge list for reciprocal blast alignments
Added script to output the remaining edge list after clustering
Added test+ option for make to run a test using blastp+ rather than blast+
Relaxed criteria for compilation test to deal with different versions of blast
2012
05-01: Proteinortho V4.25 - Source code
Compatibility with newer blast+ v2.2.25
Compatibility with newer versions of gcc
Reduced default I/O-threads limit to 3
Some details for better looking output
06-05: Proteinortho V4.26 - Source code
Added -singles option, it allows to report single and paralogous genes found in one species only
2013
12-17: Proteinortho V5.0 - Source code
PoFF extension added, which allows to incorporate conserved synteny data (-synteny, requires .gff files for gene positions)
Default E-value changed from 1e-10 to 1e-05
Partially reimplemented, more clear variable names and three step model (check/prepare, blast, cluster)
Changed parameter names (run without options to see manual)
Pairs will always be reported
Tree-like structures in the orthology graph are not pruned anymore
2014
01-27: Proteinortho V5.02 - Source code
Added -selfblast option to improve prediction of paralogs
01-31: Proteinortho V5.03 - Source code (BETA)
Added -singles option to return singleton genes (orphans without any matches)
Improved multithreading: If more CPUs are present than required for blast jobs, blast's internal subthreads will be invoked
Improved output: When already present blast output was found, a note is raised to give feedback to the user
02-12: Proteinortho V5.04 - Source code
Fixed bugs in the selfblast implementation: Selfblast results obtained using V5.02 or V5.03 (BETA) should be reverified with this version!
-singles option will add data on singleton genes directly into the results matrix rather than to a separate file
Added tool to compare graph files (comp_bla.pl)
03-05: Proteinortho V5.05 - Source code
Fixed stalling issues for system calls; these could have prevented Proteinortho from finishing an analysis at all
Added -blastParameters option to define specific blast parameters other than E-Value
Added -clean switch, it removes temporary files automatically
Fixed some typos
Eased thread locking and terminating system
Added presort of blast results to speed up filtering
Proteinortho now also parses options when set via --
04-01: Proteinortho V5.06 - Source code
Made graph output optional: now it needs to be requested using the -graph switch
Added a new output file: XXX.descriptions containing ID DESC from FASTA files
Fixed some typos and description flaws
Tweaked Makefile
Special thanks for this update goes to Torsten Seemann, Victorian Bioinformatics Consortium at Monash University, Clayton
07-01: Proteinortho V5.07 - Source code
Added a more detailed manual
Added example data for test
Minor bugfixes in output data
07-26: Proteinortho V5.10 - Source code
speeded up graph processing (a lot)
improved make test and example files
fixed minor bugs in tool and manual
added some bugtracking output data to ease use
09-23: Proteinortho V5.11 - Source code
fixed bug when using -singles options with files subfolders
2016
03-17: Proteinortho V5.12b - Source code
fixed Makefile (version b)
fixed code issue in tree builder that prevented it from compiling (version b)
fixed issue where clustering could take very long or even get stuck
improved clustering accuracy for small graphs
added feature to use user-defined temporary paths (-temp=[PATH])
adapted and re-added UPGMA-tree builder tool for protein presence/absence from the version 4 branch (po2tree)
04-26: Proteinortho V5.13 - Source code
fixed issue in graph clustering that sometimes led to random artefacts
thanks to David Kraus (MPI Marburg) and Andrey Rozenberg (University of Bochum)
added hardening modifications for Makefile and added tree builder as install target
thanks to Andreas Tille
Known issues: edges the cleaned graph file (proteinortho-graph) are not reliable at the moment (do not reflect in-program graph)
08-26: Proteinortho V5.15 - Source code with precompiled binaries (Linux/x64) / Proteinortho V5.15 - Source code only
output table is ordered by species names and gene names which largely increases readability and comparability
increased arithmetic accuracy of graph clustering
added warning before existing outputs are overwritten
added support for tblastx+ and tblastx legacy
thanks to Clemens Thölken
2018 Proteinortho6
20.Juni-4.Juli
openmp support (max_of_diag,get_new_x,makeOrthogonal,normalize,getY)
bitscore integration in the convergence (weighted algebraic connectivity)
protein output is now sorted in descending degree-order (sort with comparator_pairDoubleUInt)
getConnectivity: special case checks now if the induced subgraph is complete (K_n)
added various test functions
5. Juli
added kmere heuristic for splitting groups in proteinortho_clustering. After the calculation of an fiedler vector, the kmere heuristic splits the graph not only in the positive and negative entries of the vector but in k clusters. k=2 -> the original split (without the purity).
16. Juli
added LAPACK support for CC with less than 2^15 nodes (since it uses quadratic space -> (2^15)^2=2^30) for the calculation of the algebraic connectivity.
added all other proteinortho files to this repository.
graphMinusRemoveGraph.cpp implements proteinortho5_clean_edges2.pl in c++
23.Juli
openMP support for laplacian declaration (for lapack).
'make test' clean up.
jackhmmer, phmmer, diamond, usearch support.
24 Juli
last integration.
phmmer+jackhmmer fix/workaround (there is no local identity in the output -> disabled).
proteinortho.pl : set cluster algorithm to weighted-mode as default.
30. Juli
rapsearch integration.
proteinortho_clustering.cpp : -ramLapack is now -ram and is the threshold for laplace matrix + graph struct.
added dynamic memory management (proteinortho.pl + clustering.cpp) using the free -m command (if exists)
31. Juli
rapsearch fix (wrong order of db and q)
purity is back, now 0.1 (and fallback function, if all nodes are below purity threshold -> remove purity for this connected component)
more options for proteinortho.pl -p=diamond-moresensitive|usearch-ublast
9. Aug
topaz integeration.
all DBs now have the blastmode in name (colliding names) as well as the tmp files generated by the blastalgos.
10. Aug
Orthology XML integration. Added the option -noxml for not generating the orthology XML format.
13. Aug
bugfix usearch/ublast: removed the description from the gene name (formatU.pl). bugfix rapsearch: forced to create an output .m8 file if there are no hits found.
allowedAlphabet check in read_details in check_files. E.g. diamond expects aminoacid characters -> found a gene with only nucleotide characters -> WARNING. E.g. blastn+ expects nucleotide characters -> found non nucleotide characters -> ERROR.
22. Aug
removed phmmer and jackhmmer.
removed the -p=diamondmoresensitive option, since it is equivalent to -p=diamond -subpara='--moresensitive'.
# redesigned the multithreading system:
# -cpus=x -> spawn round(sqrt(x)) workerthreads with each ceil(sqrt(x)) (different for the last workerthread ...) cores for blast.
# removed threads_per_process function.
8. Sep
proteinortho_clustering: introduced multithreading in partition_graph() -> generate k CC and compute the lapack dsyevx in parallel (1. Memory check 2. if a large CC is found -> 1 power iteration with all cores 3. else do k lapack.). New const variable lapack_power_threshold_n for determining large CC for the power iteration.
4. Okt
improvement in the memory calculations.
BUGfix in the DFS calculation (recursion in c++ failed with segmentation fault if the recursion was too deep) -> now iteratively (memory ineffciently) with Queue
10. Okt
DFS -> BFS since recursive calls can only be so deep.
18. Okt
purity is now 1e-7, evalue 1e-8 (http://people.sc.fsu.edu/~jburkardt/c_src/power_method/power_method_prb.c)
kmere heuristic minNodes = 2^20 (~1e+6), kmere now checks if the "normal" split would result in a good partition.
30. Okt
- removed the memory manager for proteinortho_clustering, instead a simple n threshold manages the power/lapack algorithms
now all CC are calculated for power/lapack (no frequent restart), dynamic for loop for lapack
7. Nov
dsyevr instead of dsyevx (rrr algorithm now)
remove graph bugfix (each thread is now assigned an own ofstream (shared_ptr needed -> c++11 needed))
13. Nov
OMP_PROC_BIND=close for multi-socket systems (change the cpu affinity of openmp to close -> each new thread spawn next to the last one, instead of randomly)
15. Nov
Blat support (step=2), the evalues cannot be preset as a parameter but appear if -out=blast8 is set.
16. Nov
proteinoprtho 6.0 alpha release
28. Nov
Makefile update (lapack zipped,...)
5. Dez
MCL integration (-mcl option in proteinortho.pl)
XML bugfix (species with . in the name did confuse the xml parser)
6. Dez
MCL pre/postprocessing (src/do_mcl.pl)
double -> float in proteinortho_clustering.cpp
weights are unsigned shorts again (only the last commit was unsigned int) proteinortho_clustering.cpp
10. Dez
gff4fasta update: Test for additional naming schemes
20.-21. Dez
src/do_mcl.pl performance increase, dev/blastgraph2CCblastgraphs.cpp improvement
orthoXML improvement (still not accepted by orthobechmarkproject)
25. Dez
no lapack version (src/proteinortho_clustering_nolapack.cpp)
no cmake version (make all_nocmake, needs lapack installed)
2019
9. Jan
pow_n replaced with powLapD (graph density threshold instead of number of nodes)
11. Jan
mmseq2 integration (proteinortho.pl) -p=mmseqsp or mmseqsn
22. Jan (uid:296)
Added CHAGEUID for a commit specific id. (update_CHANGEUID.sh can be found in snippets, use 'find . -maxdepth 2 | perl -lne '{if($_=~m/^.*(\.pl|\.cpp|\.c|\.h|\.md|\.txt|Makefile|CHANGELOG)$/){print $_;}}' | entr bash update_CHANGEUID.sh')
BUGfix: weird sort behaviour dependant on locale (LC_All,LC_NUMERIC). Fix: $ENV{'LC_All'}='C';
23. Jan (uid:492) v6.0a
small fix for get_po_path
4. Feb (uid:724)
-ram is back for memory control of proteinortho_clustering (restricts the memory usage of LAPACK and the input graph), works also for proteinortho.pl -ram
14. Mar (uid: 1034)
-tmp is now working better (tmp_dir now generates a directory with all temporary files inside)
read_details now checks if the input files are faa and fna type based on the -p algorithm (diamond only uses faa files etcpp) IF -checkfasta
20. Mar (uid: 1174)
static versions (Linux/x64) of all binaries are now included in the repository
Makefile now compiles first against /usr/lib/liblapack statically then it tries to recompile src/lapack automatically with 'make'
26. Apr (uid: 2239)
now supports -minspecies fully (proteinortho.pl argument)
fix no_lapack_proteinortho_clustering
po2html integration
28. Apr (uid:2349)
Makefile now builds in src/BUILDS/$uname depending on the system (Linux/Darvin). Now I can include precompiled binaries for mac and linus at the same time.
1. Mai (uid:2488) v6.0b
proteinortho is now part of the bioconda repository
grab_proteins.pl makeover for brew integration
6. Mai (uid:2821) v6.0
finally proteinortho2xml.pl is working correctly.
clean up proteinortho_do_mcl.pl
renamed all files, such that every program starts with proteinortho !
11. Mai (uid:3023) v6.0.1
html improvement (now you can display alternative names from the fasta files)
new minspecies default (1)
19. Mai (uid:3063)
proteinorthoHelper.html
21. Mai (uid:3080)
minspecies 1 fix (previously 1 => disabled minspecies calculations)
22. Mai (uid:3140)
-selfblast generates duplicated hits -> automatically calls cleanupblastgraph
3. Juni (uid:3142)
refined error messages on duplicated inputs, <2 inputs
27. Juni (uid:3492)
proteinortho6.pl now writes databases (-step=1) into -tmp directory if system call failed.
fixed small issue that tmp directories are created inside eachother.
better stderr outputs e.g. if blast fails -> try -check ...
1. Juli (uid:3511)
fixed the -ram issue (used free memory, now total memory) in case there is a swap using up all free memory (also proteinortho_clustering now throws a warning not a error)
10. Juli (uid:3697)
fixed proteinortho_grab_proteins.pl: -tofiles option now escapes if -exact, replaced chomp with s/[\r\n]+$//
proteinortho_grab_proteins.pl speedup for -exact and a given proteinortho file
proteinortho6.pl replaced chomp with s/[\r\n]+$//
proteinortho_clustering.cpp fix bug that only uses lapack if -pld is set, regardless of the value.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
\ No newline at end of file
# This makefile is part of Proteinortho.
##########################################
# Run 'make' for compiling everything in the current directory (using the installed version of lapack in e.g. /usr/lib/, you can install lapack with e.g. apt-get install libatlas3-base or liblapack3)
# Run 'make STATIC=TRUE' for a static version
# Run 'make USELAPACK=FALSE' for a version without(!) LAPACK (only power iteration is used)
# Run 'make USEPRECOMPILEDLAPACK=FALSE' for directly recompiling the provided lapack version 3.8.0 and linking dynamically
# Run 'make CXX=g++-7' for using the g++-7 compiler. See Flags below for more informations
# Run 'make CXX=clang++' for using the clang compiler
# Run 'make install' for installing the compiled files to /usr/local/bin
# Run 'make install PREFIX=/home/paul/bin/' for local installation
############ FLAGS: ##########################################
## STATIC=TRUE : enable static compiling (default:FALSE)
## CXX=g++ : the g++ compiler
## CXXFLAGS = compiler flags passed to g++
## CXXLIBRARY = the path to the libs like lapack,... (dont forget the -L)
## CXXINCLUDE = include path (^) (dont forget the -I)
## PREFIX = the installation prefix (only for make install)
##############################################################
##########################
## enviroment variables ##
##########################
# ALIAS for PREFIX
INSTALLDIR=/usr/local/bin
# ALIAS for LAPACK
USELAPACK=TRUE
# compile statically
STATIC=FALSE
ifdef PREFIX
INSTALLDIR=$(PREFIX)
endif
ifdef LAPACK
USELAPACK=$(LAPACK)
endif
USEPRECOMPILEDLAPACK=TRUE
UNAME_S=$(shell uname -s)_$(shell uname -m)
# output dir of make (make install moves these to PREFIX)
BUILDDIR=src/BUILD/$(UNAME_S)
CC=cc
CXX=g++
CXXFLAGS_PO=-Wall -O3 -std=c++11 -Wno-unused-result
IS_COLOR_COMPATIBLE:=$(shell tput color 2>/dev/null)
ifdef IS_COLOR_COMPATIBLE
RED=\033[1;31m
GREEN=\033[1;32m
ORANGE=\033[1;33m
NC=\033[0m
endif
##############
# MAKEFILE : #
##############
dir_guard=@if [ ! -d $(BUILDDIR) ]; then echo "Creating build directory ..."; mkdir -p $(BUILDDIR); fi
.PHONY: all
all:$(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_grab_proteins.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_treeBuilderCore
@echo "[100%] $(GREEN)Everything is compiled with no errors.$(NC)"
$(BUILDDIR)/proteinortho_extract_from_graph.pl: src/proteinortho_extract_from_graph.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl: src/proteinortho_compareProteinorthoGraphs.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho_grab_proteins.pl: src/proteinortho_grab_proteins.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho_do_mcl.pl: src/proteinortho_do_mcl.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho_formatUsearch.pl: src/proteinortho_formatUsearch.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho2html.pl: src/proteinortho2html.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho2xml.pl: src/proteinortho2xml.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho_singletons.pl: src/proteinortho_singletons.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho2tree.pl: src/proteinortho2tree.pl
$(dir_guard)
@cp $< $@
$(BUILDDIR)/proteinortho_ffadj_mcs.py: src/proteinortho_ffadj_mcs.py
$(dir_guard)
@cp $< $@
echoENV:
@echo -n "CC = "
@echo $(CC)
@echo -n "CCFLAGS = "
@echo $(CCFLAGS)
@echo -n "CXX = "
@echo $(CXX)
@echo -n "CXXFLAGS = "
@echo $(CXXFLAGS)
@echo -n "LDFLAGS = "
@echo $(LDFLAGS)
@echo -n "LDLIBS = "
@echo $(LDLIBS)
# 1. Try to compile statically with LAPACK
# 2. try to compile dynamically with the given lapack lib in src/
$(BUILDDIR)/proteinortho_clustering: src/proteinortho_clustering.cpp
$(dir_guard)
@echo "[ 10%] Prepare proteinortho_clustering ..."
ifeq ($(USELAPACK),TRUE)
ifeq ($(USEPRECOMPILEDLAPACK),TRUE)
ifeq ($(STATIC),TRUE)
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (static linking)";
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -static -Wl,--allow-multiple-definition -llapack -lblas -lgfortran -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] ) || ( \
echo "......$(ORANGE)static linking failed, now I try dynamic linking.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -llapack -lblas -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)dynamic linking failed too, now I try dynamic linking without -WL,-whole-archive (this should now work for OSX).$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -llapack -lblas -pthread -lpthread && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)dynamic linking failed (without -WL,-whole-archive) too too, now I try to openblas.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -lopenblas -pthread -lpthread && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)last linking failed too too too, now I try to recompile lapack (v.3.8.0) and then compile proteinortho_clustering with dynamic linking.$(NC)"; \
echo "......[ 33%] Extracting the LAPACK library"; \
if [ ! -d src/lapack-3.8.0 ]; then cd src; tar -xzvf lapack-3.8.0.tar.gz > /dev/null 2>&1; cd ..; fi; \
echo "......[ 66%] Compiling the LAPACK library (using cmake + make)"; \
if [ ! -f src/lapack-3.8.0/build/lib/liblapack.a ]; then mkdir src/lapack-3.8.0/build 2> /dev/null; cd src/lapack-3.8.0/build; cmake .. -DCMAKE_CXX_COMPILER=$(CXX) > /dev/null; make all -j4 > /dev/null 2>&1; cd ../../.. ; fi; \
echo "......[ 99%] Building **proteinortho_clustering** with LAPACK (dynamic linking)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< -Isrc/lapack-3.8.0/build/include/ -Lsrc/lapack-3.8.0/build/lib/ -llapack -lblas $(LDFLAGS) $(LDLIBS) -lgfortran && echo "......OK dynamic linking was successful for proteinortho_clustering!" || ( echo "" ) ; ) ) ) )
else
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (dynamic linking)";
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -llapack -lblas -pthread -Wl,--whole-archive -lpthread -Wl,--no-whole-archive && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)dynamic linking failed too, now I try dynamic linking without -WL,-whole-archive (this should now work for OSX).$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -llapack -lblas -pthread -lpthread && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)dynamic linking failed (without -WL,-whole-archive) too too, now I try to openblas.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< $(LDFLAGS) $(LDLIBS) -lopenblas -pthread -lpthread && ([ $$? -eq 0 ] && echo "......OK dynamic linking was successful for proteinortho_clustering!";) || ( \
echo "......$(ORANGE)last linking failed too too too, now I try to recompile lapack (v.3.8.0) and then compile proteinortho_clustering with dynamic linking.$(NC)"; \
echo "......[ 33%] Extracting the LAPACK library"; \
if [ ! -d src/lapack-3.8.0 ]; then cd src; tar -xzvf lapack-3.8.0.tar.gz > /dev/null 2>&1; cd ..; fi; \
echo "......[ 66%] Compiling the LAPACK library (using cmake + make)"; \
if [ ! -f src/lapack-3.8.0/build/lib/liblapack.a ]; then mkdir src/lapack-3.8.0/build 2> /dev/null; cd src/lapack-3.8.0/build; cmake .. -DCMAKE_CXX_COMPILER=$(CXX) > /dev/null; make all -j4 > /dev/null 2>&1; cd ../../.. ; fi; \
echo "......[ 99%] Building **proteinortho_clustering** with LAPACK (dynamic linking)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< -Isrc/lapack-3.8.0/build/include/ -Lsrc/lapack-3.8.0/build/lib/ -llapack -lblas $(LDFLAGS) $(LDLIBS) -lgfortran && echo "......OK dynamic linking was successful for proteinortho_clustering!" || ( echo "" ) ; ) ) )
endif
@if [ ! -e $(BUILDDIR)/proteinortho_clustering ]; then echo "proteinortho_clustering compilation failed. Please visit https://gitlab.com/paulklemm_PHD/proteinortho/wikis/Error%20Codes"; false; fi
else
@echo "[ 15%] Extracting the LAPACK library";
@if [ ! -d src/lapack-3.8.0 ]; then cd src; tar -xzvf lapack-3.8.0.tar.gz > /dev/null 2>&1; cd ..; fi;
@echo "[ 18%] Compiling the LAPACK library (using cmake + make)";
@if [ ! -f src/lapack-3.8.0/build/lib/liblapack.a ]; then mkdir src/lapack-3.8.0/build; cd src/lapack-3.8.0/build; cmake .. -DCMAKE_CXX_COMPILER=$(CXX) > /dev/null 2>&1; make all -j4 > /dev/null 2>&1; cd ../../.. ; fi;
@echo "[ 20%] Building **proteinortho_clustering** with LAPACK (dynamic linking)";
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ $< -Isrc/lapack-3.8.0/build/include/ -Lsrc/lapack-3.8.0/build/lib/ -llapack -lblas $(LDFLAGS) $(LDLIBS) -lgfortran;
endif
endif
ifeq ($(USELAPACK),FALSE)
@echo "[ 20%] Building **proteinortho_clustering** WITHOUT(!) LAPACK";
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) -static && ([ $$? -eq 0 ] ) || ( \
echo "......$(ORANGE)static linking failed of proteinortho_clustering_nolapack, now i switch to dynamic linking.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -fopenmp -o $@ src/proteinortho_clustering_nolapack.cpp $(LDFLAGS) $(LDLIBS) && echo "......OK dynamic linking was successful for proteinortho_clustering_nolapack!"; )
endif
$(BUILDDIR)/proteinortho_cleanupblastgraph: src/cleanupblastgraph.cpp
$(dir_guard)
@echo "[ 50%] Building **cleanupblastgraph**"
ifeq ($(STATIC),TRUE)
@$(CXX) -std=c++11 $(CXXFLAGS) $(CXXFLAGS_PO) -o $@ $< $(LDFLAGS) $(LDLIBS) -static && ([ $$? -eq 0 ] ) || ( \
echo "......$(ORANGE)static linking failed of cleanupblastgraph, now i switch to dynamic linking.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -o $@ $< && echo "......OK dynamic linking was successful of cleanupblastgraph!"; )
else
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -o $@ $< $(LDFLAGS) $(LDLIBS)
endif
$(BUILDDIR)/proteinortho_graphMinusRemovegraph: src/graphMinusRemovegraph.cpp
$(dir_guard)
@echo "[ 25%] Building **graphMinusRemovegraph**"
ifeq ($(STATIC),TRUE)
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -o $@ $< $(LDFLAGS) $(LDLIBS) -static && ([ $$? -eq 0 ] ) || ( \
echo "......$(ORANGE)static linking failed of graphMinusRemovegraph, now i switch to dynamic linking.$(NC)"; \
$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -o $@ $< && echo "......OK dynamic linking was successful of graphMinusRemovegraph!"; )
else
@$(CXX) $(CXXFLAGS) $(CXXFLAGS_PO) -o $@ $< $(LDFLAGS) $(LDLIBS)
endif
$(BUILDDIR)/proteinortho_treeBuilderCore: src/po_tree.c
$(dir_guard)
@echo "[ 75%] Building **po_tree**"
ifeq ($(STATIC),TRUE)
@$(CC) $(CCFLAGS) -o $@ $< $(LDFLAGS) $(LDLIBS) -static && ([ $$? -eq 0 ] ) || ( \
echo "......$(ORANGE)static linking failed of po_tree, now i switch to dynamic linking.$(NC)"; \
$(CC) $(CCFLAGS) -o $@ $< && echo "......OK dynamic linking was successful of po_tree!"; )
else
@$(CC) $(CCFLAGS) -o $@ $< $(LDFLAGS) $(LDLIBS)
endif
.PHONY: install
install: proteinortho6.pl proteinortho $(BUILDDIR)/proteinortho_extract_from_graph.pl $(BUILDDIR)/proteinortho_formatUsearch.pl $(BUILDDIR)/proteinortho_compareProteinorthoGraphs.pl $(BUILDDIR)/proteinortho_do_mcl.pl $(BUILDDIR)/proteinortho2html.pl $(BUILDDIR)/proteinortho2xml.pl $(BUILDDIR)/proteinortho_clustering $(BUILDDIR)/proteinortho_singletons.pl $(BUILDDIR)/proteinortho_ffadj_mcs.py $(BUILDDIR)/proteinortho2tree.pl $(BUILDDIR)/proteinortho_cleanupblastgraph $(BUILDDIR)/proteinortho_graphMinusRemovegraph $(BUILDDIR)/proteinortho_treeBuilderCore $(BUILDDIR)/proteinortho_grab_proteins.pl
@echo "INSTALLING everything to $(INSTALLDIR)"
@install -v $^ $(INSTALLDIR);
@echo "$(GREEN)Everything installed successfully to $(INSTALLDIR).$(NC)"
@echo "If needed you can add $(INSTALLDIR) to \$$PATH with 'export PATH=\$$PATH:$(INSTALLDIR)'."
.PHONY: test
test: proteinortho6.pl test_step2 test_step3 test_clean
@echo "[TEST] All tests $(GREEN)passed$(NC)"
.PHONY: test_step2
test_step2: proteinortho6.pl
@echo "[TEST] 1. basic proteinortho6.pl -step=2 test. (algorithms that are not present are skipped)"
@echo -n " [1/12] -p=blastp+ test: "
@if [ "$(shell which blastp)" = "" ]; then\
echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_blastp -p=blastp+ test/*.faa; \
set -e ; ./src/chk_test.pl test_blastp.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [2/12] -p=blastp+ synteny (PoFF) test: "
@if [ "$(shell which blastp)" = "" ]; then\
echo "$(ORANGE)blastp missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_synteny -synteny -singles -p=blastp+ test/*.faa; \
set -e ; ./src/chk_test.pl test_synteny.proteinortho.tsv; \
set -e ; ./src/chk_test.pl test_synteny.poff.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [3/12] -p=diamond test: "
@if [ "$(shell which diamond)" = "" ]; then\
echo "$(ORANGE)diamond missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_diamond -p=diamond test/*.faa; \
set -e ; ./src/chk_test.pl test_diamond.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [4/12] -p=diamond (--moresensitive) test (subparaBlast): "
@if [ "$(shell which diamond)" = "" ]; then\
echo "$(ORANGE)diamond missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_diamondmoresensitive -p=diamond -subparaBlast="--more-sensitive" test/*.faa; \
set -e ; ./src/chk_test.pl test_diamondmoresensitive.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [5/12] -p=lastp (lastal) test: "
@if [ "$(shell which lastal)" = "" ]; then\
echo "$(ORANGE)lastal missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_lastp -p=lastp test/*.faa; \
set -e ; ./src/chk_test.pl test_lastp.proteinortho.tsv ; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [6/12] -p=topaz test: "
@if [ "$(shell which topaz)" = "" ]; then\
echo "$(ORANGE)topaz missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_topaz -p=topaz test/*.faa; \
set -e ; ./src/chk_test.pl test_topaz.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [7/12] -p=usearch test: "
@if [ "$(shell which usearch)" = "" ]; then\
echo "$(ORANGE)usearch missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_usearch -p=usearch test/*.faa; \
set -e ; ./src/chk_test.pl test_usearch.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [8/12] -p=ublast test: "
@if [ "$(shell which usearch)" = "" ]; then\
echo "$(ORANGE)usearch missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_ublast -p=ublast test/*.faa; \
set -e ; ./src/chk_test.pl test_ublast.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
@echo -n " [9/12] -p=rapsearch test: "
@if [ "$(shell which rapsearch)" = "" ]; then\
echo "$(ORANGE)rapsearch missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_rapsearch -p=rapsearch test/*.faa; \
set -e ; ./src/chk_test.pl test_rapsearch.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
CPP = g++
CPPFLAGS += -Wall -O2 -std=gnu++11 -Wno-unused-result
@echo -n " [10/12] -p=blatp (blat) test: "
@if [ "$(shell which blat)" = "" ]; then\
echo "$(ORANGE)blat missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_blatp -p=blatp test/*.faa; \
set -e ; ./src/chk_test.pl test_blatp.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
all: proteinortho5_clustering proteinortho5_tree
@echo -n " [11/12] -p=mmseqsp (mmseqs) test: "
@if [ "$(shell which mmseqs)" = "" ]; then\
echo "$(ORANGE)mmseqs missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_mmseqsp -p=mmseqsp test/*.faa; \
set -e ; ./src/chk_test.pl test_blatp.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
proteinortho5_clustering: proteinortho5_clustering.cpp
$(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
@echo -n " [12/12] -p=blastp (legacy blast) test: "
@if [ "$(shell which blastall)" = "" ]; then\
echo "$(ORANGE)blastall missing, skipping...$(NC)"; \
else \
./proteinortho6.pl -silent -force -project=test_blastall -p=blastp test/*.faa; \
set -e ; ./src/chk_test.pl test_blastall.proteinortho.tsv; \
echo "$(GREEN)passed$(NC)"; \
fi
proteinortho5_tree: po_tree.c
$(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $<
.PHONY: test_step3
test_step3: proteinortho6.pl test_step2
@echo "[TEST] 2. -step=3 tests (proteinortho_clustering) "
@echo -n " [1/2] various test functions of proteinortho_clustering (-test): "; \
$(BUILDDIR)/proteinortho_clustering -test> /dev/null 2>&1
@echo "$(GREEN)passed$(NC)"
@echo -n " [2/2] Compare results of 'with lapack' and 'without lapack': "; \
$(BUILDDIR)/proteinortho_clustering -epsilon 0 test_blastp.blast-graph> /dev/null 2>&1; \
sort remove.graph -o test.A> /dev/null 2>&1; \
$(BUILDDIR)/proteinortho_clustering -epsilon 0 -lapack 0 test_blastp.blast-graph> /dev/null 2>&1; \
sort remove.graph -o test.B> /dev/null 2>&1; \
set -e ; diff test.A test.B;
@echo "$(GREEN)passed$(NC)"
install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po2tree.pl proteinortho5_tree
install -v $^ $(INSTALLDIR)
.PHONY: test_clean
test_clean:
@echo "[TEST] Clean up all test files..."; \
rm -rf proteinortho_cache_test_* test.* test_* test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*> /dev/null 2>&1;
test: proteinortho5.pl proteinortho5_clustering
./proteinortho5.pl -project=test -synteny -singles test/*.faa
@./chk_test.pl test.proteinortho
@./chk_test.pl test.poff
rm test.*
@echo "Test okay"
.PHONY: clean
clean:
rm -rf src/BUILD test/C.faa.* test/E.faa.* test/C2.faa.* test/L.faa.* test/M.faa.*
rm -rf src/lapack-3.8.0/
This diff is collapsed.
#!/bin/sh
/usr/lib/proteinortho/proteinortho5.pl $@
#!/bin/sh
/usr/lib/proteinortho/proteinortho6.pl $@
proteinortho (6.0.6+dfsg-1) UNRELEASED; urgency=medium
[ Paul Klemm ]
* Team upload.
* New upstream version
[ Andreas Tille ]
* Fix watch file and Files-Excluded
* debhelper 12
* Standards-Version: 4.4.0
* Build-Depends: s/liblapack3/liblapack-dev|libatlas-base-dev|liblapack.so/
-- Paul Klemm <klemmp@staff.uni-marburg.de> Mon, 15 Jul 2019 13:17:13 +0200
proteinortho (5.16.b+dfsg-1) unstable; urgency=medium
* d/watch: Also catch letters in version number
......
test/*.phr
test/*.pin
test/*.psq
proteinortho5_clean_edges
proteinortho5_clustering
proteinortho_clustering
proteinortho_cleanupblastgraph
proteinortho_graphMinusRemovegraph
proteinortho_treeBuilderCore
test.*
\ No newline at end of file
......@@ -3,18 +3,22 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
ncbi-blast+
Standards-Version: 4.3.0
Build-Depends: debhelper (>= 12~),
ncbi-blast+,
liblapack-dev | libatlas-base-dev | liblapack.so,
diamond-aligner
Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/proteinortho
Vcs-Git: https://salsa.debian.org/med-team/proteinortho.git
Homepage: http://www.bioinf.uni-leipzig.de/Software/proteinortho/
Homepage: https://gitlab.com/paulklemm_PHD/proteinortho
Package: proteinortho
Architecture: any
Depends: ${shlibs:Depends},
${misc:Depends},
ncbi-blast+,
diamond-aligner,
liblapack3,
python
Description: Detection of (Co-)orthologs in large-scale protein analysis
Proteinortho is a stand-alone tool that is geared towards large datasets
......
......@@ -2,10 +2,8 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: Proteinortho
Upstream-Contact: Marcus Lechner <lechner@staff.uni-marburg.de>
Source: https://www.bioinf.uni-leipzig.de/Software/proteinortho/
Files-Excluded: proteinortho5_clean_edges
proteinortho5_clustering
po_tree
proteinortho5_tree
Files-Excluded: */BUILD
*/lapack-*.tar.gz
Files: *
Copyright: 2009-2014 Marcus Lechner <lechner@staff.uni-marburg.de>
......
.TH PROTEINORTHO5 "1" "November 2015" "proteinortho5 5.11" "User Commands"
.TH PROTEINORTHO6 "1" "November 2015" "proteinortho6 6.0.6" "User Commands"
.SH NAME
proteinortho5 \- orthology detection tool
proteinortho6 \- orthology detection tool
.SH SYNOPSIS
.B proteinortho5
.B proteinortho6
[\fI\,OPTIONS\/\fR] \fI\,FASTA1 FASTA2 \/\fR[\fI\,FASTA\/\fR...]
.SH DESCRIPTION
Proteinortho is a stand-alone tool that is geared towards large datasets
......@@ -18,8 +18,8 @@ proteins present in 99% of all bacterial proteomes.
E\-value for blast [default: 1e\-05]
.TP
\fB\-p=\fR
blast program {blastn|blastp|blastn+|blastp+}
[default: blastp+]
blast program {blastp+|blastn+|tblastx+|diamond|usearch|ublast|lastp|lastn|rapsearch|topaz|blatp|blatn|mmseqsp|mmseqsn}
[default: diamond]
.TP
\fB\-project=\fR
prefix for all result file names [default: myproject]
......@@ -76,8 +76,8 @@ min. similarity for additional hits (0..1) [default: 0.95]
3 \-> clustering
0 \-> all (default)
.TP
\fB\-blastpath=\fR
path to your local blast (if not installed globally)
\fB\-binpath=\fR
path to your local blast/diamond/... (if not installed globally)
.TP
\fB\-verbose\fR
keeps you informed about the progress
......@@ -85,18 +85,13 @@ keeps you informed about the progress
\fB\-clean\fR
remove all unnecessary files after processing
.TP
\fB\-graph\fR
generate .graph files (pairwise orthology relations)
.TP
\fB\-debug\fR
gives detailed information for bug tracking
.PP
More specific blast parameters can be defined by
.TP
\fB\-blastParameters=\fR'[parameters]' (e.g. \fB\-blastParameters=\fR'\-seg no')
\fB\-subparaBLAST=\fR'[parameters]' (e.g. \fB\-subparaBLAST=\fR'\-seg no')
.PP
In case jobs should be distributed onto several machines, use
.TP
\fB\-startat=\fR File number to start with (default: 0)
.TP
\fB\-stopat=\fR File number to end with (default: \fB\-1\fR)
\fB\-jobs=M/N\fR If you want to involve multiple machines or separate a Proteinortho run into smaller chunks, use the -jobs=M/N option. First, run 'proteinortho6.pl -steps=1 ...' to generate the indices. Then you can run 'proteinortho6.pl -steps=2 -jobs=M/N ...' to run small chunks separately. Instead of M and N numbers must be set representing the number of jobs you want to divide the run into (M) and the job division to be performed by the process.
......@@ -10,4 +10,4 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all
dh $@
override_dh_auto_install:
dh_auto_install --buildsystem=makefile -- install INSTALLDIR=$(CURDIR)/debian/$(DEB_SOURCE)/usr/lib/proteinortho
dh_auto_install --buildsystem=makefile -- install PREFIX=$(CURDIR)/debian/$(DEB_SOURCE)/usr/lib/proteinortho
......@@ -9,9 +9,11 @@ mkdir test
cp -a /usr/share/doc/${pkg}/examples/* test
find . -type f -name "*.gz" -exec gunzip \{\} \;
mv test/chk_test.pl .
proteinortho5 -project=test -synteny -singles test/*.faa
./chk_test.pl test.proteinortho
./chk_test.pl test.poff
proteinortho -project=test -synteny -singles test/*.faa
./src/chk_test.pl test.proteinortho
./src/chk_test.pl test.poff
echo "Test okay"
make test
rm -fr $ADTTMP/*
version=4
opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz,uversionmangle=s/([\d.]+)([a-z])/$1.$2/" \
http://www.bioinf.uni-leipzig.de/Software/proteinortho/ proteinortho_v@ANY_VERSION@@ARCHIVE_EXT@
opts="repacksuffix=+dfsg,dversionmangle=auto,repack,compression=xz" \
https://gitlab.com/paulklemm_PHD/proteinortho/tags?sort=updated_desc .*/archive/.*/proteinortho-v@ANY_VERSION@\.tar\.gz
cmake_minimum_required(VERSION 2.8)
file(GLOB devproject_SRC
"*.cpp"
)
find_package(OpenMP REQUIRED)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
else(OPENMP_FOUND)
message("ERROR: OpenMP could not be found.")
endif()
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fopenmp -std=c++11")
set (CMAKE_CXX_STANDARD 11)
foreach(project ${devproject_SRC})
get_filename_component(barename ${project} NAME_WE)
add_executable(${barename} ${project})
IF(${barename} MATCHES "test_getConnectivity")
link_directories(${CMAKE_SOURCE_DIR}/../src/lapack/build/lib)
target_link_libraries(${barename} lapack)
ENDIF()
install(TARGETS ${barename}
RUNTIME DESTINATION ~/bin)
endforeach()
/*
compile: g++ -std=c++11 -O3 blastgraph2CCblastgraphs.cpp -o blastgraph2CCblastgraphs
Usage: blastgraph2CCblastgraphs (options) BLASTGRAPH (BLASTGRAPH2,BLASTGRAPH3,...)
Detail: paritions the input blastgraph(s) into all the connected components and produces a file in the blast-graph style for each of them. The output files are named 'BLASTGRAPH_x.cc' where x is the current CC index. The output files are
generated in the current directory (the files getting tar-ed to fight io overload). The size of the current CC is printed to STDOUT in the form n,m with n=number of nodes and m=number of edges.
Example Output:
| 1295527.5 Enterococcus_faecium 1158601.5 Enterococcus_malodoratus 0 187 0 187
| 1295527.5 Enterococcus_faecium 1330525.4 Enterococcus_raffinosus 0 186 0 186
| ...
(1295527.5 is a protein of the species Enterococcus_faecium)
Options:
-s , --statonly : print just the statistics in stdout (number of nodes, edges of each CC)
-id , -idonly : each node (protein/gene) is replaced with an id (saves alot of memory)!
| 0 1 0 187 0 187
| 0 2 0 186 0 186
| ...
(the ID 0,1,... are unique for each protein+species)
-notar : Do not tar the results!
-species : print the species header separately (this will result in alot of comment lines)
| # Enterococcus_faecium Enterococcus_malodoratus
| # 300 400
| 1295527.5 1158601.5 0 187 0 187
| # Enterococcus_faecium Enterococcus_raffinosus
| # 300 999
| 1295527.5 1330525.4 0 186 0 186
| ...
the 300, 400 and 999 are the species specific median bitscores
*/
//pk
#include <iostream>
#include <fstream>
#include <vector>
#include <sstream>
#include <string>
#include <map>
#include <cstdlib>
#include <algorithm>
#include <cmath>
#include <list>
#include <unordered_set>
#include <cstring>
#include <string.h>
using namespace std;
struct wedge {unsigned int edge;};
struct protein {vector<wedge> edges;};
// Globals
unsigned int species_counter = 0; // Species
unsigned int protein_counter = 0; // Proteins
vector<string> species; // Number -> Name
vector<protein> graph; // Graph containing all protein data
// TMP Globals
map<string,unsigned int> species2id; // Name -> Number
map<string,unsigned int> protein2id; // Name -> Number
bool fullinformation=true;
bool printspecies=false;
bool notar=false;
///////////////////////////////////////////////////////////
// Misc functions
///////////////////////////////////////////////////////////
// Convert string to double
double string2double(string str) {
istringstream buffer(str);
double value;
buffer >> value;
return value;
}
// Convert string to float
float string2float(string str) {
istringstream buffer(str);
float value;
buffer >> value;
return value;
}
pair<unsigned int, unsigned int> ordPair(unsigned int a,unsigned int b){if(a<b){return make_pair(a,b);}else{return make_pair(b,a);}}
// Split a string at a certain delim
void tokenize(const string& str, vector<string>& tokens, const string& delimiters = "\t") {
// Skip delimiters at beginning.
string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
string::size_type pos = str.find_first_of(delimiters, lastPos);
while (string::npos != pos || string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
}
struct lineinfo {unsigned int bitscore; float eval; map<string,unsigned int>::iterator a_name_it;map<string,unsigned int>::iterator b_name_it; unsigned int a_species_name_it;unsigned int b_species_name_it; };
map<pair<unsigned int, unsigned int>, lineinfo > edge2line;
struct pair_hash {
inline std::size_t operator()(const std::pair<int,int> & v) const {
return v.first*31+v.second;
}
};
unordered_set<pair<unsigned int,unsigned int> , pair_hash> edge_is_done;
map<pair<unsigned int,unsigned int>,string> speciesPair2Median;
///////////////////////////////////////////////////////////
// File parser
///////////////////////////////////////////////////////////
void parse_file(string file) {
string line;
ifstream graph_file(file.c_str());
if (graph_file.is_open()) {
// For each line
string file_a = ""; unsigned int file_a_id = 0;
string file_b = ""; unsigned int file_b_id = 0;
while (!graph_file.eof()) {
getline(graph_file, line);
vector<string> fields;
tokenize(line, fields, "\t");
// Header line
if (fields.size() == 2 && fields[0].substr(0, 1) == "#") {
file_a = fields[0].substr(2, fields[0].size()-2);
file_b = fields[1];
if (file_a == "file_a" && file_b == "file_b") continue; // Init Header
// Map species a
if (species2id.find(file_a) == species2id.end()) {
species.push_back(file_a);
species2id[file_a] = species_counter++;
}
// Map species b
if (species2id.find(file_b) == species2id.end()) {
species.push_back(file_b);
species2id[file_b] = species_counter++;
}
file_a_id = species2id[file_a];
file_b_id = species2id[file_b];
}else if (fields.size() == 4 && fields[0].substr(0, 1) == "#") {
speciesPair2Median[make_pair(file_a_id,file_b_id)]=line;
}
// Data line
else if ((fields.size() >1) && fields[0].substr(0, 1) != "#") {
// a b e1 b1 e2 b2 score
if(fields.size() < 6){
fullinformation=false;
}
// 5.16 deal with duplicated IDs by adding file ID to protein ID
string ida = fields[0];
string idb = fields[1];
fields[0] += " "; fields[0] += file_a;
fields[1] += " "; fields[1] += file_b;
// 5.16 do not point to yourself
if (!fields[0].compare(fields[1])) {continue;}
// A new protein
map<string,unsigned int>::iterator a_name_it=protein2id.find(fields[0]);
map<string,unsigned int>::iterator b_name_it=protein2id.find(fields[1]);
if (a_name_it == protein2id.end()) {
protein a;
protein2id[fields[0]] = protein_counter++;
graph.push_back(a);
a_name_it=protein2id.find(fields[0]);
}
if (b_name_it == protein2id.end()) {
protein b;
protein2id[fields[1]] = protein_counter++;
graph.push_back(b);
b_name_it=protein2id.find(fields[1]);
}
// Add link to graph (reciprocal)
unsigned int a_id = protein2id[fields[0]];
unsigned int b_id = protein2id[fields[1]];
unsigned short bitscore_avg=0;
float eval_avg=0;
if(fields.size() > 5){
float bit_a = string2float(fields[3]);
float bit_b = string2float(fields[5]);
bitscore_avg = (bit_a+bit_b)/2;
float eval_a = string2float(fields[2]);
float eval_b = string2float(fields[4]);
eval_avg = (eval_a+eval_b)/2;
}
// 5.17, add weight
wedge w;
w.edge=b_id;
graph[a_id].edges.push_back(w);
w.edge=a_id;
graph[b_id].edges.push_back(w);
edge2line[ordPair(a_id,b_id)].bitscore = bitscore_avg;
edge2line[ordPair(a_id,b_id)].eval = eval_avg;
if(fullinformation && fields.size() > 5){
edge2line[ordPair(a_id,b_id)].a_species_name_it=file_a_id;
edge2line[ordPair(a_id,b_id)].b_species_name_it=file_b_id;
edge2line[ordPair(a_id,b_id)].a_name_it = a_name_it;
edge2line[ordPair(a_id,b_id)].b_name_it = b_name_it;
//edge2line[ordPair(a_id,b_id)]=fields[0]+"\t"+fields[1]+"\t"+fields[2]+"\t"+fields[3]+"\t"+fields[4]+"\t"+fields[5];
}
edge_is_done.insert(ordPair(b_id,a_id));
}
}
graph_file.close();
}
else {
throw string("Could not open file " + file);
}
}
bool statonly=false;
ofstream out;
unsigned int n=0;
string last_species_header="";
string last_median_header="";
void BFS(vector<bool> * done, unsigned int cur_node ){
list<unsigned int> q;
q.push_back(cur_node);
(*done)[cur_node]=true;
while(q.size()>0){
list<unsigned int> q_new;
for(list<unsigned int>::iterator it = q.begin() ; it != q.end() ; ++it){
cur_node = *it;
for (unsigned int i = 0; i < graph[cur_node].edges.size(); i++) {
unsigned int adjacency_node = graph[cur_node].edges[i].edge;
if(adjacency_node > graph.size()){
cerr << string("[ERROR] : Input graph is invalid. The node is reporting an edge/adjacent node, that is not present in the graph.").c_str() << endl;throw;
}
if(adjacency_node > (*done).size()){
cerr << string("[ERROR] : Input graph is invalid. The node is not present in done vector.").c_str() << endl;throw;
}
if( !(*done)[adjacency_node] ){
n++;
(*done)[adjacency_node] = true;
q_new.push_back(adjacency_node);
}
if(edge_is_done.count( ordPair(cur_node,adjacency_node) ) ){
//cerr << edge_is_done[ordPair(cur_node,adjacency_node)] << endl;
if(!statonly){
lineinfo li = edge2line[ordPair(cur_node,adjacency_node)];
if(fullinformation){
if( printspecies ){
vector<string> fieldsA;
tokenize(li.a_name_it->first, fieldsA, " ");
vector<string> fieldsB;
tokenize(li.b_name_it->first, fieldsB, " ");
if ( last_species_header=="" || last_species_header != "# "+fieldsA[1] + "\t" + fieldsB[1] ){
out << "# "+fieldsA[1] + "\t" + fieldsB[1]<< endl;
last_species_header="# "+fieldsA[1] + "\t" + fieldsB[1];
if(speciesPair2Median.count(make_pair(li.a_species_name_it,li.b_species_name_it))){
out << speciesPair2Median[make_pair(li.a_species_name_it,li.b_species_name_it)]<< endl;
}
}else{
}
out << fieldsA[0] << "\t" << fieldsB[0] << "\t" ;
}else{
out << li.a_name_it->first << "\t" << li.b_name_it->first << "\t" ;
}
}else{
out << cur_node << "\t" << adjacency_node << "\t";
}
out << li.eval << "\t" << li.bitscore << "\t"<< li.eval << "\t" << li.bitscore << endl ;
}
unordered_set<pair<unsigned int, unsigned int >,pair_hash>::iterator iter = edge_is_done.find(ordPair(cur_node,adjacency_node) ) ;
if( iter != edge_is_done.end() )
edge_is_done.erase( iter ); // removes this edge from edge_is_done -> (for counting the number of edges in the CC)
}
}
}
q=q_new;
}
}
int main(int argc, char *argv[]) {
// check for an argument
if(argc < 2 || argv[1] == "-h" || argv[1] == "--h" || argv[1] == "-help" || argv[1] == "--help" || argv[1] == "help" || argv[1] == "h"){
PRINTHELP:
cerr << endl << "Usage: " << argv[0] << " (options) BLASTGRAPH (BLASTGRAPH2,BLASTGRAPH3,...)" << endl << endl
<< "Detail: paritions the input blastgraph(s) into all the connected components and produces a file in the blast-graph style for each of them. The output files are named 'BLASTGRAPH_x.cc' where x is the current CC index. The output files are generated in the current directory (the files getting tar-ed to fight io overload). The size of the current CC is printed to STDOUT in the form n,m with n=number of nodes and m=number of edges."<< endl
<< " Example Output:" << endl
<< " | 1295527.5 Enterococcus_faecium 1158601.5 Enterococcus_malodoratus 0 187 0 187" << endl
<< " | 1295527.5 Enterococcus_faecium 1330525.4 Enterococcus_raffinosus 0 186 0 186" << endl
<< " | ..." << endl << " (1295527.5 is a protein of the species Enterococcus_faecium)" << endl << "Options:"<< endl
<< "-s , --statonly : print just the statistics in stdout (number of nodes, edges of each CC)" << endl
<< "-id , -idonly : each node (protein/gene) is replaced with an id (saves alot of memory)!" << endl
<< " | 0 1 0 187 0 187" << endl
<< " | 0 2 0 186 0 186" << endl
<< " | ..." << endl
<< " (the ID 0,1,... are unique for each protein+species)" << endl
<< "-notar : Do not tar the results!" << endl
<< "-species : print the species header separately (this will result in alot of comment lines)" << endl
<< " | # Enterococcus_faecium Enterococcus_malodoratus" << endl
<< " | 1295527.5 1158601.5 0 187 0 187" << endl
<< " | # Enterococcus_faecium Enterococcus_raffinosus" << endl
<< " | 1295527.5 1330525.4 0 186 0 186" << endl
<< " | ..." << endl;
return -1;
}
try{
// read in a text file that contains a real matrix stored in column major format
// but read it into row major format
string name="";
for(unsigned int argi=1;argi<argc;argi++){
if(strcmp(argv[argi],"-s")==0 || strcmp(argv[argi],"--statonly")==0 || strcmp(argv[argi],"--onlystat")==0 ){
statonly=true;
}else if(strcmp(argv[argi],"--idonly")==0 || strcmp(argv[argi],"--onlyid")==0 || strcmp(argv[argi],"-id")==0 ){
fullinformation=false;
}else if(strcmp(argv[argi],"--species")==0 || strcmp(argv[argi],"-species")==0 ){
printspecies=true;
}else if(strcmp(argv[argi],"--notar")==0 || strcmp(argv[argi],"-notar")==0 ){
notar=true;
}else{
if(argv[argi][0] == '-'){
cout << "[ERROR] : unknown option '" << argv[argi] << "'"<< endl;
goto PRINTHELP;
}
cerr << "[STDERR] Parsing "<< argv[argi] << endl;
name=argv[argi];
parse_file(argv[argi]);
}
}
cerr << "[STDERR] I know " << graph.size() << " proteins with " << edge_is_done.size() << " connections of " << species_counter << " species."<< endl;
std::size_t found = string(name).find_last_of("/");
name=string(name).substr(found+1);
species2id.clear();
cerr << "[STDERR] Parsing done ..." << endl;
unsigned int cur_CC_indx=0; // index of the current CC (e.g. 5 => all edge of the 5. CC have the index 5)
vector<bool> done = vector<bool> (graph.size(), false); // Keep track on what was done (for each node)
bool allNodesAreDone = false;
unsigned int last_protein_id=0;
unsigned int tictoc=0;
string cur_arg="";
unsigned int total_num_edges=edge_is_done.size();
while( !allNodesAreDone ){
allNodesAreDone=true;
for (unsigned int protein_id = last_protein_id ; protein_id < graph.size() ; protein_id++) {
//last_protein_id=protein_id;
if (done[protein_id]){continue;}// We were here already
done[protein_id]=true; // mark this node
cur_CC_indx++;
n=1;
stringstream ss;
ss<<name;
ss<<"_"<<cur_CC_indx;
ss<<".cc";
if(!statonly){
out.open(ss.str().c_str());
if(fullinformation){
out << "# file_a file_b"<< endl<< "# a b evalue_ab bitscore_ab evalue_ba bitscore_ba"<< endl;
}
}
size_t prev_s = edge_is_done.size();
BFS(&done,protein_id); // get the CC of the current node (protein_id);
last_species_header="";
size_t new_s = edge_is_done.size();
if(!statonly){
out.close();
cur_arg+=ss.str()+" ";
tictoc++;
if(tictoc==1000 && !notar){
cerr << "[STDERR] Tar-ing 1000 connected components. " << edge_is_done.size() << " edges left (=" << (1.0-(double)edge_is_done.size()/(double)total_num_edges)*100.0 << "\% done)"<< endl;
tictoc=0;
int i = system(("tar -rf "+name+"_CC.tar "+cur_arg).c_str());
i=system(("rm "+cur_arg).c_str());
cur_arg="";
}
}
cout << n << "," << (prev_s-new_s) << endl; //prev_s = the number of edges before running BFS (removes all edges of the given CC)
last_protein_id=protein_id;
allNodesAreDone=false;
}
}
if(!statonly && !notar){
std::size_t found = string(argv[1]).find_last_of("/");
cerr << "[STDERR] Tar-ing last connected components into "<< name << "_CC.tar. " << edge_is_done.size() << " edges left."<< endl;
int i = system(("tar -rf "+name+"_CC.tar "+cur_arg).c_str());
i=system(("rm "+cur_arg).c_str());
}
}catch(string& error) {
cerr << "[STDERR] [ERROR] catched " << error << endl;
goto PRINTHELP;
return EXIT_FAILURE;
}
}