Skip to content
Commits on Source (6)
[submodule "util/regression"]
path = util/regression
url = https://github.com/soedinglab/MMseqs2-Regression.git
......@@ -24,6 +24,7 @@ matrix:
- zlib1g-dev
- libbz2-dev
- vim-common
- shellcheck
env: CC=clang-3.6 CXX=clang++-3.6
- os: linux
dist: trusty
......@@ -41,6 +42,7 @@ matrix:
- zlib1g-dev
- libbz2-dev
- vim-common
- shellcheck
env: CC=clang-7 CXX=clang++-7
- os: linux
dist: trusty
......@@ -54,6 +56,7 @@ matrix:
- zlib1g-dev
- libbz2-dev
- vim-common
- shellcheck
env: CC=gcc-4.8 CXX=g++-4.8
- os: linux
dist: trusty
......@@ -69,6 +72,7 @@ matrix:
- zlib1g-dev
- libbz2-dev
- vim-common
- shellcheck
env: CC=gcc-8 CXX=g++-8
- os: linux
dist: trusty
......@@ -85,6 +89,7 @@ matrix:
- libbz2-dev
- vim-common
- libopenmpi-dev
- shellcheck
env: MPI=1 CC=gcc-8 CXX=g++-8
- os: osx
osx_image: xcode10.1
......@@ -96,6 +101,7 @@ matrix:
- gcc@8
- zlib
- bzip2
- shellcheck
env: CC=gcc-8 CXX=g++-8
allow_failures:
- env: QEMU_ARM=1
......@@ -116,7 +122,7 @@ script:
elif [[ "$TRAVIS_OS_NAME" == "linux" ]]; then \
if [[ -n "$MPI" ]]; then MPI=1; else MPI=0; fi; \
mkdir build; cd build; \
cmake -G Ninja -DENABLE_WERROR=1 -DHAVE_MPI="$MPI" -DHAVE_SSE4_1=1 .. \
cmake -G Ninja -DENABLE_WERROR=1 -DHAVE_MPI="$MPI" -DHAVE_SSE4_1=1 -DHAVE_TESTS=1 -DREQUIRE_OPENMP=0 .. \
|| exit 1; ninja || exit 1; \
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then \
./util/build_osx.sh . build || exit 1; \
......@@ -124,18 +130,3 @@ script:
exit 1; \
fi
after_success:
- |
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]] || [[ "$TRAVIS_OS_NAME" != "osx" ]]; then \
exit 0; \
fi; \
if [[ "$encrypted_4188a201d0b5_key" == "" ]] || [[ "$encrypted_4188a201d0b5_iv" == "" ]]; then \
exit 0; \
fi; \
openssl aes-256-cbc -K "$encrypted_4188a201d0b5_key" -iv "$encrypted_4188a201d0b5_iv" -in ./util/.travis.enc -out "$HOME/.ssh/id_rsa" -d; \
chmod 400 "$HOME/.ssh/id_rsa"; \
ssh -o StrictHostKeyChecking=no codeship@uniclust.mmseqs.com \
"mkdir -p /home/mirdita/repositories/mmseqs-webserver/archive/${TRAVIS_COMMIT}"; \
cd build; \
scp -o StrictHostKeyChecking=no mmseqs-osx-static_sse41.tar.gz mmseqs-osx-static_avx2.tar.gz \
codeship@uniclust.mmseqs.com:/home/mirdita/repositories/mmseqs-webserver/archive/${TRAVIS_COMMIT};
......@@ -6,6 +6,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(FRAMEWORK_ONLY 0 CACHE BOOL "Framework Mode")
set(HAVE_SANITIZER 0 CACHE BOOL "Have Sanitizers")
set(INSTALL_UTIL 1 CACHE BOOL "Install util scripts")
set(VERSION_OVERRIDE "" CACHE STRING "Override version string in help and usage messages")
#Sanitizers
......@@ -21,7 +22,7 @@ if (NOT CMAKE_BUILD_TYPE)
endif (NOT CMAKE_BUILD_TYPE)
# find compiler
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
# using Clang
message("-- Compiler is clang(++)")
set(CMAKE_COMPILER_IS_CLANG 1)
......@@ -58,26 +59,31 @@ if (APPLE)
set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -D__APPLE_API_STRICT_CONFORMANCE")
endif ()
if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -D_WITH_GETLINE")
endif()
# zstd
SET(ZSTD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/lib/zstd")
SET(CMAKE_INSTALL_LIBDIR bin)
# We use ZSTD_findDecompressedSize which is only available with ZSTD_STATIC_LINKING_ONLY
# Thus we cannot use a system provided libzstd
set(ZSTD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/lib/zstd")
set(CMAKE_INSTALL_LIBDIR bin)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/lib/zstd/build/cmake/CMakeModules")
OPTION(ZSTD_LEGACY_SUPPORT "LEGACY SUPPORT" OFF)
OPTION(ZSTD_BUILD_STATIC "BUILD STATIC LIBRARIES" ON)
OPTION(ZSTD_BUILD_SHARED "BUILD SHARED LIBRARIES" OFF)
OPTION(ZSTD_MULTITHREAD_SUPPORT "MULTITHREADING SUPPORT" OFF)
OPTION(ZSTD_BUILD_PROGRAMS "BUILD PROGRAMS" OFF)
OPTION(ZSTD_BUILD_CONTRIB "BUILD CONTRIB" OFF)
OPTION(ZSTD_BUILD_TESTS "BUILD TESTS" OFF)
option(ZSTD_LEGACY_SUPPORT "LEGACY SUPPORT" OFF)
option(ZSTD_BUILD_STATIC "BUILD STATIC LIBRARIES" ON)
option(ZSTD_BUILD_SHARED "BUILD SHARED LIBRARIES" OFF)
option(ZSTD_MULTITHREAD_SUPPORT "MULTITHREADING SUPPORT" OFF)
option(ZSTD_BUILD_PROGRAMS "BUILD PROGRAMS" OFF)
option(ZSTD_BUILD_CONTRIB "BUILD CONTRIB" OFF)
option(ZSTD_BUILD_TESTS "BUILD TESTS" OFF)
include_directories(lib/zstd/lib)
add_subdirectory(lib/zstd/build/cmake/lib EXCLUDE_FROM_ALL)
# tinyexpr
OPTION(TE_NAT_LOG "Define the log function as natural logarithm." ON)
include_directories(lib/tinyexpr)
add_subdirectory(lib/tinyexpr EXCLUDE_FROM_ALL)
include_directories(lib)
include_directories(lib/kseq)
include_directories(lib/simd)
include_directories(lib/gzstream)
include_directories(lib/alp)
......@@ -88,6 +94,6 @@ add_subdirectory(lib/alp)
add_subdirectory(lib/ksw2)
add_subdirectory(data)
add_subdirectory(src)
if (NOT FRAMEWORK_ONLY)
if (NOT FRAMEWORK_ONLY AND INSTALL_UTIL)
add_subdirectory(util)
endif ()
# MMseqs2: ultra fast and sensitive protein search and clustering suite
MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge proteins/nucleotide sequence sets. MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
## Publications
......@@ -7,42 +7,36 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and
[Steinegger M and Soeding J. Clustering huge protein sequence sets in linear time. Nature Communications, doi: 10.1038/s41467-018-04964-5 (2018)](https://www.nature.com/articles/s41467-018-04964-5).
[Mirdita M, Steinegger M and Soeding J. MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019)](https://academic.oup.com/bioinformatics/article/35/16/2856/5280135)
[![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2)
[![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest)
[![Docker Pulls](https://img.shields.io/docker/pulls/soedinglab/mmseqs2.svg)](https://hub.docker.com/r/soedinglab/mmseqs2)
[![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master)
![AppVeyor CI](https://ci.appveyor.com/api/projects/status/lq8nxeb0j8v38d1a?svg=true)
[![Travis CI](https://travis-ci.org/soedinglab/MMseqs2.svg?branch=master)](https://travis-ci.org/soedinglab/MMseqs2)
[![Zenodo DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.840208.svg)](https://zenodo.org/record/1718312)
<p align="center"><img src="https://raw.githubusercontent.com/soedinglab/mmseqs2/master/.github/mmseqs2_logo.png" height="256" /></p>
## Documentation
## Documentation
The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). We provide a tutorial of MMseqs2 [here](https://github.com/soedinglab/metaG-ECCB18-partII).
## News
Keep posted about MMseqs2/Linclust updates by following Martin on [Twitter](https://twitter.com/thesteinegger).
08/10/2018 ECCB18 tutorial of MMseqs2 is available [here](https://github.com/soedinglab/metaG-ECCB18-partII).
07/07/2018 Linclust has just been published at [Nature Communications](https://www.nature.com/articles/s41467-018-04964-5).
17/10/2017 MMseqs2 has just been published at [Nature Biotechnology](https://www.nature.com/nbt/journal/vaop/ncurrent/full/nbt.3988.html).
## Installation
MMseqs2 can be used by compiling from source, downloading a statically compiled version, using [Homebrew](https://github.com/Homebrew/brew), [conda](https://github.com/conda/conda) or [Docker](https://github.com/moby/moby). MMseqs2 requires a 64-bit system (check with `uname -a | grep x86_64`) with at least the SSE4.1 instruction set (check by executing `cat /proc/cpuinfo | grep sse4_1` on Linux or `sysctl -a | grep machdep.cpu.features | grep SSE4.1` on MacOS).
# install by brew
brew install mmseqs2
# install via conda
conda install -c bioconda mmseqs2
conda install -c bioconda mmseqs2
# install docker
docker pull soedinglab/mmseqs2
# static build sse4.1
wget https://mmseqs.com/latest/mmseqs-static_sse41.tar.gz; tar xvfz mmseqs-static_sse41.tar.gz; export PATH=$(pwd)/mmseqs2/bin/:$PATH
# static build AVX2
wget https://mmseqs.com/latest/mmseqs-static_avx2.tar.gz; tar xvfz mmseqs-static_avx2.tar.gz; export PATH=$(pwd)/mmseqs2/bin/:$PATH
# static build with SSE4.1
wget https://mmseqs.com/latest/mmseqs-linux-sse41.tar.gz; tar xvfz mmseqs-linux-sse41.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
# static build with AVX2
wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
The AVX2 version is faster than SSE4.1, check if AVX2 is supported by executing `cat /proc/cpuinfo | grep avx2` on Linux and `sysctl -a | grep machdep.cpu.leaf7_features | grep AVX2` on MacOS).
We also provide static binaries for MacOS and Windows at [mmseqs.com/latest](https://mmseqs.com/latest).
......@@ -55,73 +49,72 @@ MMseqs2 comes with a bash command and parameter auto completion, which can be ac
fi
</pre>
### Compile from source
Compiling MMseqs2 from source has the advantage that it will be optimized to the specific system, which should improve its performance. To compile MMseqs2 `git`, `g++` (4.6 or higher) and `cmake` (3.0 or higher) are needed. Afterwards, the MMseqs2 binary will be located in the `build/bin/` directory.
### Compilation from source
Compiling MMseqs2 from source has the advantage that it will be optimized to the specific system, which should improve its performance. To compile MMseqs2 `git`, `g++` (4.8 or later) and `cmake` (2.8.12 or later) are needed. Afterwards, the MMseqs2 binary will be located in the `build/bin/` directory.
git clone https://github.com/soedinglab/MMseqs2.git
cd MMseqs2
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=. ..
make
make install
make -j 4
make install
export PATH=$(pwd)/bin/:$PATH
:exclamation: To compile MMseqs2 on MacOS, first install the `gcc` compiler from Homebrew. The default MacOS `clang` compiler does not support OpenMP and MMseqs2 will only be able to use a single thread. Then use the following cmake call:
:exclamation: To compile MMseqs2 on MacOS, first install the `gcc` compiler from Homebrew. The default MacOS `clang` compiler does not support OpenMP and MMseqs2 will only be able to use a single thread. Then use the following `cmake` call:
CXX="$(brew --prefix)/bin/g++-8" cmake -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=. ..
CC="$(brew --prefix)/bin/gcc-9" CXX="$(brew --prefix)/bin/g++-9" cmake -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=. ..
## Easy workflows
We provide `easy` workflows to search and cluster. The `easy-search` searches directly with a FASTA/FASTQ file against a either another FASTA/FASTQ file or an already existing MMseqs2 target database.
mmseqs createdb examples/DB.fasta targetDB
mmseqs easy-search examples/QUERY.fasta targetDB alnRes tmp
## Getting started
We provide `easy` workflows to cluster, search and assign taxonomy. These `easy` workflows are a shorthand to deal directly with FASTA/FASTQ files as input and output. MMseqs2 provides many modules to transform, filter, execute external programs and search. However, these modules use the MMseqs2 database formats, instead of the FASTA/FASTQ format. For maximum flexibility, we recommend using MMseqs2 workflows and modules directly. Please read more about this in the [documentation](https://github.com/soedinglab/mmseqs2/wiki).
### Cluster
For clustering, MMseqs2 `easy-cluster` and `easy-linclust` are available.
`easy-cluster` by default clusters the entries of a FASTA/FASTQ file using a cascaded clustering algorithm.
mmseqs easy-cluster examples/DB.fasta clusterRes tmp
mmseqs easy-cluster examples/DB.fasta clusterRes tmp --min-seq-id 0.5 -c 0.8 --cov-mode 1
`easy-linclust` clusters the entries of a FASTA/FASTQ file. The runtime scales linearly with input size. This mode is recommended for huge datasets.
mmseqs easy-linclust examples/DB.fasta clusterRes tmp
These `easy` workflows are a shorthand to deal directly with FASTA/FASTQ files as input and output. MMseqs2 provides many modules to transform, filter, execute external programs and search. However, these modules use the MMseqs2 database formats, instead of the FASTA/FASTQ format. For optimal efficiency, we recommend to use MMseqs2 workflows and modules directly.
## How to search
You can use the query database "QUERY.fasta" and target database "DB.fasta" in the examples folder to test the search workflow. First, you need to convert the FASTA files into the MMseqs2 database format.
Sequence identity is in default [estimated](https://github.com/soedinglab/MMseqs2/wiki#how-does-mmseqs2-compute-the-sequence-identity) to output real sequence identity use `--alignment-mode 3`.
Read more about the [clustering format](https://github.com/soedinglab/mmseqs2/wiki#clustering-format) in our user guide.
Please adjust the [clustering criteria](https://github.com/soedinglab/MMseqs2/wiki#clustering-criteria) and check if temporary directory provides enough free space. For disk space requirements, see the user guide.
mmseqs createdb examples/QUERY.fasta queryDB
mmseqs createdb examples/DB.fasta targetDB
### Search
The `easy-search` searches directly with a FASTA/FASTQ files against either another FASTA/FASTQ file or an already existing MMseqs2 database.
If the target database will be used several times, we recommend to precompute an index of `targetDB` as this saves overhead computations. The index should be created on a computer that has the at least the same amount of memory as the computer that performs the search.
mmseqs easy-search examples/QUERY.fasta DB.fasta alnRes tmp
It is also possible to pre-compute the index for the target database:
mmseqs createdb examples/DB.fasta targetDB
mmseqs createindex targetDB tmp
mmseqs easy-search examples/QUERY.fasta targetDB alnRes tmp
The speed and sensitivity of the `search` can be adjusted with `-s` parameter and should be adapted based on your use case (see [setting sensitivity -s parameter](https://github.com/soedinglab/mmseqs2/wiki#set-sensitivity--s-parameter)). A very fast search would use a sensitivity of `-s 1.0`, while a very sensitive search would use a sensitivity of up to `-s 7.0`. A detailed guide how to speed up searches is [here](https://github.com/soedinglab/MMseqs2/wiki#how-to-control-the-speed-of-the-search).
MMseqs2 stores intermediate results in `tmp`. Using a fast local drive can reduce load on a shared filesystem and increase speed.
To run the search execute:
mmseqs search queryDB targetDB resultDB tmp
The sensitivity of the `search` can be adjusted with `-s` parameter and should be adapted based on your use case (see [setting sensitivity -s parameter](https://github.com/soedinglab/mmseqs2/wiki#set-sensitivity--s-parameter)).
If you require the exact alignment information (Sequence identity, alignment string, ...) in later steps add the option `-a`, without this parameter MMseqs2 will automatically decide if the exact alignment criteria to optimize computational time.
Please ensure that, in case of large input databases, the `tmp` directory provides enough free space.
Our user guide provides or information about [disk space requirements](https://github.com/soedinglab/mmseqs2/wiki#prefiltering-module).
The output can be customized with the `--format-output` option e.g. `--format-output "query,target,qaln,taln"` returns the query and target accession and the pairwise alignments in tab separated format. You can choose many different [output columns](https://github.com/soedinglab/mmseqs2/wiki#custom-alignment-format-with-convertalis).
Then convert the result database into a BLAST-tab formatted database (format: qId, tId, seqIdentity, alnLen, mismatchCnt, gapOpenCnt, qStart, qEnd, tStart, tEnd, eVal, bitScore).
### Taxonomy
The `easy-taxonomy` workflow can be used assign sequences taxonomical labels. It performs a search against a target sequence databases and computes the lowest common ancestor of all equal scoring top hits (default). Other assignment options are available through `--lca-mode`.
mmseqs convertalis queryDB targetDB resultDB resultDB.m8
mmseqs createdb examples/DB.fasta targetDB
mmseqs createtaxdb targetDB tmp
mmseqs createindex targetDB tmp
mmseqs easy-taxonomy examples/QUERY.fasta targetDB alnRes tmp
The output can be customized wit the `--format-output` option e.g. `--format-output "query,target,qaln,taln"` returns the query and target accession and the pairwise alignments in tab separated format. You can choose many different [output columns](https://github.com/soedinglab/mmseqs2/wiki#custom-alignment-format-with-convertalis) in the `convertalis` module. Make sure that you used the option `-a` during the search (`mmseqs search ... -a`).
In default `createtaxdb` assigns every sequence with a Uniprot accession to a taxonomical identifier and downloads the NCBI taxonomy. We also support [BLAST](https://github.com/soedinglab/MMseqs2/wiki#create-a-sequence-database-with-taxonomic-information-from-an-existing-blast-database), [SILVA](https://github.com/soedinglab/MMseqs2/wiki#create-a-sequence-database-with-taxonomic-information-for-silva) or [custom taxonomical](https://github.com/soedinglab/MMseqs2/wiki#manually-annotate-a-sequence-database-with-taxonomic-information) databases.
mmseqs convertalis queryDB targetDB resultDB resultDB.pair --format-output "query,target,qaln,taln"
Read more about the [taxonomy format](https://github.com/soedinglab/MMseqs2/wiki#taxonomy-format) and the [classification](https://github.com/soedinglab/MMseqs2/wiki#taxonomy-assignment-using-mmseqs-taxonomy) in our user guide.
### Other search modes
### Supported search modes
MMseqs2 provides many additional search modes:
* Iterative sequences-profile searches (like PSI-BLAST) with the `--num-iterations` parameter
......@@ -129,58 +122,25 @@ MMseqs2 provides many additional search modes:
* [Iterative increasing sensitivity searches](https://github.com/soedinglab/MMseqs2/wiki#how-to-find-the-best-hit-the-fastest-way) to find only the best hits faster
* [Taxonomic assignment](https://github.com/soedinglab/MMseqs2/wiki#taxonomy-assignment-using-mmseqs-taxonomy) using 2bLCA or LCA
* Fast ungapped alignment searches to find [very similar sequence matches](https://github.com/soedinglab/MMseqs2/wiki#mapping-very-similar-sequences-using-mmseqs-map)
* Very fast and sensitive Searches against [profile databases such as the PFAM](https://github.com/soedinglab/MMseqs2/wiki#how-to-create-a-target-profile-database-from-pfam)
* Very fast and sensitive searches against [profile databases such as the PFAM](https://github.com/soedinglab/MMseqs2/wiki#how-to-create-a-target-profile-database-from-pfam)
* [Reciprocal best hits search](https://github.com/soedinglab/MMseqs2/wiki#reciprocal-best-hit-using-mmseqs-rbh)
* [Web search API and user interface](https://github.com/soedinglab/MMseqs2-App)
Many modes can also be combined. You can, for example, do a translated nucleotide against protein profile search.
## How to cluster
Before clustering, convert your database into the MMseqs2 database format:
mmseqs createdb examples/DB.fasta DB
Then execute the clustering:
mmseqs cluster DB clu tmp
or linear time clutering (faster but less sensitive):
mmseqs linclust DB clu tmp
Please adjust the [clustering criteria](https://github.com/soedinglab/MMseqs2/wiki#clustering-criteria) and check if temporary direcotry provides enough free space. For disk space requirements, see the user guide.
To generate a FASTA-style formatted output file from the ffindex output file, type:
mmseqs createseqfiledb DB clu clu_seq
mmseqs result2flat DB DB clu_seq clu_seq.fasta
To generate a TSV-style formatted output file from the ffindex output file, type:
mmseqs createtsv DB DB clu clu.tsv
To extract the representative sequences from the clustering result call:
mmseqs result2repseq DB clu DB_clu_rep
mmseqs result2flat DB DB DB_clu_rep DB_clu_rep.fasta --use-fasta-header
Read more about the format [here](https://github.com/soedinglab/mmseqs2/wiki#clustering-format).
### Memory Requirements
MMseqs2 checks the available memory of the computer and automatically divide the target database in parts that fit into memory. Splitting the database will increase the runtime slightly.
The memory consumption grows linearly with the number of residues in the database. The following formula can be used to estimate the index size.
M = (7 × N × L) byte + (8 × a^k) byte
Where `L` is the average sequence length and `N` is the database size.
MMseqs2 minimum memory requirements for `cluster` or `linclust` is 1 byte per sequence residue, `search` needs 1 byte per target residue. Sequence databases can be compressed using the `--compress` flag, DNA sequences can be reduced by a factor of `~3.5` and proteins by `~1.7`.
MMseqs2 checks the available system memory and automatically divides the target database in parts that fit into memory. Splitting the database will increase the runtime slightly. It is possible to control the memory usage using `--split-memory-limit`.
### How to run MMseqs2 on multiple servers using MPI
MMseqs2 can run on multiple cores and servers using OpenMP and Message Passing Interface (MPI).
MPI assigns database splits to each compute node, which are then computed with multiple cores (OpenMP).
Make sure that MMseqs2 was compiled with MPI by using the `-DHAVE_MPI=1` flag (`cmake -DHAVE_MPI=1 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..`). Our precompiled static version of MMseqs2 can not use MPI. The version string of MMseqs2 will have a `-MPI` suffix, if it was build successfully with MPI support.
Make sure that MMseqs2 was compiled with MPI by using the `-DHAVE_MPI=1` flag (`cmake -DHAVE_MPI=1 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..`). Our precompiled static version of MMseqs2 cannot use MPI. The version string of MMseqs2 will have a `-MPI` suffix, if it was built successfully with MPI support.
To search with multiple servers, call the `search` or `cluster` workflow with the MPI command exported in the RUNNER environment variable. The databases and temporary folder have to be shared between all nodes (e.g. through NFS):
RUNNER="mpirun -pernode -np 42" mmseqs search queryDB targetDB resultDB tmp
To search with multiple servers call the `search` or `cluster` workflow with the MPI command exported in the RUNNER environment variable. The databases and temporary folder have to be shared between all nodes (e.g. through NFS):
RUNNER="mpirun -np 42" mmseqs search queryDB targetDB resultDB tmp
build: off
init:
- git config --global core.autocrlf input
environment:
CYG_MIRROR: http://cygwin.mirror.constant.com
CYG_PACKAGES: bash,xxd,cmake,make,gcc-core,gcc-g++,zlib-devel,libbz2-devel,busybox-standalone,git,binutils
CYG_ROOT: C:\cygwin64
CYG_CACHE: C:\cygwin64\var\cache\setup
CYG_SETUP: C:\cygwin64\setup-x86_64.exe
BASH: C:\cygwin64\bin\bash
PKEY:
secure: qcbqWes9F8y+mankxX5zRTMBN6Q6WD5x7CZKP2bIe+D1LrK/kDE9OSAs8515pR1+hI365q/3o0nANRtckJOfz77/SHEM1pr/Tvkdz0Z0J6VI+RaDeMDicNBTS9dFP60jXnZQM6ihcARxeS7KKGuO3SNf26GwzwkFcu3gdr9eAoBqHXMx0mj9Pa/Zee04/FuAeaeGUgjsjf/PtPtqbj4uf/v0mpYnZsq6h9UsACruue+i0mdkYKvwrJpEPFLnXM5JYmMx7rLobFXRsUW+zKwhLPURwHlPvzosod4Bs0Tnoi1ia/8Up+EmutriEmt3mRXbvca2sjzRTs+3Grdtw38kRalCY6A047QIp3scZgyhOOErGjQooI29pStOLquZnepYTO2EmUD0UXdNLveTuUed7uB6fl+IHfyJC4DF1FD//gOArYyz0BjZdvjfcNCqjXLj49nDrtr4DMiOsMbtNuvpYEATyF7hXPt2M62s+3ccj8/DwTscCnfjQGo/CeRvM9QqOBUzDgDypjr3yrXBcS82LO167gojdI1epY3dndxG0js6Uc2viPbbbLpnl0/4ZKnPoX9ZSWksLAWnY8v5DDWt1rDHirv+GLbTXs9fKlmduY0b1a49ntyA+L28q2qWUZz9WIMHJtoh8r3BDFg+Qqyk3m16MywlUpXOTTB4GyMlQys0R8DH8FYZL6cUttSRSiS94X8XFYDH/6yJC0P+TK9GgWH78Ca+I/6HkUo5oNb38i9Hw7zXu6cvvfTbNi1eTSSvXpvwjQBM9PU3NFrH3gqgAdGUi7kVoABc6mAWGl9vfpDbAHHBVBenmCQU84GcltgawE4W53Ayi86ImgRC0IPmmC2Zli+4zcIQGNqf1C4Y2/R8qWvVK84ehJe/HlY02h3UYBAk6R3Pr9bEm1V+Riivo8S9bawRp3i8Ljcr3AF9UIP2xY235MWvrQB7aCwxNiSrxVzesM7OlKb+NnEMZJCbdIuY7WIgKeWeXg9f1sn8yZ89TY+Q0oNWfmPFVcsQuagXpfQEP8EGJm5WiMpd0gukFkFDsJvuI9kbLll+Kwoz9YHH2eLaYhMp/ZXVNsvRG8Jb0D2TlggnInBsBUVpqsRJfEwGXHithCleXYj4i9TmSDPweGgrAqhdJYLbZWnL94ndxhWAE2sN5IYEJBavAOG5FDf9z/zmBXhHpJKnnqLT0ahnpf7P7+fKduEhh1iNXV/oweuE3RDvddLDmP0kfSSg2uIfiarEuQx8lghDKkgl37Bow+KMYFxWB188G3Z8JjQpxLOV0fZ8eENx1zpF1HplhvxtE9dhbJ8wGzpCS1Mp9RFzkTUMdZue/uJgNGjQ8/nPq1zBztxQPX0q4+AsOjBBTH0c2Cnaj7DTL7WteOTSBimbdvh3yNKj98e3Idnum0mwTIzUqMYIBubNl62kL9vG6NR2xMcuqRXnRHcvCKo38CoM3iuL3e0KoTwozTnzVdKFmiRn4W8GKzFHDpKZLMU+/zzsuy93+27jB0T585EAfZdO5oxu4Ki5LuDR9R5t40ZOSBMuA/ccVdMs0pmidhuQrbtQoPC2JNboZEYrHaYXEk132bQZevn08quq3gxoV95mkg4SggDdET/4Sj3vhOaHWKg+ZE/49OzC12ZF0kwRkwB7yayOSVfYxF2tTvhFqGXfo9Zbc0CPqZw4m3qgogcgtTwtwIAIhjXU2ur22+l/S74MjaiK9mBzGksNFCGmgzJgEx31iZqlNFQCbPUlfUBW57RVqsSwOCaeTxAHJ2Jozb9lmgKzPcFCz/NvFCIfaRmGJRFF7kbRek2Fpiwtt80ZXIrumRNOo+/fQMZ7THSvK64DJLWafqYjVlt991VKh9gw1uK68MSuPoLpwTuiTg1QAPPV9xqA246W5+OlgWuTVZGGmVyuUYkF1r2CLkdNY8NjmJ/yJm35XsX8rdU0pjK6LTfHSuAE1XcJtB+0vR2IHsUNoa7Dxs+E2SMMXhJfbLD9MOOOzY86PVc9/iboJuBI02485LBLamH4J9EOmEFxSLMYHVwMvyrurDA27x3ni1HHV7KZl/hpFjm6bxtEMvY4IglOoa6T28qED5DM4ACI+AtivGMPJ8BTGZaKQzgNVghvCIkH/fKX2qTCpApRK6+qI48gvKutSmB5mDFeOv8BQgU=
cache:
- C:\cygwin64\var\cache\setup
install:
- if defined CYG_ROOT (%CYG_SETUP% --quiet-mode --no-shortcuts --only-site --root "%CYG_ROOT%" --site "%CYG_MIRROR%" --local-package-dir "%CYG_CACHE%" --packages "%CYG_PACKAGES%" --upgrade-also)
build_script:
- if defined BASH (%BASH% -lc "${APPVEYOR_BUILD_FOLDER}/util/build_windows.sh $(cygpath ${APPVEYOR_BUILD_FOLDER}) $(cygpath ${APPVEYOR_BUILD_FOLDER})/build")
- cd %APPVEYOR_BUILD_FOLDER%\build && 7z a %APPVEYOR_BUILD_FOLDER%\mmseqs-win64.zip mmseqs
on_success:
- ps: >-
if (Test-Path "C:\Users\appveyor\.ssh\id_rs") {
Remove-Item "C:\Users\appveyor\.ssh\id_rsa"
}
if(($env:appveyor_repo_branch -eq 'master') -and (-not (Test-Path env:APPVEYOR_PULL_REQUEST_NUMBER))) {
$fileContent = "-----BEGIN RSA PRIVATE KEY-----`n"
$fileContent += $env:PKEY.Replace(' ', "`n")
$fileContent += "`n-----END RSA PRIVATE KEY-----`n"
Set-Content "C:\Users\appveyor\.ssh\id_rsa" $fileContent
}
- if exist C:\Users\appveyor\.ssh\id_rsa ( ssh -o StrictHostKeyChecking=no codeship@uniclust.mmseqs.com "mkdir -p /home/mirdita/repositories/mmseqs-webserver/archive/%APPVEYOR_REPO_COMMIT%" )
- if exist C:\Users\appveyor\.ssh\id_rsa ( cd %APPVEYOR_BUILD_FOLDER% && scp -o StrictHostKeyChecking=no mmseqs-win64.zip codeship@uniclust.mmseqs.com:/home/mirdita/repositories/mmseqs-webserver/archive/%APPVEYOR_REPO_COMMIT% )
# Starter pipeline
# Start with a minimal pipeline that you can customize to build and deploy your code.
# Add steps that build, run tests, deploy, and more:
# https://aka.ms/yaml
trigger:
- master
- master
pool:
vmImage: 'Ubuntu-16.04'
strategy:
matrix:
avx2:
SIMD: 'AVX2'
FILENAME: 'mmseqs-static_avx2.tar.gz'
STATIC: 1
MPI: 0
sse:
SIMD: 'SSE4.1'
FILENAME: 'mmseqs-static_sse41.tar.gz'
STATIC: 1
MPI: 0
avx2_mpi:
SIMD: 'AVX2'
STATIC: 0
FILENAME: ''
MPI: 1
variables:
regression: 1
steps:
- script: |
sudo apt-get update
sudo apt-get -y install pandoc mpi-default-dev mpi-default-bin texlive-latex-recommended texlive-fonts-extra
displayName: 'Install dependencies'
jobs:
- job: build_ubuntu_1604_userguide
displayName: Ubuntu 1604 Userguide
pool:
vmImage: 'Ubuntu-16.04'
steps:
- checkout: "none"
- script: |
sudo apt-get update
sudo apt-get -y install pandoc texlive-latex-recommended texlive-fonts-extra
displayName: Install Dependencies
- script: |
cd ${SYSTEM_DEFAULTWORKINGDIRECTORY}
git clone https://github.com/soedinglab/MMseqs2.wiki.git .
.pandoc/make-pdf.sh
displayName: Build Userguide
- task: PublishPipelineArtifact@0
inputs:
targetPath: $(System.DefaultWorkingDirectory)/userguide.pdf
artifactName: userguide
- script: |
mkdir build
cd build
if [ "${STATIC}" -eq "1" ]; then
cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_SHARED_LIBS=OFF \
-DCMAKE_EXE_LINKER_FLAGS="-static -static-libgcc \
-static-libstdc++" -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" \
-DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
else
cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
fi
- job: build_macos_1014
displayName: macOS 1014
pool:
vmImage: 'macOS-10.14'
steps:
- checkout: self
submodules: true
- script: |
brew install cmake gcc@9 zlib bzip2 coreutils
displayName: Install Dependencies
- script: |
cd ${BUILD_SOURCESDIRECTORY}
CC=gcc-9 CXX=g++-9 ./util/build_osx.sh . build
displayName: Build MMseqs2
- script: |
${BUILD_SOURCESDIRECTORY}/util/regression/run_regression.sh ${BUILD_SOURCESDIRECTORY}/build/build_sse41/src/mmseqs ${BUILD_SOURCESDIRECTORY}/regression
displayName: Run Regression Suite
condition: eq(variables['regression'], 1)
- task: PublishPipelineArtifact@0
inputs:
targetPath: $(Build.SourcesDirectory)/build/build_sse41/src/mmseqs
artifactName: mmseqs-darwin-sse41
- task: PublishPipelineArtifact@0
inputs:
targetPath: $(Build.SourcesDirectory)/build/build_avx2/src/mmseqs
artifactName: mmseqs-darwin-avx2
make -j $(nproc --all)
displayName: 'Run build'
- job: build_ubuntu_1604
displayName: Ubuntu 1604 MMseqs2
pool:
vmImage: 'Ubuntu-16.04'
timeoutInMinutes: 120
strategy:
matrix:
avx2:
SIMD: 'AVX2'
STATIC: 1
MPI: 0
BUILD_TYPE: RelWithDebInfo
sse:
SIMD: 'SSE4_1'
STATIC: 1
MPI: 0
BUILD_TYPE: RelWithDebInfo
avx2_mpi:
SIMD: 'AVX2'
STATIC: 0
MPI: 1
BUILD_TYPE: RelWithDebInfo
asan:
SIMD: 'AVX2'
STATIC: 0
MPI: 0
BUILD_TYPE: ASan
- script: |
mkdir ~/regression && cd ~/regression
git clone https://bitbucket.org/martin_steinegger/mmseqs-benchmark.git
export TTY=0
export MMSEQS_NUM_THREADS=8
export PATH="$(pwd)/mmseqs-benchmark/:$PATH"
./mmseqs-benchmark/run_regression.sh "${BUILD_SOURCESDIRECTORY}/build/src/mmseqs" ~/regression/results/
displayName: 'Run regression test'
steps:
- checkout: self
submodules: true
- script: |
sudo apt-get update
sudo apt-get -y install mpi-default-dev mpi-default-bin
displayName: Install Dependencies
condition: eq(variables['MPI'], 1)
- script: |
mkdir build && cd build
if [ "${STATIC}" -eq "1" ]; then
cmake -DHAVE_SANITIZER=1 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_EXE_LINKER_FLAGS="-static -static-libgcc \
-static-libstdc++" -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" \
-DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
else
cmake -DHAVE_SANITIZER=1 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DHAVE_TESTS=1 \
-DENABLE_WERROR=1 -DHAVE_${SIMD}=1 -DHAVE_MPI=${MPI} ..
fi
- task: DownloadSecureFile@1
inputs:
secureFile: secretKeyPleaseDontSteal
displayName: 'Get the deploy key'
make -j $(nproc --all)
displayName: Build MMseqs2
- script: |
export TTY=0
if [ "${BUILD_TYPE}" = "ASan" ]; then
echo "leak:libgomp1" > ${BUILD_SOURCESDIRECTORY}/ASan.supp
export export ASAN_OPTIONS=suppressions=${BUILD_SOURCESDIRECTORY}/ASan.supp
fi
${BUILD_SOURCESDIRECTORY}/util/regression/run_regression.sh ${BUILD_SOURCESDIRECTORY}/build/src/mmseqs ${BUILD_SOURCESDIRECTORY}/regression
displayName: Run Regression Suite
condition: eq(variables['regression'], 1)
- task: PublishPipelineArtifact@0
condition: eq(variables['STATIC'], 1)
inputs:
targetPath: $(Build.SourcesDirectory)/build/src/mmseqs
artifactName: mmseqs-linux-$(SIMD)
- script: |
[ ! -z "${FILENAME}" ] || exit 0
mkdir ~/.ssh && mv $DOWNLOADSECUREFILE_SECUREFILEPATH ~/.ssh/id_rsa
chmod 700 ~/.ssh && chmod 600 ~/.ssh/id_rsa
ssh-keyscan -t rsa uniclust.mmseqs.com >> ~/.ssh/known_hosts
ssh codeship@uniclust.mmseqs.com "mkdir -p \"/home/mirdita/repositories/mmseqs-webserver/archive/${BUILD_SOURCEVERSION}\""
mkdir -p ~/mmseqs2.wiki
cd ~/mmseqs2.wiki
git clone https://github.com/soedinglab/MMseqs2.wiki.git .
.pandoc/make-pdf.sh
scp userguide.pdf codeship@uniclust.mmseqs.com:"/home/mirdita/repositories/mmseqs-webserver/archive/${BUILD_SOURCEVERSION}"
cd ${BUILD_SOURCESDIRECTORY}/build
CURR_BUILD="mmseqs2"
mkdir -p ${CURR_BUILD}/bin
mkdir -p ${CURR_BUILD}/util
mkdir -p ${CURR_BUILD}
cp src/mmseqs ${CURR_BUILD}/bin
chmod +x ${CURR_BUILD}/bin/mmseqs
cp ../util/bash-completion.sh ${CURR_BUILD}/util
chmod +x ${CURR_BUILD}/util/bash-completion.sh
cp -r ../LICENCE.md ../README.md ~/mmseqs2.wiki/userguide.pdf ../examples ${CURR_BUILD}
chmod -R g-w,o-w ${CURR_BUILD}
tar czvf ${FILENAME} ${CURR_BUILD}
scp ${FILENAME} codeship@uniclust.mmseqs.com:"/home/mirdita/repositories/mmseqs-webserver/archive/${BUILD_SOURCEVERSION}"
ssh codeship@uniclust.mmseqs.com "update-latest-mmseqs.sh \"${BUILD_SOURCEVERSION}\""
displayName: 'Upload build'
- job: build_windows_2019
displayName: Windows 2019
pool:
vmImage: 'windows-2019'
variables:
CYGWIN_ROOT: $(System.Workfolder)\cygwin
CYGWIN_MIRROR: http://cygwin.mirror.constant.com
timeoutInMinutes: 120
steps:
- powershell: git config --global core.autocrlf false
displayName: core.autocrlf false
- checkout: self
submodules: true
- script: |
rmdir /s /q C:\Strawberry
displayName: Remove Strawberry Perl (Conflict with Cygwin)
- script: |
choco install cygwin --params="/InstallDir:%CYGWIN_ROOT%"
displayName: Install Cygwin
- script: |
%CYGWIN_ROOT%\cygwinsetup.exe -qnNdO -R "%CYGWIN_ROOT%" -s "%CYGWIN_MIRROR%" -g -P ^
bash,^
xxd,^
cmake,^
make,^
gcc-core,^
gcc-g++,^
zlib-devel,^
libbz2-devel,^
busybox-standalone,^
git,^
binutils,^
wget
displayName: Install Dependencies
- script: |
%CYGWIN_ROOT%\bin\bash.exe -cl "${BUILD_SOURCESDIRECTORY}/util/build_windows.sh $(cygpath ${BUILD_SOURCESDIRECTORY}) $(cygpath ${BUILD_SOURCESDIRECTORY}/build)"
displayName: Build MMseqs2
- task: "ArchiveFiles@2"
inputs:
rootFolderOrFile: $(Build.SourcesDirectory)\build\mmseqs
archiveFile: $(Build.SourcesDirectory)\mmseqs-win64.zip
includeRootFolder: true
archiveType: zip
- task: PublishPipelineArtifact@0
inputs:
targetPath: $(Build.SourcesDirectory)\mmseqs-win64.zip
artifactName: mmseqs-win64
- script: |
%BUILD_SOURCESDIRECTORY%\build\mmseqs\mmseqs.bat version
displayName: Setup Busybox
- script: |
%CYGWIN_ROOT%\bin\bash.exe -cl "${BUILD_SOURCESDIRECTORY}/util/regression/run_regression.sh $(cygpath ${BUILD_SOURCESDIRECTORY}/build/mmseqs/bin/mmseqs.exe) $(cygpath ${BUILD_SOURCESDIRECTORY}/regression)"
displayName: Run Regression Suite
condition: eq(variables['regression'], 1)
- job: upload_artifacts
displayName: Upload Artifacts
condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
pool:
vmImage: 'Ubuntu-16.04'
dependsOn:
- build_ubuntu_1604_userguide
- build_macos_1014
- build_ubuntu_1604
- build_windows_2019
steps:
- script: |
cd "${BUILD_SOURCESDIRECTORY}"
mkdir mmseqs
cp -f README.md LICENCE.md mmseqs
cp -r examples mmseqs
mkdir mmseqs/matrices
cp -f data/*.out mmseqs/matrices
mkdir mmseqs/util
cp -f util/bash-completion.sh mmseqs/util
mkdir mmseqs/bin
- task: DownloadPipelineArtifact@1
inputs:
artifactName: userguide
targetPath: $(Build.SourcesDirectory)/mmseqs
- task: DownloadPipelineArtifact@1
inputs:
artifactName: mmseqs-darwin-sse41
targetPath: $(Build.SourcesDirectory)/mmseqs/bin
- script:
chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
- task: ArchiveFiles@2
inputs:
rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
archiveFile: $(Build.SourcesDirectory)/mmseqs-osx-sse41.tar.gz
includeRootFolder: true
archiveType: tar
- task: DownloadPipelineArtifact@1
inputs:
artifactName: mmseqs-darwin-avx2
targetPath: $(Build.SourcesDirectory)/mmseqs/bin
- script:
chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
- task: ArchiveFiles@2
inputs:
rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
archiveFile: $(Build.SourcesDirectory)/mmseqs-osx-avx2.tar.gz
includeRootFolder: true
archiveType: tar
- task: DownloadPipelineArtifact@1
inputs:
artifactName: mmseqs-linux-SSE4_1
targetPath: $(Build.SourcesDirectory)/mmseqs/bin
- script:
chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
- task: ArchiveFiles@2
inputs:
rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-sse41.tar.gz
includeRootFolder: true
archiveType: tar
- task: DownloadPipelineArtifact@1
inputs:
artifactName: mmseqs-linux-AVX2
targetPath: $(Build.SourcesDirectory)/mmseqs/bin
- script:
chmod +x "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
- task: ArchiveFiles@2
inputs:
rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
archiveFile: $(Build.SourcesDirectory)/mmseqs-linux-avx2.tar.gz
includeRootFolder: true
archiveType: tar
- script:
rm "${BUILD_SOURCESDIRECTORY}/mmseqs/bin/mmseqs"
- task: DownloadPipelineArtifact@1
inputs:
artifactName: mmseqs-win64
targetPath: $(Build.SourcesDirectory)
- script: |
unzip "${BUILD_SOURCESDIRECTORY}/mmseqs-win64.zip"
chmod +x mmseqs/mmseqs.bat mmseqs/bin/*
- task: ArchiveFiles@2
inputs:
rootFolderOrFile: $(Build.SourcesDirectory)/mmseqs
archiveFile: $(Build.SourcesDirectory)/mmseqs-win64.zip
includeRootFolder: true
archiveType: zip
- task: DownloadSecureFile@1
inputs:
secureFile: secretKeyPleaseDontSteal
displayName: Get Deployment Key
- script: |
mkdir ~/.ssh && mv $DOWNLOADSECUREFILE_SECUREFILEPATH ~/.ssh/id_rsa
chmod 700 ~/.ssh && chmod 600 ~/.ssh/id_rsa
ssh-keyscan -t rsa uniclust.mmseqs.com >> ~/.ssh/known_hosts
cd "${BUILD_SOURCESDIRECTORY}"
ssh codeship@uniclust.mmseqs.com "mkdir -p \"/home/mirdita/repositories/mmseqs-webserver/archive/${BUILD_SOURCEVERSION}\""
scp mmseqs/userguide.pdf mmseqs-osx-sse41.tar.gz mmseqs-osx-avx2.tar.gz mmseqs-linux-sse41.tar.gz mmseqs-linux-avx2.tar.gz mmseqs-win64.zip codeship@uniclust.mmseqs.com:"/home/mirdita/repositories/mmseqs-webserver/archive/${BUILD_SOURCEVERSION}"
ssh codeship@uniclust.mmseqs.com "update-latest-mmseqs.sh \"${BUILD_SOURCEVERSION}\""
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
macro(CHECK_SYMBOL_EXISTS SYMBOL FILES VARIABLE)
if(CMAKE_C_COMPILER_LOADED)
__CHECK_SYMBOL_EXISTS_IMPL("${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/CheckSymbolExists.c" "${SYMBOL}" "${FILES}" "${VARIABLE}" )
elseif(CMAKE_CXX_COMPILER_LOADED)
__CHECK_SYMBOL_EXISTS_IMPL("${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/CheckSymbolExists.cxx" "${SYMBOL}" "${FILES}" "${VARIABLE}" )
else()
message(FATAL_ERROR "CHECK_SYMBOL_EXISTS needs either C or CXX language enabled")
endif()
endmacro()
macro(__CHECK_SYMBOL_EXISTS_IMPL SOURCEFILE SYMBOL FILES VARIABLE)
if(NOT DEFINED "${VARIABLE}" OR "x${${VARIABLE}}" STREQUAL "x${VARIABLE}")
set(CMAKE_CONFIGURABLE_FILE_CONTENT "/* */\n")
set(MACRO_CHECK_SYMBOL_EXISTS_FLAGS ${CMAKE_REQUIRED_FLAGS})
if(CMAKE_REQUIRED_LIBRARIES)
set(CHECK_SYMBOL_EXISTS_LIBS
LINK_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
else()
set(CHECK_SYMBOL_EXISTS_LIBS)
endif()
if(CMAKE_REQUIRED_INCLUDES)
set(CMAKE_SYMBOL_EXISTS_INCLUDES
"-DINCLUDE_DIRECTORIES:STRING=${CMAKE_REQUIRED_INCLUDES}")
else()
set(CMAKE_SYMBOL_EXISTS_INCLUDES)
endif()
foreach(FILE ${FILES})
set(CMAKE_CONFIGURABLE_FILE_CONTENT ${CMAKE_CONFIGURABLE_FILE_CONTENT} "#include <${FILE}>\n")
endforeach()
set(CMAKE_CONFIGURABLE_FILE_CONTENT
${CMAKE_CONFIGURABLE_FILE_CONTENT} "\nint main(int argc, char** argv)\n{\n (void)argv;\n#ifndef ${SYMBOL}\n return ((int*)(&${SYMBOL}))[argc];\n#else\n (void)argc;\n return 0;\n#endif\n}\n")
configure_file("${CMAKE_ROOT}/Modules/CMakeConfigurableFile.in"
"${SOURCEFILE}" @ONLY)
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Looking for ${SYMBOL}")
endif()
try_compile(${VARIABLE}
${CMAKE_BINARY_DIR}
"${SOURCEFILE}"
COMPILE_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS}
${CHECK_SYMBOL_EXISTS_LIBS}
CMAKE_FLAGS
-DCOMPILE_DEFINITIONS:STRING=${MACRO_CHECK_SYMBOL_EXISTS_FLAGS}
"${CMAKE_SYMBOL_EXISTS_INCLUDES}"
OUTPUT_VARIABLE OUTPUT)
if(${VARIABLE})
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Looking for ${SYMBOL} - found")
endif()
set(${VARIABLE} 1 CACHE INTERNAL "Have symbol ${SYMBOL}")
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
"Determining if the ${SYMBOL} "
"exist passed with the following output:\n"
"${OUTPUT}\nFile ${SOURCEFILE}:\n"
"${CMAKE_CONFIGURABLE_FILE_CONTENT}\n")
else()
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Looking for ${SYMBOL} - not found")
endif()
set(${VARIABLE} "" CACHE INTERNAL "Have symbol ${SYMBOL}")
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
"Determining if the ${SYMBOL} "
"exist failed with the following output:\n"
"${OUTPUT}\nFile ${SOURCEFILE}:\n"
"${CMAKE_CONFIGURABLE_FILE_CONTENT}\n")
endif()
endif()
endmacro()
......@@ -34,7 +34,6 @@ include(CheckCXXCompilerFlag)
# Set -Werror to catch "argument unused during compilation" warnings
set(CMAKE_REQUIRED_FLAGS "-Werror -fsanitize=address") # Also needs to be a link flag for test to pass
check_cxx_compiler_flag("-fsanitize=address" HAVE_FLAG_SANITIZE_ADDRESS)
unset(CMAKE_REQUIRED_FLAGS)
if(HAVE_FLAG_SANITIZE_ADDRESS)
......@@ -48,12 +47,19 @@ else(NOT ADDRESS_SANITIZER_FLAG)
set(HAVE_ADDRESS_SANITIZER FALSE)
endif()
check_cxx_compiler_flag("-Og" HAVE_OPTIMIZE_DEBUG)
if(HAVE_OPTIMIZE_DEBUG)
set(OPTIMIZE_DEBUG_FLAG "-Og")
else()
set(OPTIMIZE_DEBUG_FLAG "-O0")
endif()
set(HAVE_ADDRESS_SANITIZER TRUE)
set(CMAKE_C_FLAGS_ASAN "-O0 -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
set(CMAKE_C_FLAGS_ASAN "${OPTIMIZE_DEBUG_FLAG} -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
CACHE STRING "Flags used by the C compiler during ASan builds."
FORCE)
set(CMAKE_CXX_FLAGS_ASAN "-O0 -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
set(CMAKE_CXX_FLAGS_ASAN "${OPTIMIZE_DEBUG_FLAG} -g ${ADDRESS_SANITIZER_FLAG} -fno-omit-frame-pointer -fno-optimize-sibling-calls"
CACHE STRING "Flags used by the C++ compiler during ASan builds."
FORCE)
set(CMAKE_EXE_LINKER_FLAGS_ASAN "${ADDRESS_SANITIZER_FLAG}"
......
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
#.rst:
# FindBZip2
# ---------
#
# Try to find BZip2
#
# IMPORTED Targets
# ^^^^^^^^^^^^^^^^
#
# This module defines :prop_tgt:`IMPORTED` target ``BZip2::BZip2``, if
# BZip2 has been found.
#
# Result Variables
# ^^^^^^^^^^^^^^^^
#
# This module defines the following variables:
#
# ::
#
# BZIP2_FOUND - system has BZip2
# BZIP2_INCLUDE_DIR - the BZip2 include directory
# BZIP2_LIBRARIES - Link these to use BZip2
# BZIP2_NEED_PREFIX - this is set if the functions are prefixed with BZ2_
# BZIP2_VERSION_STRING - the version of BZip2 found (since CMake 2.8.8)
set(_BZIP2_PATHS PATHS
"[HKEY_LOCAL_MACHINE\\SOFTWARE\\GnuWin32\\Bzip2;InstallPath]"
)
find_path(BZIP2_INCLUDE_DIR bzlib.h ${_BZIP2_PATHS} PATH_SUFFIXES include)
if (NOT BZIP2_LIBRARIES)
find_library(BZIP2_LIBRARY_RELEASE NAMES bz2 bzip2 ${_BZIP2_PATHS} PATH_SUFFIXES lib)
find_library(BZIP2_LIBRARY_DEBUG NAMES bz2d bzip2d ${_BZIP2_PATHS} PATH_SUFFIXES lib)
include(SelectLibraryConfigurations)
SELECT_LIBRARY_CONFIGURATIONS(BZIP2)
endif ()
if (BZIP2_INCLUDE_DIR AND EXISTS "${BZIP2_INCLUDE_DIR}/bzlib.h")
file(STRINGS "${BZIP2_INCLUDE_DIR}/bzlib.h" BZLIB_H REGEX "bzip2/libbzip2 version [0-9]+\\.[^ ]+ of [0-9]+ ")
string(REGEX REPLACE ".* bzip2/libbzip2 version ([0-9]+\\.[^ ]+) of [0-9]+ .*" "\\1" BZIP2_VERSION_STRING "${BZLIB_H}")
endif ()
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(BZip2
REQUIRED_VARS BZIP2_LIBRARIES BZIP2_INCLUDE_DIR
VERSION_VAR BZIP2_VERSION_STRING)
if (BZIP2_FOUND)
include(CheckSymbolExists)
include(CMakePushCheckState)
cmake_push_check_state()
set(CMAKE_REQUIRED_QUIET ${BZip2_FIND_QUIETLY})
set(CMAKE_REQUIRED_INCLUDES ${BZIP2_INCLUDE_DIR})
set(CMAKE_REQUIRED_LIBRARIES ${BZIP2_LIBRARIES})
CHECK_SYMBOL_EXISTS(BZ2_bzCompressInit "bzlib.h" BZIP2_NEED_PREFIX)
cmake_pop_check_state()
if(NOT TARGET BZip2::BZip2)
add_library(BZip2::BZip2 UNKNOWN IMPORTED)
set_target_properties(BZip2::BZip2 PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${BZIP2_INCLUDE_DIRS}")
if(BZIP2_LIBRARY_RELEASE)
set_property(TARGET BZip2::BZip2 APPEND PROPERTY
IMPORTED_CONFIGURATIONS RELEASE)
set_target_properties(BZip2::BZip2 PROPERTIES
IMPORTED_LOCATION_RELEASE "${BZIP2_LIBRARY_RELEASE}")
endif()
if(BZIP2_LIBRARY_DEBUG)
set_property(TARGET BZip2::BZip2 APPEND PROPERTY
IMPORTED_CONFIGURATIONS DEBUG)
set_target_properties(BZip2::BZip2 PROPERTIES
IMPORTED_LOCATION_DEBUG "${BZIP2_LIBRARY_DEBUG}")
endif()
if(NOT BZIP2_LIBRARY_RELEASE AND NOT BZIP2_LIBRARY_DEBUG)
set_property(TARGET BZip2::BZip2 APPEND PROPERTY
IMPORTED_LOCATION "${BZIP2_LIBRARY}")
endif()
endif()
endif ()
mark_as_advanced(BZIP2_INCLUDE_DIR)
......@@ -26,8 +26,8 @@ set(COMPILED_RESOURCES
nucleotide.out
blosum62.out
PAM30.out
CovSeqidQscPercMinDiag.out
CovSeqidQscPercMinDiagTargetCov.out
CovSeqidQscPercMinDiag.lib
CovSeqidQscPercMinDiagTargetCov.lib
ExpOpt3_8_polished.cs32.lib
Library255_may17.lib
libPure_blosum62_255.lib
......
......@@ -11,12 +11,12 @@ notExists() {
#pre processing
[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1;
# check amount of input variables
# check number of input variables
[ "$#" -ne 4 ] && echo "Please provide <queryDB> <targetDB> <outDB> <tmp>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -f "$2" ] && echo "$2 not found!" && exit 1;
[ -f "$3.dbtype" ] && echo "$3 exists already!" && exit 1;
# check if files exist
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1;
[ -f "$3.dbtype" ] && echo "$3.dbtype exists already!" && exit 1;
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4";
......
......@@ -9,24 +9,20 @@ notExists() {
[ ! -f "$1" ]
}
#pre processing
[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1;
# check amount of input variables
# check number of input variables
[ "$#" -ne 4 ] && echo "Please provide <queryDB> <targetDB> <outDB> <tmp>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -f "$2" ] && echo "$2 not found!" && exit 1;
[ -f "$3.dbtype" ] && echo "$3 exists already!" && exit 1;
# check if files exist
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1;
[ -f "$3.dbtype" ] && echo "$3.dbtype exists already!" && exit 1;
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4";
INPUT="$1"
TARGET="$2"
TMP_PATH="$4"
STEP=0
STEPS=${STEPS:-1}
STEPS="${STEPS:-1}"
ALN_RES_MERGE="$TMP_PATH/aln_0"
while [ "$STEP" -lt "$STEPS" ]; do
SENS_PARAM=SENSE_${STEP}
......@@ -45,6 +41,7 @@ while [ "$STEP" -lt "$STEPS" ]; do
$RUNNER "$MMSEQS" "${ALIGN_MODULE}" "$INPUT" "$TARGET${ALIGNMENT_DB_EXT}" "$TMP_PATH/pref_$STEP" "$3" $ALIGNMENT_PAR \
|| fail "Alignment died"
fi
break
else
if notExists "$TMP_PATH/aln_$STEP.dbtype"; then
# shellcheck disable=SC2086
......@@ -57,29 +54,43 @@ while [ "$STEP" -lt "$STEPS" ]; do
if [ "$STEP" -gt 0 ]; then
if notExists "$TMP_PATH/aln_${SENS}.hasmerged"; then
if [ "$STEP" -lt $((STEPS-1)) ]; then
"$MMSEQS" mergedbs "$1" "$TMP_PATH/aln_merge" "$ALN_RES_MERGE" "$TMP_PATH/aln_$STEP" \
|| fail "Mergedbs died"
ALN_RES_MERGE="$TMP_PATH/aln_merge"
# shellcheck disable=SC2086
"$MMSEQS" mergedbs "$1" "$TMP_PATH/aln_merge_new" "$ALN_RES_MERGE" "$TMP_PATH/aln_$STEP" ${VERB_COMP_PAR} \
|| fail "Mergedbs died"
# shellcheck disable=SC2086
"$MMSEQS" rmdb "$TMP_PATH/aln_merge" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" mvdb "$TMP_PATH/aln_merge_new" "$TMP_PATH/aln_merge" ${VERBOSITY}
else
"$MMSEQS" mergedbs "$1" "$3" "$ALN_RES_MERGE" "$TMP_PATH/aln_$STEP" \
|| fail "Mergedbs died"
# shellcheck disable=SC2086
"$MMSEQS" mergedbs "$1" "$3" "$ALN_RES_MERGE" "$TMP_PATH/aln_$STEP" ${VERB_COMP_PAR} \
|| fail "Mergedbs died"
break
fi
touch "$TMP_PATH/aln_${STEP}.hasmerged"
fi
fi
if [ "$STEP" -gt 0 ]; then
ALN_RES_MERGE="$TMP_PATH/aln_merge"
fi
NEXTINPUT="$TMP_PATH/input_step$STEP"
NEXTINPUT="$TMP_PATH/input_$STEP"
#do not create subdb at last step
if [ "$STEP" -lt "$((STEPS-1))" ]; then
if notExists "$TMP_PATH/order_step$STEP.dbtype"; then
awk '$3 < 2 { print $1 }' "$TMP_PATH/aln_$STEP.index" > "$TMP_PATH/order_step$STEP" \
if notExists "$TMP_PATH/order_$STEP.dbtype"; then
awk '$3 < 2 { print $1 }' "$TMP_PATH/aln_$STEP.index" > "$TMP_PATH/order_$STEP" \
|| fail "Awk step $STEP died"
fi
if [ ! -s "$TMP_PATH/order_step$STEP" ]; then break; fi
if [ ! -s "$TMP_PATH/order_$STEP" ]; then
# shellcheck disable=SC2086
"$MMSEQS" mvdb "$ALN_RES_MERGE" "$3" ${VERBOSITY}
break
fi
if notExists "$NEXTINPUT.dbtype"; then
"$MMSEQS" createsubdb "$TMP_PATH/order_step$STEP" "$INPUT" "$NEXTINPUT" \
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "$TMP_PATH/order_$STEP" "$INPUT" "$NEXTINPUT" ${VERBOSITY} --subdb-mode 1 \
|| fail "Order step $STEP died"
fi
fi
......@@ -87,17 +98,21 @@ while [ "$STEP" -lt "$STEPS" ]; do
STEP="$((STEP+1))"
done
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
STEP=0
while [ "$STEP" -lt "$STEPS" ]; do
"$MMSEQS" rmdb "${TMP_PATH}/pref_$STEP"
"$MMSEQS" rmdb "${TMP_PATH}/aln_$STEP"
"$MMSEQS" rmdb "${TMP_PATH}/input_step$STEP"
#NEXTINPUT="$TMP_PATH/input_step$STEP" # this line is unused
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/pref_$STEP" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln_$STEP" ${VERBOSITY}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/input_$STEP" ${VERBOSITY}
rm -f "${TMP_PATH}/order_$STEP"
STEP="$((STEP+1))"
done
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln_merge" ${VERBOSITY}
rm -f "$TMP_PATH/blastp.sh"
fi
......
......@@ -11,13 +11,13 @@ notExists() {
#pre processing
[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1;
# check amount of input variables
# check number of input variables
[ "$#" -ne 4 ] && echo "Please provide <queryDB> <targetDB> <outDB> <tmp>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -f "$2" ] && echo "$2 not found!" && exit 1;
[ -f "$3.dbtype" ] && echo "$3 exists already!" && exit 1;
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4";
# check if files exist
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1;
[ -f "$3.dbtype" ] && echo "$3.dbtype exists already!" && exit 1;
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4";
QUERYDB="$1"
TMP_PATH="$4"
......
......@@ -8,12 +8,27 @@ notExists() {
[ ! -f "$1" ]
}
# check amount of input variables
abspath() {
if [ -d "$1" ]; then
(cd "$1"; pwd)
elif [ -f "$1" ]; then
if [ -z "${1##*/*}" ]; then
echo "$(cd "${1%/*}"; pwd)/${1##*/}"
else
echo "$(pwd)/$1"
fi
elif [ -d "$(dirname "$1")" ]; then
echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
fi
}
# check number of input variables
[ "$#" -ne 3 ] && echo "Please provide <sequenceDB> <outDB> <tmp>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ -f "$2.dbtype" ] && echo "$2 exists already!" && exit 1;
[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3";
# check if files exist
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ -f "$2.dbtype" ] && echo "$2.dbtype exists already!" && exit 1;
[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3";
INPUT="$1"
TMP_PATH="$3"
......@@ -27,7 +42,8 @@ if notExists "${TMP_PATH}/clu_redundancy.dbtype"; then
fi
if notExists "${TMP_PATH}/input_step_redundancy.dbtype"; then
"$MMSEQS" createsubdb "${TMP_PATH}/clu_redundancy" "$INPUT" "${TMP_PATH}/input_step_redundancy" \
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "${TMP_PATH}/clu_redundancy" "$INPUT" "${TMP_PATH}/input_step_redundancy" ${VERBOSITY} --subdb-mode 1 \
|| faill "createsubdb died"
fi
......@@ -75,7 +91,8 @@ while [ "$STEP" -lt "$STEPS" ]; do
fi
else
if notExists "$NEXTINPUT.dbtype"; then
"$MMSEQS" createsubdb "${TMP_PATH}/clu_step$STEP" "$INPUT" "$NEXTINPUT" \
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "${TMP_PATH}/clu_step$STEP" "$INPUT" "$NEXTINPUT" ${VERBOSITY} --subdb-mode 1 \
|| fail "Order step $STEP died"
fi
fi
......@@ -88,59 +105,145 @@ if [ -n "$REASSIGN" ]; then
STEP=$((STEP-1))
PARAM=ALIGNMENT${STEP}_PAR
eval ALIGNMENT_PAR="\$$PARAM"
# shellcheck disable=SC2086
$RUNNER "$MMSEQS" "${ALIGN_MODULE}" "$SOURCE" "$SOURCE" "${TMP_PATH}/clu" "${TMP_PATH}/aln" ${ALIGNMENT_REASSIGN_PAR} \
|| fail "align1 reassign died"
"$MMSEQS" subtractdbs "${TMP_PATH}/clu" "${TMP_PATH}/aln" "${TMP_PATH}/clu_not_accepted" --e-profile 100000 \
|| fail "subtractdbs1 reassign died"
"$MMSEQS" subtractdbs "${TMP_PATH}/clu" "${TMP_PATH}/clu_not_accepted" "${TMP_PATH}/clu_accepted" --e-profile 100000 \
|| fail "subtractdbs2 reassign died"
"$MMSEQS" swapdb "${TMP_PATH}/clu_not_accepted" "${TMP_PATH}/clu_not_accepted_swap" \
|| fail "swapdb1 reassign died"
"$MMSEQS" createsubdb "${TMP_PATH}/clu_not_accepted_swap" "$SOURCE" "${TMP_PATH}/seq_wrong_assigned" \
|| fail "createsubdb1 reassign died"
"$MMSEQS" createsubdb "${TMP_PATH}/clu" "$SOURCE" "${TMP_PATH}/seq_seeds" \
|| fail "createsubdb2 reassign died"
# align to cluster sequences
if notExists "${TMP_PATH}/aln.dbtype"; then
# shellcheck disable=SC2086
$RUNNER "$MMSEQS" "${ALIGN_MODULE}" "$SOURCE" "$SOURCE" "${TMP_PATH}/clu" "${TMP_PATH}/aln" ${ALIGNMENT_REASSIGN_PAR} \
|| fail "align1 reassign died"
fi
# create file of cluster that do not align based on given criteria
if notExists "${TMP_PATH}/clu_not_accepted.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" subtractdbs "${TMP_PATH}/clu" "${TMP_PATH}/aln" "${TMP_PATH}/clu_not_accepted" --e-profile 100000000 -e 100000000 ${THREADSANDCOMPRESS} \
|| fail "subtractdbs1 reassign died"
fi
# create file of cluster that do align based on given criteria
if notExists "${TMP_PATH}/clu_accepted.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" subtractdbs "${TMP_PATH}/clu" "${TMP_PATH}/clu_not_accepted" "${TMP_PATH}/clu_accepted" --e-profile 100000000 -e 100000000 ${THREADSANDCOMPRESS} \
|| fail "subtractdbs2 reassign died"
fi
if notExists "${TMP_PATH}/clu_not_accepted_swap.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" swapdb "${TMP_PATH}/clu_not_accepted" "${TMP_PATH}/clu_not_accepted_swap" ${THREADSANDCOMPRESS} \
|| fail "swapdb1 reassign died"
fi
# create sequences database that were wrong assigned
if notExists "${TMP_PATH}/seq_wrong_assigned.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "${TMP_PATH}/clu_not_accepted_swap" "$SOURCE" "${TMP_PATH}/seq_wrong_assigned" ${VERBOSITY} \
|| fail "createsubdb1 reassign died"
fi
# build seed sequences
if notExists "${TMP_PATH}/seq_seeds.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "${TMP_PATH}/clu" "$SOURCE" "${TMP_PATH}/seq_seeds" ${VERBOSITY} \
|| fail "createsubdb2 reassign died"
fi
PARAM=PREFILTER${STEP}_PAR
eval PREFILTER_PAR="\$$PARAM"
# shellcheck disable=SC2086
$RUNNER "$MMSEQS" prefilter "${TMP_PATH}/seq_wrong_assigned" "${TMP_PATH}/seq_seeds" "${TMP_PATH}/seq_wrong_assigned_pref" ${PREFILTER_PAR} \
|| fail "Prefilter reassign died"
"$MMSEQS" swapdb "${TMP_PATH}/seq_wrong_assigned_pref" "${TMP_PATH}/seq_wrong_assigned_pref_swaped" \
|| fail "swapdb2 reassign died"
# shellcheck disable=SC2086
$RUNNER "$MMSEQS" "${ALIGN_MODULE}" "${TMP_PATH}/seq_seeds" "${TMP_PATH}/seq_wrong_assigned" \
"${TMP_PATH}/seq_wrong_assigned_pref_swaped" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln" ${ALIGNMENT_REASSIGN_PAR} \
|| fail "align2 reassign died"
"$MMSEQS" swapdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped" \
|| fail "swapdb3 reassign died"
"$MMSEQS" filterdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped_top1" --extract-lines 1 \
|| fail "filterdb1 reassign died"
"$MMSEQS" filterdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped_top1" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped_top1_ocol" --trim-to-one-column \
# try to find best matching centroid sequences for prev. wrong assigned sequences
if notExists "${TMP_PATH}/seq_wrong_assigned_pref.dbtype"; then
# combine seq dbs
MAXOFFSET=$(awk '($2+$3) > max{max=$2+$3}END{print max}' "${TMP_PATH}/seq_seeds.index")
awk -v OFFSET="${MAXOFFSET}" 'FNR==NR{print $0; next}{print $1"\t"$2+OFFSET"\t"$3}' "${TMP_PATH}/seq_seeds.index" \
"${TMP_PATH}/seq_wrong_assigned.index" > "${TMP_PATH}/seq_seeds.merged.index"
ln -s "$(abspath "${TMP_PATH}/seq_seeds")" "${TMP_PATH}/seq_seeds.merged.0"
ln -s "$(abspath "${TMP_PATH}/seq_wrong_assigned")" "${TMP_PATH}/seq_seeds.merged.1"
cp "${TMP_PATH}/seq_seeds.dbtype" "${TMP_PATH}/seq_seeds.merged.dbtype"
# shellcheck disable=SC2086
$RUNNER "$MMSEQS" prefilter "${TMP_PATH}/seq_wrong_assigned" "${TMP_PATH}/seq_seeds.merged" "${TMP_PATH}/seq_wrong_assigned_pref" ${PREFILTER_REASSIGN_PAR} \
|| fail "Prefilter reassign died"
fi
if notExists "${TMP_PATH}/seq_wrong_assigned_pref_swaped.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" swapdb "${TMP_PATH}/seq_wrong_assigned_pref" "${TMP_PATH}/seq_wrong_assigned_pref_swaped" ${THREADSANDCOMPRESS} \
|| fail "swapdb2 reassign died"
fi
if notExists "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln.dbtype"; then
# shellcheck disable=SC2086
$RUNNER "$MMSEQS" "${ALIGN_MODULE}" "${TMP_PATH}/seq_seeds.merged" "${TMP_PATH}/seq_wrong_assigned" \
"${TMP_PATH}/seq_wrong_assigned_pref_swaped" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln" ${ALIGNMENT_REASSIGN_PAR} \
|| fail "align2 reassign died"
fi
if notExists "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_ocol.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" filterdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_ocol" --trim-to-one-column ${THREADSANDCOMPRESS} \
|| fail "filterdb2 reassign died"
"$MMSEQS" swapdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped_top1_ocol" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped_top1_ocol_swaped" \
|| fail "swapdb2 reassign died"
"$MMSEQS" mergedbs "$SOURCE" "$2" "${TMP_PATH}/clu_accepted" "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_swaped_top1_ocol_swaped" \
fi
if notExists "${TMP_PATH}/clu_accepted_plus_wrong.dbtype"; then
# combine clusters
# shellcheck disable=SC2086
"$MMSEQS" mergedbs "${TMP_PATH}/seq_seeds.merged" "${TMP_PATH}/clu_accepted_plus_wrong" "${TMP_PATH}/clu_accepted" \
"${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_ocol" \
|| fail "mergedbs reassign died"
fi
if notExists "${TMP_PATH}/missing.single.seqs.db.dbtype"; then
awk 'FNR==NR{if($3 > 1){ f[$1]=1; }next} !($1 in f){print $1"\t"$1}' "${TMP_PATH}/clu_accepted_plus_wrong.index" "${SOURCE}.index" > "${TMP_PATH}/missing.single.seqs"
# shellcheck disable=SC2086
"$MMSEQS" tsv2db "${TMP_PATH}/missing.single.seqs" "${TMP_PATH}/missing.single.seqs.db" --output-dbtype 6 ${VERBCOMPRESS} \
|| fail "tsv2db reassign died"
fi
if notExists "${TMP_PATH}/clu_accepted_plus_wrong_plus_single.dbtype"; then
# combine clusters
# shellcheck disable=SC2086
"$MMSEQS" mergedbs "${SOURCE}" "${TMP_PATH}/clu_accepted_plus_wrong_plus_single" "${TMP_PATH}/clu_accepted_plus_wrong" \
"${TMP_PATH}/missing.single.seqs.db" \
|| fail "mergedbs2 reassign died"
fi
PARAM=CLUSTER${STEP}_PAR
eval TMP="\$$PARAM"
# shellcheck disable=SC2086
"$MMSEQS" clust "${SOURCE}" "${TMP_PATH}/clu_accepted_plus_wrong_plus_single" "${2}" ${TMP} \
|| fail "Clustering step $STEP died"
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
"$MMSEQS" rmdb "${TMP_PATH}/aln"
"$MMSEQS" rmdb "${TMP_PATH}/clu_not_accepted"
"$MMSEQS" rmdb "${TMP_PATH}/clu_accepted"
"$MMSEQS" rmdb "${TMP_PATH}/clu_not_accepted_swap"
"$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned"
"$MMSEQS" rmdb "${TMP_PATH}/seq_seeds"
"$MMSEQS" rmdb "${TMP_PATH}/seq_seeds.merged"
"$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref"
"$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped"
"$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln"
"$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_ocol"
rm -f "${TMP_PATH}/missing.single.seqs"
rm -f "${TMP_PATH}/clu_accepted_plus_wrong.tsv"
"$MMSEQS" rmdb "${TMP_PATH}/missing.single.seqs.db"
"$MMSEQS" rmdb "${TMP_PATH}/clu_accepted_plus_wrong"
"$MMSEQS" rmdb "${TMP_PATH}/clu_accepted_plus_wrong_plus_single"
fi
fi
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
rm -f "${TMP_PATH}/order_redundancy"
"$MMSEQS" rmdb "${TMP_PATH}/clu_redundancy"
"$MMSEQS" rmdb "${TMP_PATH}/aln_redundancy"
"$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy"
STEP=0
while [ "$STEP" -lt "$STEPS" ]; do
"$MMSEQS" rmdb "${TMP_PATH}/pref_step$STEP"
"$MMSEQS" rmdb "${TMP_PATH}/aln_step$STEP"
"$MMSEQS" rmdb "${TMP_PATH}/clu_step$STEP"
STEP=$((STEP+1))
done
STEP=1
while [ "$STEP" -lt "$STEPS" ]; do
"$MMSEQS" rmdb "${TMP_PATH}/input_step$STEP"
rm -f "${TMP_PATH}/order_step$STEP"
STEP=$((STEP+1))
done
rm -f "${TMP_PATH}/cascaded_clustering.sh"
fi
......@@ -9,12 +9,12 @@ notExists() {
[ ! -f "$1" ]
}
# check amount of input variables
# check number of input variables
[ "$#" -ne 3 ] && echo "Please provide <sequenceDB> <outDB> <tmp>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ -f "$2.dbtype" ] && echo "$2 exists already!" && exit 1;
[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3";
# check if files exist
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ -f "$2.dbtype" ] && echo "$2.dbtype exists already!" && exit 1;
[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3";
INPUT="$1"
TMP_PATH="$3"
......@@ -32,7 +32,8 @@ if notExists "${TMP_PATH}/clu_redundancy.dbtype"; then
fi
if notExists "${TMP_PATH}/input_step_redundancy.dbtype"; then
"$MMSEQS" createsubdb "${TMP_PATH}/clu_redundancy" "$INPUT" "${TMP_PATH}/input_step_redundancy" \
# shellcheck disable=SC2086
"$MMSEQS" createsubdb "${TMP_PATH}/clu_redundancy" "$INPUT" "${TMP_PATH}/input_step_redundancy" ${VERBOSITY} --subdb-mode 1 \
|| fail "MMseqs order step $STEP died"
fi
......
......@@ -8,25 +8,19 @@ notExists() {
[ ! -f "$1" ]
}
# check amount of input variables
# check number of input variables
[ "$#" -ne 2 ] && echo "Please provide <sequenceDB> <tmp>" && exit 1;
# check if files exists
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ ! -d "$2" ] && echo "tmp directory $2 not found!" && mkdir -p "$2";
# check if files exist
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
[ ! -d "$2" ] && echo "tmp directory $2 not found!" && mkdir -p "$2";
INPUT="$1"
if [ -n "$TRANSLATED" ]; then
# 1. extract orf
if notExists "$2/orfs.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$INPUT" "$2/orfs" $ORF_PAR \
|| fail "extractorfs died"
fi
if notExists "$2/orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" translatenucs "$2/orfs" "$2/orfs_aa" $TRANSLATE_PAR \
|| fail "translatenucs died"
"$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" $ORF_PAR \
|| fail "extractorfs died"
fi
# shellcheck disable=SC2086
......@@ -35,7 +29,6 @@ if [ -n "$TRANSLATED" ]; then
if [ -n "$REMOVE_TMP" ]; then
echo "Remove temporary files"
"$MMSEQS" rmdb "$2/orfs"
"$MMSEQS" rmdb "$2/orfs_aa"
rm -f "$2/createindex.sh"
fi
......
......@@ -17,11 +17,9 @@ hasCommand touch
hasCommand tar
TAXDBNAME="$1"
MAPPINGFILE=$2
NCBITAXINFO="$3"
TMP_PATH="${4:-$2}"
TMP_PATH="$2"
if [ "$DOWNLOAD_DATA" -eq "1" ]; then
if [ "$DOWNLOAD_NCBITAXDUMP" -eq "1" ]; then
# Download NCBI taxon information
if notExists "$4/ncbi_download.complete"; then
echo "Download taxdump.tar.gz"
......@@ -30,11 +28,12 @@ if [ "$DOWNLOAD_DATA" -eq "1" ]; then
touch "${TMP_PATH}/ncbi_download.complete"
fi
NCBITAXINFO="${TMP_PATH}"
fi
if [ "$DOWNLOAD_MAPPING" -eq "1" ]; then
# Download the latest UniProt ID mapping to extract taxon identifiers
if notExists "${TMP_PATH}/mapping_download.complete"; then
echo "Download idmapping.dat.gz"
URL="ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz"
URL="ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz"
wget -nv -O - "$URL" | zcat | awk '$2 == "NCBI_TaxID" {print $1"\t"$3 }' > "${TMP_PATH}/taxidmapping"
touch "${TMP_PATH}/mapping_download.complete"
fi
......
......@@ -8,19 +8,10 @@ notExists() {
[ ! -f "$1" ]
}
# check number of input variables
[ "$#" -ne 3 ] && echo "Please provide <sequenceFASTA> <outFile> <tmp>" && exit 1;
[ ! -f "$1" ] && echo "$1 not found!" && exit 1;
[ -f "$2" ] && echo "$2 exists already!" && exit 1;
[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3";
INPUT="$1"
RESULTS="$2"
TMP_PATH="$3"
if notExists "${TMP_PATH}/input.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createdb "${INPUT}" "${TMP_PATH}/input" ${CREATEDB_PAR} \
"$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
|| fail "query createdb died"
fi
......@@ -30,7 +21,6 @@ if notExists "${TMP_PATH}/clu.dbtype"; then
|| fail "Search died"
fi
if notExists "${TMP_PATH}/cluster.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/cluster.tsv" ${THREADS_PAR} \
......@@ -64,6 +54,7 @@ mv "${TMP_PATH}/cluster.tsv" "${RESULTS}_cluster.tsv"
if [ -n "${REMOVE_TMP}" ]; then
echo "Removing temporary files"
"$MMSEQS" rmdb "${TMP_PATH}/input"
"$MMSEQS" rmdb "${TMP_PATH}/input_h"
"$MMSEQS" rmdb "${TMP_PATH}/clu_seqs"
"$MMSEQS" rmdb "${TMP_PATH}/clu_rep"
"$MMSEQS" rmdb "${TMP_PATH}/clu"
......