Skip to content
Commits on Source (5)
......@@ -37,7 +37,10 @@ exec_program(
ARGS "rev-parse --short HEAD"
OUTPUT_VARIABLE VERSION_SHA1 )
if ( VERSION_SHA1 MATCHES "^[0-9a-f]+$") # might return "git commit fatal: not a git repository"
message("Git version: ${VERSION_SHA1}")
add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
endif()
################################################################################
# Define cmake modules directory
......@@ -94,6 +97,12 @@ add_executable("merci" ${MerciFiles})
target_link_libraries("merci" ${gatb-core-libraries})
# that file will hold the version information as a #define
configure_file (
${PROJECT_SOURCE_DIR}/src/build_info.hpp.in
${PROJECT_SOURCE_DIR}/src/build_info.hpp
)
################################################################################
# DOC
################################################################################
......
......@@ -24,5 +24,5 @@ cd ..
# run tests
echo "Running simple test..."
cd test
. ./simple_test.sh
./simple_test.sh
cd ..
......@@ -32,12 +32,7 @@ C++11 compiler; (g++ version>=4.7 (Linux), clang version>=4.3 (Mac OSX))
Type `minia` without any arguments for usage instructions.
A more complete manual:
cd doc
pdflatex manual.tex
If you cannot compile it: http://minia.genouest.org/files/minia.pdf
A more complete manual is here: https://github.com/GATB/minia/raw/master/doc/manual.pdf
# What is new ? (2018)
......
minia (3.2.1-1) UNRELEASED; urgency=medium
* Team upload.
* New upstream version 3.2.1
-- Andrius Merkys <merkys@debian.org> Fri, 05 Jul 2019 01:35:17 -0400
minia (3.2.0-2) unstable; urgency=medium
* Team upload.
......
......@@ -4,7 +4,7 @@ Description: Link against Debian packaged gatbcore
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,15 +72,17 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMA
@@ -72,15 +72,17 @@
include_directories (${PROGRAM_SOURCE_DIR})
file (GLOB_RECURSE ProjectFiles ${PROGRAM_SOURCE_DIR}/*)
......@@ -23,4 +23,4 @@ Description: Link against Debian packaged gatbcore
+target_link_libraries("merci" gatbcore hdf5 )
################################################################################
# that file will hold the version information as a #define
......@@ -4,7 +4,7 @@ Description: Failed to fix these install dirs, deactivate completely and use dh_
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,9 +121,9 @@ SET (CPACK_SOURCE_IGNORE_FILES
@@ -127,9 +127,9 @@
)
# We copy the project binary to the 'bin' directory
......
Author: Andrius Merkys <merkys@debian.org>
Description: There is no need to build GatbCore.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,7 @@
SET (GATB_CORE_EXCLUDE_EXAMPLES 1)
# GATB CORE
-include (GatbCore)
+# include (GatbCore)
################################################################################
# TOOL
......@@ -3,4 +3,3 @@ link_libraries.patch
no_install_to_wrong_dir.patch
skip-rpath.patch
fix_path_in_tests.patch
remove_gatb-core_cmake.patch
Author: Andreas Tille <tille@debian.org>
Last-Update: Mon, 21 Jan 2019 09:01:19 +0100
Description: Use cmake input file of Debian packaged gatb-core
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,22 +28,11 @@ IF (DEFINED JENKINS_TAG)
@@ -28,20 +28,6 @@
SET (gatb-tool-version ${JENKINS_TAG})
ENDIF()
......@@ -17,15 +16,20 @@ Description: Use cmake input file of Debian packaged gatb-core
- ARGS "rev-parse --short HEAD"
- OUTPUT_VARIABLE VERSION_SHA1 )
-
-if ( VERSION_SHA1 MATCHES "^[0-9a-f]+$") # might return "git commit fatal: not a git repository"
- message("Git version: ${VERSION_SHA1}")
- add_definitions( -DGIT_SHA1="${VERSION_SHA1}" )
-endif()
-
################################################################################
# Define cmake modules directory
################################################################################
-SET (GATB_CORE_HOME ${PROJECT_SOURCE_DIR}/thirdparty/gatb-core/gatb-core)
-SET (CMAKE_MODULE_PATH ${GATB_CORE_HOME}/cmake)
+include(GNUInstallDirs)
+SET (CMAKE_MODULE_PATH /usr/${CMAKE_INSTALL_LIBDIR}/cmake)
@@ -66,7 +52,7 @@
SET (GATB_CORE_EXCLUDE_EXAMPLES 1)
# GATB CORE
-include (GatbCore)
+# include (GatbCore)
################################################################################
# SUPPORTED KMER SIZES
# TOOL
No preview for this file type
\documentclass[a4paper]{article}
\usepackage{fancyvrb}
\usepackage{pdfpages}
\usepackage{hyperref}
\usepackage{url}
\begin{document}
......@@ -65,7 +66,7 @@ The main parameters are:
\begin{description}
\vitem+kmer-size+
The $k$-mer length is the length of the nodes in the de Bruijn graph. It strongly depends on the input dataset. For proper assembly, we recommend that you use the Minia-pipeline that runs Minia multiple times, with an iterative multi-k algorithm. That way, you won't need to choose k. If you insist on running with a single k value, the KmerGenie software can automatically find the best $k$ for your dataset.
The $k$-mer length is the length of the nodes in the de Bruijn graph. It strongly depends on the input dataset. For proper assembly, we recommend that you use the Minia-pipeline (\url{https://github.com/GATB/gatb-minia-pipeline}) that runs Minia multiple times, using an iterative multi-k algorithm (similar to IDBA, Megahit). That way, you won't need to choose k. If you insist on running with a single k value, the KmerGenie software can automatically find the best $k$ for your dataset.
\vitem+abundance-min+
The \verb+abundance-min+ is a hard cut-off to remove likely erroneous, low-abundance $k$-mers. In Minia 3, it is recommended to set it to 2 unless you have a good reason not to. Minia 3 also implements other ways to remove erroneous k-mers, using a more adequate relative cut-off, in the graph simplifications step.
......@@ -113,11 +114,17 @@ Minia can direclty read files compressed with gzip. Compressed files should end
\section{Output}
The output of Minia is a set of contigs in the FASTA format, in the file \verb+[prefix].contigs.fa+.
\subsection{Fasta}
\subsection*{Creating unitigs}
The main output of Minia is a set of contigs in the FASTA format, in the file \verb+[prefix].contigs.fa+.
Minia will also output unitigs in the FASTA format. Those are non-branching paths in the de Bruijn graph prior to any graph simplification. File: \verb+[prefix].unitigs.fa+.
\subsection{Assembly graph}
The contigs FASTA file can be converted to GFA using \href{https://github.com/GATB/bcalm/blob/master/scripts/convertToGFA.py}{BCALM's convertToGFA.py script}.
\subsection*{Unitigs}
Minia will also output unitigs, in the FASTA format. They correspond to non-branching paths in the de Bruijn graph prior to any graph simplification. File: \verb+[prefix].unitigs.fa+.
\section{Selection of graph simplification steps}
......
......@@ -93,8 +93,7 @@ void index_assembly(string assembly, int k, int nb_threads, bool verbose, assemb
bool rc = modelCanon.toString(kmmerBegin.value()) != kmerBegin; // could be optimized
ExtremityInfo e (current_seq, rc, UNITIG_BEGIN); /* actually we dont know if its false/UNITIG_BEGIN or reverse/UNITIG_END but should think about it, for now, putting naive*/
index[kmmerBegin.value()] = e.pack();
if (debug)
std::cout << "index left kmer " << modelCanon.toString(kmmerBegin.value()) << " of contig " << current_seq << std::endl;
if (debug) std::cout << "index left kmer " << modelCanon.toString(kmmerBegin.value()) << " of contig " << current_seq << std::endl;
}
if (no_link_right)
......@@ -104,8 +103,7 @@ void index_assembly(string assembly, int k, int nb_threads, bool verbose, assemb
bool rc = modelCanon.toString(kmmerEnd.value()) != kmerEnd; // could be optimized
ExtremityInfo e (current_seq, rc, UNITIG_END);
index[kmmerEnd.value()] = e.pack();
if (debug)
std::cout << "index right kmer " << modelCanon.toString(kmmerEnd.value()) << " of contig " << current_seq << std::endl;
if (debug) std::cout << "index right kmer " << modelCanon.toString(kmmerEnd.value()) << " of contig " << current_seq << std::endl;
}
current_seq++;
}
......
cd test-simple;
../../buildk32/merci -kmer-size 11 -reads reads.fa -assembly assembly.fa -verbose 1 ;
cd ..
echo "wc, should have 4 lines:"
wc -l test-simple/assembly.fa
echo "wc, should have 2 lines:"
wc -l test-simple/assembly.fa.merci
#!/usr/bin/env python
'''****************************************************************************
* Program - Convert To GFA format
* Author - Mayank Pahadia
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
****************************************************************************'''
'''****************************************************************************
* To run the program, pass three arguments with the python script on command line.
* For example - python convertToGFA.py inputFileName outputFileName kmerSize
* Logic - It reads through the fasta file with all the unitigs information
* and link information and outputs it in the GFA format.
****************************************************************************'''
import sys
import argparse
def write_segment(name,segment,optional,g,links):
add = ""
add += "S\t" #for segment
add += name #id of segment
add += "\t"
add += segment #segment itself
add += "\t"
for i in optional: #optional tags
add+=i
add+="\t"
#adding Segment to the file
g.write(add.strip()+"\n")
for j in links: #adding all the links of the current segment to the GFA file
g.write(j)
def main():
parser = argparse.ArgumentParser(description="Convert a bcalm-generated FASTA to a GFA.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('inputFilename', help='Input FASTA file')
parser.add_argument('outputFilename', help='Output GFA file')
parser.add_argument('kmerSize', type=int, help='k-mer length')
parser.add_argument('-s', '--single-directed', action='store_true',
help='Avoid outputting the whole skew-simmetric graph and output only one edge between two nodes',
dest='single_directed')
args = parser.parse_args()
with open(args.inputFilename) as f:
#name stores the id of the unitig
#optional is a list which stores all the optional tags of a segment
#links stores all the link information about a segment
name = ""
optional=[]
links=[]
g = open(args.outputFilename,'w')
#adding Header to the file
k = int(args.kmerSize)
g.write('H\tVN:Z:1.0\tks:i:%d\n' %k) # includes the k-mer size
print("GFA file open")
#firstLine is for implemetation purpose so that we don't add some garbage value to the output file.
firstLine = 0
#segment stores the segment till present, in a fasta file, segment can be on many lines, hence we need to get the whole segment from all the lines
segment = ""
for line in f:
line = line.replace("\n","")
if(line[0]!=">"):
#segment might be in more than one line, hence we get the whole segment first, and then put it in the GFA file.
segment += line
if(line[0]==">"):
if(firstLine!=0):#if it's not the firstline in the input file, we store the input in GFA format in the output file
write_segment(name,segment,optional,g,links)
segment = ""
firstLine = 1
#once the previous segment and it's information has been stored, we start the next segment and it's information
a = line.split(" ")
name=a[0][1:] #get the id
optional=[]
links = []
#we skip the first value because the first value is ">ID"
for i in range(1,len(a)):
#we need this because the line can end with a space, hence we get one extra value in our list.
if(a[i]==""):
continue
if(a[i][0:2] == "MA"): #previous bcalm2 versions had "MA=[xxx]" optional tag as well, kept it just for compatibility, and reformated
optional.append(a[i][0:2]+":f:"+a[i][2:])
elif(a[i][0:2] == "L:"): #for links
b = a[i].split(":")
k1 = int(args.kmerSize)-1
if args.single_directed:
if name < b[2]:
links.append("L\t"+name+"\t"+b[1]+"\t"+b[2]+"\t"+b[3]+"\t"+str(k1)+"M\n")
elif name == b[2] and not (b[1] == b[3] == '-'): # manage links between the same unitig
links.append("L\t"+name+"\t"+b[1]+"\t"+b[2]+"\t"+b[3]+"\t"+str(k1)+"M\n")
else:
links.append("L\t"+name+"\t"+b[1]+"\t"+b[2]+"\t"+b[3]+"\t"+str(k1)+"M\n")
else: #all the other optional tags
optional.append(a[i])
#we will miss the last one, because it won't go into the if condition - if(line[0]==">") and hence won't add the segment to the file.
write_segment(name,segment,optional,g,links)
print("done")
g.close()
if __name__ == "__main__":
main()
// comments added a posteriori, 1 year later:
// script that keeps reads if they match at least one kmer from a contig file
// probably this was done for multi-k optimization?
// to compile it is a bit tricky: go to dsk repository, put that script in dsk/utils folder and use that CMakeLists file: https://github.com/GATB/dsk/blob/12de3e1f49e04f372b80773e7c4bc829c28e5657/utils/CMakeLists.txt
#include <gatb/gatb_core.hpp>
#include <fstream>
#include <set>
// We use the required packages
using namespace std;
static /* probably important that it's static here too */
char revcomp (char s) {
if (s == 'A') return 'T';
else if (s == 'C') return 'G';
else if (s == 'G') return 'C';
else if (s == 'T') return 'A';
else if (s == 'a') return 't';
else if (s == 'c') return 'g';
else if (s == 'g') return 'c';
else if (s == 't') return 'a';
return 'X';
}
static string revcomp (const string &s) {
string rc;
for (signed int i = s.length() - 1; i >= 0; i--) {rc += revcomp(s[i]);}
return rc;
}
/********************************************************************************/
class FilterByPreviousContigs : public Tool
{
public:
/** */
FilterByPreviousContigs () : Tool ("filter_by_previous_contigs")
{
getParser()->push_front (new OptionOneParam ("-previous-contigs", "previous contigs" , true));
getParser()->push_front (new OptionOneParam ("-reads", "reads" , true));
getParser()->push_front (new OptionOneParam (STR_KMER_SIZE, "k-mer size used to generate previous contigs" , true));
getParser()->push_front (new OptionOneParam ("-out", "output file that will contain filtered reads", true));
}
/** */
void execute ()
{
size_t kmerSize = getInput()->getInt (STR_KMER_SIZE);
Integer::apply<Functor,Parameter> (kmerSize, Parameter(*this, kmerSize));
}
struct Parameter
{
Parameter (FilterByPreviousContigs& tool, size_t kmerSize) : tool(tool), kmerSize(kmerSize) {}
FilterByPreviousContigs& tool;
size_t kmerSize;
};
template<size_t span> struct Functor { void operator () (Parameter parameter)
{
FilterByPreviousContigs& tool = parameter.tool;
size_t kmerSize = parameter.kmerSize;
unsigned int k = kmerSize;
typedef typename Kmer<span>::Count Count;
typename Kmer<span>::ModelCanonical model (kmerSize);
BankFasta bank(tool.getInput()->getStr("-previous-contigs"));
IBank* bankReads = Bank::open(tool.getInput()->getStr("-reads"));
BankFasta bankOut(tool.getInput()->getStr("-out"));
BankFasta::Iterator itCtg (bank);
set<string> kmers;
std::cout << "indexing previous contigs" << std::endl;
for (itCtg.first(); !itCtg.isDone(); itCtg.next())
{
string seq = itCtg->toString();
string kmerBegin = seq.substr(0, k );
string kmerEnd = seq.substr(seq.size() - k , k );
kmers.insert(kmerBegin);
kmers.insert(kmerEnd);
kmers.insert(revcomp(kmerBegin));
kmers.insert(revcomp(kmerEnd));
}
std::cout << "filtering reads" << std::endl;
Iterator<Sequence>* itReads = bankReads->iterator();
uint64_t nb_reads = 0, nb_kept = 0;
for (itReads->first(); !itReads->isDone(); itReads->next())
{
bool contains = false;
string seq = (*itReads)->toString();
nb_reads++;
int found_pos = 0;
for (int i = 0; i < seq.size() - k; i++)
{
string kmer = seq.substr(i, k );
contains = (kmers.find(kmer) != kmers.end());
if (contains)
{
found_pos = i;
break;
}
}
if (contains)
{
Sequence s (Data::ASCII);
s.getData().setRef ((char*)seq.c_str(), seq.size());
s._comment = (*itReads)->getComment() + " contains previous-k kmer at pos " + to_string(found_pos);
bankOut.insert(s);
nb_kept++;
}
}
std::cout << "done filtering reads, kept " << nb_kept << "/" << nb_reads << std::endl;
/** We gather some statistics. */
tool.getInfo()->add (1, "stats");
tool.getInfo()->add (2, "kmer_size", "%ld", kmerSize);
}};
};
/********************************************************************************/
/* Dump solid kmers in ASCII format */
/********************************************************************************/
int main (int argc, char* argv[])
{
/* try
{*/
FilterByPreviousContigs().run (argc, argv);
/* }
catch (Exception& e)
{
std::cout << "EXCEPTION: " << e.getMessage() << std::endl;
return EXIT_FAILURE;
}*/
// gdb likes having clean exceptions
}
......@@ -51,7 +51,6 @@ static const char* progressFormat0 = "Minia : assembly";
*********************************************************************/
Minia::Minia () : Tool ("minia")
{
#ifdef GIT_SHA1
std::cout << "Minia 3, git commit " << GIT_SHA1 << std::endl;
#endif
......
/*****************************************************************************
* GATB : Genome Assembly Tool Box
* Copyright (C) 2014 INRIA
* Authors: R.Chikhi, G.Rizk, E.Drezen
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*****************************************************************************/
/* WARNING:
* Every source file including this header will be recompiled unconditionally
*/
#define STR_MINIA_VERSION "${gatb-tool-version}"
#define STR_MINIA_COMPILATION_FLAGS "${gatb-core-flags}"
#define STR_MINIA_COMPILER "${CMAKE_C_COMPILER} (${CMAKE_CXX_COMPILER_VERSION})"
#define STR_MINIA_OPERATING_SYSTEM "${CMAKE_SYSTEM}"
......@@ -20,6 +20,7 @@
/********************************************************************************/
#include <Minia.hpp>
#include "build_info.hpp"
using namespace std;
......@@ -27,6 +28,19 @@ using namespace std;
int main (int argc, char* argv[])
{
if(argc > 1 && ( strcmp(argv[1],"--version")==0 || strcmp(argv[1],"-v")==0 || strcmp(argv[1],"-version")==0 ) ){
std::cout << "Minia version " << STR_MINIA_VERSION << std::endl;
#ifdef GIT_SHA1
std::cout << "git commit " << GIT_SHA1 << std::endl;
#endif
std::cout << "Using gatb-core version "<< System::info().getVersion() << std::endl;
std::cout << "OS: " << STR_MINIA_OPERATING_SYSTEM << std::endl;
std::cout << "compiler: " << STR_MINIA_COMPILER << std::endl;
return EXIT_SUCCESS;
}
// We define a try/catch block in case some method fails (bad filename for instance)
try
{
......
......@@ -19,6 +19,7 @@ fi
# if wget is not installed, you may use "curl -O ..."
DATA_SAMPLE="ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR039/ERR039477/ERR039477.fastq.gz"
WGET_PATH=`which wget`
rm -f ERR039477.fastq.gz #needed otherwise wget will not overwrite and write to ERR039477.fastq.gz.1 instead
echo ">>> Retrieving data sample: ${DATA_SAMPLE}"
if [ ! -z "$WGET_PATH" ] ; then
echo " using '$WGET_PATH'..."
......@@ -42,6 +43,10 @@ else
fi
fi
echo "size and MD5 of ERR039477.fastq.gz :"
ls -l ERR039477.fastq.gz
md5sum ERR039477.fastq.gz
################################################################################
# we launch minia; note that we use only one thread (no real time issues with
# potential different results)
......