Skip to content
Commits on Source (2)
......@@ -19,7 +19,6 @@
<fileset dir="../../jloda/jars/batik-1.8" includes="*.jar"/>
<fileset dir="../../megan-ce/jars" includes="*.jar"/>
<fileset dir="../../megan-ce/jars/megan6server" includes="*.jar"/>
<fileset dir="../../malt/jars/" includes="*.jar"/>
</path>
<!-- init -->
......@@ -62,7 +61,7 @@
<target name="jar" depends="compile">
<jar jarfile="${jar}"
basedir="${classDir}"
includes="jloda/** megan/** log4j.properties malt/** rusch/**">
includes="jloda/** megan/** log4j.properties malt/**">
</jar>
</target>
......
malt (0.0+20170502-1) UNRELEASED; urgency=medium
malt (0.0+git20180524.5984e06-1) UNRELEASED; urgency=medium
* Initial release (Closes: #<bug>)
* debian/upstream/metadata: Added reference to
OMICtools (Steffen Moeller)
-- Andreas Tille <tille@debian.org> Tue, 11 Oct 2016 20:08:16 +0200
version=4
https://github.com/danielhuson/malt/releases .*/archive/.*(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
opts="mode=git,pretty=0.0+git%cd.%h" \
https://github.com/danielhuson/malt.git HEAD
# https://github.com/danielhuson/malt/issues/1
<?xml version="1.0" encoding="UTF-8"?>
<install4j version="6.1.3" transformSequenceNumber="5">
<install4j version="6.1.6" transformSequenceNumber="5">
<directoryPresets config="../../megan6/jars/data.jar" />
<application name="MALT" distributionSourceDir="" applicationId="3229-5251-7410-5330" mediaDir="../../../builds" mediaFilePattern="${compiler:sys.shortName}_${compiler:sys.platform}_${compiler:sys.version}" compression="6" lzmaCompression="false" pack200Compression="false" excludeSignedFromPacking="true" commonExternalFiles="false" createMd5Sums="true" shrinkRuntime="true" shortName="MALT" publisher="Daniel Huson's Lab, University of Tuebingen" publisherWeb="www-ab.informatik.uni-tuebingen.de" version="0" allPathsRelative="true" backupOnSave="true" autoSave="false" convertDotsToUnderscores="true" macSignature="????" macVolumeId="970a61501b2fa775" javaMinVersion="1.8" javaMaxVersion="" allowBetaVM="true" jdkMode="runtimeJre" jdkName="">
<languages skipLanguageSelection="false" languageSelectionInPrincipalLanguage="false">
......@@ -15,7 +15,7 @@
<variable name="variable" value="" description="" category="" />
</variables>
<mergedProjects />
<codeSigning macEnabled="true" macPkcs12File="../../../../etc/Certificates.p12" windowsEnabled="false" windowsKeySource="pvkAndSpc" windowsPvkFile="" windowsSpcFile="" windowsPkcs12File="" />
<codeSigning macEnabled="true" macPkcs12File="../../../../etc/application.p12" windowsEnabled="false" windowsKeySource="pvkAndSpc" windowsPvkFile="" windowsSpcFile="" windowsPkcs12File="" />
</application>
<files keepModificationTimes="false" missingFilesStrategy="warn" globalExcludeSuffixes=".svn,.CVS,*.psd,*.java" defaultOverwriteMode="4" defaultUninstallMode="0" launcherOverwriteMode="3" defaultFileMode="644" defaultDirMode="755">
<filesets />
......@@ -1461,7 +1461,7 @@ wizardContext.setCancelButtonVisible(false);</preActivation>
<content />
</installerScript>
</unixInstaller>
<macosFolder name="Mac OS X Folder" id="1691242149" customizedId="" mediaFileName="" installDir="MALT" overridePrincipalLanguage="true" jreBitType="all" runPostProcessor="false" postProcessor="" failOnPostProcessorError="false" useLegacyMediaFileIds="false" legacyMediaFileIds="" downloadURL="" includeAllDownloadableComponents="false" includedJRE="macosx-amd64-1.8.0_112" manualJREEntry="false" bundleType="1" jreURL="" jreShared="false" directDownload="false" installOnlyIfNecessary="false" appleJre="false" requiredVmIdPrefix="" customInstallBaseDir="" contentFilesType="2" installerName="${i18n:InstallerName(${compiler:sys.fullName})}" volumeName="${compiler:sys.shortName}" compressDmg="false" signLaunchers="false">
<macosFolder name="Mac OS X Folder" id="1691242149" customizedId="" mediaFileName="" installDir="MALT" overridePrincipalLanguage="true" jreBitType="all" runPostProcessor="false" postProcessor="" failOnPostProcessorError="false" useLegacyMediaFileIds="false" legacyMediaFileIds="" downloadURL="" includeAllDownloadableComponents="false" includedJRE="macosx-amd64-1.8.0_162" manualJREEntry="false" bundleType="1" jreURL="" jreShared="false" directDownload="false" installOnlyIfNecessary="false" appleJre="false" requiredVmIdPrefix="" customInstallBaseDir="" contentFilesType="1" installerName="${i18n:InstallerName(${compiler:sys.fullName})}" volumeName="${compiler:sys.shortName}" compressDmg="false" signLaunchers="false">
<excludedComponents />
<includedDownloadableComponents />
<excludedLaunchers />
......@@ -1475,7 +1475,7 @@ wizardContext.setCancelButtonVisible(false);</preActivation>
</autoUpdate>
<topLevelFiles />
</macosFolder>
<windows name="Windows" id="1691242407" customizedId="" mediaFileName="${compiler:sys.shortName}_${compiler:sys.platform}_${compiler:sys.version}" installDir="Malt" overridePrincipalLanguage="true" jreBitType="64" runPostProcessor="false" postProcessor="" failOnPostProcessorError="false" useLegacyMediaFileIds="false" legacyMediaFileIds="" downloadURL="" includeAllDownloadableComponents="false" includedJRE="windows-amd64-1.8.0_112" manualJREEntry="false" bundleType="1" jreURL="" jreShared="false" directDownload="false" installOnlyIfNecessary="true" customInstallBaseDir="" contentFilesType="1" verifyIntegrity="true">
<windows name="Windows" id="1691242407" customizedId="" mediaFileName="${compiler:sys.shortName}_${compiler:sys.platform}_${compiler:sys.version}" installDir="Malt" overridePrincipalLanguage="true" jreBitType="64" runPostProcessor="false" postProcessor="" failOnPostProcessorError="false" useLegacyMediaFileIds="false" legacyMediaFileIds="" downloadURL="" includeAllDownloadableComponents="false" includedJRE="windows-amd64-1.8.0_162" manualJREEntry="false" bundleType="1" jreURL="" jreShared="false" directDownload="false" installOnlyIfNecessary="true" customInstallBaseDir="" contentFilesType="1" verifyIntegrity="true">
<excludedComponents />
<includedDownloadableComponents />
<excludedLaunchers />
......
/**
* AlignmentEngine.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......@@ -22,16 +22,14 @@ package malt;
import jloda.util.Basic;
import malt.align.AlignerOptions;
import malt.align.BandedAligner;
import malt.analysis.OrganismsProfile;
import malt.data.*;
import malt.genes.GeneTableAccess;
import malt.io.*;
import malt.mapping.MappingManager;
import malt.util.FixedSizePriorityQueue;
import malt.util.Utilities;
import megan.parsers.blast.BlastMode;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
......@@ -56,12 +54,13 @@ public class AlignmentEngine {
// io:
private final FastAReader fastAReader;
private final MaltOptions.MatchOutputFormat matchOutputFormat;
private final OutputStream organismsOutStream;
private final FileWriterRanked matchesWriter;
private final FileWriterRanked alignedReadsWriter;
private final FileWriterRanked unalignedReadsWriter;
private final RMA6Writer rmaWriter;
private final GeneTableAccess geneTableAccess;
// parameters
private final double minRawScore;
private final double minBitScore;
......@@ -75,14 +74,12 @@ public class AlignmentEngine {
// keep track of all used references:
private final BitSet alignedReferenceIds;
private final OrganismsProfile organismsProfile;
// used for stats:
long countSequencesProcessed;
long countSequencesWithAlignments;
long countSeedMatches;
long countHashSeedMismatches;
long countAlignments;
private long countSequencesProcessed;
private long countSequencesWithAlignments;
private long countSeedMatches;
private long countHashSeedMismatches;
private long countAlignments;
// used in inner loop:
private final FixedSizePriorityQueue<ReadMatch> matchesQueue;
......@@ -99,21 +96,11 @@ public class AlignmentEngine {
/**
* construct an instance of the alignment engine. Each instance is run in a separate thread
*
* @param maltOptions
* @param alignerOptions
* @param referencesDB
* @param tables
* @param fastAReader
* @param matchesWriter
* @param alignedReadsWriter
* @param unalignedReadsWriter
* @throws IOException
*/
public AlignmentEngine(final int threadNumber, final MaltOptions maltOptions, AlignerOptions alignerOptions, final ReferencesDBAccess referencesDB,
AlignmentEngine(final int threadNumber, final MaltOptions maltOptions, AlignerOptions alignerOptions, final ReferencesDBAccess referencesDB,
final ReferencesHashTableAccess[] tables, final FastAReader fastAReader,
final FileWriterRanked matchesWriter, final RMA6Writer rmaWriter, final OutputStream organismsOutStream,
final FileWriterRanked alignedReadsWriter, final FileWriterRanked unalignedReadsWriter) throws IOException {
final FileWriterRanked matchesWriter, final RMA6Writer rmaWriter,
final FileWriterRanked alignedReadsWriter, final FileWriterRanked unalignedReadsWriter, final GeneTableAccess geneTableAccess) throws IOException {
this.threadNumber = threadNumber;
this.maltOptions = maltOptions;
this.referencesDB = referencesDB;
......@@ -122,9 +109,9 @@ public class AlignmentEngine {
this.matchOutputFormat = maltOptions.getMatchOutputFormat();
this.matchesWriter = matchesWriter;
this.rmaWriter = rmaWriter;
this.organismsOutStream = organismsOutStream;
this.alignedReadsWriter = alignedReadsWriter;
this.unalignedReadsWriter = unalignedReadsWriter;
this.geneTableAccess = geneTableAccess;
this.shift = maltOptions.getShift();
......@@ -147,7 +134,6 @@ public class AlignmentEngine {
xDrop = alignerOptions.getUngappedXDrop(maltOptions.getMode());
minUngappedRawScore = alignerOptions.getUngappedMinRawScore(maltOptions.getMode());
// data structures used in inner loop:
matchesQueue = new FixedSizePriorityQueue<>(maltOptions.getMaxAlignmentsPerQuery(), ReadMatch.createComparator());
recycledMatchesArray = new ReadMatch[maltOptions.getMaxAlignmentsPerQuery()];
......@@ -156,19 +142,13 @@ public class AlignmentEngine {
for (int i = 0; i < readMatchesForRefIndex.length; i++)
readMatchesForRefIndex[i] = new ReadMatch();
if (organismsOutStream != null) {
organismsProfile = new OrganismsProfile(MappingManager.getTaxonomyMapping());
organismsProfile.setTopPercent(maltOptions.getTopPercentLCA());
} else
organismsProfile = null;
seedArrays = resizeAndConstructEntries(new SeedMatchArray[0], 1000, maltOptions.getMaxSeedsPerReference());
}
/**
* The main outer loop. Grabs the next input read and determines all possible seed matches. Then calls the inner loop
*/
public void runOuterLoop() {
void runOuterLoop() {
try {
final int maxFramesPerQuery = Utilities.getMaxFramesPerQuery(maltOptions.getMode(), maltOptions.isDoForward(), maltOptions.isDoReverse());
......@@ -220,13 +200,8 @@ public class AlignmentEngine {
/**
* run the inner loop. This tries to extend all found seed matches. If caching is used, first tries to find alignments in cache
*
* @param query
* @param totalSize
* @param dataForInnerLoop
* @throws IOException
*/
public void runInnerLoop(final FastARecord query, final int totalSize, final DataForInnerLoop dataForInnerLoop) throws IOException {
private void runInnerLoop(final FastARecord query, final int totalSize, final DataForInnerLoop dataForInnerLoop) throws IOException {
countSequencesProcessed++;
// if cache active and query found, use the cached matches:
......@@ -370,8 +345,22 @@ public class AlignmentEngine {
}
if (foundPlaceToKeepThisMatch) {
final byte[] referenceHeader;
if (geneTableAccess == null)
referenceHeader = referencesDB.getHeader(refIndex);
else {
int start = aligner.getStartReference();
if (start == -1) {
aligner.computeAlignmentByTraceBack();
start = aligner.getStartReference();
}
int end = aligner.getEndReference();
referenceHeader = geneTableAccess.annotateRefString(Basic.toString(referencesDB.getHeader(refIndex)), refIndex, start, end).getBytes();
// System.err.println(Basic.toString(referenceHeader));
}
byte[] text = null;
byte[] rma3Text = null;
byte[] rma6Text = null;
if (matchesWriter != null) {
switch (matchOutputFormat) {
default:
......@@ -380,21 +369,21 @@ public class AlignmentEngine {
break;
}
case Tab: {
text = aligner.getAlignmentTab(dataForInnerLoop, null, referencesDB.getHeader(refIndex), seedMatch.getRank()); // don't pass queryHeader, it is added below
text = aligner.getAlignmentTab(dataForInnerLoop, null, referenceHeader, seedMatch.getRank()); // don't pass queryHeader, it is added below
break;
}
case SAM: {
rma3Text = text = aligner.getAlignmentSAM(dataForInnerLoop, null, query.getSequence(), referencesDB.getHeader(refIndex), seedMatch.getRank()); // don't pass queryHeader, it is added below
rma6Text = text = aligner.getAlignmentSAM(dataForInnerLoop, null, query.getSequence(), referenceHeader, seedMatch.getRank()); // don't pass queryHeader, it is added below
break;
}
}
}
if (rmaWriter != null && rma3Text == null) {
rma3Text = aligner.getAlignmentSAM(dataForInnerLoop, null, query.getSequence(), referencesDB.getHeader(refIndex), seedMatch.getRank()); // don't pass queryHeader, it is added below
if (rmaWriter != null && rma6Text == null) {
rma6Text = aligner.getAlignmentSAM(dataForInnerLoop, null, query.getSequence(), referenceHeader, seedMatch.getRank()); // don't pass queryHeader, it is added below
}
if (percentIdentity > 0) // need to filter by percent identity. Can't do this earlier because number of matches not known until alignment has been computed
{
if (text == null && rma3Text == null) // haven't computed alignment, so number of matches not yet computed
if (text == null && rma6Text == null) // haven't computed alignment, so number of matches not yet computed
aligner.computeAlignmentByTraceBack(); // compute number of matches
if (aligner.getIdentities() < percentIdentity * aligner.getAlignmentLength()) { // too few identities
if (incrementedNumberOfReadMatchesForRefIndex)
......@@ -402,7 +391,7 @@ public class AlignmentEngine {
continue;
}
}
readMatch.set(aligner.getBitScore(), refIndex, text, rma3Text, aligner.getStartReference(), aligner.getEndReference());
readMatch.set(aligner.getBitScore(), refIndex, text, rma6Text, aligner.getStartReference(), aligner.getEndReference());
}
previous = seedMatch;
}
......@@ -477,10 +466,6 @@ public class AlignmentEngine {
}
}
if (organismsOutStream != null) {
organismsProfile.addRead(Utilities.getFirstWordSkipLeadingGreaterSign(query.getHeader()), numberOfMatches, matchesArray);
}
if (alignedReadsWriter != null) {
alignedReadsWriter.writeByRank(threadNumber, query.getId(), Utilities.getFirstWordEnsureLeadingGreaterSign(query.getHeader()), Utilities.copy0Terminated(query.getSequence()));
}
......@@ -502,10 +487,6 @@ public class AlignmentEngine {
if (rmaWriter != null && maltOptions.isSaveUnalignedToRMA()) {
rmaWriter.processMatches(query.getHeaderString(), query.getSequenceString(), matchesArray, 0);
}
if (organismsOutStream != null) {
organismsProfile.addNoHitsRead();
}
if (alignedReadsWriter != null) {
alignedReadsWriter.skipByRank(threadNumber, query.getId());
}
......@@ -519,18 +500,12 @@ public class AlignmentEngine {
* finish up after outer loop completed
*/
public void finish() {
if (organismsOutStream != null) {
organismsProfile.finishAnalysis();
}
}
/**
* compute total sequences processed
*
* @param alignmentEngines
* @return total
*/
public static long getTotalSequencesProcessed(final AlignmentEngine[] alignmentEngines) {
static long getTotalSequencesProcessed(final AlignmentEngine[] alignmentEngines) {
long total = 0;
for (AlignmentEngine alignmentEngine : alignmentEngines) {
total += alignmentEngine.countSequencesProcessed;
......@@ -540,11 +515,8 @@ public class AlignmentEngine {
/**
* compute total with alignments
*
* @param alignmentEngines
* @return total
*/
public static long getTotalSequencesWithAlignments(final AlignmentEngine[] alignmentEngines) {
static long getTotalSequencesWithAlignments(final AlignmentEngine[] alignmentEngines) {
long total = 0;
for (AlignmentEngine alignmentEngine : alignmentEngines) {
total += alignmentEngine.countSequencesWithAlignments;
......@@ -554,11 +526,8 @@ public class AlignmentEngine {
/**
* compute total number of alignments
*
* @param alignmentEngines
* @return total
*/
public static long getTotalAlignments(final AlignmentEngine[] alignmentEngines) {
static long getTotalAlignments(final AlignmentEngine[] alignmentEngines) {
long total = 0;
for (AlignmentEngine alignmentEngine : alignmentEngines) {
total += alignmentEngine.countAlignments;
......@@ -566,22 +535,14 @@ public class AlignmentEngine {
return total;
}
public OrganismsProfile getOrganismsProfile() {
return organismsProfile;
}
public BitSet getAlignedReferenceIds() {
BitSet getAlignedReferenceIds() {
return alignedReferenceIds;
}
/**
* resize the array of seed match arrays
*
* @param array
* @param newSize
* @return new array
*/
public SeedMatchArray[] resizeAndConstructEntries(SeedMatchArray[] array, int newSize, int maxLength) {
private SeedMatchArray[] resizeAndConstructEntries(SeedMatchArray[] array, int newSize, int maxLength) {
SeedMatchArray[] result = new SeedMatchArray[newSize];
for (int i = array.length; i < newSize; i++)
result[i] = new SeedMatchArray(maxLength);
......@@ -592,7 +553,7 @@ public class AlignmentEngine {
/**
* initialize the read sequence 2 matches cache
*/
public static void activateReplicateQueryCaching(int bits) {
static void activateReplicateQueryCaching(int bits) {
System.err.println("Using replicate query cache (cache size=" + (1 << bits) + ")");
querySequence2MatchesCache = new QuerySequence2MatchesCache(bits);
}
......@@ -600,7 +561,7 @@ public class AlignmentEngine {
/**
* report on cache usage, if any
*/
public static void reportStats() {
static void reportStats() {
if (querySequence2MatchesCache != null)
querySequence2MatchesCache.reportStats();
}
......@@ -634,8 +595,8 @@ public class AlignmentEngine {
return matches[i];
}
public SeedMatch setNext(int queryOffset, int referenceOffset, int rank, int seedLength) {
return matches[size++].set(queryOffset, referenceOffset, rank, seedLength);
void setNext(int queryOffset, int referenceOffset, int rank, int seedLength) {
matches[size++].set(queryOffset, referenceOffset, rank, seedLength);
}
public void clear() {
......
/**
* DataForInnerLoop.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/**
* ITextProducer.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/**
* MaltBuild.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......@@ -31,8 +31,7 @@ import megan.classification.IdParser;
import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
/**
* build MALT index
......@@ -82,25 +81,27 @@ public class MaltBuild {
*/
public void run(String[] args) throws Exception {
// parse commandline options:
final ArgsOptions options = new ArgsOptions(args, ProgramProperties.getProgramName(), "MaltBuild", "Builds an index for MALT (MEGAN alignment tool)");
final ArgsOptions options = new ArgsOptions(args, this, "Builds an index for MALT (MEGAN alignment tool)");
options.setAuthors("Daniel H. Huson");
options.setVersion(ProgramProperties.getProgramVersion());
options.setLicense("Copyright (C) 2017 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
options.setLicense("Copyright (C) 2018 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
options.comment("Input:");
final List<String> inputFiles = options.getOptionMandatory("i", "input", "Input reference file(s)", new LinkedList<String>());
final List<String> inputFiles = options.getOptionMandatory("i", "input", "Input reference files in FastA format (or specify a single directory)", new LinkedList<String>());
final SequenceType sequenceType = SequenceType.valueOfIgnoreCase(options.getOptionMandatory("s", "sequenceType", "Sequence type", SequenceType.values(), SequenceType.Protein.toString()));
final List<String> gffFiles = options.getOption("-igff", "inputGFF", "Files that provide CDS annotations of DNA input files in GFF format (or specify a single directory)", new LinkedList<String>());
options.comment("Output:");
final String indexDirectoryName = options.getOptionMandatory("d", "index", "Name of index directory", "");
final String indexDirectoryName = options.getOptionMandatory("-d", "index", "Name of index directory", "");
options.comment("Performance:");
final int numberOfThreads = options.getOption("t", "threads", "Number of worker threads", Runtime.getRuntime().availableProcessors());
final int stepSize = options.getOption("st", "step", "Step size used to advance seed, values greater than 1 reduce index size and sensitivity", 1, 1, 100);
final int numberOfThreads = options.getOption("-t", "threads", "Number of worker threads", Runtime.getRuntime().availableProcessors());
final int stepSize = options.getOption("-st", "step", "Step size used to advance seed; a value greater than 1 will reduce index size, but also sensitivity", 1, 1, 100);
options.comment("Seed:");
String[] shapes = options.getOption("ss", "shapes", "Seed shape(s)", new String[]{"default"});
int maxHitsPerSeed = options.getOption("mh", "maxHitsPerSeed", "Maximum number of hits per seed", 1000);
String[] shapes = options.getOption("-ss", "shapes", "Seed shape(s)", new String[]{"default"});
int maxHitsPerSeed = options.getOption("-mh", "maxHitsPerSeed", "Maximum number of hits per seed", 1000);
final String proteinReduction;
if (sequenceType == SequenceType.Protein || options.isDoHelp())
proteinReduction = options.getOption("-pr", "proteinReduct", "Name or definition of protein alphabet reduction ("
......@@ -108,34 +109,28 @@ public class MaltBuild {
else
proteinReduction = "";
final String[] availableFNames = ClassificationManager.getAllSupportedClassifications().toArray(new String[ClassificationManager.getAllSupportedClassifications().size()]);
options.comment("Classification:");
String[] cNames = options.getOption("-c", "classify", "Classifications (any of " + Basic.toString(availableFNames, " ") + ")", new String[]{Classification.Taxonomy});
for (String cName : cNames) {
if (!ClassificationManager.getAllSupportedClassifications().contains(cName))
throw new UsageException("--classify: Unknown classification: " + cName);
}
final boolean parseTaxonNames = true;
if (options.isDoHelp())
cNames = availableFNames;
final Map<String, String> cName2GIFileName = new HashMap<>();
final Map<String, String> cName2AcessionFileName = new HashMap<>();
final Map<String, String> cName2SynonymsFileName = new HashMap<>();
final boolean parseTaxonNames = true;
final Set<String> classificationsToUse = new TreeSet<>();
final String[] gi2FNames = new String[cNames.length];
final String[] acc2FNames = new String[cNames.length];
final String[] synonyms2FNames = new String[cNames.length];
for (String cName : ClassificationManager.getAllSupportedClassifications()) {
cName2GIFileName.put(cName, options.getOption("-g2" + cName.toLowerCase(), "gi2" + cName.toLowerCase(), "GI-to-" + cName + " mapping file (deprecated)", ""));
cName2AcessionFileName.put(cName, options.getOption("-a2" + cName.toLowerCase(), "acc2" + cName.toLowerCase(), "Accession-to-" + cName + " mapping file", ""));
cName2SynonymsFileName.put(cName, options.getOption("-s2" + cName.toLowerCase(), "syn2" + cName.toLowerCase(), "Synonyms-to-" + cName + " mapping file", ""));
for (int i1 = 0; i1 < cNames.length; i1++) {
String cName = cNames[i1];
gi2FNames[i1] = options.getOption("-g2" + cName.toLowerCase(), "gi2" + cName.toLowerCase(), "GI-to-" + cName + " mapping file", "");
acc2FNames[i1] = options.getOption("-a2" + cName.toLowerCase(), "acc2" + cName.toLowerCase(), "Accession-to-" + cName + " mapping file", "");
synonyms2FNames[i1] = options.getOption("-s2" + cName.toLowerCase(), "syn2" + cName.toLowerCase(), "Synonyms-to-" + cName + " mapping file", "");
if (cName2GIFileName.get(cName).length() > 0 || cName2AcessionFileName.get(cName).length() > 0 || cName2SynonymsFileName.get(cName).length() > 0)
classificationsToUse.add(cName);
if (cName.equalsIgnoreCase(Classification.Taxonomy))
options.getOption("-tn", "parseTaxonNames", "Parse taxon names", true);
}
final String geneTableFile = options.getOption("-gif", "-geneInfoFile", "File containing gene information", "");
final boolean functionalClassification = !options.getOption("-nf", "noFun", "Turn off functional classifications for provided mapping files (set this when using GFF files for DNA references)", false);
options.comment(ArgsOptions.OTHER);
ProgramProperties.put(IdParser.PROPERTIES_FIRST_WORD_IS_ACCESSION, options.getOption("-fwa", "firstWordIsAccession", "First word in reference header is accession number", ProgramProperties.get(IdParser.PROPERTIES_FIRST_WORD_IS_ACCESSION, true)));
......@@ -154,6 +149,36 @@ public class MaltBuild {
if (sequenceType == null)
throw new IOException("Sequence type undefined");
if (inputFiles.size() == 1) {
final File file = new File(inputFiles.get(0));
if (file.isDirectory()) {
System.err.println("Looking for FastA files in directory: " + file);
inputFiles.clear();
for (File aFile : Basic.getAllFilesInDirectory(file, new FastaFileFilter(), true)) {
inputFiles.add(aFile.getPath());
}
if (inputFiles.size() == 0)
throw new IOException("No files found in directory: " + file);
else
System.err.println(String.format("Found: %,d", inputFiles.size()));
}
}
if (gffFiles.size() == 1) {
final File file = new File(gffFiles.get(0));
if (file.isDirectory()) {
System.err.println("Looking for GFF files in directory: " + file);
gffFiles.clear();
for (File aFile : Basic.getAllFilesInDirectory(file, new GFF3FileFilter(), true)) {
gffFiles.add(aFile.getPath());
}
if (gffFiles.size() == 0)
throw new IOException("No GFF files found in directory: " + file);
else
System.err.println(String.format("Found: %,d", gffFiles.size()));
}
}
System.err.println("Reference sequence type set to: " + sequenceType.toString());
final IAlphabet referenceAlphabet;
final IAlphabet seedAlphabet;
......@@ -197,9 +222,10 @@ public class MaltBuild {
// load the reference file:
final ReferencesDBBuilder referencesDB = new ReferencesDBBuilder();
System.err.println(String.format("Number input files: %,12d", inputFiles.size()));
referencesDB.loadFastAFiles(inputFiles, referenceAlphabet);
System.err.println(String.format("Number of sequences:%12d", referencesDB.getNumberOfSequences()));
System.err.println(String.format("Number of letters: %12d", referencesDB.getNumberOfLetters()));
System.err.println(String.format("Number of sequences:%,12d", referencesDB.getNumberOfSequences()));
System.err.println(String.format("Number of letters:%,14d", referencesDB.getNumberOfLetters()));
// generate hash table for each seed shape
if (doBuildTables) {
......@@ -215,32 +241,38 @@ public class MaltBuild {
}
// setup classification support
for (int i = 0; i < cNames.length; i++) {
final String cName = cNames[i];
for (String cName : classificationsToUse) {
final String cNameLowerCase = cName.toLowerCase();
final String sourceName = (cName.equals(Classification.Taxonomy) ? "ncbi" : cNameLowerCase);
ClassificationManager.ensureTreeIsLoaded(cName);
// need these present for MaltRun to know what to classify
Basic.writeStreamToFile(ResourceManager.getFileAsStream(sourceName + ".tre"), new File(indexDirectory, cNameLowerCase + ".tre"));
Basic.writeStreamToFile(ResourceManager.getFileAsStream(sourceName + ".map"), new File(indexDirectory, cNameLowerCase + ".map"));
Utilities.loadMapping(synonyms2FNames[i], IdMapper.MapType.Synonyms, cName);
Utilities.loadMapping(acc2FNames[i], IdMapper.MapType.Accession, cName);
Utilities.loadMapping(gi2FNames[i], IdMapper.MapType.GI, cName);
if (cName2SynonymsFileName.get(cName).length() > 0)
Utilities.loadMapping(cName2SynonymsFileName.get(cName), IdMapper.MapType.Synonyms, cName);
if (cName2AcessionFileName.get(cName).length() > 0)
Utilities.loadMapping(cName2AcessionFileName.get(cName), IdMapper.MapType.Accession, cName);
if (cName2GIFileName.get(cName).length() > 0)
Utilities.loadMapping(cName2GIFileName.get(cName), IdMapper.MapType.GI, cName);
final IdParser idParser = ClassificationManager.get(cName, true).getIdMapper().createIdParser();
if (cName.equals(Classification.Taxonomy))
idParser.setUseTextParsing(parseTaxonNames);
if (functionalClassification || cName.equals(Classification.Taxonomy)) {
final Mapping mapping = Mapping.create(cName, referencesDB, idParser, new ProgressPercentage("Building " + cName + "-mapping..."));
mapping.save(new File(indexDirectory, cNameLowerCase + ".idx"));
}
}
if (doBuildTables) // don't write until after running classification mappers, as they add tags to reference sequences
referencesDB.save(new File(indexDirectory, "ref.idx"), new File(indexDirectory, "ref.db"), new File(indexDirectory, "ref.inf"), saveFirstWordOfReferenceHeaderOnly);
if (geneTableFile.length() > 0) {
if (gffFiles.size() > 0) {
GeneTableBuilder geneTableBuilder = new GeneTableBuilder();
geneTableBuilder.buildAndSaveGeneTable(referencesDB, geneTableFile, new File(indexDirectory, "gene-table.idx"), numberOfThreads);
geneTableBuilder.buildAndSaveAnnotations(referencesDB, gffFiles, new File(indexDirectory, "annotation.idx"), new File(indexDirectory, "annotation.db"), numberOfThreads);
}
}
}
/**
* MaltOptions.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......@@ -66,7 +66,7 @@ public class MaltOptions {
private boolean gzipUnalignedReads = true;
private boolean useWeightedLCA = false;
private float weightedLCAPercent = 80.0f;
private float lcaCoveragePercent = 80.0f;
private float topPercentLCA = 10;
private float minSupportPercentLCA = 0.001f;
......@@ -87,6 +87,11 @@ public class MaltOptions {
private boolean pairedReads = false;
private String contaminantsFile = "";
private boolean parseHeaders;
/**
* get seed shift step
*
......@@ -311,12 +316,12 @@ public class MaltOptions {
this.useWeightedLCA = useWeightedLCA;
}
public float getWeightedLCAPercent() {
return weightedLCAPercent;
public float getLcaCoveragePercent() {
return lcaCoveragePercent;
}
public void setWeightedLCAPercent(float weightedLCAPercent) {
this.weightedLCAPercent = weightedLCAPercent;
public void setLcaCoveragePercent(float lcaCoveragePercent) {
this.lcaCoveragePercent = lcaCoveragePercent;
}
public boolean isPairedReads() {
......@@ -364,4 +369,20 @@ public class MaltOptions {
return commandLine;
}
public String getContaminantsFile() {
return contaminantsFile;
}
public void setContaminantsFile(String contaminantsFile) {
this.contaminantsFile = contaminantsFile;
}
public boolean isParseHeaders() {
return parseHeaders;
}
public void setParseHeaders(boolean parseHeaders) {
this.parseHeaders = parseHeaders;
}
}
/**
/*
* MaltRun.java
* Copyright (C) 2015 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -24,21 +24,21 @@ import malt.align.AlignerOptions;
import malt.align.BlastStatisticsHelper;
import malt.align.DNAScoringMatrix;
import malt.align.ProteinScoringMatrix;
import malt.analysis.OrganismsProfileMerger;
import malt.data.*;
import malt.genes.GeneTableAccess;
import malt.io.*;
import malt.mapping.MappingManager;
import malt.util.ProfileUtilities;
import malt.util.Utilities;
import megan.classification.Classification;
import megan.classification.ClassificationManager;
import megan.core.Document;
import megan.parsers.blast.BlastMode;
import megan.util.ReadMagnitudeParser;
import javax.xml.bind.JAXBException;
import java.io.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.security.NoSuchAlgorithmException;
import java.security.NoSuchProviderException;
import java.security.spec.InvalidKeySpecException;
......@@ -61,11 +61,7 @@ public class MaltRun {
private long totalAlignments = 0;
/**
* run the MALT program
*
* @param args
* @throws jloda.util.UsageException
* @throws java.io.IOException
* launch the MALT program
*/
public static void main(String[] args) {
try {
......@@ -96,10 +92,6 @@ public class MaltRun {
/**
* run the program
*
* @param args
* @throws jloda.util.UsageException
* @throws java.io.IOException
*/
public void run(final String[] args) throws UsageException, IOException, CanceledException, JAXBException, InvalidKeySpecException, NoSuchAlgorithmException, NoSuchProviderException {
version = Basic.getVersion(this.getClass());
......@@ -107,10 +99,10 @@ public class MaltRun {
final AlignerOptions alignerOptions = new AlignerOptions();
// parse commandline options:
final ArgsOptions options = new ArgsOptions(args, this, ProgramProperties.getProgramName(), "Aligns sequences using MALT (MEGAN alignment tool)");
final ArgsOptions options = new ArgsOptions(args, this, "Aligns sequences using MALT (MEGAN alignment tool)");
options.setAuthors("Daniel H. Huson");
options.setVersion(ProgramProperties.getProgramVersion());
options.setLicense("Copyright (C) 2017 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
options.setLicense("Copyright (C) 2018 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
options.comment("Mode:");
maltOptions.setMode(BlastMode.valueOfIgnoreCase(options.getOptionMandatory("m", "mode", "Program mode", BlastMode.values(), BlastMode.BlastX.toString())));
......@@ -139,15 +131,6 @@ public class MaltRun {
if (maltOptions.getMatchOutputFormat() == MaltOptions.MatchOutputFormat.SAM || options.isDoHelp()) {
maltOptions.setSparseSAM(options.getOption("sps", "sparseSAM", "Produce sparse SAM format (smaller, faster, but only suitable for MEGAN)", maltOptions.isSparseSAM()));
}
final List<String> outputOrganismFileNames;
if (true) // do not allow organisms output
outputOrganismFileNames = new LinkedList<>();
else {
outputOrganismFileNames = options.getOption("oo", "outOrganism", "Organism profile XML output file(s) or directory or STDOUT", new LinkedList<String>());
if (outputOrganismFileNames.size() > 0 || options.isDoHelp()) {
maltOptions.setGzipOrganisms(options.getOption("zo", "gzipOrganism", "Compress organism output using gzip", maltOptions.isGzipOrganisms()));
}
}
final List<String> outputAlignedFileNames = options.getOption("oa", "outAligned", "Aligned reads output file(s) or directory or STDOUT", new LinkedList<String>());
if (outputAlignedFileNames.size() > 0 || options.isDoHelp()) {
maltOptions.setGzipAlignedReads(options.getOption("zal", "gzipAligned", "Compress aligned reads output using gzip", maltOptions.isGzipAlignedReads()));
......@@ -193,13 +176,6 @@ public class MaltRun {
options.comment("LCA parameters:");
final String[] cNames = (options.isDoHelp() ? ClassificationManager.getAllSupportedClassifications().toArray(new String[ClassificationManager.getAllSupportedClassifications().size()]) : MappingManager.determineAvailableMappings(indexDirectory));
if (false) {
for (String cName : cNames) {
final boolean useLCA = options.getOption("-l_" + cName.toLowerCase(), "lca_" + cName.toLowerCase(), "Use LCA for assigning to '" + cName + "' (otherwise 'best-hit')", ProgramProperties.get(cName + "UseLCA", cName.equals(Classification.Taxonomy)));
ProgramProperties.put(cName + "UseLCA", useLCA);
}
}
maltOptions.setTopPercentLCA(options.getOption("top", "topPercent", "Top percent value for LCA algorithm", maltOptions.getTopPercentLCA()));
maltOptions.setMinSupportPercentLCA(options.getOption("supp", "minSupportPercent", "Min support value for LCA algorithm as a percent of assigned reads (0==off)", maltOptions.getMinSupportPercentLCA()));
maltOptions.setMinSupportLCA(options.getOption("sup", "minSupport", "Min support value for LCA algorithm (overrides --minSupportPercent)", 0));
......@@ -216,10 +192,12 @@ public class MaltRun {
maltOptions.setUseWeightedLCA(options.getOption("-wlca", "weightedLCA", "Use the weighted LCA for taxonomic assignment", false));
if (options.isDoHelp() || maltOptions.isUseWeightedLCA())
maltOptions.setWeightedLCAPercent(options.getOption("-wlp", "weightedLCAPercent", "Set the percent weight to cover", Document.DEFAULT_WEIGHTED_LCA_PERCENT));
maltOptions.setLcaCoveragePercent(options.getOption("-lcp", "lcaCoveragePercent", "Set the percent for the LCA to cover", Document.DEFAULT_LCA_COVERAGE_PERCENT));
ReadMagnitudeParser.setEnabled(options.getOption("mag", "magnitudes", "Reads have magnitudes (to be used in taxonomic or functional analysis)", false));
maltOptions.setContaminantsFile(ProgramProperties.getIfEnabled("enable-contaminants", options.getOption("-cf", "conFile", "File of contaminant taxa (one Id or name per line)", "")));
options.comment("Heuristics:");
maltOptions.setMaxSeedsPerOffsetPerFrame(options.getOption("spf", "maxSeedsPerFrame", "Maximum number of seed matches per offset per read frame", maltOptions.getMaxSeedsPerOffsetPerFrame()));
maltOptions.setMaxSeedsPerReference(options.getOption("spr", "maxSeedsPerRef", "Maximum number of seed matches per read and reference", maltOptions.getMaxSeedsPerReference()));
......@@ -273,19 +251,20 @@ public class MaltRun {
throw new UsageException("You must specify at least one input file");
Utilities.checkFileExists(new File(inputFileNames.iterator().next()));
for (String aName : outputRMAFileNames) {
if (outputAlignedFileNames.contains(aName))
throw new UsageException("-a and -o options: Illegal for both to contain the same file name: " + aName);
}
for (String aName : outputAlignedFileNames) {
if (outputRMAFileNames.contains(aName))
throw new UsageException("-a and -o options: Illegal for both to contain the same file name: " + aName);
}
if (!maltOptions.isDoForward() && !maltOptions.isDoReverse())
throw new UsageException("Illegal to specify both --forwardOnly and --reverseOnly");
Utilities.checkFileExists(new File(indexDirectory));
if (outputOrganismFileNames.size() > 0) {
try {
Utilities.checkFileExists(new File(indexDirectory, "gene-table.idx"));
} catch (IOException ex) {
throw new IOException("Specified index does not support '--outOrganisms': " + ex);
}
}
try {
ReferencesHashTableAccess.checkFilesExist(indexDirectory, 0);
} catch (IOException ex) {
......@@ -316,14 +295,15 @@ public class MaltRun {
// table.show();
// load mapping files, if we are going to generate RMA
if ((outputRMAFileNames.size() > 0)) {
if (outputRMAFileNames.size() > 0) {
MappingManager.loadMappings(cNames, indexDirectory);
}
final GeneTableAccess geneTableAccess;
if (outputOrganismFileNames.size() > 0 && (new File(indexDirectory, "gene-table.idx")).exists())
geneTableAccess = new GeneTableAccess(new File(indexDirectory, "gene-table.idx"));
else
if ((new File(indexDirectory, "annotation.idx")).exists()) {
geneTableAccess = new GeneTableAccess(new File(indexDirectory, "annotation.idx"), new File(indexDirectory, "annotation.db"));
maltOptions.setParseHeaders(true);
} else
geneTableAccess = null;
// run alignment for each input file:
......@@ -332,16 +312,15 @@ public class MaltRun {
if (maltOptions.isUseReplicateQueryCaching())
AlignmentEngine.activateReplicateQueryCaching(replicateQueryCacheBits);
for (String inFile : inputFileNames) {
try {
if ((new File(inFile).exists())) {
String rmaOutputFile = getOutputFileName(fileNumber, inputFileNames, outputRMAFileNames, ".rma6", false);
String matchesOutputFile = getOutputFileName(fileNumber, inputFileNames, outputMatchesFileNames, maltOptions.getMatchesOutputSuffix(), maltOptions.isGzipMatches());
String organismProfileOutputFile = getOutputFileName(fileNumber, inputFileNames, outputOrganismFileNames, "-organisms.xml", maltOptions.isGzipOrganisms());
String alignedReadsOutputFile = getOutputFileName(fileNumber, inputFileNames, outputAlignedFileNames, "-aligned.fna", maltOptions.isGzipAlignedReads());
String unalignedReadsOutputFile = getOutputFileName(fileNumber, inputFileNames, outputUnAlignedFileNames, "-unaligned.fna", maltOptions.isGzipUnalignedReads());
launchAlignmentThreads(alignerOptions, maltOptions, inFile, rmaOutputFile, matchesOutputFile, organismProfileOutputFile,
launchAlignmentThreads(alignerOptions, maltOptions, inFile, rmaOutputFile, matchesOutputFile,
alignedReadsOutputFile, unalignedReadsOutputFile, referencesDB, hashTables, geneTableAccess);
} else {
System.err.println("File not found: '" + inFile + "', skipped");
......@@ -372,15 +351,9 @@ public class MaltRun {
/**
* run search on file of input sequences
*
* @param maltOptions
* @param infile
* @param tables
* @throws jloda.util.CanceledException
* @throws java.io.IOException
*/
public void launchAlignmentThreads(final AlignerOptions alignerOptions, final MaltOptions maltOptions, final String infile, final String rmaOutputFile,
final String matchesOutputFile, final String organismProfileOutputFile,
private void launchAlignmentThreads(final AlignerOptions alignerOptions, final MaltOptions maltOptions, final String infile, final String rmaOutputFile,
final String matchesOutputFile,
final String alignedReadsOutputFile, final String unalignedReadsOutputFile,
final ReferencesDBAccess referencesDB, final ReferencesHashTableAccess[] tables,
final GeneTableAccess geneTableAccess) throws IOException, JAXBException {
......@@ -405,8 +378,6 @@ public class MaltRun {
final FileWriterRanked alignedReadsWriter = (alignedReadsOutputFile != null ? new FileWriterRanked(alignedReadsOutputFile, maltOptions.getNumberOfThreads(), 1) : null);
final FileWriterRanked unalignedReadsWriter = (unalignedReadsOutputFile != null ? new FileWriterRanked(unalignedReadsOutputFile, maltOptions.getNumberOfThreads(), 1) : null);
final OutputStream organismOutStream = (organismProfileOutputFile != null ? new BufferedOutputStream(new FileOutputStream(organismProfileOutputFile)) : null);
if (matchesWriter == null && rmaWriter == null && alignedReadsWriter == null && unalignedReadsWriter == null)
System.err.println("Warning: no output specified");
......@@ -427,7 +398,7 @@ public class MaltRun {
public void run() {
try {
alignmentEngines[threadNumber] = new AlignmentEngine(threadNumber, maltOptions, alignerOptions, referencesDB, tables, fastAReader,
matchesWriter, rmaWriter, organismOutStream, alignedReadsWriter, unalignedReadsWriter);
matchesWriter, rmaWriter, alignedReadsWriter, unalignedReadsWriter, geneTableAccess);
alignmentEngines[threadNumber].runOuterLoop();
alignmentEngines[threadNumber].finish();
} catch (Exception ex) {
......@@ -456,7 +427,7 @@ public class MaltRun {
System.err.println("Alignments written to file: " + matchesOutputFileUsed);
}
if (rmaWriter != null) {
rmaWriter.close();
rmaWriter.close(maltOptions.getContaminantsFile());
System.err.println("Analysis written to file: " + rmaOutputFile);
}
......@@ -502,13 +473,6 @@ public class MaltRun {
System.err.println("Deleted temporary file: " + matchesOutputFileUsed);
}
if (organismOutStream != null) {
OrganismsProfileMerger organismsProfileMerger = new OrganismsProfileMerger(MappingManager.getTaxonomyMapping(), geneTableAccess);
organismsProfileMerger.setName(Basic.getFileBaseName(Basic.getFileNameWithoutPath(infile)));
organismsProfileMerger.mergeAndCompute(ProfileUtilities.getOrganismsProfiles(alignmentEngines));
organismsProfileMerger.write(organismOutStream);
organismOutStream.close();
}
if (alignedReadsWriter != null) {
// merge all thread-specific taxon profiles. This can be quite major computation...
alignedReadsWriter.close();
......@@ -534,13 +498,6 @@ public class MaltRun {
/**
* creates the output file name
*
* @param fileNumber
* @param inFiles
* @param outFiles
* @param suffix
* @return
* @throws IOException
*/
private String getOutputFileName(final int fileNumber, final List<String> inFiles, final List<String> outFiles, final String suffix, final boolean gzip) throws IOException {
if (outFiles.size() == 0)
......
/**
* TestIO.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/**
* Version.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......@@ -25,5 +25,5 @@ package malt;
*/
public class Version {
public static final String NAME = "MALT";
public static final String SHORT_DESCRIPTION = "MALT (version 0.3.8, built 2 Jun 2016)";
public static final String SHORT_DESCRIPTION = "MALT (version 0.4.0, built 6 Sep 2017)";
}
/**
* AlignerOptions.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/**
* BandedAligner.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......@@ -1315,8 +1315,10 @@ public class BandedAligner {
alignmentBuffer.write(queryTrack, 0, length);
alignmentBuffer.write('\t');
}
int length = Utilities.getFirstWordSkipLeadingGreaterSign(referenceHeader, queryTrack);
{
int length = Utilities.getFirstWordSkipLeadingGreaterSign(referenceHeader, queryTrack); // misuse query track
alignmentBuffer.write(queryTrack, 0, length);
}
alignmentBuffer.write('\t');
if (getExpected() == 0)
alignmentBuffer.writeAsAscii(String.format("%.1f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t0.0\t%d", ((100.0 * getIdentities()) / getAlignmentLength()), getAlignmentLength(),
......
/**
* BlastStatisticsHelper.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......@@ -25,7 +25,7 @@ import jloda.util.Pair;
import java.io.IOException;
/**
* DESCRIPTION
* blast statistics helper
* Daniel Huson, 8.2014
*/
public class BlastStatisticsHelper {
......
/**
* DNAScoringMatrix.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/**
* IScoringMatrix.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/**
* ProteinScoringMatrix.java
* Copyright (C) 2015 Daniel H. Huson
*
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* <p>
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* <p>
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* <p>
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
......
/*
* Copyright (C) 2015 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......
/**
* OrganismsProfile.java
* Copyright (C) 2015 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package malt.analysis;
import malt.data.ReadMatch;
import malt.mapping.Mapping;
import malt.util.Utilities;
import megan.algorithms.LCAAlgorithm;
import java.util.*;
/**
* an organism profile reports organisms, contained genes and contained reads.
* THe weighted LCA is used to determine organisms. For each read, genes are ranked by reference weight
* Daniel Huson, 8.2014
*/
public class OrganismsProfile {
protected final Mapping taxonMapping;
protected final Map<Integer, Integer> refIndex2weight = new HashMap<>(100000);
protected final QueryItem head = new QueryItem(); // head of query item list
protected QueryItem tail = head; // tail of query item list
protected String name;
protected int totalReads;
private final Set<Integer> refIdAlreadySeenInAddRead = new HashSet<>(2000, 0.9f);
private final LCAAlgorithm lcaAlgorithm = new LCAAlgorithm();
protected int[] taxIds = new int[1000];
private int[] refIds = new int[1000];
protected double topPercentFactor = 0.9;
/**
* constructor
*/
public OrganismsProfile(final Mapping taxonMapping) {
this.taxonMapping = taxonMapping;
}
/**
* add a read to the organism profile
*
* @param queryHeader
* @param numberOfMatches
* @param readMatches
*/
public void addRead(final byte[] queryHeader, final int numberOfMatches, final ReadMatch[] readMatches) {
final byte[] queryName = Utilities.getFirstWordSkipLeadingGreaterSign(queryHeader);
// increment reference weights using naive LCA algorithm
if (numberOfMatches == 0) { // no hits
addNoHitsRead();
} else if (numberOfMatches == 1) { // exactly one hit, will use this
ReadMatch match = readMatches[0];
int refId = match.getReferenceId();
Integer weight = refIndex2weight.get(refId); // increment reference sequence weight
if (weight == null)
refIndex2weight.put(refId, 1);
else
refIndex2weight.put(refId, weight + 1);
totalReads++;
tail.next = new QueryItem(queryName, numberOfMatches, readMatches);
tail = tail.next;
} else { // more than one hit.
// For each read, we store the set of references matched and after processing all reads in this way we
// then apply the weighted LCA to all such sets of references
if (refIds.length < numberOfMatches) { // resize if necessary
int newSize = Math.max(2 * refIds.length, numberOfMatches);
refIds = new int[newSize];
taxIds = new int[newSize];
}
final double topScore = readMatches[0].getBitScore();
final double minScore = Math.min(topScore, topPercentFactor * topScore);
int numberOfMatchesToUse = 0;
refIdAlreadySeenInAddRead.clear();
for (int i = 0; i < numberOfMatches; i++) { // consider all matches in descending order of bit score
ReadMatch match = readMatches[i];
if (match.getBitScore() < minScore)
break;
final int refId = match.getReferenceId();
final int taxId = taxonMapping.get(refId);
if (taxId > 0 && numberOfMatchesToUse < refIds.length) {
if (!refIdAlreadySeenInAddRead.contains(refId)) { // don't use more than one match to the same reference
refIdAlreadySeenInAddRead.add(refId);
taxIds[numberOfMatchesToUse] = taxId;
refIds[numberOfMatchesToUse++] = refId;
}
}
}
if (numberOfMatchesToUse == 0) {
addNoHitsRead(); // should never happen...
} else if (numberOfMatchesToUse == 1) { // only has one good match, increment reference weight
final int refId = refIds[0];
final Integer weight = refIndex2weight.get(refId); // increment reference sequence weight
if (weight == null)
refIndex2weight.put(refId, 1);
else
refIndex2weight.put(refId, weight + 1);
totalReads++;
tail.next = new QueryItem(queryName, numberOfMatchesToUse, readMatches);
tail = tail.next;
} else { // compute naive LCA. Then increment weight for any reference whose taxon matches the LCA
final int lca = lcaAlgorithm.computeNaiveLCA(taxIds, numberOfMatchesToUse);
if (lca > 0) {
for (int i = 0; i < numberOfMatchesToUse; i++) {
if (taxIds[i] == lca) {
int refId = refIds[i];
Integer weight = refIndex2weight.get(refId);
if (weight == null)
refIndex2weight.put(refId, 1);
else
refIndex2weight.put(refId, weight + 1);
}
}
totalReads++;
tail.next = new QueryItem(queryName, numberOfMatchesToUse, readMatches);
tail = tail.next;
} else
addNoHitsRead();
}
}
}
/**
* skip a read
*/
public void addNoHitsRead() {
totalReads++;
}
/**
* returns getLetterCodeIterator over all query items
*
* @return query item getLetterCodeIterator
*/
public Iterator<QueryItem> iterator() {
return new Iterator<QueryItem>() {
private QueryItem item = head.next;
public boolean hasNext() {
return item != null;
}
public QueryItem next() {
QueryItem result = item;
item = item.next;
return result;
}
public void remove() {
}
};
}
public double getTopPercent() {
return 100 * (1.0 - topPercentFactor);
}
public void setTopPercent(double topPercent) {
this.topPercentFactor = 1.0 - topPercent / 100.0;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public long getTotalReads() {
return totalReads;
}
protected LCAAlgorithm getLcaAlgorithm() {
return lcaAlgorithm;
}
/**
* finish the analysis
*/
public void finishAnalysis() {
}
}