Skip to content
Commits on Source (8)
malt (0.0+git20180524.5984e06-1) UNRELEASED; urgency=medium
malt (0.0+git20180707.42b5cfd-1) UNRELEASED; urgency=medium
* Initial release (Closes: #<bug>)
......
......@@ -3,14 +3,16 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 10),
Build-Depends: debhelper (>= 11~),
javahelper,
default-jdk,
ant,
libjloda-java (>= 0.0+20170502)
Standards-Version: 3.9.8
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/malt.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/malt.git
libjloda-java (>= 0.0+git20180523),
libopenjfx-java,
libcontrolsfx-java
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/malt
Vcs-Git: https://salsa.debian.org/med-team/malt.git
Homepage: https://github.com/danielhuson/malt
Package: malt
......
#!/bin/sh -e
COMPRESS=xz
NAME=`dpkg-parsechangelog | awk '/^Source/ { print $2 }'`
MVERSION=`dpkg-parsechangelog | awk '/^Version:/ { print $2 }' | sed 's/^\([0-9\.]\+\)[+~][-0-9]\+$/\1/'`
mkdir -p ../tarballs
cd ../tarballs
# need to clean up the tarballs dir first because upstream tarball might
# contain a directory with unpredictable name
rm -rf *
git clone --quiet https://github.com/danielhuson/malt
cd $NAME
VERSION=${MVERSION}+`date -d @$(git show --format="%at" | head -n1) +%Y%m%d`
# for esthetical reasons set file timestamps (if git-restore-mtime is installed)
git restore-mtime || true
cd ..
TARDIR=${NAME}-${VERSION}
mv ${NAME} ${TARDIR}
rm -rf ${TARDIR}/.git
rm -f ${TARDIR}/jars/*
GZIP="--best --no-name" tar --owner=root --group=root --mode=a+rX -caf "$NAME"_"$VERSION".orig.tar.${COMPRESS} "${TARDIR}"
rm -rf ${TARDIR}
......@@ -13,11 +13,3 @@ Description: MALT source is just here ...
<property name="srcDir" value="src"/>
<property name="classDir" value="classes"/>
<property name="jar" value="${project_name}.jar"/>
@@ -14,7 +14,6 @@
<path id="build.classpath">
<fileset dir="/usr/share/java/" includes="jloda.jar"/>
- <fileset dir="../../malt/jars/" includes="*.jar"/>
</path>
<!-- init -->
......@@ -22,10 +22,10 @@ Description: No idea whether megan-ce is needed to build MALT - assume
<fileset dir="/usr/share/java/" includes="jloda.jar"/>
- <fileset dir="../../megan-ce/jars" includes="*.jar"/>
- <fileset dir="../../megan-ce/jars/megan6server" includes="*.jar"/>
<fileset dir="../../malt/jars/" includes="*.jar"/>
</path>
@@ -37,7 +34,6 @@
<!-- init -->
@@ -36,7 +33,6 @@
<!-- copy sources -->
<target name="copy_sources" depends="copy_resources">
<copy todir="${srcDir}">
......
......@@ -22,8 +22,8 @@ Description: There is no point in copying jloda source since the packaged
+ <fileset dir="/usr/share/java/" includes="jloda.jar"/>
<fileset dir="../../megan-ce/jars" includes="*.jar"/>
<fileset dir="../../megan-ce/jars/megan6server" includes="*.jar"/>
<fileset dir="../../malt/jars/" includes="*.jar"/>
@@ -39,7 +37,6 @@
</path>
@@ -38,7 +36,6 @@
<!-- copy sources -->
<target name="copy_sources" depends="copy_resources">
<copy todir="${srcDir}">
......
......@@ -24,7 +24,6 @@ override_dh_clean:
rm -rf antbuild/classes antbuild/src
override_dh_auto_build:
# cp -a debian/megan src
ant -buildfile antbuild/build.xml jar
get-orig-source:
. debian/get-orig-source
# rm -rf src/megan
......@@ -23,7 +23,7 @@ import jloda.util.Basic;
import malt.align.AlignerOptions;
import malt.align.BandedAligner;
import malt.data.*;
import malt.genes.GeneTableAccess;
import malt.genes.GeneItemAccessor;
import malt.io.*;
import malt.util.FixedSizePriorityQueue;
import malt.util.Utilities;
......@@ -59,7 +59,7 @@ public class AlignmentEngine {
private final FileWriterRanked unalignedReadsWriter;
private final RMA6Writer rmaWriter;
private final GeneTableAccess geneTableAccess;
private final GeneItemAccessor geneTableAccess;
// parameters
private final double minRawScore;
......@@ -100,7 +100,7 @@ public class AlignmentEngine {
AlignmentEngine(final int threadNumber, final MaltOptions maltOptions, AlignerOptions alignerOptions, final ReferencesDBAccess referencesDB,
final ReferencesHashTableAccess[] tables, final FastAReader fastAReader,
final FileWriterRanked matchesWriter, final RMA6Writer rmaWriter,
final FileWriterRanked alignedReadsWriter, final FileWriterRanked unalignedReadsWriter, final GeneTableAccess geneTableAccess) throws IOException {
final FileWriterRanked alignedReadsWriter, final FileWriterRanked unalignedReadsWriter, final GeneItemAccessor geneTableAccess) throws IOException {
this.threadNumber = threadNumber;
this.maltOptions = maltOptions;
this.referencesDB = referencesDB;
......
......@@ -21,13 +21,16 @@ package malt;
import jloda.util.*;
import malt.data.*;
import malt.genes.GeneTableBuilder;
import malt.genes.GeneItem;
import malt.genes.GeneItemCreator;
import malt.mapping.Mapping;
import malt.util.Utilities;
import megan.classification.Classification;
import megan.classification.ClassificationManager;
import megan.classification.IdMapper;
import megan.classification.IdParser;
import megan.tools.AAdderBuild;
import megan.util.interval.Interval;
import java.io.File;
import java.io.IOException;
......@@ -38,6 +41,8 @@ import java.util.*;
* Daniel Huson, 8.2014
*/
public class MaltBuild {
final public static String INDEX_CREATOR = "MALT";
/**
* run the program
*
......@@ -110,24 +115,30 @@ public class MaltBuild {
proteinReduction = "";
options.comment("Classification:");
final boolean parseTaxonNames = true;
final boolean parseTaxonNames = options.getOption("-tn", "parseTaxonNames", "Parse taxon names", true);
final String gi2TaxaFile = options.getOption("-g2t", "gi2taxa", "GI-to-Taxonomy mapping file", "");
final String acc2TaxaFile = options.getOption("-a2t", "acc2taxa", "Accession-to-Taxonomy mapping file", "");
final String synonyms2TaxaFile = options.getOption("-s2t", "syn2taxa", "Synonyms-to-Taxonomy mapping file", "");
final Map<String, String> cName2GIFileName = new HashMap<>();
final Map<String, String> cName2AcessionFileName = new HashMap<>();
final Map<String, String> cName2SynonymsFileName = new HashMap<>();
final HashMap<String, String> class2GIFile = new HashMap<>();
final HashMap<String, String> class2AccessionFile = new HashMap<>();
final HashMap<String, String> class2SynonymsFile = new HashMap<>();
final Set<String> classificationsToUse = new TreeSet<>();
for (String cName : ClassificationManager.getAllSupportedClassifications()) {
cName2GIFileName.put(cName, options.getOption("-g2" + cName.toLowerCase(), "gi2" + cName.toLowerCase(), "GI-to-" + cName + " mapping file (deprecated)", ""));
cName2AcessionFileName.put(cName, options.getOption("-a2" + cName.toLowerCase(), "acc2" + cName.toLowerCase(), "Accession-to-" + cName + " mapping file", ""));
cName2SynonymsFileName.put(cName, options.getOption("-s2" + cName.toLowerCase(), "syn2" + cName.toLowerCase(), "Synonyms-to-" + cName + " mapping file", ""));
if (cName2GIFileName.get(cName).length() > 0 || cName2AcessionFileName.get(cName).length() > 0 || cName2SynonymsFileName.get(cName).length() > 0)
for (String cName : ClassificationManager.getAllSupportedClassificationsExcludingNCBITaxonomy()) {
class2GIFile.put(cName, options.getOption("-g2" + cName.toLowerCase(), "gi2" + cName.toLowerCase(), "GI-to-" + cName + " mapping file", ""));
class2AccessionFile.put(cName, options.getOption("-a2" + cName.toLowerCase(), "acc2" + cName.toLowerCase(), "Accession-to-" + cName + " mapping file", ""));
class2SynonymsFile.put(cName, options.getOption("-s2" + cName.toLowerCase(), "syn2" + cName.toLowerCase(), "Synonyms-to-" + cName + " mapping file", ""));
final String tags = options.getOption("-t4" + cName.toLowerCase(), "tags4" + cName.toLowerCase(), "Tags for " + cName + " id parsing (must set to activate id parsing)", "").trim();
if (tags.length() > 0)
ProgramProperties.put(cName + "Tags", tags);
ProgramProperties.put(cName + "ParseIds", tags.length() > 0);
// final boolean useLCA = options.getOption("-l_" + cName.toLowerCase(), "lca" + cName.toLowerCase(), "Use LCA for assigning to '" + cName + "', alternative: best hit", ProgramProperties.get(cName + "UseLCA", cName.equals(Classification.Taxonomy)));
// ProgramProperties.put(cName + "UseLCA", useLCA);
if (class2GIFile.get(cName).length() > 0 || class2AccessionFile.get(cName).length() > 0 || class2AccessionFile.get(cName).length() > 0)
classificationsToUse.add(cName);
if (cName.equalsIgnoreCase(Classification.Taxonomy))
options.getOption("-tn", "parseTaxonNames", "Parse taxon names", true);
}
final boolean functionalClassification = !options.getOption("-nf", "noFun", "Turn off functional classifications for provided mapping files (set this when using GFF files for DNA references)", false);
......@@ -139,10 +150,11 @@ public class MaltBuild {
final boolean saveFirstWordOfReferenceHeaderOnly = options.getOption("-fwo", "firstWordOnly", "Save only first word of reference header", false);
final int randomSeed = options.getOption("rns", "random", "Random number generator seed", 666);
final float hashTableLoadFactor = options.getOption("hsf", "hashScaleFactor", "Hash table scale factor", 0.9f, 0.1f, 1.0f);
//final boolean buildTableInMemory = options.getOption("btm", "buildTableInMemory", "Build the hash table in memory and then save (more memory, much faster)", true);
final boolean buildTableInMemory = true; // don't make this an option because it is really slow...
final boolean buildTableInMemory = options.getOption("btm", "buildTableInMemory", "Build the hash table in memory and then save (uses more memory, is much faster)", true);
final boolean doBuildTables = !options.getOption("!xX", "xSkipTable", "Don't recompute index and tables, just compute profile support", false);
final boolean lookInside = options.getOption("-ex", "extraStrict", "When given an input directory, look inside every GFF file to check that it is indeed in GFF3 format", false);
options.done();
Basic.setDebugMode(options.isVerbose());
......@@ -164,20 +176,7 @@ public class MaltBuild {
}
}
if (gffFiles.size() == 1) {
final File file = new File(gffFiles.get(0));
if (file.isDirectory()) {
System.err.println("Looking for GFF files in directory: " + file);
gffFiles.clear();
for (File aFile : Basic.getAllFilesInDirectory(file, new GFF3FileFilter(), true)) {
gffFiles.add(aFile.getPath());
}
if (gffFiles.size() == 0)
throw new IOException("No GFF files found in directory: " + file);
else
System.err.println(String.format("Found: %,d", gffFiles.size()));
}
}
AAdderBuild.setupGFFFiles(gffFiles, lookInside);
System.err.println("Reference sequence type set to: " + sequenceType.toString());
final IAlphabet referenceAlphabet;
......@@ -240,7 +239,9 @@ public class MaltBuild {
}
}
// setup classification support
if (gi2TaxaFile.length() > 0 || acc2TaxaFile.length() > 0 || synonyms2TaxaFile.length() > 0)
classificationsToUse.add(Classification.Taxonomy);
for (String cName : classificationsToUse) {
final String cNameLowerCase = cName.toLowerCase();
final String sourceName = (cName.equals(Classification.Taxonomy) ? "ncbi" : cNameLowerCase);
......@@ -250,12 +251,12 @@ public class MaltBuild {
Basic.writeStreamToFile(ResourceManager.getFileAsStream(sourceName + ".tre"), new File(indexDirectory, cNameLowerCase + ".tre"));
Basic.writeStreamToFile(ResourceManager.getFileAsStream(sourceName + ".map"), new File(indexDirectory, cNameLowerCase + ".map"));
if (cName2SynonymsFileName.get(cName).length() > 0)
Utilities.loadMapping(cName2SynonymsFileName.get(cName), IdMapper.MapType.Synonyms, cName);
if (cName2AcessionFileName.get(cName).length() > 0)
Utilities.loadMapping(cName2AcessionFileName.get(cName), IdMapper.MapType.Accession, cName);
if (cName2GIFileName.get(cName).length() > 0)
Utilities.loadMapping(cName2GIFileName.get(cName), IdMapper.MapType.GI, cName);
if (class2SynonymsFile.get(cName) != null)
Utilities.loadMapping(class2SynonymsFile.get(cName), IdMapper.MapType.Synonyms, cName);
if (class2AccessionFile.get(cName) != null)
Utilities.loadMapping(class2AccessionFile.get(cName), IdMapper.MapType.Accession, cName);
if (class2GIFile.get(cName) != null)
Utilities.loadMapping(class2GIFile.get(cName), IdMapper.MapType.GI, cName);
final IdParser idParser = ClassificationManager.get(cName, true).getIdMapper().createIdParser();
if (cName.equals(Classification.Taxonomy))
......@@ -271,8 +272,13 @@ public class MaltBuild {
referencesDB.save(new File(indexDirectory, "ref.idx"), new File(indexDirectory, "ref.db"), new File(indexDirectory, "ref.inf"), saveFirstWordOfReferenceHeaderOnly);
if (gffFiles.size() > 0) {
GeneTableBuilder geneTableBuilder = new GeneTableBuilder();
geneTableBuilder.buildAndSaveAnnotations(referencesDB, gffFiles, new File(indexDirectory, "annotation.idx"), new File(indexDirectory, "annotation.db"), numberOfThreads);
// setup gene item creator, in particular accession mapping
final GeneItemCreator creator = AAdderBuild.setupCreator(null, class2AccessionFile);
// obtains the gene annotations:
Map<String, ArrayList<Interval<GeneItem>>> dnaId2list = AAdderBuild.computeAnnotations(creator, gffFiles);
AAdderBuild.saveIndex(INDEX_CREATOR, creator, indexDirectory.getPath(), dnaId2list, referencesDB.refNames());
}
}
}
......@@ -25,7 +25,7 @@ import malt.align.BlastStatisticsHelper;
import malt.align.DNAScoringMatrix;
import malt.align.ProteinScoringMatrix;
import malt.data.*;
import malt.genes.GeneTableAccess;
import malt.genes.GeneItemAccessor;
import malt.io.*;
import malt.mapping.MappingManager;
import malt.util.Utilities;
......@@ -196,7 +196,7 @@ public class MaltRun {
ReadMagnitudeParser.setEnabled(options.getOption("mag", "magnitudes", "Reads have magnitudes (to be used in taxonomic or functional analysis)", false));
maltOptions.setContaminantsFile(ProgramProperties.getIfEnabled("enable-contaminants", options.getOption("-cf", "conFile", "File of contaminant taxa (one Id or name per line)", "")));
maltOptions.setContaminantsFile(options.getOption("-cf", "conFile", "File of contaminant taxa (one Id or name per line)", ""));
options.comment("Heuristics:");
maltOptions.setMaxSeedsPerOffsetPerFrame(options.getOption("spf", "maxSeedsPerFrame", "Maximum number of seed matches per offset per read frame", maltOptions.getMaxSeedsPerOffsetPerFrame()));
......@@ -299,9 +299,9 @@ public class MaltRun {
MappingManager.loadMappings(cNames, indexDirectory);
}
final GeneTableAccess geneTableAccess;
if ((new File(indexDirectory, "annotation.idx")).exists()) {
geneTableAccess = new GeneTableAccess(new File(indexDirectory, "annotation.idx"), new File(indexDirectory, "annotation.db"));
final GeneItemAccessor geneTableAccess;
if ((new File(indexDirectory, "aadd.idx")).exists()) {
geneTableAccess = new GeneItemAccessor(new File(indexDirectory, "aadd.idx"), new File(indexDirectory, "aadd.dbx"));
maltOptions.setParseHeaders(true);
} else
geneTableAccess = null;
......@@ -356,7 +356,7 @@ public class MaltRun {
final String matchesOutputFile,
final String alignedReadsOutputFile, final String unalignedReadsOutputFile,
final ReferencesDBAccess referencesDB, final ReferencesHashTableAccess[] tables,
final GeneTableAccess geneTableAccess) throws IOException, JAXBException {
final GeneItemAccessor geneTableAccess) throws IOException {
final ExecutorService executor = Executors.newFixedThreadPool(maltOptions.getNumberOfThreads());
final CountDownLatch countDownLatch = new CountDownLatch(maltOptions.getNumberOfThreads());
......
......@@ -25,5 +25,5 @@ package malt;
*/
public class Version {
public static final String NAME = "MALT";
public static final String SHORT_DESCRIPTION = "MALT (version 0.4.0, built 6 Sep 2017)";
public static final String SHORT_DESCRIPTION = "MALT (version 0.4.1, built 29 June 2018)";
}
......@@ -19,12 +19,14 @@
*/
package malt.data;
import jloda.util.Basic;
import jloda.util.CanceledException;
import jloda.util.ProgressPercentage;
import malt.io.FastAFileIteratorBytes;
import megan.io.OutputWriter;
import java.io.*;
import java.util.Iterator;
import java.util.List;
/**
......@@ -104,6 +106,37 @@ public class ReferencesDBBuilder implements ISequenceAccessor {
return headers[index];
}
/**
* gets an iterable over all ref names as strings
*
* @return iterable
*/
public Iterable<String> refNames() {
return new Iterable<String>() {
@Override
public Iterator<String> iterator() {
return new Iterator<String>() {
private int i = 0;
@Override
public boolean hasNext() {
return i < numberOfSequences;
}
@Override
public String next() {
return Basic.getAccessionWord(headers[i++]);
}
@Override
public void remove() {
}
};
}
};
}
/**
* Get sequence. Index starts at 0
*
......@@ -259,9 +292,9 @@ public class ReferencesDBBuilder implements ISequenceAccessor {
pos++;
byte[] add;
if (header[pos - 1] == '|')
add = String.format("%s%d|", tag, id).getBytes();
add = String.format("%s%d", tag, id).getBytes();
else
add = String.format("|%s%d|", tag, id).getBytes();
add = String.format("|%s%d", tag, id).getBytes();
byte[] newHeader = new byte[header.length + add.length];
System.arraycopy(header, 0, newHeader, 0, pos);
......
/**
/*
* GeneItem.java
* Copyright (C) 2018 Daniel H. Huson
* <p>
......@@ -22,6 +22,7 @@ package malt.genes;
import jloda.util.Basic;
import megan.io.InputReader;
import megan.io.OutputWriter;
import megan.util.interval.Interval;
import java.io.IOException;
import java.io.RandomAccessFile;
......@@ -33,53 +34,39 @@ import java.io.RandomAccessFile;
*/
public class GeneItem {
private byte[] proteinId;
private int keggId;
private int cogId;
private int seedId;
private int interproId;
private final GeneItemCreator creator;
private boolean reverse;
public GeneItem() {
private final int[] ids;
GeneItem(GeneItemCreator creator) {
this.creator = creator;
ids = new int[creator.numberOfClassifications()];
}
public byte[] getProteinId() {
return proteinId;
}
public void setProteinId(byte[] proteinId) {
public void setProteinId(byte[] proteinId) throws IOException {
this.proteinId = proteinId;
creator.map(Basic.toString(proteinId), ids);
}
public int getKeggId() {
return keggId;
}
public void setKeggId(int keggId) {
this.keggId = keggId;
}
public int getCogId() {
return cogId;
}
public void setCogId(int cogId) {
this.cogId = cogId;
}
public int getSeedId() {
return seedId;
public int getId(String classificationName) {
return getId(creator.rank(classificationName));
}
public void setSeedId(int seedId) {
this.seedId = seedId;
public int getId(Integer rank) {
return rank == null ? 0 : ids[rank];
}
public int getInterproId() {
return interproId;
public void setId(String classificationName, int id) {
ids[creator.rank(classificationName)] = id;
}
public void setInterproId(int interproId) {
this.interproId = interproId;
public void setId(int rank, int id) {
ids[rank] = id;
}
public boolean isReverse() {
......@@ -91,13 +78,13 @@ public class GeneItem {
}
public String toString() {
return "proteinId=" + (proteinId == null ? "null" : Basic.toString(proteinId))
+ ", keggId=" + keggId
+ ", cogId=" + cogId
+ ", seedId=" + seedId
+ ", interProId=" + interproId
+ ", reverse=" + reverse;
final StringBuilder buf = new StringBuilder("proteinId=" + (proteinId == null ? "null" : Basic.toString(proteinId)));
for (int i = 0; i < creator.numberOfClassifications(); i++) {
buf.append(", ").append(creator.classification(i)).append("=").append(ids[i]);
}
buf.append(", reverse=").append(reverse);
return buf.toString();
}
/**
......@@ -113,10 +100,9 @@ public class GeneItem {
outs.writeInt(proteinId.length);
outs.write(proteinId);
}
outs.writeInt(keggId);
outs.writeInt(cogId);
outs.writeInt(seedId);
outs.writeInt(interproId);
for (int i = 0; i < creator.numberOfClassifications(); i++) {
outs.writeInt(ids[i]);
}
outs.write(reverse ? 1 : 0);
}
......@@ -135,10 +121,9 @@ public class GeneItem {
if (ins.read(proteinId, 0, length) != length)
throw new IOException("read failed");
}
keggId = ins.readInt();
cogId = ins.readInt();
seedId = ins.readInt();
interproId = ins.readInt();
for (int i = 0; i < creator.numberOfClassifications(); i++) {
ids[i] = ins.readInt();
}
reverse = (ins.read() == 1);
}
......@@ -157,10 +142,26 @@ public class GeneItem {
if (ins.read(proteinId, 0, length) != length)
throw new IOException("read failed");
}
keggId = ins.readInt();
cogId = ins.readInt();
seedId = ins.readInt();
interproId = ins.readInt();
for (int i = 0; i < creator.numberOfClassifications(); i++) {
ids[i] = ins.readInt();
}
reverse = (ins.read() == 1);
}
/**
* get the annotation string
*
* @param refInterval
* @return annotation string
*/
public String getAnnotation(Interval<GeneItem> refInterval) {
final StringBuilder buf = new StringBuilder();
buf.append("pos|").append(isReverse() ? refInterval.getEnd() + ".." + refInterval.getStart() : refInterval.getStart() + ".." + refInterval.getEnd());
buf.append("|ref|").append(Basic.toString(getProteinId()));
for (int i = 0; i < creator.numberOfClassifications(); i++) {
if (getId(i) > 0)
buf.append("|").append(creator.getShortTag(i)).append(getId(i));
}
return buf.toString();
}
}
/**
* GeneTableAccess.java
/*
* GeneItemAccessor.java
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
......@@ -20,21 +20,29 @@
package malt.genes;
import jloda.util.*;
import malt.MaltBuild;
import megan.classification.IdMapper;
import megan.io.InputReader;
import megan.tools.AAdderBuild;
import megan.tools.AAdderRun;
import megan.util.interval.Interval;
import megan.util.interval.IntervalTree;
import java.io.*;
/**
* class used to access gene table
* Daniel Huson, 8.2014
* class used to access gene items
* Daniel Huson, 6.2018
*/
public class GeneTableAccess {
public class GeneItemAccessor {
private final int size;
private final long[] refIndex2FilePos;
private final IntervalTree<GeneItem>[] refIndex2Intervals;
private final String[] index2ref;
private final RandomAccessFile dbRaf;
private final GeneItemCreator creator;
private final int syncBits = 1023;
private final Object[] syncObjects = new Object[syncBits + 1]; // use lots of objects to synchronize on so that threads don't in each others way
......@@ -45,28 +53,38 @@ public class GeneTableAccess {
* @param dbFile
* @throws IOException
*/
public GeneTableAccess(File indexFile, File dbFile) throws IOException {
public GeneItemAccessor(File indexFile, File dbFile) throws IOException {
// create the synchronization objects
for (int i = 0; i < (syncBits + 1); i++) {
syncObjects[i] = new Object();
}
try (DataInputStream ins = new DataInputStream(new BufferedInputStream(new FileInputStream(indexFile))); ProgressPercentage progress = new ProgressPercentage("Reading file: " + indexFile)) {
Basic.readAndVerifyMagicNumber(ins, GeneTableBuilder.MAGIC_NUMBER_IDX);
try (InputReader ins = new InputReader(indexFile); ProgressPercentage progress = new ProgressPercentage("Reading file: " + indexFile)) {
AAdderRun.readAndVerifyMagicNumber(ins, AAdderBuild.MAGIC_NUMBER_IDX);
final String creator = ins.readString();
if (!creator.equals(MaltBuild.INDEX_CREATOR))
throw new IOException("Gene Item index not created by MALT");
size = ins.readInt();
progress.setMaximum(size);
refIndex2FilePos = new long[size];
index2ref = new String[size];
for (int i = 0; i < size; i++) {
refIndex2FilePos[i] = ins.readLong();
index2ref[i] = ins.readString();
final long pos = ins.readLong();
refIndex2FilePos[i] = pos;
progress.incrementProgress();
}
}
refIndex2Intervals = (IntervalTree<GeneItem>[]) new IntervalTree[size];
try (DataInputStream ins = new DataInputStream(new BufferedInputStream(new FileInputStream(dbFile)))) {
Basic.readAndVerifyMagicNumber(ins, GeneTableBuilder.MAGIC_NUMBER_DB);
if (ins.readInt() != size)
throw new IOException("Sizes differ: " + indexFile + " vs " + dbFile);
try (InputReader dbxIns = new InputReader(dbFile)) {
AAdderRun.readAndVerifyMagicNumber(dbxIns, AAdderBuild.MAGIC_NUMBER_DBX);
final String[] cNames = new String[dbxIns.readInt()];
for (int i = 0; i < cNames.length; i++) {
cNames[i] = dbxIns.readString();
System.err.println(cNames[i]);
}
creator = new GeneItemCreator(cNames, new IdMapper[0]);
}
dbRaf = new RandomAccessFile(dbFile, "r");
}
......@@ -85,14 +103,15 @@ public class GeneTableAccess {
if (refIndex2Intervals[refIndex] == null && refIndex2FilePos[refIndex] != 0) {
synchronized (dbRaf) {
try {
dbRaf.seek(refIndex2FilePos[refIndex]);
final long pos = refIndex2FilePos[refIndex];
dbRaf.seek(pos);
int intervalsLength = dbRaf.readInt();
if (intervalsLength > 0) {
IntervalTree<GeneItem> intervals = new IntervalTree<>();
for (int i = 0; i < intervalsLength; i++) {
int start = dbRaf.readInt();
int end = dbRaf.readInt();
GeneItem geneItem = new GeneItem();
GeneItem geneItem = new GeneItem(creator);
geneItem.read(dbRaf);
intervals.add(start, end, geneItem);
//System.err.println(refIndex+"("+start+"-"+end+") -> "+geneItem);
......@@ -133,36 +152,16 @@ public class GeneTableAccess {
if (refInterval != null) {
final GeneItem geneItem = refInterval.getData();
final StringBuilder buf = new StringBuilder();
buf.append("|ref|").append(Basic.toString(geneItem.getProteinId()));
if (geneItem.getKeggId() != 0)
buf.append("|kegg|").append(geneItem.getKeggId());
if (geneItem.getCogId() != 0)
buf.append("|cog|").append(geneItem.getCogId());
if (geneItem.getSeedId() != 0)
buf.append("|seed|").append(geneItem.getSeedId());
if (geneItem.getInterproId() != 0)
buf.append("|ipr|").append(geneItem.getInterproId());
if (buf.length() > 0) {
String header = referenceHeader;
final String remainder;
final int len = header.indexOf(' ');
if (len >= 0 && len < header.length()) {
remainder = header.substring(len); // keep space...
header = header.substring(0, len);
} else
remainder = "";
return (header + (header.endsWith("|") ? "" : "|") + "pos|"
+ (geneItem.isReverse() ? refInterval.getEnd() + ".." + refInterval.getStart()
: refInterval.getStart() + ".." + refInterval.getEnd())
+ buf.toString() + remainder);
}
return Basic.swallowLeadingGreaterSign(Basic.getFirstWord(referenceHeader)) + "|" + geneItem.getAnnotation(refInterval);
}
}
return referenceHeader;
}
public String getIndex2ref(int i) {
return index2ref[i];
}
public int size() {
return size;
}
......@@ -173,24 +172,25 @@ public class GeneTableAccess {
* @param args
*/
public static void main(String[] args) throws IOException, UsageException, CanceledException {
args = new String[]{"-i", "/Users/huson/data/malt/genes2/index/annotation.idx"};
args = new String[]{"-i", "/Users/huson/data/malt/gff/index/aadd.idx"};
final ArgsOptions options = new ArgsOptions(args, null, "GeneTableDump", "Dump gene table");
final String idxFile = options.getOptionMandatory("i", "idxFile", "Input annotation.idx file", "index/annotation.idx");
final String dbFile = options.getOption("d", "dbFile", "Input annotation.db file", Basic.replaceFileSuffix(idxFile, ".db"));
final String idxFile = options.getOptionMandatory("i", "idxFile", "Input aadd.idx file", "index/aadd.idx");
final String dbFile = options.getOption("d", "dbFile", "Input aadd.dbx file", Basic.replaceFileSuffix(idxFile, ".dbx"));
final String outputFile = options.getOption("o", "output", "Output file (or stdout)", "stdout");
options.done();
final GeneTableAccess geneTableAccess = new GeneTableAccess(new File(idxFile), new File(dbFile));
final GeneItemAccessor geneTableAccess = new GeneItemAccessor(new File(idxFile), new File(dbFile));
try (Writer w = new BufferedWriter(outputFile.equals("stdout") ? new OutputStreamWriter(System.out) : new FileWriter(outputFile))) {
for (int idx = 0; idx < geneTableAccess.size(); idx++) {
final IntervalTree<GeneItem> tree = geneTableAccess.getIntervals(idx);
for (int i = 0; i < geneTableAccess.size(); i++) {
System.err.println("ref[" + i + "]=" + geneTableAccess.getIndex2ref(i) + ":");
final IntervalTree<GeneItem> tree = geneTableAccess.getIntervals(i);
if (tree != null) {
if (true) {
System.err.println("Tree[" + idxFile + "]: " + Basic.abbreviateDotDotDot(tree.toString(), 1000));
} else {
w.write("RefIndex=" + idx + "\n");
w.write("RefIndex=" + i + "\n");
for (Interval<GeneItem> interval : tree) {
w.write(interval.getStart() + " " + interval.getEnd() + ": " + interval.getData() + "\n");
......
/*
* Copyright (C) 2015 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package malt.genes;
import megan.classification.Classification;
import megan.classification.IdMapper;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* class for creating gene items
* Daniel Huson, 6.2018
*/
public class GeneItemCreator {
private final String[] cNames;
private final Map<String, Integer> rank;
private final IdMapper[] idMappers;
private final String[] shortTags;
/**
* constructor
*
* @param cNames
*/
public GeneItemCreator(String[] cNames, IdMapper[] idMappers) {
this.cNames = cNames.clone();
this.idMappers = idMappers;
rank = new HashMap<>();
for (int i = 0; i < this.cNames.length; i++) {
rank.put(this.cNames[i], i);
}
this.shortTags = new String[this.cNames.length];
for (int i = 0; i < cNames.length; i++) {
shortTags[i] = Classification.createShortTag(cNames[i]);
}
}
public GeneItem createGeneItem() {
return new GeneItem(this);
}
public Integer rank(String classificationName) {
return rank.get(classificationName);
}
public String classification(int rank) {
return cNames[rank];
}
public int numberOfClassifications() {
return cNames.length;
}
public Iterable<String> cNames() {
return new Iterable<String>() {
@Override
public Iterator<String> iterator() {
return Arrays.asList(cNames).iterator();
}
};
}
/**
* map the given accession to ids using the id mappers
*
* @param accession
* @param ids
* @throws IOException
*/
public void map(String accession, int[] ids) throws IOException {
for (int i = 0; i < idMappers.length; i++) {
ids[i] = idMappers[i].getAccessionMap().get(accession);
}
}
public String getShortTag(int i) {
return shortTags[i];
}
}
/*
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package malt.genes;
import jloda.util.Basic;
import jloda.util.CanceledException;
import jloda.util.ProgressPercentage;
import malt.data.ReferencesDBBuilder;
import megan.classification.ClassificationManager;
import megan.classification.IdMapper;
import megan.classification.util.TaggedValueIterator;
import megan.io.OutputWriter;
import megan.util.interval.Interval;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* Builds a table mapping reference indices and positions to genes
* Daniel Huson, 8.2014, 11.2017
*/
public class GeneTableBuilder {
final static byte[] MAGIC_NUMBER_IDX = "MAAnnoIdxV0.1.".getBytes();
final static byte[] MAGIC_NUMBER_DB = "MAAnnoDbV0.1.".getBytes();
public static final String[] ACCESSION_TAGS = new String[]{"gb|", "ref|"};
private final IdMapper keggMapper;
private final IdMapper cogMapper;
private final IdMapper seedMapper;
private final IdMapper interproMapper;
private final int syncBits = 1023;
private final Object[] syncObjects = new Object[syncBits + 1]; // use lots of objects to synchronize on so that threads don't in each others way
/**
* constructor
*
* @throws IOException
*/
public GeneTableBuilder() throws IOException {
// create the synchronization objects
for (int i = 0; i < (syncBits + 1); i++) {
syncObjects[i] = new Object();
}
if (ClassificationManager.get("KEGG", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
keggMapper = ClassificationManager.get("KEGG", false).getIdMapper();
else
keggMapper = null;
if (ClassificationManager.get("EGGNOG", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
cogMapper = ClassificationManager.get("EGGNOG", false).getIdMapper();
else
cogMapper = null;
if (ClassificationManager.get("SEED", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
seedMapper = ClassificationManager.get("SEED", false).getIdMapper();
else
seedMapper = null;
if (ClassificationManager.get("INTERPRO2GO", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
interproMapper = ClassificationManager.get("INTERPRO2GO", false).getIdMapper();
else
interproMapper = null;
}
/**
* build and then save the gene table
*
* @param referencesDB
* @param gffFiles
* @param indexFile
* @param numberOfThreads
* @throws IOException
*/
public void buildAndSaveAnnotations(final ReferencesDBBuilder referencesDB, final Collection<String> gffFiles, final File indexFile, final File dbFile, final int numberOfThreads) throws IOException, CanceledException {
System.err.println("Annotating reference sequences...");
final Map<String, Integer> refAccession2IndexMap = computeRefAccession2IndexMap(referencesDB, numberOfThreads);
final Collection<CDS> annotations = CDS.parseGFFforCDS(gffFiles, new ProgressPercentage());
final ArrayList<Interval<GeneItem>>[] table = computeRefIndex2Intervals(referencesDB, refAccession2IndexMap, annotations, numberOfThreads);
refAccession2IndexMap.clear();
writeTable(indexFile, dbFile, table);
}
/**
* Compute the reference accessions to reference index mapping
*
* @param referencesDB
* @param numberOfThreads
* @return accession to reference index mapping
*/
private Map<String, Integer> computeRefAccession2IndexMap(final ReferencesDBBuilder referencesDB, final int numberOfThreads) {
final Map<String, Integer> accession2refIndex = new HashMap<>(referencesDB.getNumberOfSequences(), 1f);
final ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
final CountDownLatch countDownLatch = new CountDownLatch(numberOfThreads);
final ProgressPercentage progress = new ProgressPercentage("Mapping accessions to references...", referencesDB.getNumberOfSequences());
// launch the worker threads
for (int thread = 0; thread < numberOfThreads; thread++) {
final int threadNumber = thread;
executor.execute(new Runnable() {
public void run() {
try {
final TaggedValueIterator it = new TaggedValueIterator(true, true, ACCESSION_TAGS);
for (int refIndex = threadNumber; refIndex < referencesDB.getNumberOfSequences(); refIndex += numberOfThreads) {
it.restart(Basic.toString(referencesDB.getHeader(refIndex)));
while (it.hasNext()) {
synchronized (accession2refIndex) {
accession2refIndex.put(it.next(), refIndex);
}
}
progress.incrementProgress();
}
} catch (Exception ex) {
Basic.caught(ex);
System.exit(1); // just die...
} finally {
countDownLatch.countDown();
}
}
});
}
try {
countDownLatch.await(); // await completion of alignment threads
} catch (InterruptedException e) {
Basic.caught(e);
} finally {
// shut down threads:
executor.shutdownNow();
}
progress.close();
return accession2refIndex;
}
/**
* compute the reference Id to intervals mapping
*
* @param referencesDB
* @param refAccession2IdMap
* @param cdsList
* @param numberOfThreads
* @return
* @throws FileNotFoundException
*/
private ArrayList<Interval<GeneItem>>[] computeRefIndex2Intervals(final ReferencesDBBuilder referencesDB, final Map<String, Integer> refAccession2IdMap, final Collection<CDS> cdsList, int numberOfThreads) throws IOException {
final ArrayList<Interval<GeneItem>>[] refIndex2Intervals = new ArrayList[referencesDB.getNumberOfSequences()];
final ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
final CountDownLatch countDownLatch = new CountDownLatch(numberOfThreads);
final ArrayBlockingQueue<CDS> queue = new ArrayBlockingQueue<>(10 * numberOfThreads);
final CDS sentinel = new CDS();
final int[] counts = new int[numberOfThreads];
// launch the worker threads
for (int thread = 0; thread < numberOfThreads; thread++) {
final int threadNumber = thread;
executor.execute(new Runnable() {
public void run() {
try {
while (true) {
final CDS cds = queue.take();
if (cds == sentinel)
break;
final Integer refIndex = refAccession2IdMap.get(cds.getDnaId());
if (refIndex != null) {
synchronized (syncObjects[refIndex & syncBits]) {
ArrayList<Interval<GeneItem>> list = refIndex2Intervals[refIndex];
if (list == null)
list = refIndex2Intervals[refIndex] = new ArrayList<>();
final GeneItem geneItem = new GeneItem();
final String accession = cds.getProteinId();
geneItem.setProteinId(accession.getBytes());
if (keggMapper != null) {
final Integer id = keggMapper.getIdFromAccession(accession);
if (id != null && id != 0) {
geneItem.setKeggId(id);
}
}
// set cog:
if (cogMapper != null) {
final Integer id = cogMapper.getIdFromAccession(accession);
if (id != null && id != 0) {
geneItem.setCogId(id);
}
}
// set seed:
if (seedMapper != null) {
final Integer id = seedMapper.getIdFromAccession(accession);
if (id != null && id != 0) {
geneItem.setSeedId(id);
}
}
// set interpro:
if (interproMapper != null) {
final Integer id = interproMapper.getIdFromAccession(accession);
if (id != null && id != 0) {
geneItem.setInterproId(id);
}
}
geneItem.setReverse(cds.isReverse());
list.add(new Interval<>(cds.getStart(), cds.getEnd(), geneItem));
counts[threadNumber]++;
}
}
}
} catch (Exception ex) {
Basic.caught(ex);
System.exit(1); // just die...
} finally {
countDownLatch.countDown();
}
}
});
}
try (ProgressPercentage progress = new ProgressPercentage("Annotating references", cdsList.size() + numberOfThreads)) {
for (CDS cds : cdsList) {
queue.put(cds);
progress.incrementProgress();
}
for (int i = 0; i < numberOfThreads; i++) {
queue.put(sentinel);
progress.incrementProgress();
}
countDownLatch.await();
} catch (InterruptedException ex) {
Basic.caught(ex);
} finally {
executor.shutdownNow();
}
System.err.println(String.format("Count:%,14d", Basic.getSum(counts)));
return refIndex2Intervals;
}
/**
* save the annotations
*
* @param indexFile
* @param refIndex2Intervals
* @throws IOException
*/
private static void writeTable(File indexFile, File dbFile, final ArrayList<Interval<GeneItem>>[] refIndex2Intervals) throws IOException {
final long[] refIndex2FilePos = new long[refIndex2Intervals.length];
int totalRefWithAGene = 0;
try (OutputWriter outs = new OutputWriter(dbFile); ProgressPercentage progress = new ProgressPercentage("Writing file: " + dbFile, refIndex2Intervals.length)) {
outs.write(GeneTableBuilder.MAGIC_NUMBER_DB);
outs.writeInt(refIndex2Intervals.length);
for (int i = 0; i < refIndex2Intervals.length; i++) {
final ArrayList<Interval<GeneItem>> list = refIndex2Intervals[i];
if (list == null) {
refIndex2FilePos[i] = 0;
outs.writeInt(0);
} else {
refIndex2FilePos[i] = outs.length();
outs.writeInt(list.size());
for (Interval<GeneItem> interval : Basic.randomize(list, 666)) { // need to save in random order
outs.writeInt(interval.getStart());
outs.writeInt(interval.getEnd());
interval.getData().write(outs);
}
totalRefWithAGene++;
}
progress.incrementProgress();
}
}
try (OutputWriter outs = new OutputWriter(indexFile); ProgressPercentage progress = new ProgressPercentage("Writing file: " + indexFile, refIndex2FilePos.length)) {
outs.write(GeneTableBuilder.MAGIC_NUMBER_IDX);
outs.writeInt(refIndex2FilePos.length);
for (long filePos : refIndex2FilePos) {
outs.writeLong(filePos);
progress.incrementProgress();
}
}
System.err.println(String.format("Reference sequences with at least one annotation: %,d of %,d", totalRefWithAGene, refIndex2Intervals.length));
}
}
/*
* Copyright (C) 2015 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package malt.tools;
import jloda.util.*;
import malt.genes.CDS;
import malt.genes.GeneItem;
import megan.io.OutputWriter;
import megan.util.interval.Interval;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* build the aadd index
* Daniel Huson, 5.2018
*/
public class AAddBuild {
final static byte[] MAGIC_NUMBER_IDX = "AAddIdxV0.1.".getBytes();
final static byte[] MAGIC_NUMBER_DBX = "AAddDbxV0.1.".getBytes();
/**
* add functional annotations to DNA alignments
*/
public static void main(String[] args) {
try {
ProgramProperties.setProgramName("AAddBuild");
ProgramProperties.setProgramVersion(megan.main.Version.SHORT_DESCRIPTION);
PeakMemoryUsageMonitor.start();
(new AAddBuild()).run(args);
System.err.println("Total time: " + PeakMemoryUsageMonitor.getSecondsSinceStartString());
System.err.println("Peak memory: " + PeakMemoryUsageMonitor.getPeakUsageString());
System.exit(0);
} catch (Exception ex) {
Basic.caught(ex);
System.exit(1);
}
}
/**
* run the program
*/
public void run(String[] args) throws CanceledException, IOException, UsageException {
final ArgsOptions options = new ArgsOptions(args, this, "Build the index for AAdd");
options.setVersion(ProgramProperties.getProgramVersion());
options.setLicense("Copyright (C) 2018 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
options.setAuthors("Daniel H. Huson");
options.comment("Input Output");
final List<String> gffFiles = options.getOptionMandatory("-igff", "inputGFF", "Input GFF3 files or directory (.gz ok)", new LinkedList<String>());
final String indexDirectory = options.getOptionMandatory("-d", "index", "Index directory", "");
options.comment(ArgsOptions.OTHER);
final boolean lookInside = options.getOption("-ex", "extraStrict", "When given an input directory, look inside every input file to check that it is indeed in GFF3 format", false);
options.done();
if (gffFiles.size() == 1) {
final File file = new File(gffFiles.get(0));
if (file.isDirectory()) {
System.err.println("Collecting all GFF3 files in directory: " + file);
gffFiles.clear();
for (File aFile : Basic.getAllFilesInDirectory(file, new GFF3FileFilter(true, lookInside), true)) {
gffFiles.add(aFile.getPath());
}
if (gffFiles.size() == 0)
throw new IOException("No GFF files found in directory: " + file);
else
System.err.println(String.format("Found: %,d", gffFiles.size()));
}
}
// obtains the gene annotations:
Map<String, ArrayList<Interval<GeneItem>>> dnaId2list = new HashMap<>();
{
final Collection<CDS> annotations = CDS.parseGFFforCDS(gffFiles, new ProgressPercentage("Processing GFF files"));
try (ProgressListener progress = new ProgressPercentage("Building annotation list", annotations.size())) {
for (CDS cds : annotations) {
ArrayList<Interval<GeneItem>> list = dnaId2list.get(cds.getDnaId());
if (list == null) {
list = new ArrayList<>();
dnaId2list.put(cds.getDnaId(), list);
}
final GeneItem geneItem = new GeneItem();
final String accession = cds.getProteinId();
geneItem.setProteinId(accession.getBytes());
geneItem.setReverse(cds.isReverse());
list.add(new Interval<>(cds.getStart(), cds.getEnd(), geneItem));
progress.incrementProgress();
}
}
}
// writes the index file:
long totalRefWithAGene = 0;
final File indexFile = new File(indexDirectory, "aadd.idx");
final File dbFile = new File(indexDirectory, "aadd.dbx");
try (OutputWriter idxWriter = new OutputWriter(indexFile); OutputWriter dbxWriter = new OutputWriter(dbFile);
ProgressPercentage progress = new ProgressPercentage("Writing files: " + indexFile + "\n " + dbFile, dnaId2list.size())) {
idxWriter.write(MAGIC_NUMBER_IDX);
dbxWriter.write(MAGIC_NUMBER_DBX);
idxWriter.writeInt(dnaId2list.size());
for (String dnaId : dnaId2list.keySet()) {
idxWriter.writeString(dnaId);
final ArrayList<Interval<GeneItem>> list = dnaId2list.get(dnaId);
if (list == null) {
idxWriter.writeLong(0); // no intervals
} else {
idxWriter.writeLong(dbxWriter.getPosition()); // position of intervals in DB file
dbxWriter.writeInt(list.size());
for (Interval<GeneItem> interval : Basic.randomize(list, 666)) { // need to save in random order
dbxWriter.writeInt(interval.getStart());
dbxWriter.writeInt(interval.getEnd());
interval.getData().write(dbxWriter);
}
totalRefWithAGene++;
}
progress.incrementProgress();
}
}
System.err.println(String.format("Reference sequences with at least one annotation: %,d of %,d", totalRefWithAGene, dnaId2list.size()));
}
}
/*
* Copyright (C) 2015 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package malt.tools;
import jloda.util.*;
import malt.genes.GeneItem;
import megan.io.FileRandomAccessReadOnlyAdapter;
import megan.io.InputReader;
import megan.util.interval.Interval;
import megan.util.interval.IntervalTree;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
/**
* add functional annotations to DNA alignments
* Daniel Huson, 5.2018
*/
public class AAddRun {
/**
* add functional annotations to DNA alignments
*/
public static void main(String[] args) {
try {
ProgramProperties.setProgramName("AAddRun");
ProgramProperties.setProgramVersion(megan.main.Version.SHORT_DESCRIPTION);
PeakMemoryUsageMonitor.start();
(new AAddRun()).run(args);
System.err.println("Total time: " + PeakMemoryUsageMonitor.getSecondsSinceStartString());
System.err.println("Peak memory: " + PeakMemoryUsageMonitor.getPeakUsageString());
System.exit(0);
} catch (Exception ex) {
Basic.caught(ex);
System.exit(1);
}
}
/**
* run the program
*/
public void run(String[] args) throws CanceledException, IOException, UsageException {
final ArgsOptions options = new ArgsOptions(args, this, "Adds functional accessions to DNA alignments");
options.setVersion(ProgramProperties.getProgramVersion());
options.setLicense("Copyright (C) 2018 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
options.setAuthors("Daniel H. Huson");
options.comment("Input Output");
final String[] inputFiles = options.getOptionMandatory("-i", "input", "Input SAM file(s) (.gz ok)", new String[0]);
final String indexDirectory = options.getOptionMandatory("-d", "index", "AAdd index directory", "");
final String[] outputFiles = options.getOptionMandatory("-o", "output", "Output file(s) (.gz ok) or directory", new String[0]);
options.comment(ArgsOptions.OTHER);
final boolean reportUnmappedAccessions = options.getOption("-rnf", "reportNotFound", "Report the names of DNA reference for which no functional accession is available", false);
options.done();
final File outputDir;
if (outputFiles.length == 1 && ((new File(outputFiles[0])).isDirectory())) {
outputDir = new File(outputFiles[0]);
} else {
outputDir = null;
if (inputFiles.length != outputFiles.length)
throw new UsageException("Number of output files doesn't match number of input files");
}
final Map<String, Pair<Long, IntervalTree<GeneItem>>> ref2PosAndTree;
final File indexFile = new File(indexDirectory, "aadd.idx");
try (InputReader ins = new InputReader(indexFile); ProgressPercentage progress = new ProgressPercentage("Reading file: " + indexFile)) {
readAndVerifyMagicNumber(ins, AAddBuild.MAGIC_NUMBER_IDX);
final int entries = ins.readInt();
progress.setMaximum(entries);
ref2PosAndTree = new HashMap<>(2 * entries);
for (int t = 0; t < entries; t++) {
final String dnaId = ins.readString();
final long pos = ins.readLong();
ref2PosAndTree.put(dnaId, new Pair<Long, IntervalTree<GeneItem>>(pos, null));
progress.incrementProgress();
}
}
final IntervalTree<GeneItem> emptyTree = new IntervalTree<>();
final File dbFile = new File(indexDirectory, "aadd.dbx");
try (FileRandomAccessReadOnlyAdapter dbxIns = new FileRandomAccessReadOnlyAdapter(dbFile)) {
System.err.println("Opening file: " + dbFile);
for (int i = 0; i < inputFiles.length; i++) {
File inputFile = new File(inputFiles[i]);
final File outputFile;
if (outputDir != null) {
outputFile = new File(outputDir, inputFile.getName() + ".out");
} else
outputFile = new File(outputFiles[i]);
if (inputFile.equals(outputFile))
throw new IOException("Input file equals output file: " + inputFile);
final boolean gzipOutput = outputFile.getName().toLowerCase().endsWith(".gz");
long countLines = 0;
long countAlignments = 0;
long countAnnotated = 0;
long countReferencesLoaded = 0;
final Set<String> refNotFound = new HashSet<>();
try (final FileInputIterator it = new FileInputIterator(inputFile, true);
final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(gzipOutput ? new GZIPOutputStream(new FileOutputStream(outputFile)) : new FileOutputStream(outputFile)))) {
System.err.println("Writing file: " + outputFile);
while (it.hasNext()) {
final String aLine = it.next();
if (aLine.startsWith("@"))
w.write(aLine);
else {
final String[] tokens = Basic.split(aLine, '\t');
if (tokens.length < 2 || tokens[2].equals("*")) {
w.write(aLine);
} else {
final IntervalTree<GeneItem> tree;
{
final int pos = tokens[2].indexOf(".");
final String ref = (pos > 0 ? tokens[2].substring(0, pos) : tokens[2]);
final Pair<Long, IntervalTree<GeneItem>> pair = ref2PosAndTree.get(ref);
if (pair != null) {
if (pair.getSecond() == null && pair.getFirst() != 0) {
dbxIns.seek(pair.getFirst());
int intervalsLength = dbxIns.readInt();
if (intervalsLength > 0) {
tree = new IntervalTree<>();
for (int t = 0; t < intervalsLength; t++) {
int start = dbxIns.readInt();
int end = dbxIns.readInt();
GeneItem geneItem = new GeneItem();
geneItem.read(dbxIns);
tree.add(start, end, geneItem);
//System.err.println(refIndex+"("+start+"-"+end+") -> "+geneItem);
}
countReferencesLoaded++;
} else {
tree = emptyTree;
}
pair.setSecond(tree);
} else {
tree = pair.getSecond();
}
} else {
if (!refNotFound.contains(ref)) {
refNotFound.add(ref);
if (reportUnmappedAccessions)
System.err.println("Reference not found: " + ref);
}
continue;
}
}
final int startSubject = Basic.parseInt(tokens[3]);
final int endSubject = startSubject + getRefLength(tokens[5]) - 1;
final Interval<GeneItem> refInterval = tree.getBestInterval(new Interval<GeneItem>(startSubject, endSubject, null), 0.9);
String annotatedRef = tokens[2];
if (refInterval != null) {
final GeneItem geneItem = refInterval.getData();
final String remainder;
final int len = annotatedRef.indexOf(' ');
if (len >= 0 && len < annotatedRef.length()) {
remainder = annotatedRef.substring(len); // keep space...
annotatedRef = annotatedRef.substring(0, len);
} else
remainder = "";
annotatedRef += (annotatedRef.endsWith("|") ? "pos|" : "|pos|") + (geneItem.isReverse() ? refInterval.getEnd() + ".." + refInterval.getStart()
: refInterval.getStart() + ".." + refInterval.getEnd()) + "|ref|" + Basic.toString(geneItem.getProteinId()) + remainder;
}
for (int t = 0; t < tokens.length; t++) {
if (t > 0)
w.write('\t');
if (t == 2 && !annotatedRef.equals(tokens[2])) {
w.write(annotatedRef);
countAnnotated++;
} else
w.write(tokens[t]);
}
}
countAlignments++;
}
w.write("\n");
countLines++;
}
}
System.err.println(String.format("Lines: %,11d", countLines));
System.err.println(String.format("Alignments:%,11d", countAlignments));
System.err.println(String.format("Annotated: %,11d", countAnnotated));
System.err.println(String.format("(Loaded refs:%,9d)", countReferencesLoaded));
if (refNotFound.size() > 0)
System.err.println(String.format("(Missing refs:%,8d)", refNotFound.size()));
}
}
}
private static Pattern pattern = Pattern.compile("[0-9]+[MDN]+");
public static int getRefLength(String cigar) {
final Matcher matcher = pattern.matcher(cigar);
final ArrayList<String> pairs = new ArrayList<>();
while (matcher.find())
pairs.add(matcher.group());
int length = 0;
for (String p : pairs) {
int num = Integer.parseInt(p.substring(0, p.length() - 1));
length += num;
}
return length;
}
/**
* read and verify a magic number from a stream
*
* @param ins
* @param expectedMagicNumber
* @throws java.io.IOException
*/
public static void readAndVerifyMagicNumber(InputReader ins, byte[] expectedMagicNumber) throws IOException {
byte[] magicNumber = new byte[expectedMagicNumber.length];
if (ins.read(magicNumber, 0, magicNumber.length) != expectedMagicNumber.length || !Basic.equal(magicNumber, expectedMagicNumber)) {
System.err.println("Expected: " + Basic.toString(expectedMagicNumber));
System.err.println("Got: " + Basic.toString(magicNumber));
throw new IOException("Index is too old or incorrect file (wrong magic number). Please recompute index.");
}
}
}