Skip to content
Commits on Source (3)
......@@ -23,7 +23,6 @@
<path id="build.classpath">
<fileset dir="../../jloda/jars" includes="*.jar"/>
<fileset dir="../../malt/jars" includes="*.jar"/>
<fileset dir="../../jloda/jars/batik-1.8" includes="*.jar"/>
<fileset dir="../../megan-ce/jars/megan6server" includes="*.jar"/>
......@@ -57,8 +56,15 @@
</copy>
</target>
<!-- copy fxml -->
<target name="copy_fx" depends="copy_sources">
<copy todir="${classDir}">
<fileset dir="${meganSrcDir}" includes="**/*.fxml"/>
</copy>
</target>
<!-- compile MEGAN -->
<target name="compile" depends="copy_sources">
<target name="compile" depends="copy_fx">
<javac includeantruntime="false"
srcdir="${srcDir}"
destdir="${classDir}"
......
megan-ce (0.0+20170515-1) UNRELEASED; urgency=medium
megan-ce (0.0+git20180524.24e70bf-1) UNRELEASED; urgency=medium
* Initial release (Closes: #<bug>)
......
version=4
opts=dversionmangle=s/.*/0.No-Release/ \
https://people.debian.org/~eriberto/ FakeWatchNoUpstreamReleaseForThisPackage-(\d\S+)\.gz
opts="mode=git,pretty=0.0+git%cd.%h" \
https://github.com/danielhuson/megan-ce.git HEAD
# https://github.com/danielhuson/megan-ce/releases .*/archive/.*(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......
......@@ -475,7 +475,7 @@
478 COG0478 Serine Threonine protein kinase
479 COG0479 succinate dehydrogenase
480 COG0480 Catalyzes the GTP-dependent ribosomal translocation step during translation elongation. During this step, the ribosome changes from the pre-translocational (PRE) to the post- translocational (POST) state as the newly formed A-site-bound peptidyl-tRNA and P-site-bound deacylated tRNA move to the P and E sites, respectively. Catalyzes the coordinated movement of the two tRNA molecules, the mRNA and conformational changes in the ribosome (By similarity)
481 COG0481 Required for accurate and efficient protein synthesis under certain stress conditions. May act as a fidelity factor of the translation reaction, by catalyzing a one-codon backward translocation of tRNAs on improperly translocated ribosomes. Back- translocation proceeds from a post-translocation (POST) complex to a pre-translocation (PRE) complex, thus giving elongation factor G a second chance to translocate the tRNAs correctly. Binds to ribosomes in a GTP-dependent manner (By similarity)
481 COG0481 Required for accurate and efficient protein synthesis under certain stress conditions. May act as a fidelity factor of the translation reaction, by catalyzing a one-codon reverse translocation of tRNAs on improperly translocated ribosomes. Back- translocation proceeds from a post-translocation (POST) complex to a pre-translocation (PRE) complex, thus giving elongation factor G a second chance to translocate the tRNAs correctly. Binds to ribosomes in a GTP-dependent manner (By similarity)
482 COG0482 Catalyzes the 2-thiolation of uridine at the wobble position (U34) of tRNA, leading to the formation of s(2)U34 (By similarity)
483 COG0483 inositol monophosphatase
484 COG0484 ATP binding to DnaK triggers the release of the substrate protein, thus completing the reaction cycle. Several rounds of ATP-dependent interactions between DnaJ, DnaK and GrpE are required for fully efficient folding. Also involved, together with DnaK and GrpE, in the DNA replication of plasmids through activation of initiation proteins (By similarity)
created: Tue Feb 21 10:53:57 CET 2017
created: Thu Mar 22 13:23:29 SGT 2018
Cite: Benson et al (2005) NAR 33 D34–38.
\ No newline at end of file
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
This diff is collapsed.
resources/images/megan6.png

208 KiB | W: | H:

resources/images/megan6.png

217 KiB | W: | H:

resources/images/megan6.png
resources/images/megan6.png
resources/images/megan6.png
resources/images/megan6.png
  • 2-up
  • Swipe
  • Onion skin
#
# Copyright (C) 2017 Daniel H. Huson
# Copyright (C) 2018 Daniel H. Huson
#
# (Some files contain contributions from other authors, who are then mentioned separately.)
#
......
#
# Copyright (C) 2017 Daniel H. Huson
# Copyright (C) 2018 Daniel H. Huson
#
# (Some files contain contributions from other authors, who are then mentioned separately.)
#
......
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -20,6 +20,7 @@ package megan.algorithms;
import megan.data.IMatchBlock;
import megan.data.IReadBlock;
import megan.viewer.TaxonomyData;
import java.io.IOException;
import java.util.BitSet;
......@@ -45,9 +46,9 @@ public class ActiveMatches {
// the set of matches that we will consider:
for (int i = 0; i < readBlock.getNumberOfAvailableMatchBlocks(); i++) {
final IMatchBlock matchBlock = readBlock.getMatchBlock(i);
if (!matchBlock.isIgnore() && matchBlock.getBitScore() >= minScore && matchBlock.getExpected() <= maxExpected &&
if (!matchBlock.isIgnore() && !TaxonomyData.isTaxonDisabled(matchBlock.getTaxonId()) && matchBlock.getBitScore() >= minScore && matchBlock.getExpected() <= maxExpected &&
(minPercentIdentity == 0 || matchBlock.getPercentIdentity() >= minPercentIdentity)) {
if (matchBlock.getId(classificationName) > 0)
if (classificationName == null || matchBlock.getId(classificationName) > 0)
activeMatchesForClassification.set(i);
}
}
......
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -52,6 +52,7 @@ public class AssignmentUsingBestHit implements IAssignmentAlgorithm {
// System.err.println("Using 'best hit' assignment on " + cName);
}
/**
* computes the id for a read from its matches
* matches
......@@ -68,8 +69,11 @@ public class AssignmentUsingBestHit implements IAssignmentAlgorithm {
return id;
}
if (activeMatches.cardinality() == 0)
if (readBlock.getNumberOfMatches() == 0)
return IdMapper.NOHITS_ID;
if (activeMatches.cardinality() == 0)
return IdMapper.UNASSIGNED_ID;
for (int i = activeMatches.nextSetBit(0); i != -1; i = activeMatches.nextSetBit(i + 1)) {
IMatchBlock match = readBlock.getMatchBlock(i);
int id = match.getId(cName);
......
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -24,7 +24,8 @@ package megan.algorithms;
* Daniel Huson, 3.2016
*/
public class AssignmentUsingBestHitCreator implements IAssignmentAlgorithmCreator {
private final AssignmentUsingBestHit assignmentUsingBestHit;
private final String cName;
private final String fileName;
/**
* constructor
......@@ -32,7 +33,8 @@ public class AssignmentUsingBestHitCreator implements IAssignmentAlgorithmCreato
* @param cName
*/
public AssignmentUsingBestHitCreator(String cName, String fileName) {
assignmentUsingBestHit = new AssignmentUsingBestHit(cName, fileName);
this.cName = cName;
this.fileName = fileName;
System.err.println("Using Best-Hit algorithm for binning: " + cName);
}
......@@ -43,6 +45,6 @@ public class AssignmentUsingBestHitCreator implements IAssignmentAlgorithmCreato
*/
@Override
public IAssignmentAlgorithm createAssignmentAlgorithm() {
return assignmentUsingBestHit;
return new AssignmentUsingBestHit(cName, fileName);
}
}
/*
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package megan.algorithms;
import jloda.graph.Edge;
import jloda.graph.Node;
import megan.classification.Classification;
import megan.classification.ClassificationManager;
import megan.classification.IdMapper;
import megan.classification.data.ClassificationFullTree;
import megan.core.Document;
import megan.data.IMatchBlock;
import megan.data.IReadBlock;
import megan.viewer.TaxonomyData;
import java.util.*;
/**
* performs taxonId assignment using a union-based algorithm
* Created by huson on 4/12/17.
*/
public class AssignmentUsingIntervalUnionLCA implements IAssignmentAlgorithm {
private final float weightedPercentFactor;
private final float topPercent;
private final ClassificationFullTree fullTree;
// all these are used during computation:
private final HashSet<Node> allNodes = new HashSet<>();
private final HashMap<Integer, IntervalList> taxa2intervals = new HashMap<>();
private final Map<Node, Integer> node2covered = new HashMap<>();
private StartStopEvent[] events = new StartStopEvent[10000]; // not final because may get resized...
private final Comparator<StartStopEvent> comparator;
/**
* constructor
*/
public AssignmentUsingIntervalUnionLCA(Document doc) {
this.weightedPercentFactor = Math.min(1f, doc.getLcaCoveragePercent() / 100.0f);
this.topPercent = doc.getTopPercent();
this.fullTree = ClassificationManager.get(Classification.Taxonomy, true).getFullTree();
comparator = createComparator();
}
/**
* compute taxonId id
*
* @param activeMatches
* @param readBlock
* @return taxonId id
*/
public int computeId(BitSet activeMatches, IReadBlock readBlock) {
if (readBlock.getNumberOfMatches() == 0)
return IdMapper.NOHITS_ID;
if (activeMatches.cardinality() == 0)
return IdMapper.UNASSIGNED_ID;
taxa2intervals.clear();
computeTaxaToSegmentsMap(activeMatches, readBlock, taxa2intervals);
if (taxa2intervals.size() == 0)
return IdMapper.UNASSIGNED_ID;
if (taxa2intervals.size() == 1)
return taxa2intervals.keySet().iterator().next();
allNodes.clear();
final Node root = computeInducedTree(taxa2intervals, allNodes);
node2covered.clear();
computeCoveredBasesRec(root, allNodes, taxa2intervals, node2covered);
final double threshold = weightedPercentFactor * node2covered.get(root);
return getLCA(root, allNodes, node2covered, threshold);
}
/**
* computes the taxon to segments map. On each segment, we apply the top-percent filter
*
* @param activeMatches
* @param readBlock
* @param taxa2intervals
*/
private void computeTaxaToSegmentsMap(BitSet activeMatches, IReadBlock readBlock, HashMap<Integer, IntervalList> taxa2intervals) {
// determine all start and stop events:
int numberOfEvents = 0;
for (int m = activeMatches.nextSetBit(0); m != -1; m = activeMatches.nextSetBit(m + 1)) {
final IMatchBlock matchBlock = readBlock.getMatchBlock(m);
int taxonId = matchBlock.getTaxonId();
if (taxonId > 0 && !TaxonomyData.isTaxonDisabled(taxonId)) {
if (numberOfEvents + 1 >= events.length) { // need enough to add two new events
StartStopEvent[] tmp = new StartStopEvent[2 * events.length];
System.arraycopy(events, 0, tmp, 0, numberOfEvents);
events = tmp;
}
if (events[numberOfEvents] == null)
events[numberOfEvents] = new StartStopEvent();
events[numberOfEvents++].set(true, Math.min(matchBlock.getAlignedQueryStart(), matchBlock.getAlignedQueryEnd()), m);
if (events[numberOfEvents] == null)
events[numberOfEvents] = new StartStopEvent();
events[numberOfEvents++].set(false, Math.max(matchBlock.getAlignedQueryStart(), matchBlock.getAlignedQueryEnd()), m);
}
}
Arrays.sort(events, 0, numberOfEvents, comparator);
final BitSet currentMatches = new BitSet(); // set of matches currently active
final Map<Integer, Float> taxon2BestScore = new HashMap<>();
StartStopEvent previousEvent = null;
for (int c = 0; c < numberOfEvents; c++) {
final StartStopEvent currentEvent = events[c];
if (previousEvent == null) {
if (!currentEvent.isStart())
throw new RuntimeException("Taxon end before begin: " + currentEvent);
currentMatches.set(currentEvent.getMatchId());
} else {
if (currentEvent.getPos() > previousEvent.getPos()) {
final int segmentLength = (currentEvent.getPos() - previousEvent.getPos() + 1); // length of segment
if (segmentLength > 0) {
taxon2BestScore.clear();
for (int m = currentMatches.nextSetBit(0); m != -1; m = currentMatches.nextSetBit(m + 1)) {
final IMatchBlock matchBlock = readBlock.getMatchBlock(m);
final int taxonId = matchBlock.getTaxonId(); // store the best score for each taxon
if (taxonId > 0 && !TaxonomyData.isTaxonDisabled(taxonId)) {
Float bestScore = taxon2BestScore.get(taxonId);
if (bestScore == null)
taxon2BestScore.put(taxonId, matchBlock.getBitScore());
else
taxon2BestScore.put(taxonId, Math.max(bestScore, matchBlock.getBitScore()));
}
}
// determine the top-percent threshold on the current segment:
float topPercenThreshold = 0;
for (Float value : taxon2BestScore.values()) {
topPercenThreshold = Math.max(topPercenThreshold, value);
}
topPercenThreshold = (100.0f - topPercent) / 100.0f * topPercenThreshold;
// add the segments for all taxa whose best match exceeds the threshold:
for (Integer taxonId : taxon2BestScore.keySet()) {
if (taxon2BestScore.get(taxonId) >= topPercenThreshold) {
IntervalList intervals = taxa2intervals.get(taxonId);
if (intervals == null) {
intervals = new IntervalList();
taxa2intervals.put(taxonId, intervals);
}
intervals.add(previousEvent.getPos(), currentEvent.getPos());
}
}
}
}
// update the set of current matches:
if (currentEvent.isStart()) {
currentMatches.set(currentEvent.getMatchId());
} else { // is end event
currentMatches.clear(currentEvent.getMatchId());
}
}
previousEvent = currentEvent;
}
for (IntervalList list : taxa2intervals.values()) {
list.setIsSorted(true); // initially, lists are sorted by construction
}
}
/**
* computes the set of all nodes that lie between the given taxa and their LCA
*
* @param taxa2intervals
* @param allNodes
* @return root node
*/
private Node computeInducedTree(HashMap<Integer, IntervalList> taxa2intervals, Set<Node> allNodes) {
// compute the local root node:
final Node rootOfAllNodes;
{
final ArrayList<String> addresses = new ArrayList<>(taxa2intervals.size());
for (Integer taxId : taxa2intervals.keySet()) {
addresses.add(fullTree.getAddress(taxId));
}
final int rootId = fullTree.getAddress2Id(LCAAddressing.getCommonPrefix(addresses, false));
rootOfAllNodes = fullTree.getANode(rootId);
}
allNodes.add(rootOfAllNodes);
// add all nodes between that taxa and the root:
for (Integer taxId : taxa2intervals.keySet()) {
Node v = fullTree.getANode(taxId);
if (v != null) {
while (!allNodes.contains(v)) {
allNodes.add(v);
if (v.getInDegree() > 0)
v = v.getFirstInEdge().getSource();
else
break; // must be v==fullTree.getRoot()
}
}
}
return rootOfAllNodes;
}
/**
* computes the number of bases that each taxon is covered by. Side effect is to change all taxa2intervals intervals.
*
* @param v
* @param allNodes
* @param taxa2intervals
*/
private IntervalList computeCoveredBasesRec(final Node v, final HashSet<Node> allNodes, final HashMap<Integer, IntervalList> taxa2intervals, final Map<Node, Integer> node2covered) {
final int taxId = (Integer) v.getInfo();
final IntervalList intervals;
if (taxa2intervals.get(taxId) != null)
intervals = taxa2intervals.get(taxId);
else {
intervals = new IntervalList();
taxa2intervals.put(taxId, intervals);
}
// get intervals of children:
for (Edge e = v.getFirstOutEdge(); e != null; e = v.getNextOutEdge(e)) {
final Node w = e.getTarget();
if (allNodes.contains(w)) {
final IntervalList intervalsW = computeCoveredBasesRec(w, allNodes, taxa2intervals, node2covered);
intervals.addAll(intervalsW.getAll()); // this will trigger recomputation of amount covered
}
}
node2covered.put(v, intervals.getCovered());
return intervals;
}
/**
* computes the node that is above all nodes whose coverage meets the threshold
*
* @param v
* @param allNodes
* @param node2covered
* @param threshold
* @return LCA of all nodes that meet the threshold
*/
private int getLCA(Node v, HashSet<Node> allNodes, Map<Node, Integer> node2covered, double threshold) {
while (true) {
Node bestChild = null;
for (Edge e = v.getFirstOutEdge(); e != null; e = v.getNextOutEdge(e)) {
final Node w = e.getTarget();
if (allNodes.contains(w)) {
if (node2covered.get(w) >= threshold) {
if (bestChild == null)
bestChild = w;
else { // has at least two best children, return v
return (Integer) v.getInfo();
}
}
}
}
if (bestChild != null)
v = bestChild; // has exactly one child that beats threshold, move down to it
else
return (Integer) v.getInfo(); // no best child, return v
}
}
@Override
public int getLCA(int id1, int id2) {
if (id1 == 0)
return id2;
else if (id2 == 0)
return id1;
else
return fullTree.getAddress2Id(LCAAddressing.getCommonPrefix(new String[]{fullTree.getAddress(id1), fullTree.getAddress(id2)}, 2, false));
}
private Comparator<StartStopEvent> createComparator() {
return new Comparator<StartStopEvent>() {
@Override
public int compare(StartStopEvent a, StartStopEvent b) {
if (a.getPos() < b.getPos())
return -1;
else if (a.getPos() > b.getPos())
return 1;
else if (a.isStart() && b.isEnd())
return -1;
else if (a.isEnd() && b.isStart())
return 1;
else
return 0;
}
};
}
private class StartStopEvent {
private boolean start;
private int pos;
private int matchId;
public void set(boolean start, int pos, int matchId) {
this.start = start;
this.pos = pos;
this.matchId = matchId;
}
public boolean isStart() {
return start;
}
public boolean isEnd() {
return !start;
}
public int getPos() {
return pos;
}
public int getMatchId() {
return matchId;
}
}
}
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -19,6 +19,7 @@
package megan.algorithms;
import jloda.util.ProgramProperties;
import megan.classification.Classification;
import megan.core.Document;
......@@ -26,19 +27,22 @@ import megan.core.Document;
* create a coverage-base LCA assignment algorithm
* Daniel Huson, 4.2017
*/
public class AssignmentUsingCoverageBasedLCACreator implements IAssignmentAlgorithmCreator {
public class AssignmentUsingIntervalUnionLCACreator implements IAssignmentAlgorithmCreator {
private final Document document;
private final float topPercent;
/**
* constructor
*
* @param document
*/
public AssignmentUsingCoverageBasedLCACreator(Document document, float topPercent) {
public AssignmentUsingIntervalUnionLCACreator(Document document) {
this.document = document;
this.topPercent = topPercent;
System.err.println("Using coverage-based LCA algorithm for binning: " + Classification.Taxonomy);
if (ProgramProperties.get("use-segment-lca", false))
System.err.println("Using 'segment-LCA' algorithm for binning: " + Classification.Taxonomy);
else {
System.err.println(String.format("Using 'Interval-Union-LCA' algorithm (%.1f %%) for binning: %s", document.getLcaCoveragePercent(), Classification.Taxonomy));
//System.err.println("(setprop CheckForChimeras=true; to turn on experimental chimeric read identification)");
}
}
/**
......@@ -47,7 +51,7 @@ public class AssignmentUsingCoverageBasedLCACreator implements IAssignmentAlgori
* @return assignment algorithm
*/
@Override
public AssignmentUsingCoverageBasedLCA createAssignmentAlgorithm() {
return new AssignmentUsingCoverageBasedLCA(document, topPercent);
public IAssignmentAlgorithm createAssignmentAlgorithm() {
return new AssignmentUsingIntervalUnionLCA(document);
}
}
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -33,8 +33,6 @@ import java.util.BitSet;
public class AssignmentUsingLCA implements IAssignmentAlgorithm {
private String[] addresses;
private final BitSet toRemove;
private final String cName;
private final ClassificationFullTree fullTree;
......@@ -45,7 +43,6 @@ public class AssignmentUsingLCA implements IAssignmentAlgorithm {
this.cName = cName;
fullTree = ClassificationManager.get(cName, true).getFullTree();
addresses = new String[1000];
toRemove = new BitSet();
}
/**
......@@ -58,10 +55,11 @@ public class AssignmentUsingLCA implements IAssignmentAlgorithm {
public int computeId(BitSet activeMatches, IReadBlock readBlock) {
if (readBlock.getNumberOfMatches() == 0)
return IdMapper.NOHITS_ID;
if (activeMatches.cardinality() == 0)
return IdMapper.UNASSIGNED_ID;
// compute addresses of all hit taxa:
if (activeMatches.cardinality() > 0) {
boolean hasDisabledMatches = false;
// collect the addresses of all non-disabled taxa:
......@@ -105,12 +103,11 @@ public class AssignmentUsingLCA implements IAssignmentAlgorithm {
// compute LCA using addresses:
if (numberOfAddresses > 0) {
final String address = LCAAddressing.getCommonPrefix(addresses, numberOfAddresses, true);
final Integer id = fullTree.getAddress2Id(address);
if (id != null && id > 0) {
final int id = fullTree.getAddress2Id(address);
if (id > 0)
return id;
}
}
}
// although we had some hits, couldn't make an assignment
return IdMapper.UNASSIGNED_ID;
}
......
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......
/*
* Copyright (C) 2017 Daniel H. Huson
* Copyright (C) 2018 Daniel H. Huson
*
* (Some files contain contributions from other authors, who are then mentioned separately.)
*
......@@ -28,6 +28,8 @@ import megan.data.IReadBlock;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* computes the taxon assignment for a read, using the LCA algorithm
......@@ -36,22 +38,49 @@ import java.util.Collection;
*/
public class AssignmentUsingLCAForTaxonomy implements IAssignmentAlgorithm {
protected String[] addresses;
private final BitSet activeSet;
private final Map<Character, Integer> ch2weight;
protected final boolean useIdentityFilter;
protected float proportionToCover = 1;
protected final ClassificationFullTree fullTree;
protected final IdMapper idMapper;
protected final Name2IdMap name2idMap;
protected final boolean ignoreAncestralTaxa;
/**
* constructor
*
* @param cName
* @param useIdentityFilter
* @param percentToCover
*/
public AssignmentUsingLCAForTaxonomy(String cName, boolean useIdentityFilter) {
public AssignmentUsingLCAForTaxonomy(String cName, boolean useIdentityFilter, float percentToCover) {
this(cName, useIdentityFilter, percentToCover, true);
}
/**
* constructor
*
* @param cName
* @param useIdentityFilter
* @param percentToCover
* @param ignoreAncestralTaxa
*/
public AssignmentUsingLCAForTaxonomy(String cName, boolean useIdentityFilter, float percentToCover, boolean ignoreAncestralTaxa) {
fullTree = ClassificationManager.get(cName, false).getFullTree();
idMapper = ClassificationManager.get(cName, true).getIdMapper();
name2idMap = ClassificationManager.get(cName, false).getIdMapper().getName2IdMap();
addresses = new String[1000];
activeSet = new BitSet();
ch2weight = new HashMap<>(Character.MAX_VALUE, 1f);
this.useIdentityFilter = useIdentityFilter;
this.proportionToCover = percentToCover / 100f;
this.ignoreAncestralTaxa = ignoreAncestralTaxa;
}
/**
......@@ -64,6 +93,8 @@ public class AssignmentUsingLCAForTaxonomy implements IAssignmentAlgorithm {
public int computeId(BitSet activeMatches, IReadBlock readBlock) {
if (readBlock.getNumberOfMatches() == 0)
return IdMapper.NOHITS_ID;
if (activeMatches.cardinality() == 0)
return IdMapper.UNASSIGNED_ID;
// compute addresses of all hit taxa:
if (activeMatches.cardinality() > 0) {
......@@ -112,14 +143,17 @@ public class AssignmentUsingLCAForTaxonomy implements IAssignmentAlgorithm {
// compute LCA using addresses:
if (numberOfAddresses > 0) {
final String address = LCAAddressing.getCommonPrefix(addresses, numberOfAddresses, true);
int taxId = fullTree.getAddress2Id(address);
if (taxId > 0) {
if (useIdentityFilter) {
taxId = adjustByPercentIdentity(taxId, activeMatches, readBlock, fullTree, name2idMap);
}
return taxId;
final int id;
if (proportionToCover == 1) {
final String address = LCAAddressing.getCommonPrefix(addresses, numberOfAddresses, ignoreAncestralTaxa);
id = fullTree.getAddress2Id(address);
} else {
final int weightToCover = (int) Math.min(numberOfAddresses, Math.ceil(proportionToCover * numberOfAddresses));
final String address = getPrefixCoveringWeight(weightToCover, addresses, numberOfAddresses);
id = fullTree.getAddress2Id(address);
}
if (id > 0)
return id;
}
}
......@@ -157,7 +191,7 @@ public class AssignmentUsingLCAForTaxonomy implements IAssignmentAlgorithm {
// compute LCA using addresses:
if (numberOfAddresses > 0) {
final String address = LCAAddressing.getCommonPrefix(addresses, numberOfAddresses, true);
final String address = LCAAddressing.getCommonPrefix(addresses, numberOfAddresses, ignoreAncestralTaxa);
return fullTree.getAddress2Id(address);
}
return IdMapper.UNASSIGNED_ID;
......@@ -177,7 +211,7 @@ public class AssignmentUsingLCAForTaxonomy implements IAssignmentAlgorithm {
else if (id2 == 0)
return id1;
else
return fullTree.getAddress2Id(LCAAddressing.getCommonPrefix(new String[]{fullTree.getAddress(id1), fullTree.getAddress(id2)}, 2, false));
return fullTree.getAddress2Id(LCAAddressing.getCommonPrefix(new String[]{fullTree.getAddress(id1), fullTree.getAddress(id2)}, 2, ignoreAncestralTaxa));
}
/**
......@@ -247,5 +281,72 @@ public class AssignmentUsingLCAForTaxonomy implements IAssignmentAlgorithm {
} while (changed);
return taxId;
}
/**
* given a set of addresses, returns the longest prefix that equals or exceeds the given weight threshold
*
* @param addresses
* @return prefix
*/
private String getPrefixCoveringWeight(int weightToCover, String[] addresses, int length) {
activeSet.clear();
ch2weight.clear();
for (int i = 0; i < length; i++) {
activeSet.set(i);
}
final StringBuilder buf = new StringBuilder();
for (int pos = 0; ; pos++) {
for (int i = activeSet.nextSetBit(0); i != -1; i = activeSet.nextSetBit(i + 1)) {
if (pos == addresses[i].length()) {
activeSet.set(i, false); // run out of symbols
// weightToCover -= 1; // this node lies on route to best node, so it is covered and its weight can be removed from weightToCover
} else {
char ch = addresses[i].charAt(pos);
Integer count = ch2weight.get(ch);
if (count == null)
ch2weight.put(ch, 1);
else
ch2weight.put(ch, count + 1);
}
}
if (activeSet.cardinality() == 0)
break;
// determine the heaviest character
Character bestCh = null;
int bestCount = 0;
for (Character ch : ch2weight.keySet()) {
Integer weight = ch2weight.get(ch);
if (weight != null && weight > bestCount) {
bestCh = ch;
bestCount = weight;
}
}
if (bestCount >= weightToCover && bestCh != null)
buf.append(bestCh);
else
break;
for (int i = activeSet.nextSetBit(0); i != -1; i = activeSet.nextSetBit(i + 1)) {
if (addresses[i].charAt(pos) != bestCh) // no length problem here, if address too short then it will not be active
activeSet.set(i, false); // not on best path, remove from active nodes
}
if (activeSet.cardinality() == 0)
break;
ch2weight.clear();
}
String result = buf.toString();
if (result.length() > 0) {
return result;
} else
return "";
}
}