Skip to content
Commits on Source (7)
picard-tools (2.18.14+dfsg-2) UNRELEASED; urgency=medium
picard-tools (2.18.17+dfsg-1) UNRELEASED; urgency=medium
[ Emmanuel Bourg ]
* Team upload.
* Fixed the build failure with Java 11
-- Emmanuel Bourg <ebourg@apache.org> Fri, 30 Nov 2018 13:28:52 +0100
[ Andreas Tille ]
* New upstream version
* Secure URI in copyright format
* Remove trailing whitespace in debian/changelog
* Remove trailing whitespace in debian/control
* Remove trailing whitespace in debian/rules
-- Andreas Tille <tille@debian.org> Sat, 01 Dec 2018 08:03:17 +0100
picard-tools (2.18.14+dfsg-1) unstable; urgency=medium
......
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Source: https://github.com/broadinstitute/picard/releases
Comment: Convenience binary jar files are removed.
Files-Excluded: gradle/wrapper/gradle-wrapper.jar
......
......@@ -244,6 +244,11 @@ public class TheoreticalSensitivity {
* @return Theoretical sensitivity for the given arguments at a constant depth.
*/
public static double sensitivityAtConstantDepth(final int depth, final Histogram<Integer> qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final long randomSeed) {
// If the depth is 0 at a particular locus, the sensitivity is trivially 0.0.
if (depth == 0) {
return 0.0;
}
final RouletteWheel qualityRW = new RouletteWheel(trimDistribution(normalizeHistogram(qualityHistogram)));
final Random randomNumberGenerator = new Random(randomSeed);
final RandomGenerator rg = new Well19937c(randomSeed);
......@@ -350,13 +355,22 @@ public class TheoreticalSensitivity {
/**
* Removes trailing zeros in a distribution. The purpose of this function is to prevent other
* functions from evaluating in regions where the distribution has zero probability.
* If the array consists of only zeros or is empty, an empty array is returned.
*
* @param distribution Distribution of base qualities
* @return Distribution of base qualities removing any trailing zeros
*/
static double[] trimDistribution(final double[] distribution) {
int endOfDistribution = distribution.length - 1;
while(distribution[endOfDistribution] == 0) {
int endOfDistribution = distribution.length;
if (distribution.length == 0) {
return distribution;
}
// Locate last element in array that is non-zero if it exists.
// If there are no non-zero elements, endOfDistribution should be 0.
while (distribution[endOfDistribution - 1] == 0) {
endOfDistribution--;
if (endOfDistribution == 0) break;
}
// Remove trailing zeros and return.
......
......@@ -224,7 +224,9 @@ public class CheckFingerprint extends CommandLineProgram {
outputDetailMetricsFile = DETAIL_OUTPUT;
outputSummaryMetricsFile = SUMMARY_OUTPUT;
} else {
if (!OUTPUT.endsWith(".")) OUTPUT = OUTPUT + ".";
if (!OUTPUT.endsWith(".")) {
OUTPUT += ".";
}
outputDetailMetricsFile = new File(OUTPUT + FINGERPRINT_DETAIL_FILE_SUFFIX);
outputSummaryMetricsFile = new File(OUTPUT + FINGERPRINT_SUMMARY_FILE_SUFFIX);
}
......@@ -245,8 +247,7 @@ public class CheckFingerprint extends CommandLineProgram {
List<FingerprintResults> results;
String observedSampleAlias = null;
final boolean isBamOrSamFile = isBamOrSam(inputPath);
if (isBamOrSamFile) {
if (isBamOrSam(inputPath)) {
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(inputPath), SAMSequenceDictionaryExtractor.extractDictionary(genotypesPath), true);
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(inputPath), checker.getHeader().getSequenceDictionary(), true);
......@@ -311,6 +312,7 @@ public class CheckFingerprint extends CommandLineProgram {
final MetricsFile<FingerprintingSummaryMetrics, ?> summaryFile = getMetricsFile();
final MetricsFile<FingerprintingDetailMetrics, ?> detailsFile = getMetricsFile();
boolean allZeroLod = true;
for (final FingerprintResults fpr : results) {
final MatchResults mr = fpr.getMatchResults().first();
......@@ -365,11 +367,20 @@ public class CheckFingerprint extends CommandLineProgram {
summaryFile.addMetric(metrics);
log.info("Read Group: " + metrics.READ_GROUP + " / " + observedSampleAlias + " vs. " + metrics.SAMPLE + ": LOD = " + metrics.LOD_EXPECTED_SAMPLE);
allZeroLod &= metrics.LOD_EXPECTED_SAMPLE == 0;
}
summaryFile.write(outputSummaryMetricsFile);
detailsFile.write(outputDetailMetricsFile);
if (allZeroLod) {
log.error("No non-zero results found. This is likely an error. " +
"Probable cause: EXPECTED_SAMPLE (if provided) or the sample name from INPUT (if EXPECTED_SAMPLE isn't provided)" +
"isn't a sample in GENOTYPES file.");
return 1;
}
return 0;
}
......@@ -389,10 +400,6 @@ public class CheckFingerprint extends CommandLineProgram {
return super.customCommandLineValidation();
}
static boolean isBamOrSam(final File f) {
return isBamOrSam(f.toPath());
}
static boolean isBamOrSam(final Path p) {
return (p.toUri().getRawPath().endsWith(BamFileIoUtils.BAM_FILE_EXTENSION) || p.toUri().getRawPath().endsWith(IOUtil.SAM_FILE_EXTENSION));
}
......
......@@ -281,7 +281,7 @@ public class IlluminaBasecallsToFastq extends CommandLineProgram {
FIRST_TILE, TILE_LIMIT, queryNameComparator,
new FastqRecordsForClusterCodec(readStructure.templates.length(),
readStructure.sampleBarcodes.length(), readStructure.molecularBarcode.length()),
FastqRecordsForCluster.class, bclQualityEvaluationStrategy, IGNORE_UNEXPECTED_BARCODES);
FastqRecordsForCluster.class, bclQualityEvaluationStrategy, INCLUDE_NON_PF_READS, IGNORE_UNEXPECTED_BARCODES);
} else {
basecallsConverter = new IlluminaBasecallsConverter<>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure,
sampleBarcodeFastqWriterMap, demultiplex, Math.max(1, MAX_READS_IN_RAM_PER_TILE / readsPerCluster), TMP_DIR, NUM_PROCESSORS,
......
......@@ -312,7 +312,7 @@ public class IlluminaBasecallsToSam extends CommandLineProgram {
TMP_DIR, NUM_PROCESSORS,
FIRST_TILE, TILE_LIMIT, new QueryNameComparator(),
new Codec(numOutputRecords),
SAMRecordsForCluster.class, bclQualityEvaluationStrategy, IGNORE_UNEXPECTED_BARCODES);
SAMRecordsForCluster.class, bclQualityEvaluationStrategy, INCLUDE_NON_PF_READS, IGNORE_UNEXPECTED_BARCODES);
} else {
basecallsConverter = new IlluminaBasecallsConverter<>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure,
barcodeSamWriterMap, true, MAX_READS_IN_RAM_PER_TILE / numOutputRecords, TMP_DIR, NUM_PROCESSORS, FORCE_GC,
......
......@@ -14,11 +14,9 @@ import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.AbstractIlluminaPositionFileReader;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.illumina.parser.readers.LocsFileReader;
import picard.util.ThreadPoolExecutorUtil;
import picard.util.ThreadPoolExecutorWithExceptions;
import java.io.File;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
......@@ -26,7 +24,9 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -36,10 +36,12 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
private final List<AbstractIlluminaPositionFileReader.PositionInfo> locs = new ArrayList<>();
private final File[] filterFiles;
private final Map<String, ThreadPoolExecutorWithExceptions> barcodeWriterThreads = new HashMap<>();
private final Map<Integer, List<RecordWriter>> completedWork = Collections.synchronizedMap(new HashMap<>());
private final Map<Integer, File> barcodesFiles = new HashMap<>();
private final boolean includeNonPfReads;
/**
* Constructor setting includeNonPfReads to true by default.
*
* @param basecallsDir Where to read basecalls from.
* @param barcodesDir Where to read barcodes from (optional; use basecallsDir if not specified).
* @param lane What lane to process.
......@@ -71,11 +73,52 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
final boolean ignoreUnexpectedBarcodes) {
this(basecallsDir, barcodesDir, lane, readStructure, barcodeRecordWriterMap,
demultiplex, maxReadsInRamPerTile, tmpDirs, numProcessors, firstTile,
tileLimit, outputRecordComparator, codecPrototype, outputRecordClass,
bclQualityEvaluationStrategy, true, ignoreUnexpectedBarcodes);
}
/**
* @param basecallsDir Where to read basecalls from.
* @param barcodesDir Where to read barcodes from (optional; use basecallsDir if not specified).
* @param lane What lane to process.
* @param readStructure How to interpret each cluster.
* @param barcodeRecordWriterMap Map from barcode to CLUSTER_OUTPUT_RECORD writer. If demultiplex is false, must contain
* one writer stored with key=null.
* @param demultiplex If true, output is split by barcode, otherwise all are written to the same output stream.
* @param maxReadsInRamPerTile Configures number of reads each tile will store in RAM before spilling to disk.
* @param tmpDirs For SortingCollection spilling.
* @param numProcessors Controls number of threads. If <= 0, the number of threads allocated is
* available cores - numProcessors.
* @param firstTile (For debugging) If non-null, start processing at this tile.
* @param tileLimit (For debugging) If non-null, process no more than this many tiles.
* @param outputRecordComparator For sorting output records within a single tile.
* @param codecPrototype For spilling output records to disk.
* @param outputRecordClass Inconveniently needed to create SortingCollections.
* @param includeNonPfReads If true, will include ALL reads (including those which do not have PF set)
* @param ignoreUnexpectedBarcodes If true, will ignore reads whose called barcode is not found in barcodeRecordWriterMap,
*/
public NewIlluminaBasecallsConverter(final File basecallsDir, final File barcodesDir, final int lane,
final ReadStructure readStructure,
final Map<String, ? extends ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD>> barcodeRecordWriterMap,
final boolean demultiplex,
final int maxReadsInRamPerTile,
final List<File> tmpDirs, final int numProcessors,
final Integer firstTile,
final Integer tileLimit,
final Comparator<CLUSTER_OUTPUT_RECORD> outputRecordComparator,
final SortingCollection.Codec<CLUSTER_OUTPUT_RECORD> codecPrototype,
final Class<CLUSTER_OUTPUT_RECORD> outputRecordClass,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
final boolean includeNonPfReads,
final boolean ignoreUnexpectedBarcodes) {
super(barcodeRecordWriterMap, maxReadsInRamPerTile, tmpDirs, codecPrototype, ignoreUnexpectedBarcodes,
demultiplex, outputRecordComparator, bclQualityEvaluationStrategy,
outputRecordClass, numProcessors, new IlluminaDataProviderFactory(basecallsDir,
barcodesDir, lane, readStructure, bclQualityEvaluationStrategy));
this.includeNonPfReads = includeNonPfReads;
this.tiles = new ArrayList<>();
barcodeRecordWriterMap.keySet().forEach(barcode -> barcodeWriterThreads.put(barcode, new ThreadPoolExecutorWithExceptions(1)));
......@@ -153,26 +196,31 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
completedWorkExecutor.shutdown();
//thread by surface tile
final ThreadPoolExecutorWithExceptions tileProcessingExecutor = new ThreadPoolExecutorWithExceptions(numThreads);
final ThreadPoolExecutor tileProcessingExecutor = new ThreadPoolExecutorWithExceptions(numThreads);
workChecker.setTileProcessingExecutor(tileProcessingExecutor);
for (final Integer tile : tiles) {
tileProcessingExecutor.submit(new TileProcessor(tile, barcodesFiles.get(tile)));
tileProcessingExecutor.submit(new TileProcessor(tile, barcodesFiles.get(tile), workChecker));
}
tileProcessingExecutor.shutdown();
ThreadPoolExecutorUtil.awaitThreadPoolTermination("Reading executor", tileProcessingExecutor, Duration.ofMinutes(5));
awaitThreadPoolTermination("Reading executor", tileProcessingExecutor);
awaitThreadPoolTermination("Tile completion executor", completedWorkExecutor);
// if there was an exception reading then initiate an immediate shutdown.
if (tileProcessingExecutor.exception != null) {
int tasksStillRunning = completedWorkExecutor.shutdownNow().size();
tasksStillRunning += barcodeWriterThreads.values().stream().mapToLong(executor -> executor.shutdownNow().size()).sum();
throw new PicardException("Reading executor had exceptions. There were " + tasksStillRunning
+ " tasks were still running or queued and have been cancelled.", tileProcessingExecutor.exception);
} else {
ThreadPoolExecutorUtil.awaitThreadPoolTermination("Tile completion executor", completedWorkExecutor, Duration.ofMinutes(5));
barcodeWriterThreads.values().forEach(ThreadPoolExecutor::shutdown);
barcodeWriterThreads.forEach((barcode, executor) -> ThreadPoolExecutorUtil.awaitThreadPoolTermination(barcode + " writer", executor, Duration.ofMinutes(5)));
barcodeWriterThreads.forEach((barcode, executor) -> awaitThreadPoolTermination(barcode + " writer", executor));
}
private void awaitThreadPoolTermination(final String executorName, final ThreadPoolExecutor executorService) {
try {
while (!executorService.awaitTermination(1, TimeUnit.SECONDS)) {
log.info(String.format("%s waiting for job completion. Finished jobs - %d : Running jobs - %d : Queued jobs - %d",
executorName, executorService.getCompletedTaskCount(), executorService.getActiveCount(),
executorService.getQueue().size()));
}
} catch (final InterruptedException e) {
e.printStackTrace();
}
}
......@@ -201,30 +249,16 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
}
}
private class Closer implements Runnable {
private final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer;
private final String barcode;
private Closer(final ConvertedClusterDataWriter<CLUSTER_OUTPUT_RECORD> writer, final String barcode) {
this.writer = writer;
this.barcode = barcode;
}
@Override
public void run() {
log.debug("Closing writer for barcode " + barcode);
this.writer.close();
}
}
private class TileProcessor implements Runnable {
private final int tileNum;
private final Map<String, SortingCollection<CLUSTER_OUTPUT_RECORD>> barcodeToRecordCollection = new HashMap<>();
private final File barcodeFile;
private final CompletedWorkChecker workChecker;
TileProcessor(final int tileNum, final File barcodeFile) {
TileProcessor(final int tileNum, final File barcodeFile, CompletedWorkChecker workChecker) {
this.tileNum = tileNum;
this.barcodeFile = barcodeFile;
this.workChecker = workChecker;
}
@Override
......@@ -234,9 +268,13 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
while (dataProvider.hasNext()) {
final ClusterData cluster = dataProvider.next();
readProgressLogger.record(null, 0);
// If this cluster is passing, or we do NOT want to ONLY emit passing reads, then add it to
// the next
if (cluster.isPf() || includeNonPfReads) {
final String barcode = (demultiplex ? cluster.getMatchedBarcode() : null);
addRecord(barcode, converter.convertClusterToOutputRecord(cluster));
}
}
dataProvider.close();
......@@ -247,7 +285,8 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
writerList.add(new RecordWriter(writer, value, barcode));
});
completedWork.put(tileNum, writerList);
workChecker.registerCompletedWork(tileNum, writerList);
log.info("Finished processing tile " + tileNum);
}
......@@ -275,19 +314,21 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
final int maxRecordsInRam =
Math.max(1, maxReadsInRamPerTile /
barcodeRecordWriterMap.size());
return SortingCollection.newInstanceFromPaths(
return SortingCollection.newInstance(
outputRecordClass,
codecPrototype.clone(),
outputRecordComparator,
maxRecordsInRam,
IOUtil.filesToPaths(tmpDirs));
tmpDirs);
}
}
private class CompletedWorkChecker implements Runnable {
private final Map<Integer, List<RecordWriter>> completedWork = new ConcurrentHashMap<>();
private int currentTileIndex = 0;
private ThreadPoolExecutor tileProcessingExecutor;
@Override
public void run() {
......@@ -296,19 +337,30 @@ public class NewIlluminaBasecallsConverter<CLUSTER_OUTPUT_RECORD> extends Baseca
if (completedWork.containsKey(currentTile)) {
log.info("Writing out tile " + currentTile);
completedWork.get(currentTile).forEach(writer -> barcodeWriterThreads.get(writer.getBarcode()).submit(writer));
completedWork.remove(currentTile);
currentTileIndex++;
} else {
try {
Thread.sleep(5000);
} catch (final InterruptedException e) {
throw new PicardException(e.getMessage(), e);
}
if(tileProcessingExecutor.isTerminated() && completedWork.size() == 0){
break;
}
}
//we are all done scheduling work.. now schedule the closes
barcodeRecordWriterMap.forEach((barcode, writer) -> barcodeWriterThreads.get(barcode).submit(new Closer(writer, barcode)));
barcodeRecordWriterMap.forEach((barcode, writer) -> {
ThreadPoolExecutorWithExceptions executorService = barcodeWriterThreads.get(barcode);
executorService.shutdown();
awaitThreadPoolTermination( "Barcode writer (" + barcode + ")", executorService);
log.debug("Closing writer for barcode " + barcode);
writer.close();
});
}
void registerCompletedWork(int tileNum, List<RecordWriter> writerList) {
completedWork.put(tileNum, writerList);
}
void setTileProcessingExecutor(ThreadPoolExecutor tileProcessingExecutor) {
this.tileProcessingExecutor = tileProcessingExecutor;
}
}
}
......@@ -42,7 +42,7 @@ public abstract class BaseIlluminaDataProvider implements Iterator<ClusterData>,
}
protected void addData(final ClusterData clusterData, final PfData pfData) {
clusterData.setPf(pfData.isPf());
clusterData.setPf(pfData.isPfRead());
}
protected void addData(final ClusterData clusterData, final BarcodeData barcodeData) {
......@@ -72,9 +72,9 @@ public abstract class BaseIlluminaDataProvider implements Iterator<ClusterData>,
for (int i = 0; i < numReads; i++) {
clusterData.getRead(i).setQualities(qualities[i]);
}
clusterData.setPf(cbclData.isPf());
clusterData.setX(cbclData.getPositionInfo().xQseqCoord);
clusterData.setY(cbclData.getPositionInfo().yQseqCoord);
clusterData.setPf(cbclData.isPfRead());
clusterData.setX(cbclData.getXCoordinate());
clusterData.setY(cbclData.getYCoordinate());
}
abstract void seekToTile(int seekAfterFirstRead);
......
......@@ -9,30 +9,27 @@ import picard.illumina.parser.readers.AbstractIlluminaPositionFileReader;
public class CbclData extends BclData implements PfData, PositionalData {
private final int tile;
private AbstractIlluminaPositionFileReader.PositionInfo positionInfo;
private boolean pfRead;
public CbclData(int[] outputLengths, int tile) {
super(outputLengths);
this.tile = tile;
}
//CBCLs currently only contain PF reads.
@Override
public boolean isPf() {
return true;
}
public int getTile() {
return tile;
}
public AbstractIlluminaPositionFileReader.PositionInfo getPositionInfo() {
return positionInfo;
}
public void setPositionInfo(AbstractIlluminaPositionFileReader.PositionInfo positionInfo) {
this.positionInfo = positionInfo;
}
public void setPfRead(boolean pfRead) {
this.pfRead = pfRead;
}
@Override
public int getXCoordinate() {
return this.positionInfo.xQseqCoord;
......@@ -42,4 +39,9 @@ public class CbclData extends BclData implements PfData, PositionalData {
public int getYCoordinate() {
return this.positionInfo.yQseqCoord;
}
@Override
public boolean isPfRead() {
return pfRead;
}
}
......@@ -65,7 +65,7 @@ class FilterParser extends PerTileParser<PfData> {
public PfData next() {
final boolean nextValue = reader.next();
return new PfData() {
public boolean isPf() {
public boolean isPfRead() {
return nextValue;
}
};
......
......@@ -59,7 +59,7 @@ interface RawIntensityData extends IlluminaData{
}
interface PfData extends IlluminaData {
public boolean isPf();
public boolean isPfRead();
}
interface BarcodeData extends IlluminaData {
......
......@@ -46,7 +46,7 @@ public class MultiTileFilterParser extends MultiTileParser<PfData> {
final boolean nextVal = reader.next();
return new PfData() {
@Override
public boolean isPf() {
public boolean isPfRead() {
return nextVal;
}
};
......
......@@ -200,13 +200,13 @@ public class CbclReader extends BaseBclReader implements CloseableIterator<CbclD
for (final int outputLength : outputLengths) {
for (int cycle = 0; cycle < outputLength; cycle++) {
final CycleData currentCycleData = cycleData[totalCycleCount];
final TileData currentCycleTileInfo = cycleData[totalCycleCount].tileInfo;
try {
if (cachedTile[totalCycleCount] == null) {
if (!cachedFilter.containsKey(cycleData[totalCycleCount].tileInfo.tileNum)) {
cacheFilterAndLocs(cycleData[totalCycleCount].tileInfo, locs);
if (!cachedFilter.containsKey(currentCycleTileInfo.tileNum)) {
cacheFilterAndLocs(currentCycleTileInfo, locs);
}
cacheTile(totalCycleCount, cycleData[totalCycleCount].tileInfo, currentCycleData);
cacheTile(totalCycleCount, currentCycleTileInfo);
}
} catch (final IOException e) {
// when logging the error, increment cycle by 1, since totalCycleCount is zero-indexed but Illumina directories are 1-indexed.
......@@ -269,7 +269,9 @@ public class CbclReader extends BaseBclReader implements CloseableIterator<CbclD
private void advance() {
int totalCycleCount = 0;
final CbclData data = new CbclData(outputLengths, cycleData[totalCycleCount].tileInfo.tileNum);
int tileNum = cycleData[totalCycleCount].tileInfo.tileNum;
final CbclData data = new CbclData(outputLengths, tileNum);
int tilePosition = cachedTilePosition[totalCycleCount];
for (int read = 0; read < outputLengths.length; read++) {
for (int cycle = 0; cycle < outputLengths[read]; cycle++) {
......@@ -288,31 +290,25 @@ public class CbclReader extends BaseBclReader implements CloseableIterator<CbclD
totalCycleCount++;
}
}
data.setPositionInfo(positionInfoIterator.next());
data.setPfRead(cachedFilter.get(tileNum).get(tilePosition));
this.queue = data;
}
private void cacheFilterAndLocs(final TileData currentTileData, final List<AbstractIlluminaPositionFileReader.PositionInfo> locs) {
final List<Boolean> filterValues = new ArrayList<>();
final FilterFileReader reader = new FilterFileReader(filterFileMap.get(currentTileData.tileNum));
final Iterator<AbstractIlluminaPositionFileReader.PositionInfo> positionInfoIterator = locs.iterator();
while (reader.hasNext()) {
filterValues.add(reader.next());
}
final List<AbstractIlluminaPositionFileReader.PositionInfo> positions = new ArrayList<>();
for (final boolean filterValue : filterValues) {
final AbstractIlluminaPositionFileReader.PositionInfo info = positionInfoIterator.next();
if (filterValue) {
positions.add(info);
}
}
this.positionInfoIterator = positions.iterator();
this.positionInfoIterator = locs.iterator();
cachedFilter.put(currentTileData.tileNum, filterValues);
}
private void cacheTile(final int totalCycleCount, final TileData tileData, final CycleData currentCycleData) throws IOException {
private void cacheTile(final int totalCycleCount, final TileData tileData) throws IOException {
final byte[] tileByteArray = new byte[tileData.compressedBlockSize];
// Read the whole compressed block into a buffer, then sanity check the length
......@@ -333,38 +329,11 @@ public class CbclReader extends BaseBclReader implements CloseableIterator<CbclD
byte[] decompressedByteArray = decompressTile(totalCycleCount, tileData, byteInputStream);
// Read uncompressed data from the buffer and expand each nibble into a full byte for ease of use
byte[] unNibbledByteArray = promoteNibblesToBytes(decompressedByteArray);
cachedTile[totalCycleCount] = filterNonPfReads(tileData, currentCycleData, unNibbledByteArray);
cachedTile[totalCycleCount] = promoteNibblesToBytes(decompressedByteArray);
cachedTilePosition[totalCycleCount] = 0;
}
private byte[] filterNonPfReads(TileData tileData, CycleData currentCycleData, byte[] unNibbledByteArray) {
// Write buffer contents to cached tile array
// if nonPF reads are included we need to strip them out
if (!currentCycleData.pfExcluded) {
final List<Boolean> filterDatas = cachedFilter.get(tileData.tileNum);
int sum = 0;
for (final boolean b : filterDatas) {
sum += b ? 1 : 0;
}
final byte[] filteredByteArray = new byte[sum];
int filterIndex = 0;
int basecallIndex = 0;
for (final boolean filterData : filterDatas) {
final byte readByte = unNibbledByteArray[filterIndex];
if (filterData) {
filteredByteArray[basecallIndex] = readByte;
basecallIndex++;
}
filterIndex++;
}
return filteredByteArray;
} else {
return unNibbledByteArray;
}
}
private byte[] promoteNibblesToBytes(byte[] decompressedByteArray) {
//we are going to explode the nibbles in to bytes to make PF filtering easier
final byte[] unNibbledByteArray = new byte[decompressedByteArray.length * 2];
......@@ -407,10 +376,6 @@ public class CbclReader extends BaseBclReader implements CloseableIterator<CbclD
return decompressedByteArray;
}
public CycleData[] getCycleData() {
return cycleData;
}
public int getHeaderSize() {
return headerSize;
}
......
......@@ -181,7 +181,7 @@ public class RevertSam extends CommandLineProgram {
@Argument(shortName = StandardOptionDefinitions.USE_ORIGINAL_QUALITIES_SHORT_NAME, doc = "True to restore original qualities from the OQ field to the QUAL field if available.")
public boolean RESTORE_ORIGINAL_QUALITIES = true;
@Argument(doc = "Remove duplicate read flags from all reads. Note that if this is true and REMOVE_ALIGNMENT_INFORMATION==false, " +
@Argument(doc = "Remove duplicate read flags from all reads. Note that if this is false and REMOVE_ALIGNMENT_INFORMATION==true, " +
" the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.")
public boolean REMOVE_DUPLICATE_INFORMATION = true;
......
......@@ -38,11 +38,11 @@ import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.DuplicateSetIterator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import java.util.*;
import static htsjdk.samtools.util.StringUtil.hammingDistance;
/**
* UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator
......@@ -55,10 +55,11 @@ class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
private Iterator<DuplicateSet> nextSetsIterator;
private final int maxEditDistanceToJoin;
private final String umiTag;
private final String inferredUmiTag;
private final String molecularIdentifierTag;
private final boolean allowMissingUmis;
private boolean isOpen = false;
private Map<String, UmiMetrics> umiMetricsMap;
private final boolean duplexUmi;
private final Map<String, UmiMetrics> umiMetricsMap;
private boolean haveWeSeenFirstRead = false;
private long observedUmiBases = 0;
......@@ -69,19 +70,20 @@ class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
* @param wrappedIterator Iterator of DuplicatesSets to use and break-up by UMI.
* @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups
* @param umiTag The tag used in the bam file that designates the UMI
* @param assignedUmiTag The tag in the bam file that designates the assigned UMI
* @param allowMissingUmis Allow for SAM Records that do not have UMIs
* @param umiMetricsMap Map of UMI Metrics indexed by library name
*/
UmiAwareDuplicateSetIterator(final DuplicateSetIterator wrappedIterator, final int maxEditDistanceToJoin,
final String umiTag, final String assignedUmiTag, final boolean allowMissingUmis,
final String umiTag, final String molecularIdentifierTag,
final boolean allowMissingUmis, final boolean duplexUmi,
final Map<String, UmiMetrics> umiMetricsMap) {
this.wrappedIterator = wrappedIterator;
this.maxEditDistanceToJoin = maxEditDistanceToJoin;
this.umiTag = umiTag;
this.inferredUmiTag = assignedUmiTag;
this.molecularIdentifierTag = molecularIdentifierTag;
this.allowMissingUmis = allowMissingUmis;
this.umiMetricsMap = umiMetricsMap;
this.duplexUmi = duplexUmi;
isOpen = true;
nextSetsIterator = Collections.emptyIterator();
}
......@@ -132,11 +134,11 @@ class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
throw new PicardException("nextSetsIterator is expected to be empty, but already contains data.");
}
final UmiGraph umiGraph = new UmiGraph(set, umiTag, inferredUmiTag, allowMissingUmis);
final UmiGraph umiGraph = new UmiGraph(set, umiTag, molecularIdentifierTag, allowMissingUmis, duplexUmi);
// Get the UMI metrics for the library of this duplicate set, creating a new one if necessary.
final String library = set.getRepresentative().getReadGroup().getLibrary();
UmiMetrics metrics = umiMetricsMap.computeIfAbsent(library, UmiMetrics::new);
final UmiMetrics metrics = umiMetricsMap.computeIfAbsent(library, UmiMetrics::new);
final List<DuplicateSet> duplicateSets = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin);
......@@ -144,11 +146,9 @@ class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
// and total numbers of observed and inferred UMIs
for (final DuplicateSet ds : duplicateSets) {
final List<SAMRecord> records = ds.getRecords();
final SAMRecord representativeRead = ds.getRepresentative();
final String inferredUmi = representativeRead.getStringAttribute(inferredUmiTag);
for (final SAMRecord rec : records) {
final String currentUmi = UmiUtil.getSanitizedUMI(rec, umiTag);
final String currentUmi = UmiUtil.getTopStrandNormalizedUmi(rec, umiTag, duplexUmi);
if (currentUmi != null) {
// All UMIs should be the same length, the code presently does not support variable length UMIs.
......@@ -157,19 +157,21 @@ class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
if (currentUmi.contains("N")) {
metrics.addUmiObservationN();
} else {
final int umiLength = UmiUtil.getUmiLength(currentUmi);
if (!haveWeSeenFirstRead) {
metrics.MEAN_UMI_LENGTH = currentUmi.length();
metrics.MEAN_UMI_LENGTH = umiLength;
haveWeSeenFirstRead = true;
} else {
if (metrics.MEAN_UMI_LENGTH != currentUmi.length()) {
if (metrics.MEAN_UMI_LENGTH != umiLength) {
throw new PicardException("UMIs of differing lengths were found.");
}
}
// Update UMI metrics associated with each record
// The hammingDistance between N and a base is a distance of 1. Comparing N to N is 0 distance.
metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi);
observedUmiBases += currentUmi.length();
final String inferredUmi = (String) rec.getTransientAttribute(UmiUtil.INFERRED_UMI_TRANSIENT_TAG);
metrics.OBSERVED_BASE_ERRORS += StringUtil.hammingDistance(currentUmi, inferredUmi);
observedUmiBases += umiLength;
metrics.addUmiObservation(currentUmi, inferredUmi);
}
}
......
......@@ -120,9 +120,6 @@ public class UmiAwareMarkDuplicatesWithMateCigar extends SimpleMarkDuplicatesWit
@Argument(shortName = "UMI_TAG_NAME", doc = "Tag name to use for UMI", optional = true)
public String UMI_TAG_NAME = "RX";
@Argument(shortName = "ASSIGNED_UMI_TAG", doc = "Tag name to use for assigned UMI", optional = true)
public String ASSIGNED_UMI_TAG = "MI";
// Since we inherit from SimpleMarkDuplicatesWithMateCigar, it is useful for us to also inherit the tests
// which do not contain UMIs. By default, we don't allow for missing UMIs, but for the inherited tests
// we allow for missing UMIs.
......@@ -156,6 +153,6 @@ public class UmiAwareMarkDuplicatesWithMateCigar extends SimpleMarkDuplicatesWit
new DuplicateSetIterator(headerAndIterator.iterator,
headerAndIterator.header,
false,
comparator), MAX_EDIT_DISTANCE_TO_JOIN, UMI_TAG_NAME, ASSIGNED_UMI_TAG, ALLOW_MISSING_UMIS, metrics);
comparator), MAX_EDIT_DISTANCE_TO_JOIN, UMI_TAG_NAME, MOLECULAR_IDENTIFIER_TAG, ALLOW_MISSING_UMIS, DUPLEX_UMI, metrics);
}
}
......@@ -36,6 +36,7 @@ import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static java.util.stream.Collectors.counting;
/**
* UmiGraph is used to identify UMIs that come from the same original source molecule. The assumption
* is that UMIs with small edit distances are likely to be read errors on the sequencer rather than
......@@ -51,34 +52,44 @@ import static java.util.stream.Collectors.counting;
* @author fleharty
*/
public class UmiGraph {
private final List<SAMRecord> records; // SAMRecords from the original duplicate set considered to break up by UMI
private final Map<String, Long> umiCounts; // Map of UMI sequences and how many times they have been observed
private final int[] duplicateSetID; // ID of the duplicate set that the UMI belongs to, the index is the UMI ID
private final String[] umi; // Sequence of actual UMI, the index is the UMI ID
private final int numUmis; // Number of observed UMIs
private final String umiTag; // UMI tag used in the SAM/BAM/CRAM file ie. RX
private final String assignedUmiTag; // Assigned UMI tag used in the SAM/BAM/CRAM file ie. MI
private final boolean allowMissingUmis; // Allow for missing UMIs
public UmiGraph(DuplicateSet set, String umiTag, String assignedUmiTag, boolean allowMissingUmis) {
private final List<SAMRecord> records;
private final Map<String, Long> umiCounts;
private final int[] duplicateSetID;
private final String[] umi;
private final int numUmis;
private final String umiTag;
private final String molecularIdentifierTag;
private final boolean allowMissingUmis;
private final boolean duplexUmis;
/**
* Creates a UmiGraph object
* @param set Set of reads that have the same start-stop positions, these will be broken up by UMI
* @param umiTag UMI tag used in the SAM/BAM/CRAM file ie. RX
* @param molecularIdentifierTag Molecular identifier tag used in the SAM/BAM/CRAM file ie. MI
* @param allowMissingUmis Allow for missing UMIs
* @param duplexUmis Whether or not to use duplex of single strand UMIs
*/
UmiGraph(final DuplicateSet set, final String umiTag, final String molecularIdentifierTag,
final boolean allowMissingUmis, final boolean duplexUmis) {
this.umiTag = umiTag;
this.assignedUmiTag = assignedUmiTag;
this.molecularIdentifierTag = molecularIdentifierTag;
this.allowMissingUmis = allowMissingUmis;
this.duplexUmis = duplexUmis;
records = set.getRecords();
// First ensure that all the reads have a UMI, if any reads are missing a UMI throw an exception unless allowMissingUmis is true
for (SAMRecord rec : records) {
if (UmiUtil.getSanitizedUMI(rec, umiTag) == null) {
if (allowMissingUmis) {
rec.setAttribute(umiTag, "");
} else {
for (final SAMRecord rec : records) {
if (rec.getStringAttribute(umiTag) == null) {
if (!allowMissingUmis) {
throw new PicardException("Read " + rec.getReadName() + " does not contain a UMI with the " + umiTag + " attribute.");
}
rec.setAttribute(umiTag, "");
}
}
// Count the number of times each UMI occurs
umiCounts = records.stream().collect(Collectors.groupingBy(p -> UmiUtil.getSanitizedUMI(p, umiTag), counting()));
umiCounts = records.stream().collect(Collectors.groupingBy(p -> UmiUtil.getTopStrandNormalizedUmi(p, umiTag, duplexUmis), counting()));
// At first we consider every UMI as if it were its own duplicate set
numUmis = umiCounts.size();
......@@ -86,7 +97,7 @@ public class UmiGraph {
duplicateSetID = IntStream.rangeClosed(0, numUmis-1).toArray();
int i = 0;
for (String key : umiCounts.keySet()) {
for (final String key : umiCounts.keySet()) {
umi[i] = key;
i++;
}
......@@ -119,8 +130,9 @@ public class UmiGraph {
// Assign UMIs to duplicateSets
final Map<String, Integer> duplicateSetsFromUmis = getDuplicateSetsFromUmis();
for (SAMRecord rec : records) {
final String umi = UmiUtil.getSanitizedUMI(rec, umiTag);
for (final SAMRecord rec : records) {
final String umi = UmiUtil.getTopStrandNormalizedUmi(rec, umiTag, duplexUmis);
final Integer duplicateSetIndex = duplicateSetsFromUmis.get(umi);
if (duplicateSets.containsKey(duplicateSetIndex)) {
......@@ -147,8 +159,8 @@ public class UmiGraph {
String fewestNUmi = null;
long nCount = 0;
for (SAMRecord rec : recordList) {
final String umi = UmiUtil.getSanitizedUMI(rec, umiTag);
for (final SAMRecord rec : recordList) {
final String umi = UmiUtil.getTopStrandNormalizedUmi(rec, umiTag, duplexUmis);
// If there is another choice, we don't want to choose the UMI with a N
// as the assignedUmi
......@@ -173,14 +185,15 @@ public class UmiGraph {
// then choose the one with the fewest Ns
if (assignedUmi == null) { assignedUmi = fewestNUmi; }
// Set the records to contain the assigned UMI
// Set the records to contain the inferred UMI and the Unique Molecular Identifier
for (final SAMRecord rec : recordList) {
if (allowMissingUmis && rec.getStringAttribute(umiTag).isEmpty()) {
// The SAM spec doesn't support empty tags, so we set it to null if it is empty.
rec.setAttribute(umiTag, null);
} else {
rec.setAttribute(assignedUmiTag, assignedUmi);
UmiUtil.setMolecularIdentifier(rec, assignedUmi, molecularIdentifierTag, duplexUmis);
}
rec.setTransientAttribute(UmiUtil.INFERRED_UMI_TRANSIENT_TAG, assignedUmi);
}
duplicateSetList.add(ds);
......@@ -189,7 +202,10 @@ public class UmiGraph {
return duplicateSetList;
}
// Create a map that maps a umi to the duplicateSetID
/**
* @return a map that maps a umi to the duplicateSetID
*/
private Map<String, Integer> getDuplicateSetsFromUmis() {
final Map<String, Integer> duplicateSetsFromUmis = new HashMap<>();
for (int i = 0; i < duplicateSetID.length; i++) {
......