Skip to content
Commits on Source (7)
htsjdk (2.18.0+dfsg-1) UNRELEASED; urgency=medium
htsjdk (2.18.2+dfsg-1) UNRELEASED; urgency=medium
* New upstream version
TODO: Fix or drop failing tests
* New upstream version
* debhelper 12
* Standards-Version: 4.3.0
* Remove trailing whitespace in debian/changelog
-- Andreas Tille <tille@debian.org> Sun, 02 Dec 2018 11:18:29 +0100
-- Andreas Tille <tille@debian.org> Thu, 24 Jan 2019 17:31:54 +0100
htsjdk (2.16.1+dfsg-3) unstable; urgency=medium
......
......@@ -10,7 +10,7 @@ Build-Depends: default-jdk (>= 2:1.9),
javahelper,
gradle-debian-helper,
maven-repo-helper,
debhelper (>= 11~),
debhelper (>= 12~),
libcommons-jexl2-java,
libcommons-logging-java,
libjaxb-api-java,
......@@ -26,7 +26,7 @@ Build-Depends: default-jdk (>= 2:1.9),
junit4,
libjimfs-java,
scala-library
Standards-Version: 4.2.1
Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/htsjdk
Vcs-Git: https://salsa.debian.org/med-team/htsjdk.git
Homepage: http://samtools.github.io/htsjdk/
......
......@@ -337,6 +337,15 @@ public class BAMFileReader extends SamReader.ReaderImplementation {
return offset;
}
/**
* Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file.
* The caller is responsible for closing the stream.
*/
static long findVirtualOffsetOfFirstRecord(final SeekableStream seekableStream) throws IOException {
final BAMFileReader reader = new BAMFileReader(seekableStream, (SeekableStream) null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory());
return reader.mFirstRecordPointer;
}
/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
......@@ -944,6 +953,14 @@ public class BAMFileReader extends SamReader.ReaderImplementation {
return new BAMQueryFilteringIterator(iterator, new BAMQueryMultipleIntervalsIteratorFilter(intervals, contained));
}
/**
* @return a virtual file pointer for the underlying compressed stream.
* @see BlockCompressedInputStream#getFilePointer()
*/
public long getVirtualFilePointer() {
return mCompressedInputStream.getFilePointer();
}
/**
* Iterate over the SAMRecords defined by the sections of the file described in the ctor argument.
*/
......
/*
* The MIT License
*
* Copyright (c) 2018 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.IOUtil;
import java.io.EOFException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* Writes SBI files for BAM files, as understood by {@link SBIIndex}.
*/
public final class BAMSBIIndexer {
/**
* Perform indexing on the given BAM file, at the granularity level specified.
*
* @param bamFile the path to the BAM file
* @param granularity write the offset of every n-th alignment to the index
* @throws IOException as per java IO contract
*/
public static void createIndex(final Path bamFile, final long granularity) throws IOException {
final Path splittingBaiFile = IOUtil.addExtension(bamFile, SBIIndex.FILE_EXTENSION);
try (SeekableStream in = new SeekablePathStream(bamFile); OutputStream out = Files.newOutputStream(splittingBaiFile)) {
createIndex(in, out, granularity);
}
}
/**
* Perform indexing on the given BAM file, at the granularity level specified.
*
* @param in a seekable stream for reading the BAM file from
* @param out the stream to write the index to
* @param granularity write the offset of every n-th alignment to the index
* @throws IOException as per java IO contract
*/
public static void createIndex(final SeekableStream in, final OutputStream out, final long granularity) throws IOException {
long recordStart = SAMUtils.findVirtualOffsetOfFirstRecordInBam(in);
try (BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in)) {
blockIn.seek(recordStart);
// Create a buffer for reading the BAM record lengths. BAM is little-endian.
final ByteBuffer byteBuffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
final SBIIndexWriter indexWriter = new SBIIndexWriter(out, granularity);
while (true) {
try {
recordStart = blockIn.getFilePointer();
// Read the length of the remainder of the BAM record (`block_size` in the SAM spec)
InputStreamUtils.readFully(blockIn, byteBuffer.array(), 0, 4);
final int blockSize = byteBuffer.getInt(0);
// Process the record start position, then skip to the start of the next BAM record
indexWriter.processRecord(recordStart);
InputStreamUtils.skipFully(blockIn, blockSize);
} catch (EOFException e) {
break;
}
}
indexWriter.finish(recordStart, in.length());
}
}
}
......@@ -41,18 +41,18 @@ import htsjdk.samtools.cram.CRAMException;
public class CRAMIterator implements SAMRecordIterator {
private static final Log log = Log.getInstance(CRAMIterator.class);
private final CountingInputStream countingInputStream;
private CramHeader cramHeader;
private ArrayList<SAMRecord> records;
private final CramHeader cramHeader;
private final ArrayList<SAMRecord> records;
private SAMRecord nextRecord = null;
private CramNormalizer normalizer;
private final CramNormalizer normalizer;
private byte[] refs;
private int prevSeqId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
public Container container;
private SamReader mReader;
long firstContainerOffset = 0;
private Iterator<Container> containerIterator;
private final Iterator<Container> containerIterator;
private ContainerParser parser;
private final ContainerParser parser;
private final CRAMReferenceSource referenceSource;
private Iterator<SAMRecord> iterator = Collections.<SAMRecord>emptyList().iterator();
......@@ -68,6 +68,10 @@ public class CRAMIterator implements SAMRecordIterator {
this.validationStringency = validationStringency;
}
/**
* `samRecordIndex` only used when validation is not `SILENT`
* (for identification by the validator which records are invalid)
*/
private long samRecordIndex;
private ArrayList<CramCompressionRecord> cramRecords;
......@@ -84,7 +88,7 @@ public class CRAMIterator implements SAMRecordIterator {
this.containerIterator = containerIterator;
firstContainerOffset = this.countingInputStream.getCount();
records = new ArrayList<SAMRecord>(10000);
records = new ArrayList<SAMRecord>(CRAMContainerStreamWriter.DEFAULT_RECORDS_PER_SLICE);
normalizer = new CramNormalizer(cramHeader.getSamFileHeader(),
referenceSource);
parser = new ContainerParser(cramHeader.getSamFileHeader());
......@@ -103,7 +107,7 @@ public class CRAMIterator implements SAMRecordIterator {
this.containerIterator = containerIterator;
firstContainerOffset = containerIterator.getFirstContainerOffset();
records = new ArrayList<SAMRecord>(10000);
records = new ArrayList<SAMRecord>(CRAMContainerStreamWriter.DEFAULT_RECORDS_PER_SLICE);
normalizer = new CramNormalizer(cramHeader.getSamFileHeader(),
referenceSource);
parser = new ContainerParser(cramHeader.getSamFileHeader());
......@@ -143,9 +147,6 @@ public class CRAMIterator implements SAMRecordIterator {
}
}
if (records == null)
records = new ArrayList<SAMRecord>(container.nofRecords);
else
records.clear();
if (cramRecords == null)
cramRecords = new ArrayList<CramCompressionRecord>(container.nofRecords);
......@@ -172,8 +173,10 @@ public class CRAMIterator implements SAMRecordIterator {
for (int i = 0; i < container.slices.length; i++) {
final Slice slice = container.slices[i];
if (slice.sequenceId < 0)
continue;
if (!slice.validateRefMD5(refs)) {
final String msg = String.format(
"Reference sequence MD5 mismatch for slice: sequence id %d, start %d, span %d, expected MD5 %s",
......@@ -201,12 +204,6 @@ public class CRAMIterator implements SAMRecordIterator {
samRecord.setValidationStringency(validationStringency);
if (validationStringency != ValidationStringency.SILENT) {
final List<SAMValidationError> validationErrors = samRecord.isValid();
SAMUtils.processValidationErrors(validationErrors,
samRecordIndex, validationStringency);
}
if (mReader != null) {
final long chunkStart = (container.offset << 16) | cramRecord.sliceIndex;
final long chunkEnd = ((container.offset << 16) | cramRecord.sliceIndex) + 1;
......@@ -215,7 +212,6 @@ public class CRAMIterator implements SAMRecordIterator {
}
records.add(samRecord);
samRecordIndex++;
}
cramRecords.clear();
iterator = records.iterator();
......@@ -267,7 +263,15 @@ public class CRAMIterator implements SAMRecordIterator {
@Override
public SAMRecord next() {
if (hasNext()) {
return iterator.next();
SAMRecord samRecord = iterator.next();
if (validationStringency != ValidationStringency.SILENT) {
SAMUtils.processValidationErrors(samRecord.isValid(), samRecordIndex++, validationStringency);
}
return samRecord;
} else {
throw new NoSuchElementException();
}
......
......@@ -24,6 +24,8 @@
package htsjdk.samtools;
import htsjdk.variant.variantcontext.VariantContext;
import java.math.BigInteger;
import java.net.URI;
import java.net.URISyntaxException;
......@@ -57,23 +59,27 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
/**
* This is not a valid sequence name, because it is reserved in the MRNM field of SAM text format
* This is not a valid sequence name, because it is reserved in the RNEXT field of SAM text format
* to mean "same reference as RNAME field."
*/
public static final String RESERVED_MRNM_SEQUENCE_NAME = "=";
public static final String RESERVED_RNEXT_SEQUENCE_NAME = "=";
/* use RESERVED_RNEXT_SEQUENCE_NAME instead. */
@Deprecated
public static final String RESERVED_MRNM_SEQUENCE_NAME = RESERVED_RNEXT_SEQUENCE_NAME;
/**
* The standard tags are stored in text header without type information, because the type of these tags is known.
*/
public static final Set<String> STANDARD_TAGS =
new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG,
SPECIES_TAG));
new HashSet<>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, SPECIES_TAG));
// Split on any whitespace
private static final Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s");
// These are the chars matched by \\s.
private static final char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab
private static final Pattern LEGAL_RNAME_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*");
/**
* @deprecated Use {@link #SAMSequenceRecord(String, int)} instead.
* sequenceLength is required for the object to be considered valid.
......@@ -85,9 +91,6 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
public SAMSequenceRecord(final String name, final int sequenceLength) {
if (name != null) {
if (SEQUENCE_NAME_SPLITTER.matcher(name).find()) {
throw new SAMException("Sequence name contains invalid character: " + name);
}
validateSequenceName(name);
mSequenceName = name.intern();
} else {
......@@ -188,8 +191,8 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
public static String truncateSequenceName(final String sequenceName) {
/*
* Instead of using regex split, do it manually for better performance.
return SEQUENCE_NAME_SPLITTER.split(sequenceName, 2)[0];
*/
int truncateAt = sequenceName.length();
for (final char c : WHITESPACE_CHARS) {
int index = sequenceName.indexOf(c);
......@@ -204,8 +207,8 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea
* Throw an exception if the sequence name is not valid.
*/
public static void validateSequenceName(final String name) {
if (RESERVED_MRNM_SEQUENCE_NAME.equals(name)) {
throw new SAMException("'" + RESERVED_MRNM_SEQUENCE_NAME + "' is not a valid sequence name");
if (!LEGAL_RNAME_PATTERN.matcher(name).useAnchoringBounds(true).matches()) {
throw new SAMException(String.format("Sequence name '%s' doesn't match regex: '%s' ", name, LEGAL_RNAME_PATTERN));
}
}
......
......@@ -23,6 +23,7 @@
*/
package htsjdk.samtools;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
import htsjdk.samtools.util.CloserUtil;
......@@ -685,6 +686,18 @@ public final class SAMUtils {
}
}
/**
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
* offset after skipping over the text header and the sequence records.
*/
public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) {
try {
return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
}
}
/**
* Given a Cigar, Returns blocks of the sequence that have been aligned directly to the
* reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference)
......
/*
* The MIT License
*
* Copyright (c) 2018 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.NavigableSet;
import java.util.Objects;
import java.util.TreeSet;
import java.util.stream.Collectors;
/**
* SBI is an index into BGZF-compressed data files, which has an entry for the file position of the start of every
* <i>n</i>th record. Reads files that were created by {@link BAMSBIIndexer}.
*/
public final class SBIIndex implements Serializable {
public static class Header implements Serializable {
private final long fileLength;
private final byte[] md5;
private final byte[] uuid;
private final long totalNumberOfRecords;
private final long granularity;
public Header(long fileLength, byte[] md5, byte[] uuid, long totalNumberOfRecords, long granularity) {
this.fileLength = fileLength;
this.md5 = md5;
this.uuid = uuid;
this.totalNumberOfRecords = totalNumberOfRecords;
this.granularity = granularity;
}
public long getFileLength() {
return fileLength;
}
public byte[] getMd5() {
return md5;
}
public byte[] getUuid() {
return uuid;
}
public long getTotalNumberOfRecords() {
return totalNumberOfRecords;
}
public long getGranularity() {
return granularity;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final Header header = (Header) o;
return fileLength == header.fileLength &&
totalNumberOfRecords == header.totalNumberOfRecords &&
granularity == header.granularity &&
Arrays.equals(md5, header.md5) &&
Arrays.equals(uuid, header.uuid);
}
@Override
public int hashCode() {
int result = Objects.hash(fileLength, totalNumberOfRecords, granularity);
result = 31 * result + Arrays.hashCode(md5);
result = 31 * result + Arrays.hashCode(uuid);
return result;
}
@Override
public String toString() {
return "Header{" +
"fileLength=" + fileLength +
", md5=" + Arrays.toString(md5) +
", uuid=" + Arrays.toString(uuid) +
", totalNumberOfRecords=" + totalNumberOfRecords +
", granularity=" + granularity +
'}';
}
}
public static final String FILE_EXTENSION = ".sbi";
/**
* SBI magic number.
*/
static final byte[] SBI_MAGIC = "SBI\1".getBytes();
private final Header header;
private final long[] virtualOffsets;
/**
* Create an in-memory SBI with the given virtual offsets.
* @param virtualOffsets the offsets in the index
*/
public SBIIndex(final Header header, final long[] virtualOffsets) {
this.header = header;
this.virtualOffsets = virtualOffsets;
if (this.virtualOffsets.length == 0) {
throw new RuntimeException("Invalid SBI format: should contain at least one offset");
}
}
/**
* Load an SBI into memory from a path.
* @param path the path to the SBI file
* @throws IOException as per java IO contract
*/
public static SBIIndex load(final Path path) throws IOException {
try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) {
return readIndex(in);
}
}
/**
* Load an SBI into memory from a stream.
* @param in the stream to read the SBI from
*/
public static SBIIndex load(final InputStream in) {
return readIndex(in);
}
private static SBIIndex readIndex(final InputStream in) {
final BinaryCodec binaryCodec = new BinaryCodec(in);
final Header header = readHeader(binaryCodec);
final long numOffsetsLong = binaryCodec.readLong();
if (numOffsetsLong > Integer.MAX_VALUE) {
throw new RuntimeException(String.format("Cannot read SBI with more than %s offsets.", Integer.MAX_VALUE));
}
final int numOffsets = (int) numOffsetsLong;
final long[] virtualOffsets = new long[numOffsets];
long prev = -1;
for (int i = 0; i < numOffsets; i++) {
final long cur = binaryCodec.readLong();
if (prev > cur) {
throw new RuntimeException(String.format(
"Invalid SBI; offsets not in order: %#x > %#x",
prev, cur));
}
virtualOffsets[i] = cur;
prev = cur;
}
return new SBIIndex(header, virtualOffsets);
}
private static Header readHeader(final BinaryCodec binaryCodec) {
final byte[] buffer = new byte[SBI_MAGIC.length];
binaryCodec.readBytes(buffer);
if (!Arrays.equals(buffer, SBI_MAGIC)) {
throw new RuntimeException("Invalid file header in SBI: " + new String(buffer) + " (" + Arrays.toString(buffer) + ")");
}
final long fileLength = binaryCodec.readLong();
final byte[] md5 = new byte[16];
binaryCodec.readBytes(md5);
final byte[] uuid = new byte[16];
binaryCodec.readBytes(uuid);
final long totalNumberOfRecords = binaryCodec.readLong();
final long granularity = binaryCodec.readLong();
return new Header(fileLength, md5, uuid, totalNumberOfRecords, granularity);
}
/**
* Returns the index header.
*
* @return the header
*/
public Header getHeader() {
return header;
}
/**
* Returns the granularity of the index, that is the number of alignments between subsequent entries in the index,
* or zero if not specified.
* @return the granularity of the index
*/
public long getGranularity() {
return header.getGranularity();
}
/**
* Returns the entries in the index.
*
* @return an array of file pointers for all the alignment offsets in the index, in ascending order. The last
* virtual file pointer is the position at which the next record would start if it were added to the file.
*/
public long[] getVirtualOffsets() {
return virtualOffsets;
}
/**
* Returns number of entries in the index.
*
* @return the number of virtual offsets in the index
*/
public long size() {
return virtualOffsets.length;
}
/**
* Returns the length of the data file in bytes.
*
* @return the length of the data file in bytes
*/
public long dataFileLength() {
return header.getFileLength();
}
/**
* Split the data file for this index into non-overlapping chunks of roughly the given size that cover the whole
* file and that can be read independently of one another.
*
* @param splitSize the rough size of each split in bytes
* @return a list of contiguous, non-overlapping, sorted chunks that cover the whole data file
* @see #getChunk(long, long)
*/
public List<Chunk> split(final long splitSize) {
if (splitSize <= 0) {
throw new IllegalArgumentException(String.format("Split size must be positive: %s", splitSize));
}
final long fileSize = dataFileLength();
final List<Chunk> chunks = new ArrayList<>();
for (long splitStart = 0; splitStart < fileSize; splitStart += splitSize) {
final Chunk chunk = getChunk(splitStart, splitStart + splitSize);
if (chunk != null) {
chunks.add(chunk);
}
}
return chunks;
}
/**
* Return a chunk that corresponds to the given range in the data file. Note that the chunk does not necessarily
* completely cover the given range, however this method will map a set of contiguous, non-overlapping file ranges
* that cover the whole data file to a set of contiguous, non-overlapping chunks that cover the whole data file.
*
* @param splitStart the start of the file range (inclusive)
* @param splitEnd the start of the file range (exclusive)
* @return a chunk whose virtual start is at the first alignment start position that is greater than or equal to the
* given split start position, and whose virtual end is at the first alignment start position that is greater than
* or equal to the given split end position, or null if the chunk would be empty.
* @see #split(long)
*/
public Chunk getChunk(final long splitStart, final long splitEnd) {
if (splitStart >= splitEnd) {
throw new IllegalArgumentException(String.format("Split start (%s) must be less than end (%s)", splitStart, splitEnd));
}
final long lastVirtualOffset = virtualOffsets[virtualOffsets.length - 1];
final long maxEnd = BlockCompressedFilePointerUtil.getBlockAddress(lastVirtualOffset);
final long actualSplitStart = Math.min(splitStart, maxEnd);
final long actualSplitEnd = Math.min(splitEnd, maxEnd);
final long virtualSplitStart = BlockCompressedFilePointerUtil.makeFilePointer(actualSplitStart);
final long virtualSplitEnd = BlockCompressedFilePointerUtil.makeFilePointer(actualSplitEnd);
final long virtualSplitStartAlignment = ceiling(virtualSplitStart);
final long virtualSplitEndAlignment = ceiling(virtualSplitEnd);
if (virtualSplitStartAlignment == virtualSplitEndAlignment) {
return null;
}
return new Chunk(virtualSplitStartAlignment, virtualSplitEndAlignment);
}
private long ceiling(final long virtualOffset) {
int index = Arrays.binarySearch(virtualOffsets, virtualOffset);
if (index < 0) {
index = -index - 1;
if (index == virtualOffsets.length) {
long lastVirtualOffset = virtualOffsets[virtualOffsets.length - 1];
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s",
BlockCompressedFilePointerUtil.asString(virtualOffset), BlockCompressedFilePointerUtil.asString(lastVirtualOffset)));
}
}
return virtualOffsets[index];
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final SBIIndex sbiIndex = (SBIIndex) o;
return Objects.equals(header, sbiIndex.header) &&
Arrays.equals(virtualOffsets, sbiIndex.virtualOffsets);
}
@Override
public int hashCode() {
int result = Objects.hash(header);
result = 31 * result + Arrays.hashCode(virtualOffsets);
return result;
}
@Override
public String toString() {
String virtualOffsetsString;
if (virtualOffsets.length > 30) {
virtualOffsetsString = Arrays.toString(Arrays.copyOfRange(virtualOffsets, 0, 30)).replace("]", ", ...]");
} else {
virtualOffsetsString = Arrays.toString(virtualOffsets);
}
return "SBIIndex{" +
"header=" + header +
", numVirtualOffsets=" + virtualOffsets.length +
", virtualOffsets=" + virtualOffsetsString +
'}';
}
}
/*
* The MIT License
*
* Copyright (c) 2018 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.RuntimeIOException;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
/**
* Writes SBI files as understood by {@link SBIIndex}.
* <p>
* To use this class, first construct an instance from an output stream, and a desired granularity. Then for each
* record in the file being indexed, pass the virtual file offset of the record to the {@link #processRecord} method.
* The indexer will keep a count of the records passed in an index every <i>n</i>th record. When there are no records
* left call {@link #finish} to complete writing the index.
*/
public final class SBIIndexWriter {
// Default to a granularity level of 4096. This is generally sufficient
// for very large BAM files, relative to a maximum heap size in the
// gigabyte range.
public static final long DEFAULT_GRANULARITY = 4096;
static final byte[] EMPTY_MD5 = new byte[16];
static final byte[] EMPTY_UUID = new byte[16];
private final OutputStream out;
private final long granularity;
private final Path tempOffsetsFile;
private final BinaryCodec tempOffsetsCodec;
private long prev = -1;
private long recordCount;
private long virtualOffsetCount;
/**
* Prepare to write an SBI index with the default granularity.
*
* @param out the stream to write the index to
*/
public SBIIndexWriter(final OutputStream out) {
this(out, SBIIndexWriter.DEFAULT_GRANULARITY);
}
/**
* Prepare to write an SBI index.
*
* @param out the stream to write the index to
* @param granularity write the offset of every <i>n</i>th record to the index
*/
public SBIIndexWriter(final OutputStream out, final long granularity) {
this.out = out;
this.granularity = granularity;
try {
// Write the offsets to a temporary file, then write the entire file contents to the output stream at
// the end, once we know the number of offsets. This is more efficient than using a List<Long> for very
// large numbers of offsets (e.g. 10^8, which is possible for low granularity), since the list resizing
// operation is slow.
this.tempOffsetsFile = Files.createTempFile("offsets-", ".headerless.sbi");
this.tempOffsetsCodec = new BinaryCodec(new BufferedOutputStream(Files.newOutputStream(tempOffsetsFile)));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* Process a record for the index: the offset of every <i>n</i>th record will be written to the index.
*
* @param virtualOffset virtual file pointer of the record
*/
public void processRecord(final long virtualOffset) {
if (recordCount++ % granularity == 0) {
writeVirtualOffset(virtualOffset);
}
}
void writeVirtualOffset(final long virtualOffset) {
if (prev > virtualOffset) {
throw new IllegalArgumentException(String.format(
"Offsets not in order: %#x > %#x",
prev, virtualOffset));
}
tempOffsetsCodec.writeLong(virtualOffset);
virtualOffsetCount++;
prev = virtualOffset;
}
/**
* Complete the index, and close the output stream.
*
* @param finalVirtualOffset the virtual offset at which the next record would start if it were added to the file
* @param dataFileLength the length of the data file in bytes
*/
public void finish(final long finalVirtualOffset, final long dataFileLength) {
finish(finalVirtualOffset, dataFileLength, null, null);
}
/**
* Complete the index, and close the output stream.
*
* @param finalVirtualOffset the virtual offset at which the next record would start if it were added to the file
* @param dataFileLength the length of the data file in bytes
* @param md5 the MD5 hash of the data file, or null if not specified
* @param uuid the UUID for the data file, or null if not specified
*/
public void finish(final long finalVirtualOffset, final long dataFileLength, final byte[] md5, final byte[] uuid) {
if (md5 != null && md5.length != 16) {
throw new IllegalArgumentException("Invalid MD5 length: " + md5.length);
}
if (uuid != null && uuid.length != 16) {
throw new IllegalArgumentException("Invalid UUID length: " + uuid.length);
}
final SBIIndex.Header header = new SBIIndex.Header(dataFileLength, md5 == null ? EMPTY_MD5 : md5, uuid == null ? EMPTY_UUID : uuid, recordCount, granularity);
finish(header, finalVirtualOffset);
}
void finish(final SBIIndex.Header header, final long finalVirtualOffset) {
// complete writing the temp offsets file
writeVirtualOffset(finalVirtualOffset);
tempOffsetsCodec.close();
try (BinaryCodec binaryCodec = new BinaryCodec(out);
InputStream tempOffsets = new BufferedInputStream(Files.newInputStream(tempOffsetsFile))) {
// header
binaryCodec.writeBytes(SBIIndex.SBI_MAGIC);
binaryCodec.writeLong(header.getFileLength());
binaryCodec.writeBytes(header.getMd5());
binaryCodec.writeBytes(header.getUuid());
binaryCodec.writeLong(header.getTotalNumberOfRecords());
binaryCodec.writeLong(header.getGranularity());
binaryCodec.writeLong(virtualOffsetCount);
// offsets
IOUtil.copyStream(tempOffsets, out);
} catch (IOException e) {
throw new RuntimeIOException(e);
} finally {
try {
Files.delete(tempOffsetsFile);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
}
}
......@@ -399,7 +399,7 @@ public abstract class SamReaderFactory {
}
} else if (BlockCompressedInputStream.isValidFile(bufferedStream)) {
primitiveSamReader = new SAMTextReader(new BlockCompressedInputStream(bufferedStream), validationStringency, this.samRecordFactory);
} else if (SamStreams.isGzippedSAMFile(bufferedStream)) {
} else if (IOUtil.isGZIPInputStream(bufferedStream)) {
primitiveSamReader = new SAMTextReader(new GZIPInputStream(bufferedStream), validationStringency, this.samRecordFactory);
} else if (SamStreams.isCRAMFile(bufferedStream)) {
if (referenceSource == null) {
......
......@@ -4,6 +4,7 @@ import htsjdk.samtools.cram.structure.CramHeader;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.IOUtil;
import java.io.ByteArrayInputStream;
import java.io.IOException;
......@@ -11,7 +12,6 @@ import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;
/**
* Utilities related to processing of {@link java.io.InputStream}s encoding SAM data
......@@ -66,26 +66,11 @@ public class SamStreams {
/**
* Checks whether the file is a gzipped sam file. Returns true if it
* is and false otherwise.
* @deprecated use {@link IOUtil#isGZIPInputStream(InputStream)} instead
*/
@Deprecated
public static boolean isGzippedSAMFile(final InputStream stream) {
if (!stream.markSupported()) {
throw new IllegalArgumentException("Cannot test a stream that doesn't support marking.");
}
stream.mark(8000);
try {
final GZIPInputStream gunzip = new GZIPInputStream(stream);
final int ch = gunzip.read();
return true;
} catch (final IOException ioe) {
return false;
} finally {
try {
stream.reset();
} catch (final IOException ioe) {
throw new IllegalStateException("Could not reset stream.");
}
}
return IOUtil.isGZIPInputStream(stream);
}
// Its too expensive to examine the remote file to determine type.
......
......@@ -29,7 +29,9 @@ import java.nio.channels.FileChannel;
/**
* @author alecw@broadinstitute.org
* @deprecated This is deprecated with no replacement. 1/19
*/
@Deprecated
public class TimeChannel {
public static void main(String[] args) throws Exception {
long fileSize = new File(args[0]).length();
......
......@@ -28,7 +28,9 @@ import java.io.RandomAccessFile;
/**
* @author alecw@broadinstitute.org
* @deprecated This is deprecated with no replacement. 1/19
*/
@Deprecated
public class TimeRandomAccessFile {
public static void main(String[] args) throws Exception {
RandomAccessFile raf = new RandomAccessFile(new File(args[0]), "r");
......
......@@ -18,13 +18,14 @@
package htsjdk.samtools.cram.build;
import htsjdk.samtools.cram.common.MutableInt;
import htsjdk.samtools.cram.encoding.ByteArrayLenEncoding;
import htsjdk.samtools.cram.encoding.ByteArrayStopEncoding;
import htsjdk.samtools.cram.encoding.ExternalByteEncoding;
import htsjdk.samtools.cram.encoding.ExternalCompressor;
import htsjdk.samtools.cram.encoding.ExternalIntegerEncoding;
import htsjdk.samtools.cram.encoding.huffman.codec.CanonicalHuffmanIntegerEncoding;
import htsjdk.samtools.cram.encoding.rans.RANS;
import htsjdk.samtools.cram.compression.ExternalCompressor;
import htsjdk.samtools.cram.encoding.*;
import htsjdk.samtools.cram.encoding.core.CanonicalHuffmanIntegerEncoding;
import htsjdk.samtools.cram.encoding.external.ByteArrayStopEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalByteArrayEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalByteEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalIntegerEncoding;
import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
import htsjdk.samtools.cram.encoding.readfeatures.Substitution;
import htsjdk.samtools.cram.structure.CompressionHeader;
......@@ -52,15 +53,13 @@ import java.util.TreeMap;
* This particular version relies heavily on GZIP and RANS for better compression.
*/
public class CompressionHeaderFactory {
private static final int TAG_VALUE_BUFFER_SIZE = 1024 * 1024;
public static final int BYTE_SPACE_SIZE = 256;
public static final int ALL_BYTES_USED = -1;
private final Map<Integer, EncodingDetails> bestEncodings = new HashMap<>();
private final ByteArrayOutputStream baosForTagValues;
public CompressionHeaderFactory() {
baosForTagValues = new ByteArrayOutputStream(TAG_VALUE_BUFFER_SIZE);
}
// a parameter for Huffman encoding, so we don't have to re-construct on each call
private static final int[] singleZero = new int[] { 0 };
private final Map<Integer, EncodingDetails> bestEncodings = new HashMap<>();
private final ByteArrayOutputStream baosForTagValues = new ByteArrayOutputStream(1024 * 1024);
/**
* Decides on compression methods to use for the given records.
......@@ -110,7 +109,7 @@ public class CompressionHeaderFactory {
builder.addExternalIntegerGzipEncoding(DataSeries.TC_TagCount);
builder.addExternalIntegerEncoding(DataSeries.TL_TagIdList, ExternalCompressor.createGZIP());
builder.addExternalIntegerGzipEncoding(DataSeries.TN_TagNameAndType);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.TS_InsetSize);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.TS_InsertSize);
builder.setTagIdDictionary(buildTagIdDictionary(records));
......@@ -348,7 +347,7 @@ public class CompressionHeaderFactory {
return baosForTagValues.toByteArray();
}
static ByteSizeRange geByteSizeRangeOfTagValues(final List<CramCompressionRecord> records, final int tagID) {
static ByteSizeRange getByteSizeRangeOfTagValues(final List<CramCompressionRecord> records, final int tagID) {
final byte type = getTagType(tagID);
final ByteSizeRange stats = new ByteSizeRange();
for (final CramCompressionRecord record : records) {
......@@ -401,13 +400,25 @@ public class CompressionHeaderFactory {
EncodingParams params;
}
/**
* Used by buildEncodingForTag to create a ByteArrayLenEncoding with CanonicalHuffmanIntegerEncoding and
* ExternalByteArrayEncoding sub-encodings
*
* @param tagValueSize the size of the tag value, to be Huffman encoded
* @param tagID the ID of the tag
* @return EncodingParams a complete description of the result Encoding
*/
private EncodingParams buildTagEncodingForSize(final int tagValueSize, final int tagID) {
return new ByteArrayLenEncoding(
new CanonicalHuffmanIntegerEncoding(new int[] { tagValueSize }, singleZero),
new ExternalByteArrayEncoding(tagID)).toParam();
}
/**
* Build an encoding for a specific tag for given records.
*
* @param records
* CRAM records holding the tags
* @param tagID
* an integer id of the tag
* @param records CRAM records holding the tags
* @param tagID an integer id of the tag
* @return an encoding for the tag
*/
private EncodingDetails buildEncodingForTag(final List<CramCompressionRecord> records, final int tagID) {
......@@ -421,37 +432,31 @@ public class CompressionHeaderFactory {
case 'A':
case 'c':
case 'C':
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { 1 }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(1, tagID);
return details;
case 'I':
case 'i':
case 'f':
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { 4 }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(4, tagID);
return details;
case 's':
case 'S':
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { 2 }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(2, tagID);
return details;
case 'Z':
case 'B':
final ByteSizeRange stats = geByteSizeRangeOfTagValues(records, tagID);
final ByteSizeRange stats = getByteSizeRangeOfTagValues(records, tagID);
final boolean singleSize = stats.min == stats.max;
if (singleSize) {
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { stats.min }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(stats.min, tagID);
return details;
}
if (type == 'Z') {
details.params = ByteArrayStopEncoding.toParam((byte) '\t', tagID);
details.params = new ByteArrayStopEncoding((byte) '\t', tagID).toParam();
return details;
}
......@@ -459,13 +464,14 @@ public class CompressionHeaderFactory {
if (stats.min > minSize_threshold_ForByteArrayStopEncoding) {
final int unusedByte = getUnusedByte(data);
if (unusedByte > ALL_BYTES_USED) {
details.params = ByteArrayStopEncoding.toParam((byte) unusedByte, tagID);
details.params = new ByteArrayStopEncoding((byte) unusedByte, tagID).toParam();
return details;
}
}
details.params = ByteArrayLenEncoding.toParam(ExternalIntegerEncoding.toParam(tagID),
ExternalByteEncoding.toParam(tagID));
details.params = new ByteArrayLenEncoding(
new ExternalIntegerEncoding(tagID),
new ExternalByteArrayEncoding(tagID)).toParam();
return details;
default:
throw new IllegalArgumentException("Unknown tag type: " + (char) type);
......@@ -502,31 +508,31 @@ public class CompressionHeaderFactory {
private void addExternalByteArrayStopTabGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ByteArrayStopEncoding.toParam((byte) '\t', dataSeries.getExternalBlockContentId()),
new ByteArrayStopEncoding((byte) '\t', dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createGZIP());
}
private void addExternalIntegerEncoding(final DataSeries dataSeries, final ExternalCompressor compressor) {
addExternalEncoding(dataSeries,
ExternalIntegerEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalIntegerEncoding(dataSeries.getExternalBlockContentId()).toParam(),
compressor);
}
private void addExternalIntegerGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalIntegerEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalIntegerEncoding(dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createGZIP());
}
private void addExternalByteGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalByteEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalByteEncoding(dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createGZIP());
}
private void addExternalByteRansOrderOneEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalByteEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalByteEncoding(dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createRANS(RANS.ORDER.ONE));
}
......
......@@ -19,19 +19,22 @@ package htsjdk.samtools.cram.build;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.digest.ContentDigests;
import htsjdk.samtools.cram.encoding.ExternalCompressor;
import htsjdk.samtools.cram.compression.ExternalCompressor;
import htsjdk.samtools.cram.encoding.writer.CramRecordWriter;
import htsjdk.samtools.cram.io.DefaultBitOutputStream;
import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream;
import htsjdk.samtools.cram.structure.Block;
import htsjdk.samtools.cram.structure.BlockContentType;
import htsjdk.samtools.cram.structure.block.ExternalBlock;
import htsjdk.samtools.cram.structure.block.Block;
import htsjdk.samtools.cram.structure.CompressionHeader;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.CramCompressionRecord;
import htsjdk.samtools.cram.structure.Slice;
import htsjdk.samtools.cram.structure.SubstitutionMatrix;
import htsjdk.samtools.util.RuntimeIOException;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
......@@ -49,16 +52,12 @@ public class ContainerFactory {
this.recordsPerSlice = recordsPerSlice;
}
public Container buildContainer(final List<CramCompressionRecord> records)
throws IllegalArgumentException, IllegalAccessException,
IOException {
public Container buildContainer(final List<CramCompressionRecord> records) {
return buildContainer(records, null);
}
Container buildContainer(final List<CramCompressionRecord> records,
final SubstitutionMatrix substitutionMatrix)
throws IllegalArgumentException, IllegalAccessException,
IOException {
final SubstitutionMatrix substitutionMatrix) {
// sets header APDelta
final boolean coordinateSorted = samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate;
......@@ -114,17 +113,12 @@ public class ContainerFactory {
}
private static Slice buildSlice(final List<CramCompressionRecord> records,
final CompressionHeader header)
throws IllegalArgumentException, IllegalAccessException,
IOException {
final Map<Integer, ExposedByteArrayOutputStream> map = new HashMap<Integer, ExposedByteArrayOutputStream>();
final CompressionHeader header) {
final Map<Integer, ByteArrayOutputStream> externalBlockMap = new HashMap<>();
for (final int id : header.externalIds) {
map.put(id, new ExposedByteArrayOutputStream());
externalBlockMap.put(id, new ByteArrayOutputStream());
}
final ExposedByteArrayOutputStream bitBAOS = new ExposedByteArrayOutputStream();
final DefaultBitOutputStream bitOutputStream = new DefaultBitOutputStream(bitBAOS);
final Slice slice = new Slice();
slice.nofRecords = records.size();
......@@ -165,26 +159,32 @@ public class ContainerFactory {
slice.alignmentSpan = maxAlEnd - minAlStart + 1;
}
final CramRecordWriter writer = new CramRecordWriter(bitOutputStream, map, header, slice.sequenceId);
try (final ByteArrayOutputStream bitBAOS = new ByteArrayOutputStream();
final DefaultBitOutputStream bitOutputStream = new DefaultBitOutputStream(bitBAOS)) {
final CramRecordWriter writer = new CramRecordWriter(bitOutputStream, externalBlockMap, header, slice.sequenceId);
writer.writeCramCompressionRecords(records, slice.alignmentStart);
bitOutputStream.close();
slice.coreBlock = Block.buildNewCore(bitBAOS.toByteArray());
slice.external = new HashMap<Integer, Block>();
for (final Integer key : map.keySet()) {
final ExposedByteArrayOutputStream os = map.get(key);
final Block externalBlock = new Block();
externalBlock.setContentId(key);
externalBlock.setContentType(BlockContentType.EXTERNAL);
final ExternalCompressor compressor = header.externalCompressors.get(key);
final byte[] rawData = os.toByteArray();
final byte[] compressed = compressor.compress(rawData);
externalBlock.setContent(rawData, compressed);
externalBlock.setMethod(compressor.getMethod());
slice.external.put(key, externalBlock);
slice.coreBlock = Block.createRawCoreDataBlock(bitBAOS.toByteArray());
}
catch (final IOException e) {
throw new RuntimeIOException(e);
}
slice.external = new HashMap<>();
for (final Integer contentId : externalBlockMap.keySet()) {
// remove after https://github.com/samtools/htsjdk/issues/1232
if (contentId == Block.NO_CONTENT_ID) {
throw new CRAMException("Valid Content ID required. Given: " + contentId);
}
final ExternalCompressor compressor = header.externalCompressors.get(contentId);
final byte[] rawContent = externalBlockMap.get(contentId).toByteArray();
final ExternalBlock externalBlock = new ExternalBlock(compressor.getMethod(), contentId,
compressor.compress(rawContent), rawContent.length);
slice.external.put(contentId, externalBlock);
}
return slice;
......
......@@ -24,7 +24,7 @@ import htsjdk.samtools.cram.common.Version;
import htsjdk.samtools.cram.io.CountingInputStream;
import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.cram.structure.Block;
import htsjdk.samtools.cram.structure.block.Block;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.ContainerIO;
import htsjdk.samtools.cram.structure.CramHeader;
......@@ -248,7 +248,7 @@ public class CramIO {
final int length = Math.max(1024, data.length + data.length / 2);
final byte[] blockContent = new byte[length];
System.arraycopy(data, 0, blockContent, 0, Math.min(data.length, length));
final Block block = Block.buildNewFileHeaderBlock(blockContent);
final Block block = Block.createRawFileHeaderBlock(blockContent);
final Container container = new Container();
container.blockCount = 1;
......@@ -279,18 +279,18 @@ public class CramIO {
if (version.compatibleWith(CramVersions.CRAM_v3)) {
final byte[] bytes = new byte[container.containerByteSize];
InputStreamUtils.readFully(inputStream, bytes, 0, bytes.length);
block = Block.readFromInputStream(version.major, new ByteArrayInputStream(bytes));
block = Block.read(version.major, new ByteArrayInputStream(bytes));
// ignore the rest of the container
} else {
/*
* pending issue: container.containerByteSize inputStream 2 bytes shorter
* then needed in the v21 test cram files.
*/
block = Block.readFromInputStream(version.major, inputStream);
block = Block.read(version.major, inputStream);
}
}
inputStream = new ByteArrayInputStream(block.getRawContent());
inputStream = new ByteArrayInputStream(block.getUncompressedContent());
final ByteBuffer buffer = ByteBuffer.allocate(4);
buffer.order(ByteOrder.LITTLE_ENDIAN);
......@@ -327,7 +327,7 @@ public class CramIO {
final long pos = countingInputStream.getCount();
countingInputStream.close();
final Block block = Block.buildNewFileHeaderBlock(toByteArray(newHeader.getSamFileHeader()));
final Block block = Block.createRawFileHeaderBlock(toByteArray(newHeader.getSamFileHeader()));
final ExposedByteArrayOutputStream byteArrayOutputStream = new ExposedByteArrayOutputStream();
block.write(newHeader.getVersion().major, byteArrayOutputStream);
if (byteArrayOutputStream.size() > c.containerByteSize) {
......
package htsjdk.samtools.cram.io;
package htsjdk.samtools.cram.compression;
import htsjdk.samtools.cram.encoding.rans.RANS;
import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.samtools.util.IOUtil;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
......@@ -149,4 +151,25 @@ public class ExternalCompression {
buffer.get(bytes);
return bytes;
}
public static byte[] uncompress(final BlockCompressionMethod method, final byte[] compressedContent) {
try {
switch (method) {
case RAW:
return compressedContent;
case GZIP:
return gunzip(compressedContent);
case BZIP2:
return unbzip2(compressedContent);
case LZMA:
return unxz(compressedContent);
case RANS:
return unrans(compressedContent);
default:
throw new RuntimeException("Unknown block compression method: " + method.name());
}
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}
package htsjdk.samtools.cram.encoding;
package htsjdk.samtools.cram.compression;
import htsjdk.samtools.cram.encoding.rans.RANS.ORDER;
import htsjdk.samtools.cram.io.ExternalCompression;
import htsjdk.samtools.cram.structure.BlockCompressionMethod;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.samtools.cram.compression.rans.RANS.ORDER;
import java.io.IOException;
......
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;
class Constants {
static final int TF_SHIFT = 12;
......