CoverageAlgorithm.h 3.42 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
#ifndef ASSEMBLY_COVERAGEALGORITHM_H
#define ASSEMBLY_COVERAGEALGORITHM_H 1

#include "Common/Histogram.h"
#include "Common/IOUtil.h"
#include "Common/Options.h" // for opt::rank
#include <fstream>

namespace AssemblyAlgorithms {

/** Return the k-mer coverage histogram. */
static inline
Histogram coverageHistogram(const SequenceCollectionHash& c)
{
	typedef SequenceCollectionHash Graph;

	Histogram h;
	for (Graph::const_iterator it = c.begin();
			it != c.end(); ++it) {
		if (it->second.deleted())
			continue;
		h.insert(it->second.getMultiplicity());
	}
	return h;
}

/** Calculate a k-mer coverage threshold from the given k-mer coverage
 * histogram. */
static inline
float calculateCoverageThreshold(const Histogram& h)
{
	float cov = h.firstLocalMinimum();
	if (opt::rank <= 0) {
		if (cov == 0)
			std::cout << "Unable to determine minimum k-mer coverage\n";
		else
			std::cout << "Minimum k-mer coverage is " << cov << std::endl;
	}

	for (unsigned iteration = 0; iteration < 100; iteration++) {
		Histogram trimmed = h.trimLow((unsigned)roundf(cov));
		if (opt::rank <= 0)
			logger(1) << "Coverage: " << cov << "\t"
				"Reconstruction: " << trimmed.size() << std::endl;

		unsigned median = trimmed.median();
		float cov1 = sqrt(median);
		if (cov1 == cov) {
			// The coverage threshold has converged.
			if (opt::rank <= 0)
				std::cout << "Using a coverage threshold of "
					<< (unsigned)roundf(cov) << "...\n"
					"The median k-mer coverage is " << median << "\n"
					"The reconstruction is " << trimmed.size()
					<< std::endl;
			if (!opt::db.empty()) {
				addToDb("coverageThreshold", (unsigned)roundf(cov));
				addToDb("medianKcoverage", median);
				addToDb("restruction", trimmed.size());
			}
			return cov;
		}
		cov = cov1;
	}
	if (opt::rank <= 0)
		std::cerr << "warning: coverage threshold did not converge"
			<< std::endl;
	return 0;
}

/** Set the coverage-related parameters e and c from the given k-mer
 * coverage histogram. */
static inline
void setCoverageParameters(const Histogram& h)
{
	if (!opt::coverageHistPath.empty() && opt::rank <= 0) {
		std::ofstream histFile(opt::coverageHistPath.c_str());
		assert_good(histFile, opt::coverageHistPath);
		histFile << h;
		assert(histFile.good());
	}

	float minCov = calculateCoverageThreshold(h);
	if (opt::rank <= 0) {
		if (minCov == 0)
			std::cout << "Unable to determine the "
				"k-mer coverage threshold" << std::endl;
		else
			std::cout << "The k-mer coverage threshold is " << minCov
				<< std::endl;
	}
	if (minCov < 2)
		minCov = 2;

	if ((int)opt::erode < 0) {
		opt::erode = (unsigned)roundf(minCov);
		if (opt::rank <= 0)
			std::cout << "Setting parameter e (erode) to "
				<< opt::erode << std::endl;
	}
	if ((int)opt::erodeStrand < 0) {
		opt::erodeStrand = minCov <= 2 ? 0 : 1;
		if (opt::rank <= 0)
			std::cout << "Setting parameter E (erodeStrand) to "
				<< opt::erodeStrand << std::endl;
	}
	if (opt::coverage < 0) {
		opt::coverage = minCov;
		if (opt::rank <= 0)
			std::cout << "Setting parameter c (coverage) to "
				<< opt::coverage << std::endl;
	}
}

115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
/** Remove all k-mers with multiplicity lower than the given threshold */
static inline
size_t applyKmerCoverageThreshold(SequenceCollectionHash& c, unsigned kc)
{
	if (kc == 0)
		return 0;

	for (SequenceCollectionHash::iterator it = c.begin();
		it != c.end(); ++it) {
		if (it->second.getMultiplicity() < kc)
			it->second.setFlag(SF_DELETE);
	}

	return c.cleanup();
}

131 132 133
} // namespace AssemblyAlgorithms

#endif