Commit 85eeccb3 authored by Michael R. Crusoe's avatar Michael R. Crusoe

Imported Upstream version 1.5.1

parents
cmake_minimum_required (VERSION 2.6)
project(express)
set(${PROJECT_NAME}_VERSION_MAJOR 1)
set(${PROJECT_NAME}_VERSION_MINOR 5)
set(${PROJECT_NAME}_VERSION_PATCH 1)
set(CMAKE_CXX_FLAGS "-Wall")
set(CMAKE_CXX_FLAGS_DEBUG "-g ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RHDEBINFO "-O3 -g ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os ${CMAKE_CXX_FLAGS}")
set(CMAKE_BUILD_TYPE Release)
set(Boost_USE_STATIC_LIBS ON)
find_package(Boost 1.39
COMPONENTS
thread
system
filesystem
program_options
date_time
REQUIRED)
find_library(GPERFTOOLS_TCMALLOC_LIB tcmalloc)
if (GPERFTOOLS_TCMALLOC_LIB)
message(STATUS "Found GPERFTOOLS_TCMALLOC: ${GPERFTOOLS_TCMALLOC_LIB}")
else (GPERFTOOLS_TCMALLOC_LIB)
message(STATUS "Could NOT find GPERFTOOLS_TCMALLOC: Install to improve speed.")
endif(GPERFTOOLS_TCMALLOC_LIB)
find_package(Protobuf)
if (PROTOBUF_FOUND)
include_directories(${Boost_INCLUDE_DIRS} ${PROTOBUF_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/bamtools/include")
set(PROTO_INT 1)
else (PROTOBUF_FOUND)
include_directories(${Boost_INCLUDE_DIRS} "${CMAKE_CURRENT_SOURCE_DIR}/bamtools/include")
set(PROTO_INT 0)
endif(PROTOBUF_FOUND)
if(WIN32)
set(CMAKE_CXX_FLAGS "/EHsc")
set(WIN32_INT 1)
else(WIN32)
find_package(ZLIB REQUIRED)
set(WIN32_INT 0)
endif(WIN32)
configure_file (
"${PROJECT_SOURCE_DIR}/src/config.h.in"
"${PROJECT_SOURCE_DIR}/src/config.h"
)
add_subdirectory(src)
Artistic License 2.0
Copyright (c) 2011, Adam Roberts and Lior Pachter.
Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
Preamble
This license establishes the terms under which a given free software Package may be copied, modified, distributed, and/or redistributed. The intent is that the Copyright Holder maintains some artistic control over the development of that Package while still keeping the Package available as open source and free software.
You are always permitted to make arrangements wholly outside of this license directly with the Copyright Holder of a given Package. If the terms of this license do not permit the full use that you propose to make of the Package, you should contact the Copyright Holder and seek a different licensing arrangement.
Definitions
"Copyright Holder" means the individual(s) or organization(s) named in the copyright notice for the entire Package.
"Contributor" means any party that has contributed code or other material to the Package, in accordance with the Copyright Holder's procedures.
"You" and "your" means any person who would like to copy, distribute, or modify the Package.
"Package" means the collection of files distributed by the Copyright Holder, and derivatives of that collection and/or of those files. A given Package may consist of either the Standard Version, or a Modified Version.
"Distribute" means providing a copy of the Package or making it accessible to anyone else, or in the case of a company or organization, to others outside of your company or organization.
"Distributor Fee" means any fee that you charge for Distributing this Package or providing support for this Package to another party. It does not mean licensing fees.
"Standard Version" refers to the Package if it has not been modified, or has been modified only in ways explicitly requested by the Copyright Holder.
"Modified Version" means the Package, if it has been changed, and such changes were not explicitly requested by the Copyright Holder.
"Original License" means this Artistic License as Distributed with the Standard Version of the Package, in its current version or as it may be modified by The Perl Foundation in the future.
"Source" form means the source code, documentation source, and configuration files for the Package.
"Compiled" form means the compiled bytecode, object code, binary, or any other form resulting from mechanical transformation or translation of the Source form.
Permission for Use and Modification Without Distribution
(1) You are permitted to use the Standard Version and create and use Modified Versions for any purpose without restriction, provided that you do not Distribute the Modified Version.
Permissions for Redistribution of the Standard Version
(2) You may Distribute verbatim copies of the Source form of the Standard Version of this Package in any medium without restriction, either gratis or for a Distributor Fee, provided that you duplicate all of the original copyright notices and associated disclaimers. At your discretion, such verbatim copies may or may not include a Compiled form of the Package.
(3) You may apply any bug fixes, portability changes, and other modifications made available from the Copyright Holder. The resulting Package will still be considered the Standard Version, and as such will be subject to the Original License.
Distribution of Modified Versions of the Package as Source
(4) You may Distribute your Modified Version as Source (either gratis or for a Distributor Fee, and with or without a Compiled form of the Modified Version) provided that you clearly document how it differs from the Standard Version, including, but not limited to, documenting any non-standard features, executables, or modules, and provided that you do at least ONE of the following:
(a) make the Modified Version available to the Copyright Holder of the Standard Version, under the Original License, so that the Copyright Holder may include your modifications in the Standard Version.
(b) ensure that installation of your Modified Version does not prevent the user installing or running the Standard Version. In addition, the Modified Version must bear a name that is different from the name of the Standard Version.
(c) allow anyone who receives a copy of the Modified Version to make the Source form of the Modified Version available to others under
(i) the Original License or
(ii) a license that permits the licensee to freely copy, modify and redistribute the Modified Version using the same licensing terms that apply to the copy that the licensee received, and requires that the Source form of the Modified Version, and of any works derived from it, be made freely available in that license fees are prohibited but Distributor Fees are allowed.
Distribution of Compiled Forms of the Standard Version or Modified Versions without the Source
(5) You may Distribute Compiled forms of the Standard Version without the Source, provided that you include complete instructions on how to get the Source of the Standard Version. Such instructions must be valid at the time of your distribution. If these instructions, at any time while you are carrying out such distribution, become invalid, you must provide new instructions on demand or cease further distribution. If you provide valid instructions or cease distribution within thirty days after you become aware that the instructions are invalid, then you do not forfeit any of your rights under this license.
(6) You may Distribute a Modified Version in Compiled form without the Source, provided that you comply with Section 4 with respect to the Source of the Modified Version.
Aggregating or Linking the Package
(7) You may aggregate the Package (either the Standard Version or Modified Version) with other packages and Distribute the resulting aggregation provided that you do not charge a licensing fee for the Package. Distributor Fees are permitted, and licensing fees for other components in the aggregation are permitted. The terms of this license apply to the use and Distribution of the Standard or Modified Versions as included in the aggregation.
(8) You are permitted to link Modified and Standard Versions with other works, to embed the Package in a larger work of your own, or to build stand-alone binary or bytecode versions of applications that include the Package, and Distribute the result without restriction, provided the result does not expose a direct interface to the Package.
Items That are Not Considered Part of a Modified Version
(9) Works (including, but not limited to, modules and scripts) that merely extend or make use of the Package, do not, by themselves, cause the Package to be a Modified Version. In addition, such works are not considered parts of the Package itself, and are not subject to the terms of this license.
General Provisions
(10) Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license.
(11) If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license.
(12) This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder.
(13) This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed.
(14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--------------------------------------------------------------------------------
README : EXPRESS
--------------------------------------------------------------------------------
eXpress is a streaming DNA/RNA sequence quantification tool. It has initially
been tested for RNA-Seq transcriptome quantification but can be used in any
application where abundances of target sequences need to be estimated from short
reads sequenced from them.
More details, installation instructions, and the manual can be found at
http://bio.math.berkeley.edu/eXpress/
--------------------------------------------------------------------------------
I. Authors:
--------------------------------------------------------------------------------
The methods and algorithms used in eXpress were developed by Adam Roberts and
Lior Pachter. Adam Roberts designed and wrote eXpress.
--------------------------------------------------------------------------------
II. Requirements:
--------------------------------------------------------------------------------
eXpress is a standalone tool that requires gcc 4.0 or greater, and runs on Linux
Macintosh OS X, and Windows.
It depends on Boost (http://www.boost.org),
BamTools (https://github.com/pezmaster31/bamtools),
ZLIB (http://zlib.net/),
and the Microsoft Visual C++ Runtime Library (Windows only)
(http://www.microsoft.com/download/en/details.aspx?id=5555).
--------------------------------------------------------------------------------
III. Learn More (Installation, Manual, etc.):
--------------------------------------------------------------------------------
Installation steps, tutorial, manual, documentation, etc. are all available
through the eXpress website:
http://bio.math.berkeley.edu/eXpress/
Join the user group and mailing list to stay informed of updates and announcements:
http://groups.google.com/group/express-users
--------------------------------------------------------------------------------
IV. License:
--------------------------------------------------------------------------------
eXpress is distributed under the Artistic License 2.0.
Copyright (c) 2011-2013 Adam Roberts and Lior Pachter.
See included file LICENSE for details.
--------------------------------------------------------------------------------
V. Acknowledgements:
--------------------------------------------------------------------------------
* Cole Trapnell, primary developer of Cufflinks
* Derek Barnett, author of the BamTools API
--------------------------------------------------------------------------------
VI. References:
--------------------------------------------------------------------------------
Roberts A (2013). Thesis: Ambiguous fragment assignment for high-throughput
sequencing experiments. EECS Department, University of California, Berkeley.
Roberts A and Pachter L (2012). Streaming fragment assignment for real-time
analysis of sequencing experiments. Nature Methods.
eXpress builds upon many ideas, including some proposed in the following papers:
Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ,
Salzberg SL, Wold B and Pachter L (2010). Transcript assembly and quantification
by RNA-Seq reveals unannotated transcripts and isoform switching during cell
differentiation. Nature Biotechnology.
Cappé O and Moulines E (2009). On-line expectation–maximization algorithm for
latent data models. Journal of the Royal Statistical Society.
Pachter, L (2011). Models for transcript quantification from RNA-Seq.
arXiv:1104.3889v2.
Roberts A, Trapnell C, Donaghey J, Rinn JL and Pachter L (2011). Improving
RNA-Seq expression estimates by correcting for fragment bias. Genome Biology.
Li B and Dewey C (2011). RSEM: accurate transcript quantification from RNA-Seq
data with or without a reference genome. BMC Bioinformatics.
--------------------------------------------------------------------------------
VII. CONTACT:
--------------------------------------------------------------------------------
Please send all questions, comments, and suggestions to ask.xprs@gmail.com.
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
file(GLOB sources *.cpp)
if (PROTOBUF_FOUND)
file(GLOB ProtoFiles "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
PROTOBUF_GENERATE_CPP(PROTO_SOURCES PROTO_HEADERS ${ProtoFiles})
endif(PROTOBUF_FOUND)
add_executable(express ${sources} ${headers} ${PROTO_SOURCES} ${PROTO_HEADERS})
set(LIBRARIES ${Boost_LIBRARIES} ${ZLIB_LIBRARIES})
if (GPERFTOOLS_TCMALLOC)
set(LIBRARIES ${LIBRARIES} "libtcmalloc_minimal.a")
endif (GPERFTOOLS_TCMALLOC)
if(WIN32)
set(LIBRARIES ${LIBRARIES} "${CMAKE_CURRENT_SOURCE_DIR}/../bamtools/lib/libbamtools.lib" "${CMAKE_CURRENT_SOURCE_DIR}/../win_build/zlibd.lib")
else(WIN32)
set(LIBRARIES ${LIBRARIES} "${CMAKE_CURRENT_SOURCE_DIR}/../bamtools/lib/libbamtools.a" "pthread")
endif(WIN32)
if (PROTOBUF_FOUND)
set(LIBRARIES ${LIBRARIES} "libprotobuf.a")
endif(PROTOBUF_FOUND)
target_link_libraries(express ${LIBRARIES})
install(TARGETS express DESTINATION bin)
if (PROTOBUF_FOUND)
file(GLOB sources *.cpp proto/*.cc)
file(GLOB headers *.h proto/*.h)
else (PROTOBUF_FOUND)
file(GLOB sources *.cpp)
endif (PROTOBUF_FOUND)
add_executable(express ${sources} ${headers})
set(LIBRARIES ${Boost_LIBRARIES} ${ZLIB_LIBRARIES})
if (GPERFTOOLS_TCMALLOC)
set(LIBRARIES ${LIBRARIES} "libtcmalloc_minimal.a")
endif (GPERFTOOLS_TCMALLOC)
if(WIN32)
set(LIBRARIES ${LIBRARIES} "${CMAKE_CURRENT_SOURCE_DIR}/../bamtools/lib/libbamtools.lib" "${CMAKE_CURRENT_SOURCE_DIR}/../win_build/zlibd.lib")
else(WIN32)
set(LIBRARIES ${LIBRARIES} "${CMAKE_CURRENT_SOURCE_DIR}/../bamtools/lib/libbamtools.a" "pthread")
endif(WIN32)
if (PROTOBUF_FOUND)
set(LIBRARIES ${LIBRARIES} ${PROTOBUF_LITE_LIBRARIES})
endif(PROTOBUF_FOUND)
target_link_libraries(express ${LIBRARIES})
install(TARGETS express DESTINATION bin)
package proto;
// A ReadaAlignment represents the alignment of one end of a fragment.
message ReadAlignment {
// Specifies if this end was sequenced first according to the SAM file.
// Always true if the sequencing was single-end.
required bool first = 1;
// The 0-based left endpoint of the alignment of the read to the reference.
optional uint32 left_pos = 2;
// The 0-based right endpoint of the alignment of the read to the reference.
optional uint32 right_pos = 3;
// The positions in the read that differ from the reference.
optional bytes mismatch_indices = 4;
// The nucleotides of the read at mismatched positions, represented as 2 bits
// (A=00, C=01, G=10, T=11) packed into bytes. Bit pairs are stored so that later
// pairs in the sequence are at more significant positions in the byte.
optional bytes mismatch_nucs = 5;
}
// The FragmentAlignment stores data for a single alignment of a fragment. If
// a ReadAlignment from the first FragmentAlignment of a Fragment is identical
// to a later one, it is left out in the later one. For example, if read_l in
// the third alignment is identical to read_l in the first alignment, it is
// left empty in the third alignment.
message FragmentAlignment {
// The ID of the target aligned to (index in SAM header order).
required uint32 target_id = 1;
// The length of the fragment according to this alignment. 0 if single-end.
optional uint32 length = 2 [deprecated=true];
// The alignment of the 5' (left) read, if it exists and is not identical
// to the first alignment.
optional ReadAlignment read_l = 3;
// The alignment of the 3' (right) read, if it exists and is not identical
// to the first alignment.
optional ReadAlignment read_r = 4;
}
// The Fragment stores the alignments of a single fragment (paired or
// single-end read). If a ReadAlignment from the first FragmentAlignment of a
// Fragment is identical to a later one, it is left out in the later one. For
// example, if read_l in the third alignment is identical to read_l in the
// first alignment, it is left empty in the third alignment.
message Fragment {
// The unique "query" name of the read (pair) according to the SAM file.
optional string name = 1;
// True iff the fragment was sequenced as a pair.
required bool paired = 2;
// A collection of alignments for the fragment.
repeated FragmentAlignment alignments = 3;
}
\ No newline at end of file
/**
* biascorrection.h
* express
*
* Created by Adam Roberts on 4/5/11.
* Copyright 2011 Adam Roberts. All rights reserved.
*
*/
#include <algorithm>
#include <boost/assign.hpp>
#include <cassert>
#include "biascorrection.h"
#include "fragments.h"
#include "frequencymatrix.h"
#include "main.h"
#include "sequence.h"
#include "targets.h"
using namespace std;
// size of bias window
const int WINDOW = 21;
// center position in window
const int CENTER = 11;
// number of bases on either side of center
const int SURROUND = 10;
SeqWeightTable::SeqWeightTable(size_t window_size, size_t order, double alpha)
: _order(order),
_observed(order, window_size, window_size, alpha),
_expected(order, window_size, order+1, EPSILON) {
}
SeqWeightTable::SeqWeightTable(size_t window_size, size_t order,
string param_file_name, string identifier)
: _order(order),
_observed(order, window_size, window_size, 0),
_expected(order, window_size, order+1, 0) {
//TODO: Allow for orders and window sizes to be read from param file.
ifstream infile (param_file_name.c_str());
const size_t BUFF_SIZE = 99999;
char line_buff[BUFF_SIZE];
if (!infile.is_open()) {
logger.severe("Unable to open parameter file '%s'.",
param_file_name.c_str());
}
do {
infile.getline (line_buff, BUFF_SIZE, '\n');
} while (strncmp(line_buff, identifier.c_str(), identifier.size()));
infile.getline (line_buff, BUFF_SIZE, '\n');
do {
infile.getline (line_buff, BUFF_SIZE, '\n');
} while (strcmp(line_buff, "\tObserved Conditional Probabilities"));
infile.getline (line_buff, BUFF_SIZE, '\n');
//FG
for (size_t k = 0; k < (size_t)WINDOW; ++k) {
infile.getline (line_buff, BUFF_SIZE, '\n');
char *p = strtok(line_buff, "\t");
for (size_t i=0; i < pow(4.0, (double)min(order, k)); ++i) {
for (size_t j=0; j < NUM_NUCS; ++j) {
p = strtok(NULL, "\t");
_observed.update(k, i, j, log(strtod(p,NULL)));
}
}
}
infile.getline (line_buff, BUFF_SIZE, '\n');
infile.getline (line_buff, BUFF_SIZE, '\n');
//BG
for (size_t k = 0; k < order+1; ++k) {
infile.getline (line_buff, BUFF_SIZE, '\n');
char *p = strtok(line_buff, "\t");
for (size_t i=0; i < pow(4.0, (double)min(order, k)); ++i) {
for (size_t j=0; j < NUM_NUCS; ++j) {
p = strtok(NULL, "\t");
_expected.update(k, i, j, log(strtod(p,NULL)));
}
}
}
}
void SeqWeightTable::copy_observed(const SeqWeightTable& other) {
_observed = other._observed;
}
void SeqWeightTable::copy_expected(const SeqWeightTable& other) {
_expected = other._expected;
}
void SeqWeightTable::increment_expected(const Sequence& seq, double mass,
const vector<double>& fl_cdf) {
_expected.fast_learn(seq, mass, fl_cdf);
}
void SeqWeightTable::normalize_expected() {
_expected.calc_marginals();
}
void SeqWeightTable::increment_observed(const Sequence& seq, size_t i,
double mass) {
int left = (int)i - SURROUND;
_observed.update(seq, left, mass);
}
double SeqWeightTable::get_weight(const Sequence& seq, size_t i) const {
int left = (int)i - SURROUND;
return _observed.seq_prob(seq, left) - _expected.seq_prob(seq, left);
}
void SeqWeightTable::append_output(ofstream& outfile) const {
char buff[200];
string header = "";
for (int i = 0; i < WINDOW; i++) {
sprintf(buff, "\t%d", i-CENTER+1);
header += buff;
}
header += '\n';
outfile << "\tObserved Marginal Distribution\n" << header;
for (size_t j = 0; j < NUM_NUCS; j++) {
outfile << NUCS[j] << ":\t";
for (int i = 0; i < WINDOW; i++) {
outfile << scientific << sexp(_observed.marginal_prob(i,j)) << "\t";
}
outfile<<endl;
}
outfile << "\tObserved Conditional Probabilities\nPosition\t";
for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
string s = "->";
s += NUCS[j & 3];
size_t cond = j >> 2;
for (size_t k = 0; k < _order; ++k) {
s = NUCS[cond & 3] + s;
cond = cond >> 2;
}
outfile << s << '\t';
}
outfile << endl;
for (int i = 0; i < WINDOW; i++) {
outfile << i-SURROUND << ":\t";
for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
outfile << scientific << sexp(_observed.transition_prob(i,j>>2,j&3))
<< "\t";
}
outfile<<endl;
}
outfile << "\tBackground Conditional Probabilities\nPosition\t";
for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
string s = "->";
s += NUCS[j & 3];
size_t cond = j >> 2;
for (size_t k = 0; k < _order; ++k) {
s = NUCS[cond & 3] + s;
cond = cond >> 2;
}
outfile << s << '\t';
}
outfile << endl;
for (size_t i = 0; i < _order+1; i++) {
outfile << i+1 << ":\t";
for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
outfile << scientific << sexp(_expected.transition_prob(i,j>>2,j&3))
<< "\t";
}
outfile<<endl;
}
}
BiasBoss::BiasBoss(size_t order, double alpha)
: _order(order),
_5_seq_bias(WINDOW, order, alpha),
_3_seq_bias(WINDOW, order, alpha){
}
BiasBoss::BiasBoss(size_t order, string param_file_name)
: _order(order),
_5_seq_bias(WINDOW, order, param_file_name, ">5"),
_3_seq_bias(WINDOW, order, param_file_name, ">3") {
}
void BiasBoss::copy_observations(const BiasBoss& other) {
_5_seq_bias.copy_observed(other._5_seq_bias);
_3_seq_bias.copy_observed(other._3_seq_bias);
}
void BiasBoss::copy_expectations(const BiasBoss& other) {
_5_seq_bias.copy_expected(other._5_seq_bias);
_3_seq_bias.copy_expected(other._3_seq_bias);
}
void BiasBoss::update_expectations(const Target& targ, double mass,
const vector<double>& fl_cdf) {
if (mass == LOG_0) {
return;
}
if (direction != R) {
const Sequence& seq_fwd = targ.seq(0);
_5_seq_bias.increment_expected(seq_fwd, mass, fl_cdf);
}
if (direction != F) {
const Sequence& seq_rev = targ.seq(1);
_3_seq_bias.increment_expected(seq_rev, mass, fl_cdf);
}
}
void BiasBoss::normalize_expectations() {
_5_seq_bias.normalize_expected();
_3_seq_bias.normalize_expected();
}
void BiasBoss::update_observed(const FragHit& hit, double normalized_mass)
{
assert (hit.pair_status() != PAIRED || (int)hit.length() > WINDOW);
const Sequence& t_seq_fwd = hit.target()->seq(0);
const Sequence& t_seq_rev = hit.target()->seq(1);
if (hit.pair_status() != RIGHT_ONLY) {
_5_seq_bias.increment_observed(t_seq_fwd, hit.left(), normalized_mass);
}
if (hit.pair_status() != LEFT_ONLY) {
_3_seq_bias.increment_observed(t_seq_rev, t_seq_rev.length()-hit.right(),
normalized_mass);
}
}
double BiasBoss::get_target_bias(std::vector<float>& start_bias,
std::vector<float>& end_bias,
const Target& targ) const {
double tot_start = LOG_0;
double tot_end = LOG_0;
const Sequence& t_seq_fwd = targ.seq(0);
const Sequence& t_seq_rev = targ.seq(1);
for (size_t i = 0; i < targ.length(); ++i) {
start_bias[i] = _5_seq_bias.get_weight(t_seq_fwd, i);
end_bias[targ.length()-i-1] = _3_seq_bias.get_weight(t_seq_rev, i);
tot_start = log_add(tot_start, start_bias[i]);
tot_end = log_add(tot_end, end_bias[i]);
}
double avg_bias = (tot_start + tot_end) - (2*log((double)targ.length()));
assert(!isnan(avg_bias));
return avg_bias;
}
void BiasBoss::append_output(ofstream& outfile) const {
outfile << ">5' Sequence-Specific Bias\n";
_5_seq_bias.append_output(outfile);
outfile << ">3' Sequence-Specific Bias\n";
_3_seq_bias.append_output(outfile);
}
/**
* biascorrection.h
* express
*
* Created by Adam Roberts on 4/5/11.
* Copyright 2011 Adam Roberts. All rights reserved.
*
*/
#ifndef BIASCORRECTION_H
#define BIASCORRECTION_H
#include <string>
#include <vector>
#include "lengthdistribution.h"
#include "frequencymatrix.h"
#include "markovmodel.h"
class BiasBoss;
class FragHit;
class Target;
/**
* The SeqWeightTable class keeps track of sequence-specific bias parameters.
* Allows for the bias associated with a given sequence to be calculated, and
* for the bias parameters to be updated based on additional observations. All
* values are stored in log space.
* @author Adam Roberts
* @date 2011
* @copyright Artistic License 2.0
**/
class SeqWeightTable {
/**
* A private size_t specifying the order of the Markov chains used to model
* the sequences.
*/
size_t _order;
/**
* A private MarkovModel that stores the observed conditional nucleotide
* frequencies (logged) in the bias window surrounding the fragment end.
*/
MarkovModel _observed;
/**
* A private MarkovModel that stores the expected condtional nucleotide
* frequencies (logged) based on an assumption of equal abundance of all
* targets.
*/
MarkovModel _expected;
public:
/**
* SeqWeightTable Constructor.
* @param window_size an unsigned integer specifying the size of the bias
* window surrounding fragment ends.
* @param order a size_t specifying the order to use for the Markov chains
* modelling the sequence.
* @param alpha a double specifying the strength of the uniform prior
* (logged pseudo-counts for each parameter).
*/
SeqWeightTable(size_t window_size, size_t order, double alpha);
/**
* A second constructor that loads the distribution from a parameter file.
* Note that the values should not be modified after using this constructor.
* @param window_size an unsigned integer specifying the size of the bias
* window surrounding fragment ends. Must match file.
* @param order a size_t specifying the order to use for the Markov chains
* modelling the sequence. Must match file.
* @param param_file_name a string specifying the path to the parameter file.
* @param identifier a string specifying the header for these parameters in
* the file.
*/
SeqWeightTable(size_t window_size, size_t order, std::string param_file_name,
std::string identifier);
/**
* A member function that overwrites the "observed" parameters with those from
* another SeqWeightTable.
* @param other another SeqWeightTable from which to copy the parameters.
*/
void copy_observed(const SeqWeightTable& other);
/**
* A member function that overwrites the "expected" parameters with those from
* another SeqWeightTable.
* @param other another SeqWeightTable from which to copy the parameters.
*/
void copy_expected(const SeqWeightTable& other);
/**
* A member function that increments the expected counts for a sliding window
* through the given target sequence by some mass.
* @param seq the target sequence.
* @param mass the amount to increment by in the parameter table.
* @param fl_cdf the fragment length CDF.
*/
void increment_expected(const Sequence& seq, double mass,
const std::vector<double>& fl_cdf);
/**
* A member function that normalizes the expected counts and fills in the
* lower-ordered marginals.
*/
void normalize_expected();
/**
* A member function that increments the observed counts for the given
* fragment sequence by some (logged) mass.
* @param seq the target sequence (possibly reverse complemented) to which the
* fragment end maps.
* @param i the index into the sequence at which to center the bias window, ie
* where the fragment starts/ends.
* @param mass the amount to increment by (logged)
*/
void increment_observed(const Sequence& seq, size_t i, double mass);
/**