Skip to content
Commits on Source (15)
This diff is collapsed.
......@@ -28,7 +28,7 @@
if (WIN32)
cmake_minimum_required(VERSION 3.1.0)
else ()
cmake_minimum_required(VERSION 2.8.0)
cmake_minimum_required(VERSION 2.8.12)
endif ()
message (STATUS "==== Initializing project cmake configuration ====")
......@@ -73,7 +73,7 @@ endif ()
#
set(CMAKE_CONFIGURATION_TYPES_TMP Debug Release RelWithDebInfo)
if (NOT WIN32)
set(CMAKE_CONFIGURATION_TYPES_TMP ${CMAKE_CONFIGURATION_TYPES_TMP} ASan)
set(CMAKE_CONFIGURATION_TYPES_TMP ${CMAKE_CONFIGURATION_TYPES_TMP} ASan GCov)
endif ()
set(CMAKE_CONFIGURATION_TYPES ${CMAKE_CONFIGURATION_TYPES_TMP} CACHE STRING "" FORCE)
......@@ -125,21 +125,31 @@ set(THIS_PROJECT_NAME "manta")
project (${THIS_PROJECT_NAME})
# find interpreters
find_package(PythonInterp)
# Find python2 interpreter
find_package(PythonInterp 2 QUIET)
if (NOT PYTHONINTERP_FOUND)
message (WARNING "No python interpreter found, disabling optional python build and installation components. Installed workflow requires python interpreter to run")
message (WARNING "No python2 interpreter found, disabling optional python build and installation components. Installed workflow requires python2 interpreter to run.")
else()
set (MINIMUM_PYTHON_VERSION "2.6")
if (${PYTHON_VERSION_STRING} VERSION_LESS ${MINIMUM_PYTHON_VERSION})
message (WARNING
"Python2 interpretor must be at least version (${MINIMUM_PYTHON_VERSION}). Found version ${PYTHON_VERSION_STRING} at '${PYTHON_EXECUTABLE}'."
" Disabling optional python build and installation components. Installed workflow requires python ${MINIMUM_PYTHON_VERSION} or higher to run.")
set(PYTHONINTERP_FOUND false)
endif ()
endif()
set (THIS_ARCH ${CMAKE_SYSTEM_PROCESSOR})
if (NOT WIN32)
set (TARGET_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
if (APPLE)
# On OS X, CMAKE_SYSTEM_PROCESSOR is set to 'i386' regardless of the actual processor, so
# architecture is inferred from pointer size instead:
if (CMAKE_SIZEOF_VOID_P MATCHES 8)
set (THIS_ARCH "x86_64")
set (TARGET_ARCHITECTURE "x86_64")
else ()
set (THIS_ARCH "x86")
set (TARGET_ARCHITECTURE "x86")
endif ()
endif ()
message (STATUS "TARGET_ARCHITECTURE: " ${THIS_ARCH} )
message (STATUS "TARGET_ARCHITECTURE: " ${TARGET_ARCHITECTURE} )
# Create package versioning target - version derived from git describe except for
......
Manta - Stuctural Variant and Indel Caller
Copyright (c) 2013-2017 Illumina, Inc.
Copyright (c) 2013-2018 Illumina, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -26,7 +26,7 @@ library:
******************************************************************
htslib 1.2.1-204-g8197cfd
htslib 1.7-6-g6d2bfb7
[Files in this distribution outwith the cram/ subdirectory are distributed
according to the terms of the following MIT/Expat license.]
......@@ -103,7 +103,46 @@ to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
******************************************************************
Boost 1.56.0
samtools 1.6
The MIT/Expat License
Copyright (C) 2008-2014 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
[The use of a range of years within a copyright notice in this distribution
should be interpreted as being equivalent to a list of years including the
first and last year specified and all consecutive years between them.
For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
2011-2012" should be interpreted as being identical to a notice that reads
"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
that reads "Copyright (C) 2005-2012" should be interpreted as being identical
to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
2011, 2012".]
******************************************************************
Boost 1.58.0
Boost Software License - Version 1.0 - August 17th, 2003
......@@ -172,11 +211,11 @@ corresponding subdirectories.
******************************************************************
pyFlow 1.1.14
pyFlow 1.1.20
pyFlow - a lightweight parallel task engine
Copyright (c) 2012-2015 Illumina, Inc.
Copyright (c) 2012-2017 Illumina, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
......@@ -217,26 +256,3 @@ Distributed under the Boost Software License, Version 1.0.
(See accompanying file `LICENSE_1_0.txt` or copy at
<http://www.boost.org/LICENSE_1_0.txt>)
******************************************************************
zlib 1.2.8
(C) 1995-2013 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
......@@ -18,7 +18,7 @@ variants in small sets of diploid samples and somatic variants in
matched tumor/normal sample pairs. There is experimental support for
analysis of unmatched tumor samples as well. Manta accepts input read
mappings from BAM or CRAM files and reports all SV and indel inferences
in VCF 4.1 format. See the [user guide] [UserGuide] for a full
in VCF 4.1 format. See the [user guide][UserGuide] for a full
description of capabilities and limitations.
[UserGuide]:docs/userGuide/README.md
......@@ -38,9 +38,9 @@ indels for germline and cancer sequencing applications. *Bioinformatics*,
License
-------
Manta source code is provided under the [GPLv3 license] (LICENSE.txt).
Manta source code is provided under the [GPLv3 license](LICENSE.txt).
Manta includes several third party packages provided under other
open source licenses, please see [COPYRIGHT.txt] (COPYRIGHT.txt)
open source licenses, please see [COPYRIGHT.txt](COPYRIGHT.txt)
for additional details.
......@@ -48,7 +48,7 @@ Getting Started
---------------
For linux users, it is recommended to start from the most recent
[binary distribution on the Manta releases page] [releases], this
[binary distribution on the Manta releases page][releases], this
distribution can be unpacked, moved to any convenient directory and
tested by [running a small demo](docs/userGuide/installation.md#demo)
included with the release distribution. Manta can also be installed
......@@ -61,7 +61,7 @@ for full build and installation details of all supported cases.
Data Analysis and Interpretation
--------------------------------
After completing installation, see the [Manta user guide] [UserGuide]
After completing installation, see the [Manta user guide][UserGuide]
for instructions on how to run Manta, interpret results and estimate
hardware requirements/compute cost, in addition to a high-level methods
overview.
......@@ -71,7 +71,7 @@ Manta Code Development
----------------------
For manta code development and debugging details, see the
[Manta developer guide] [DeveloperGuide]. This includes details
[Manta developer guide][DeveloperGuide]. This includes details
on Manta's developement protocols, special build instructions,
recommended workflows for investigating
calls, and internal documentation details.
......
......@@ -49,11 +49,12 @@ Configuration:
allow version control within eclipse
--build-type=TYPE specify the build type for CMake (affects compiler
options). Allowed values are "Debug", "Release",
"RelWithDebInfo", "ASan" [$build_type]
"RelWithDebInfo", "ASan", "GCov" [$build_type]
Debug: No optimization and all debug symbols
Release: All portable optimization
RelWithDebInfo: Most optimizations, try to keep stack trace info
ASan: Light optimization with google addresss sanitizer on
ASan: Light optimization with google address sanitizer on
GCov: Debug mode with code coverage options to enable gcov
Directory and file names:
--prefix=PREFIX install files in tree rooted at PREFIX
......
manta (1.1.0+dfsg-1) UNRELEASED; urgency=medium
manta (1.4.0+dfsg-1) UNRELEASED; urgency=medium
* Initial release (Closes: #861664)
TODO: Check why runMantaWorkflowDemo is failing
--> https://github.com/Illumina/manta/issues/77
* debian/upstream/metadata: added ref to OMICtools
* New upstream version
* debhelper 11
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
-- Andreas Tille <tille@debian.org> Tue, 15 Nov 2016 14:29:08 +0100
-- Andreas Tille <tille@debian.org> Thu, 03 May 2018 15:27:38 +0200
......@@ -3,7 +3,7 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 9),
Build-Depends: debhelper (>= 11~),
cmake,
dh-python,
libboost-date-time-dev,
......@@ -17,12 +17,12 @@ Build-Depends: debhelper (>= 9),
libboost-test-dev,
zlib1g-dev,
python-all-dev,
python-pyflow,
libhts-dev,
python-pyflow (>= 1.1.20),
libhts-dev (>= 1.7),
samtools
Standards-Version: 4.0.1
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/manta.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/manta.git
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/manta
Vcs-Git: https://salsa.debian.org/med-team/manta.git
Homepage: https://github.com/Illumina/manta
Package: manta
......@@ -32,7 +32,7 @@ Depends: ${shlibs:Depends},
${python:Depends},
tabix (>= 1.4-2),
samtools,
python-pyflow (>= 1.1.14)
python-pyflow
Description: structural variant and indel caller for mapped sequencing data
Manta calls structural variants (SVs) and indels from mapped paired-end
sequencing reads. It is optimized for analysis of germline variation in
......
......@@ -4,7 +4,7 @@ Description: Move config to /etc
--- a/src/python/lib/configureOptions.py
+++ b/src/python/lib/configureOptions.py
@@ -124,7 +124,7 @@ class ConfigureWorkflowOptions(object) :
@@ -113,7 +113,7 @@ class ConfigureWorkflowOptions(object) :
configFileName=cmdlineScriptName+".ini"
cmdlineScriptDir=os.path.abspath(os.path.dirname(realArg0))
......
......@@ -5,7 +5,7 @@ Description: libexec seems Redhat specific, in Debian the according files can
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,7 +181,7 @@ check_prefix()
@@ -191,7 +191,7 @@ check_prefix()
set(THIS_EXEC_PREFIX "${CMAKE_INSTALL_PREFIX}")
set(THIS_BINDIR "${THIS_EXEC_PREFIX}/bin")
set(THIS_LIBDIR "${THIS_EXEC_PREFIX}/lib")
......@@ -16,9 +16,9 @@ Description: libexec seems Redhat specific, in Debian the according files can
set(THIS_PYTHON_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib/python")
--- a/src/python/lib/mantaOptions.py
+++ b/src/python/lib/mantaOptions.py
@@ -79,13 +79,14 @@ class MantaWorkflowOptionsBase(Configure
@@ -90,13 +90,14 @@ class MantaWorkflowOptionsBase(Configure
alignerMode = "isaac"
configCommandLine=sys.argv
- libexecDir=os.path.abspath(os.path.join(scriptDir,"@THIS_RELATIVE_LIBEXECDIR@"))
+ libexecDir='/usr/lib/manta' # no idea why this is expanded to /usr/share/manta in cmake build
......@@ -49,7 +49,7 @@ Description: libexec seems Redhat specific, in Debian the according files can
--- a/src/python/bin/configManta.py
+++ b/src/python/bin/configManta.py
@@ -26,7 +26,7 @@ import os,sys
@@ -35,7 +35,7 @@ if sys.version_info < (2,6):
scriptDir=os.path.abspath(os.path.dirname(__file__))
scriptName=os.path.basename(__file__)
......
use_debian_packaged_pyflow.patch
use_debian_packaged_zlib.patch
use_debian_packaged_htslib.patch
use_debian_packaged_samtools.patch
# fix_comparison.patch
......
......@@ -4,47 +4,14 @@ Description: Check for Debian packaged htslib
--- a/redist/CMakeLists.txt
+++ b/redist/CMakeLists.txt
@@ -58,42 +58,7 @@ endif()
@@ -66,9 +66,7 @@ superset(THIS_ADDITIONAL_LIB "${THIS_ADD
#
# htslib
#
-set(HTSLIB_PREFIX "htslib-1.2.1-204-g8197cfd")
-set(HTSLIB_PREFIX "htslib-1.7-6-g6d2bfb7")
-superset(HTSLIB_DIR "${CMAKE_CURRENT_BINARY_DIR}/${HTSLIB_PREFIX}")
-superset(HTSLIB_LIBRARY "${HTSLIB_DIR}/libhts.a")
-
-add_custom_command(
- OUTPUT ${HTSLIB_DIR}
- COMMAND ${CMAKE_COMMAND} -E remove_directory "${HTSLIB_DIR}"
- COMMAND ${CMAKE_COMMAND} -E tar xjf "${THIS_REDIST_DIR}/${HTSLIB_PREFIX}.tar.bz2"
- DEPENDS ${THIS_ZLIB}
- COMMENT "Unpacking htslib library")
-
-set (HTSLIB_FINAL_TASK ${HTSLIB_DIR})
-
-if (NOT WIN32)
- # note that htslib ./configure CFLAGS="custom" will overwrite the standard "-g -O2 -Wall" CFLAGS, so
- # we need to restore at least the -O2 for reasonable library performance:
- add_custom_command(
- OUTPUT ${HTSLIB_LIBRARY}
- COMMAND ./configure CC="${CMAKE_C_COMPILER}" CFLAGS='-O2 -I"${ZLIB_DIR}"' LIBS="${ZLIB_LIBRARY}" >htslib.config.log
- COMMAND $(MAKE) lib-static bgzip htsfile tabix >htslib.build.log 2>htslib.build.error.log
- WORKING_DIRECTORY ${HTSLIB_DIR}
- DEPENDS ${HTSLIB_DIR}
- COMMENT "Building htslib library")
-
- set (HTSLIB_FINAL_TASK ${HTSLIB_LIBRARY})
-endif ()
-
-set(THIS_HTSLIB "${THIS_PROJECT_NAME}_htslib")
-add_custom_target(${THIS_HTSLIB} DEPENDS "${HTSLIB_FINAL_TASK}")
-
-
-if (NOT WIN32)
- install(PROGRAMS "${HTSLIB_DIR}/bgzip" DESTINATION "${THIS_LIBEXECDIR}")
- install(PROGRAMS "${HTSLIB_DIR}/htsfile" DESTINATION "${THIS_LIBEXECDIR}")
- install(PROGRAMS "${HTSLIB_DIR}/tabix" DESTINATION "${THIS_LIBEXECDIR}")
-endif ()
+superset(HTSLIB_LIBRARY "-lhts")
#
# samtools
## It is occasionally useful to extend debug/asan build options from manta all the way down through htslib,
## the flags below can be uncommented to do so.
......@@ -4,11 +4,11 @@ Description: Check for Debian packaged python-pyflow
--- a/redist/CMakeLists.txt
+++ b/redist/CMakeLists.txt
@@ -183,9 +183,7 @@ endif ()
@@ -151,9 +151,7 @@ endif ()
# pyflow
#
-set(PYFLOW_PREFIX "pyflow-1.1.14")
-set(PYFLOW_PREFIX "pyflow-1.1.20")
-set(PYFLOW_DIR "${CMAKE_CURRENT_BINARY_DIR}/${PYFLOW_PREFIX}")
-set(PYFLOW_SCRIPT "${PYFLOW_DIR}/src/pyflow.py")
+set(PYFLOW_SCRIPT "/usr/lib/python2.7/dist-packages/pyflow/pyflow.py")
......
......@@ -4,39 +4,27 @@ Description: Check for Debian packaged samtools
--- a/redist/CMakeLists.txt
+++ b/redist/CMakeLists.txt
@@ -63,34 +63,11 @@ superset(HTSLIB_LIBRARY "${CMAKE_INSTALL
@@ -110,11 +110,12 @@ endif ()
#
# samtools
#
-set(SAMTOOLS_PREFIX "samtools-1.2")
-set(SAMTOOLS_PREFIX "samtools-1.6")
-set(SAMTOOLS_DIR "${CMAKE_CURRENT_BINARY_DIR}/${SAMTOOLS_PREFIX}")
-set(SAMTOOLS_LIBRARY "${SAMTOOLS_DIR}/libbam.a")
+set(SAMTOOLS_DIR "/usr/bin")
+set(SAMTOOLS_LIBRARY "-lbam")
superset(SAMTOOLS_PROG "${SAMTOOLS_DIR}/samtools")
-# final directory copy below would ideally be a soft-link, copy is for windows build
-add_custom_command(
- OUTPUT ${SAMTOOLS_DIR}
- COMMAND ${CMAKE_COMMAND} -E remove_directory "${SAMTOOLS_DIR}"
- COMMAND ${CMAKE_COMMAND} -E tar xjf "${THIS_REDIST_DIR}/${SAMTOOLS_PREFIX}.tar.bz2"
- COMMAND ${CMAKE_COMMAND} -E copy_directory "${HTSLIB_DIR}" "${SAMTOOLS_DIR}/${HTSLIB_PREFIX}"
- DEPENDS ${HTSLIB_FINAL_TASK}
- COMMENT "Unpacking samtools package")
-
set (SAMTOOLS_FINAL_TASK ${SAMTOOLS_DIR})
+# fake if to keep diff small
+if (WIN32)
# final directory copy below would ideally be a soft-link, copy is for windows build
add_custom_command(
OUTPUT ${SAMTOOLS_DIR}
@@ -137,6 +138,7 @@ if (NOT WIN32)
set (SAMTOOLS_FINAL_TASK ${SAMTOOLS_PROG})
endif ()
+endif ()
-if (NOT WIN32)
- add_custom_command(
- OUTPUT ${SAMTOOLS_PROG}
- COMMAND $(MAKE) HTSDIR=${HTSLIB_PREFIX} all 2>| samtools.build.log
- DEPENDS ${HTSLIB_LIBRARY}
- DEPENDS ${SAMTOOLS_DIR}
- WORKING_DIRECTORY ${SAMTOOLS_DIR}
- COMMENT "Building samtools package")
-
- set (SAMTOOLS_FINAL_TASK ${SAMTOOLS_PROG})
-endif ()
-
set(THIS_SAMTOOLS "${THIS_PROJECT_NAME}_samtools")
add_custom_target(${THIS_SAMTOOLS} DEPENDS "${SAMTOOLS_FINAL_TASK}")
Author: Andreas Tille <tille@debian.org>
Last-Update: Tue, 15 Nov 2016 14:29:08 +0100
Description: Check for Debian packaged zlib
--- a/redist/CMakeLists.txt
+++ b/redist/CMakeLists.txt
@@ -50,56 +50,10 @@ endif ()
#
# zlib
#
-set(ZLIB_PREFIX "zlib-1.2.8")
-superset(ZLIB_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ZLIB_PREFIX}")
-if (WIN32)
- superset(ZLIB_LIBRARY "${ZLIB_DIR}/${CMAKE_BUILD_TYPE}/zlibstatic.lib")
-else ()
- superset(ZLIB_LIBRARY "${ZLIB_DIR}/libz.a")
-endif ()
-
-add_custom_command(
- OUTPUT ${ZLIB_DIR}
- COMMAND ${CMAKE_COMMAND} -E remove_directory "${ZLIB_DIR}"
- COMMAND ${CMAKE_COMMAND} -E tar xjf "${THIS_REDIST_DIR}/${ZLIB_PREFIX}.tar.bz2"
- COMMENT "Unpacking zlib package")
-
-if (NOT WIN32)
- set (ASM_MAKE_OPTION LOC=-DASMV OBJA=match.o)
- if (THIS_ARCH MATCHES "^.*86$")
- set(ZLIB_ARCH "")
- set(ASM_FILE_COPY cp contrib/asm686/match.S match.S)
- elseif (THIS_ARCH MATCHES "^x86_64$")
- set(ZLIB_ARCH "--64")
- set(ASM_FILE_COPY cp contrib/amd64/amd64-match.S match.S)
- else ()
- set (ZLIB_ARCH "")
- set (ASM_FILE_COPY "")
- set (ASM_MAKE_OPTION "")
- endif ()
-
- add_custom_command(
- OUTPUT ${ZLIB_LIBRARY}
- COMMAND CC=${CMAKE_C_COMPILER} ${ZLIB_DIR}/configure --prefix="${ZLIB_DIR}" --static ${ZLIB_ARCH} >zlib.config.log
- COMMAND ${ASM_FILE_COPY}
- COMMAND $(MAKE) ${ASM_MAKE_OPTION} >zlib.build.log 2>zlib.build.error.log
- WORKING_DIRECTORY ${ZLIB_DIR}
- DEPENDS ${ZLIB_DIR}
- COMMENT "Building zlib package")
-else ()
- add_custom_command(
- OUTPUT ${ZLIB_LIBRARY}
- COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${THIS_CMAKE_PLATFORM} -DCMAKE_C_COMPILER="${CMAKE_C_COMPILER}" ${ZLIB_DIR} >zlib.config.log
- COMMAND ${CMAKE_MAKE_PROGRAM} /p:Configuration=${CMAKE_BUILD_TYPE} "${ZLIB_DIR}/zlib.sln" >zlib.build.log
- WORKING_DIRECTORY ${ZLIB_DIR}
- DEPENDS ${ZLIB_DIR}
- COMMENT "Building zlib package")
-endif ()
-
-
-set(THIS_ZLIB "${THIS_PROJECT_NAME}_zlib")
-add_custom_target(${THIS_ZLIB} DEPENDS "${ZLIB_LIBRARY}")
-
+find_package(ZLIB REQUIRED)
+if (ZLIB_FOUND)
+ set(HAVE_ZLIB 1)
+endif()
#
# htslib
......@@ -2,7 +2,7 @@
This directory aggregates all project documentation. The documentation is divided into the following sections:
* [User Guide](userGuide/README.md) - This is the primary documentation resource for all Manta users.
* [Developer Guide](developerGuide/README.md) - This provides guidelines for anyone contributing to the Manta source.
* [User Guide](userGuide/README.md) - This is the primary documentation resource for all users.
* [Developer Guide](developerGuide/README.md) - This provides guidelines for anyone contributing to methods development.
* [Methods](methods/README.md) - This directory aggregates detailed method and model descriptions.
......@@ -7,22 +7,31 @@ Manta Developer Guide
* [Scope](#scope)
* [Developer Build Notes](#developer-build-notes)
* [Building from source repository vs. versioned code distribution:](#building-from-source-repository-vs-versioned-code-distribution)
* [Building from source repository vs. versioned code distribution](#building-from-source-repository-vs-versioned-code-distribution)
* [Static analysis](#static-analysis)
* [Source auto-documentation](#source-auto-documentation)
* [Improving build time](#improving-build-time)
* [ccache](#ccache)
* [Bundled dependencies](#bundled-dependencies)
* [General Debugging Notes](#general-debugging-notes)
* [General Debugging: Address Sanitizer](#general-debugging-address-sanitizer)
* [General Debugging: Inspecting temporary files](#general-debugging-inspecting-temporary-files)
* [Windows development support](#windows-development-support)
* [Automating Portable Binary Builds](#automating-portable-binary-builds)
* [Automating Portable Binary Builds for Linux](#automating-portable-binary-builds-for-linux)
* [Coding Guidelines](#coding-guidelines)
* [Source formatting](#source-formatting)
* [Git conventions](#git-conventions)
* [Commit messages](#commit-messages)
* [Commit consolidation](#commit-consolidation)
* [Changelog conventions](#changelog-conventions)
* [Branching and release tagging guidelines](#branching-and-release-tagging-guidelines)
* [Error handling](#error-handling)
* [General Policies](#general-policies)
* [Exception Details](#exception-details)
* [Logging](#logging)
* [Unit tests](#unit-tests)
* [IDE support](#ide-support)
* [Clion](#clion)
* [Special Topic Guides](#special-topic-guides)
[//]: # (END automated TOC section, any edits will be overwritten on next source refresh)
......@@ -48,17 +57,27 @@ The following section provides a supplement to the standard build
instructions including additional details of interest to methods
developers.
### Building from source repository vs. versioned code distribution:
### Building from source repository vs. versioned code distribution
When the source repository is cloned from github, it is configured for development
rather than user distribution. In this configuration all builds are strict
such that:
* all warnings are treated as errors
* if cppcheck is found any detected cppcheck issue is converted to a build error
* if cppcheck is found any detected cppcheck issue is converted to a build error (see details below)
Note that all unit tests are always run and required to pass for the build
procedure to complete.
### Static analysis
When the build is configured for development, static analysis is run on all project c++ source using cppcheck, so long
as an appropriate cppcheck version can be found in the user's path. This static analysis step is configured as follows:
* cppcheck will be used if it can be found in the user's path and the version found is at least 1.69
* The script which runs and interprets cppcheck is [run_cppcheck.py](../../src/srcqc/run_cppcheck.py).
* Any cppcheck warning will be treated as an error. A few warnings are suppressed, depending on the cppcheck version.
* All cppcheck warnings are reformatted to follow standard gcc/clang error message style to work correctly in most IDEs.
* All project c++ code will be analyzed, but third-party/redistributed packages are ignored.
### Source auto-documentation
If doxygen is found in the path (and optionally dot as well) during
......@@ -70,7 +89,7 @@ additional "doc" target for the makefile:
There is no installation for the documentation outside of the build
directory, the root doxygen page after completing this target will be:
${MANTA_BUILD_PATH}/c++/doxygen/html/index.html
${MANTA_BUILD_PATH}/src/c++/doxygen/html/index.html
### Improving build time
......@@ -84,21 +103,23 @@ found in the path
Note that during the configuration step, the following dependencies will be
built from source if they are not found:
* cmake 2.8.0+
* boost 1.56.0+
* cmake 2.8.12+
* boost 1.58.0+
To avoid the extra time associated with this step, ensure that (1)
cmake 2.8.0+ is in your PATH and (2) BOOST\_ROOT is defined to point
to boost 1.56.0 or newer.
cmake 2.8.12+ is in your PATH and (2) BOOST\_ROOT is defined to point
to boost 1.58.0 or newer.
## General Debugging Notes
### General Debugging: Address Sanitizer
The build system offers first-class support for google address sanitizer
when a supporting compiler is detected. To use this mode, start a fresh
installation process with the additional configure option `--build-type=ASan`,
extending from the configuration example in the above build instructions, use:
for example:
../manta-A.B.C.release_src/src/configure --jobs=4 --prefix=/path/to/install --build-type=ASan
../configure --jobs=4 --prefix=/path/to/install --build-type=ASan
### General Debugging: Inspecting temporary files
......@@ -120,25 +141,29 @@ the library level. Note that unit test codes are compiled to
libraries but cannot be run.
C++11 features in use require at least VS2013. A Windows
installation of cmake is also required to configure and compile.
Note that the minimum cmake version is 3.1.0 for Windows.
installation of cmake and zlib are also required to configure and compile.
Note that the minimum cmake version is 3.1.0 for Windows. Windows zlib is provided by the [gnuwin32 package][gnuwin32] among others.
### Automating Portable Binary Builds
[gnuwin32]:http://gnuwin32.sourceforge.net/packages/zlib.htm
A script is provided to enable a dockerized build process which
issues Centos5+ or Centos6+ binary tarballs. To do so, ensure you
### Automating Portable Binary Builds for Linux
A script is provided to enable a dockerized build process which issues
binary tarballs for a variety of OS/compiler configurations, with the
goal of creating a reasonably portable linux binary build by using a
relatively old OS/glibc version. To use this script, ensure you
have permission to `docker run` on the current system and execute the
following script:
```
${MANTA_ROOT_PATH}/scratch/docker/deployment/dockerBuildBinaryTarball.bash ${MANTA_ROOT_PATH2} ${BINARY_BUILD_PREFIX}
${SOURCE_PATH}/scratch/docker/deployment/dockerBuildBinaryTarball.bash ${SOURCE_PATH2} ${BINARY_BUILD_PREFIX}
```
The term `${MANTA_ROOT_PATH2}` can point to the current git repo (ie. `${MANTA_ROOT_PATH}`),
or to an extracted Manta source tarball previously created using the script:
The term `${SOURCE_PATH2}` can point to the current git repository (ie. `${SOURCE_PATH}`),
or to an extracted source release tarball previously created using the script:
```
${MANTA_ROOT_PATH}/scratch/make_release_tarball.bash
${SOURCE_PATH}/scratch/make_release_tarball.bash
```
The choice of virtualized build environment is hard-coded in the deploy script for the time being,
......@@ -146,6 +171,8 @@ see the `builderImage` variable.
## Coding Guidelines
Supported project languages are C++11 for core methods development and python2 (2.6+) for workflow and scripting support.
### Source formatting
* Basic formatting restrictions on c++ code:
......@@ -153,8 +180,77 @@ see the `builderImage` variable.
* 4-space indents
* "ANSI" bracket style
* Note the above restrictions are enforced by an astyle script which is occasionally run on the master branch (see [run_cxx_formatter.bash](../../scratch/source_check_and_format/run_cxx_formatter.bash))
* Name formatting for all newly introduced code:
* Make variable/type names self-documenting whereever this is practical, e.g. sampleCount, breakpointRegion, etc.
* Lowercase camelCase variable names
* Uppercase CamelCase type names
* Private class members start with a leading underscore, e.g. `_sampleName`.
* Otherwise, follow local code conventions
### Git conventions
#### Commit messages
All git commit messages should be prepended with either the associated JIRA or github issue id. For example:
```
MANTA-123 Improve insertion genotype accuracy
Improve assembly and realignemnt of large insertions to reduce hom->het undercall.
```
Very minor updates may be made without an associated ticket. In these cases providing a category prefix for
the minor change is still encouraged to provide context, for instance:
```
docs: Fix paths in user guide examples
```
```
build: Close new warnings from clang-4.0.0
```
All git commit messages should conform to practices outlined here:
http://chris.beams.io/posts/git-commit/
#### Commit consolidation
On any single-developer research branch, history editing is encouraged within the branch to collapse bugs and
build a more clear feature-by-feature story for other other developers to follow.
In all other situations history editing is discouraged and a conventional merge-based workflow is preferred.
### Changelog conventions
The primary function of the changelog is to help end-users weigh the benefits and risks or updating to a newer version.
To this end:
- Changelog entries should be made for any major branch merge, bug fix, or generally something that would change
a user's interaction at the configuration step, change the format/interpretation of the output or change
the variant calling performance.
- The project changelog follows many of the conventions suggested in [keepachangelog](http://keepachangelog.com/en/1.0.0/).
Consistent with this formatting, changelog entries should be more descriptive and end-user focused than git commit
entries. JIRA or github ticket IDs should be listed at the end of the Changelog description to help developers link
issues, without making this the emphasis of each changelog entry.
- Each major branch with an end-user impact should add a changelog entry to the Unreleased section of the changelog
prior to merging the branch.
- For consistency with the git commit log, try to follow a similar summary style even though descriptions can be much
longer, for instance by starting all major bullet points with an imperitive verb.
## Branching and release tagging guidelines
All features and bugfixes are developed on separate branches. Branch names should contain the corresponding JIRA ticket
id or contain the key "github${issueNumber}' to refer to the corresponding issue on github.com. After code
review and testing, all branches intended for release are merged to the 'develop' branch. Releases are tagged
on the development branch, and the corresponding release tag can be pushed to the 'master' branch. Thus the master
branch always represents the most recent stable release. As required version specific development branches are
created to support continued bugfix iterations on older releases. These branches replace the patch number in the
release version with an "x", for instance the "v2.4.x" branch would be used to provide bugfix updates "v2.4.2",
"v2.4.3", etc.
### Error handling
#### General Policies
......@@ -169,7 +265,7 @@ see the `builderImage` variable.
#### Exception Details
* Preferred exception pattern is to use an internal class derived from `boost::exception`:
* Preferred exception pattern is to use an internal class `GeneralException` derived from `boost::exception`:
```c++
......@@ -183,28 +279,109 @@ foo(const char* name)
using namespace illumina::common;
std::ostringstream oss;
oss << "ERROR: unrecognized variant scoring model name: '" << name << "'\n";
BOOST_THROW_EXCEPTION(LogicException(oss.str()));
oss << "Unrecognized variant scoring model name: '" << name << "'";
BOOST_THROW_EXCEPTION(GeneralException(oss.str()));
}
```
* Best practice for exception messages designed to avoid redundant boilerplate at the throw site:
* Avoid adding a standard "ERROR" or "EXCEPTION" prefix
* Avoid ending the message with a newline. For multi-line exception messages the ending newline may make sense on a case-by-case basis.
* Context at the original throw site is often supplemented by a 'catch and release' block to add
information at a few critical points on the stack. Typically this is information which
is unavailable at the throw site. Example code is:
information at a few critical points on the stack as the stack is unwound. Typically this is information which
is unavailable at the throw site.
* The preferred method to implement this is to use `boost::error_info`. This only works for exceptions derived from `illumina::common::ExceptionData`, such as the above noted `GeneralException` class.
* The first template argument to `boost::error_info` (`edge_error_info` in the example below) is an arbitrary empty struct, the name of which will be printed with the metadata proceeding the given string (this seems to be some kind of tag dispatch mechanism in the boost exception library). The coding convention is to create an informative category
name for each instance where this exception decoration pattern occurs.
* An example of using this pattern in code follows:
```c++
try
catch (illumina::common::ExceptionData& e)
{
realign_and_score_read(_opt,_dopt,sif.sample_opt,_ref,realign_buffer_range,rseg,sif.indel_sync());
// decorate an in-flight exception:
std::ostringstream oss;
oss << "Can't find return edge to node index: " << _index << ":" << fromIndex << " in remote node index: " << _index << ":" << fromNodeEdgeIter.first << "\n"
<< "\tlocal_node: " << fromNode
<< "\tremote_node: " << remoteNode;
// Note that the struct 'edge_error_info" is just an arbitrary tag name applied to this string, any name can be
// used for this purpose at each exception decoration site:
e << boost::error_info<struct edge_error_info,std::string>(oss.str());
throw;
}
```
* More details on `boost::error_info` usage:
```c++
catch (illumina::common::ExceptionData& e)
{
// use a boost error_info type to supplement the current exception message
e << boost::error_info<struct extra_exception_message,std::string>("FOO");
// to further supplement the exception message, change the tag struct to
// create another boost error_info type
e << boost::error_info<struct current_candidate_info,std::string>("BAR");
// Note that repeating any type will result in only the last message being
// printed to standard error, eg:
//
// e << boost::error_info<struct special_message, std::string>("BAR1");
// e << boost::error_info<struct special_message, std::string>("BAR2");
//
// ...would result in only the second message "BAR2", being rended in the final exception message.
throw;
}
```
* A more general backup to the above method that works for all exception types is a simple stderr print at the catch
site. In this case the information will be a bit out of order, but this will get the job done:
```c++
catch (...)
{
log_os << "ERROR: Exception caught in align_pos() while realigning segment: "
<< static_cast<int>(r.second) << " of read: " << (*r.first) << "\n";
log_os << "Exception caught in align_pos() while realigning segment: "
<< static_cast<int>(r.second) << " of read: " << (*r.first) << "\n";
throw;
}
```
##### Exception example
The following example shows an exception message with two messages added during stack unwinding using the
boost::error_info mechanism described above (Additional cmdline/version details are added at the end of the
message by the lowest-level catch site for each binary).
```
FATAL_ERROR: 2018-Jan-02 16:27:53 /src/c++/lib/applications/GenerateSVCandidates/SVCandidateAssemblyRefiner.cpp(985): Throw in function void processLargeInsertion(const SVCandidate&, pos_t, pos_t, const GlobalAligner<int>&, const std::vector<unsigned int>&, const std::set<int>&, SVCandidateAssemblyData&, const GSCOptions&)
Dynamic exception type: boost::exception_detail::clone_impl<illumina::common::GeneralException>
std::exception::what: Large insertion alignment procedure produced invalid zero-length alignment target
[runGSC(GSCOptions const&, char const*, char const*)::current_edge_info*] = Exception caught while processing graph edge: edgeinfo locus:node1:node2: 3360:0:0
node1:LocusNode: GenomeInterval: 19:[44215749,44215797) n_edges: 1 out_count: 18 evidence: [44215637,44215909)
EdgeTo: 0 out_count: 18
node1:EndUserGenomeInterval: chr20:44215750-44215797
[SVCandidateProcessor::evaluateCandidate(EdgeInfo const&, SVMultiJunctionCandidate const&, SVCandidateSetData const&, bool, SupportSamples&)::assembly_candidate_info*] = Exception caught while attempting to assemble SVCandidate:
isImprecise?: 1
forwardTranscriptStrandReadCount: 0 ; reverseTranscriptStrandReadCount: 0
index candidate:assemblyAlign:assemblySegment: 0:0:0
Breakend: GenomeInterval: 19:[44215749,44215797) COMPLEX
SVBreakendLowResEvidence: pair: 0 local_pair: 0 cigar: 0 softclip: 0 semialign: 6 shadow: 0 split_align: 0 unknown: 0
Breakend: GenomeInterval: 19:[44215749,44215797) UNKNOWN
SVBreakendLowResEvidence: pair: 0 local_pair: 0 cigar: 0 softclip: 0 semialign: 0 shadow: 0 split_align: 0 unknown: 0
cmdline: /install/libexec/GenerateSVCandidates --align-stats /MantaWorkflow/workspace/alignmentStats.xml --graph-file /MantaWorkflow/workspace/svLocusGraph.bin --bin-index 197 --bin-count 256 --max-edge-count 10 --min-candidate-sv-size 8 --min-candidate-spanning-count 3 --min-scored-sv-size 50 --ref /Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa --candidate-output-file /MantaWorkflow/workspace/svHyGen/candidateSV.0197.vcf --diploid-output-file /MantaWorkflow/workspace/svHyGen/diploidSV.0197.vcf --min-qual-score 10 --min-pass-qual-score 20 --min-pass-gt-score 15 --edge-runtime-log /MantaWorkflow/workspace/svHyGen/edgeRuntimeLog.0197.txt --edge-stats-log /MantaWorkflow/workspace/svHyGen/edgeStats.0197.xml --align-file /alignedSamples/NA12878/PCRfree/NA12878-PCRFree_S1.bam
version: 1.2.2-15-g34b1b79-dirty
buildTime: 2018-01-03T00:25:46.835456Z
compiler: g++-5.4.0
```
#### Logging
* At the workflow (python) layer, please write all logging messages through pyflow's logging interface as follows:
......@@ -222,6 +399,19 @@ self.flowLog("Initiating Starling workflow version: %s" % (__version__)
* Unit tests are already enabled for every library "test" subdirectory, additional tests in these directories will be automatically detected
* Example [svgraph unit tests directory](../../src/c++/lib/svgraph/test)
## IDE support
Little support for any specific IDE is provided, except as made available by cmake generators. IDE-specific configuration files maintained in the project are described below.
### Clion
A subset of code formatting settings which can be imported into Clion are available in the configuration file
`${STRELKA_REPO_PATH}/scratch/ideConfiguration/CLion/referenceCodeStyleSettings.xml`
..note that the automated `astyle` formatting settings still define the project defaults, the above configuration simply provides a starting point for CLion which is closer to the project's formatting norms.
## Special Topic Guides
The following items provide more in-depth details on a subsection of the methods/debugging protocol, etc.
......
#!/usr/bin/env bash
set -o nounset
set -o errexit
rel2abs() {
......@@ -14,12 +15,16 @@ mkdir -p $builddir
mname=methods
latexCmd() {
latex -halt-on-error -interaction=nonstopmode $1
}
do_latex_cmds() {
file=$1
latex $file
latexCmd $file
bibtex $file
latex $file
latex $file
latexCmd $file
latexCmd $file
dvipdf $file
}
......
......@@ -51,3 +51,16 @@
}
% 24307552
@Article{tigra2014,
Author="Chen, K. and Fan, X. and Wallis, J. and Ding, L. and Weinstock, G. ",
Title="{{T}IGRA: a targeted iterative graph routing assembler for breakpoint assembly}",
Journal="Genome Res.",
Year="2014",
Volume="24",
Number="2",
Pages="310--317",
Month="Feb"
}
......@@ -8,7 +8,8 @@
% for scalebox,...
\usepackage{graphics}
% hide hyperref links with pdfborder (more portable than hidelinks option)
% for automated hyperlinks on sections and \ref's:
% - hide hyperref links with white pdfborder (this is more portable than the hidelinks option)
\usepackage[pdfborder={0 0 0}]{hyperref}
% for pseudocode
......@@ -16,7 +17,7 @@
\usepackage[noend]{algpseudocode}
\title{'In-source Methods for Manta Structural Variant and Indel Caller'}
\title{Methods for Manta Structural Variant and Indel Caller}
% simple scientific notation:
......@@ -64,7 +65,7 @@ For each chromosome, depth is estimated using a modified median calculation. As
The depth estimation procedure repeatedly cycles through all segments. Within each segment, at least 40,000 reads are scanned before moving to the next segment (additional reads are scanned until the mapping position changes.) After the target number of reads have been scanned from every segment in the chromosome, the procedure returns to the first position and repeats this procedure starting from the last unscanned position. The process repeats until the all reads in all segments are scanned or the depth estimate converges.
Each scanned read is filtered if it is unmapped. Otherwise the read alignment is ignored and the read is applied to a simple depth pileup assuming a complete and ungapped alignment starting from the mapping position. Every 1M reads triggers a convergence check, but only after every chromosome segment has been sampled at least once.
Each scanned read is removed from consideration if it is marked as filtered, PCR duplicate, secondary, supplemental, or if the read is unmapped. Otherwise the read alignment is ignored and the read is applied to a simple depth pileup assuming a complete and ungapped alignment starting from the mapping position. Every 1M reads triggers a convergence check, but only after every chromosome segment has been sampled at least once.
Depth is estimated from the resulting pileup, using a simple median over all depth observations from the pileup structure excluding zero counts. Convergence is checked between the depth estimate of the last convergence check and the current one. An absolute change of less than 0.05 is treated as converged (or given the median case, the integer median estimates must be an exact match).
......@@ -265,6 +266,11 @@ Following annotation of nodes for merge eligibility, the procedure to execute me
In practice, the evidence weight signal threshold $e_s$ defaults to 9 and the evidence weight applied for an non-penalized split or paired read observation (as described in the previous section) is 3. Thus the above described breakend graph merging procedure effectively requires 3 split or paired read observations supporting the same region association before merging this evidence in the breakend graph.
% TODO The graph complexity filtration procedure in the paragraph below seems to have been left in an inaccurate state for some time and should be rewritten from the implementation. A TDR ticket (MANTA-1355) has been created for this update.
% Specific problems are:
% 1. There is no reference to the 'locus' concept from source code
% 2. The abort is actually applied to a whole locus instead of just the candidate merge node.
% 3. The complexity check has now been turned off for loci containing more than two nodes.
The breakend graph merge procedure can be limited under conditions of high graph complexity. If the search for all nodes intersecting a candidate merge node (as described in the IntersectingNodes procedure in Method \ref{method:annotate}) results in 500 or more nodes or greater than 0.5 nodes per base in the search region, the merge of the associated candidate edge is stopped. This step is designed to limit graph complexity in repetitive and pericentromeric regions.
As the target genome segment is scanned, the region breakend graph is periodically de-noised. The procedure is only performed for edges where the input alignment files have been scanned over the genomic regions of the nodes incident on the edge (excluding small buffer zones at the edges of the scanned genome segments). For any edge $e = (v,w)$ which is eligible for de-noising, if both evidence[$(v,w)$] and evidence[$(w,v)$] are less than $e_s$, the edge is removed from the graph, together with any nodes isolated by the edge removal.
......@@ -340,7 +346,9 @@ Candidate assembly reads are gathered from all expanded breakend regions. The in
The selected reads are assembled using a simple algorithm implicitly based on the popular de Bruijn graph approach originated by Pevzner \citep{pevzner2001}. We note here that the orientation of each unmapped assembly read is determined by the orientation of its mapped partner, avoiding the need to deduce this during the assembly. Assembly is attempted over a series of word sizes, starting from a minimum value which is increased until repeat filtration criteria are met or the maximum word size is reached. The method currently uses a minimum word size of 41 and maximum size 76, the word size is incremented by 5.
For a given word size $k$, a list is made of all $k$-mers present in the input reads. The most frequent $k$-mer is chosen as a seed for the resulting contig assembly if it is observed in at least 3 reads. The contig is extended from this seed in each direction by choosing $k$-mers from this list having a $k-1$ overlap with the contig end. To avoid haplotype switching, reads that support the contig are tracked at each extension point. When there are multiple potential extensions, a $k$-mer is selected so that the current extension has the highest consistency of supporting reads with the previous extensions. The extension ends when there is no overlapping $k$-mer from a previous supporting read. If a $k$-mer is repeated during contig extension (i.e. if a loop is created in the assembly graph), the word size is incremented and the contig assembly procedure is restarted. After a contig is accepted, all reads used to assemble the contig are removed from the assembly read pool and the assembly procedure recurses into the reduced pool to search for additional contigs from the remaining read evidence.
For a given word size $k$, a word list is made of all $k$-mers present in the input reads. A seed list, as a subset of the word list, is also made by excluding the $k$-mers that form a cycle of size less than 50 in the graph. The most frequent $k$-mer in the seed list, given it has the most supporting reads, is selected as a seed for the resulting contig assembly. The contig is extended from that seed in each direction by choosing $k$-mers from the word list having a $k-1$ overlap with the contig end. To avoid haplotype switching, reads that support the contig are tracked at each extension point. When there are multiple potential extensions, a $k$-mer is selected so that the current extension has the highest consistency of supporting reads with the previous extensions. The extension ends when there is no overlapping $k$-mer from a previous supporting read, or when the selected $k$-mer has already occurred in the contig (i.e. being repetitive). The contig is then collected, and each $k$-mer used to contruct the contig is removed from the seed list if contained. The contig construction procedure is repeated to generate multiple contigs until all seeds in the list are exhausted. If any of the returned contigs ends at a repetitive $k$-mer, the word size is incremented by one step, and the contig assembly procedure is restarted. All the contigs generated with the previous word size is considered as psuedo reads to roll in the next iteration of contig assembly, similar to the assembly method described in \cite{tigra2014}.
Finally, a greedy procedure is applied to select the constructed contigs in the order of the number of effective supporting reads and contig length. An effective supporting read cannot be a psuedo read, nor support any contigs that have been selected previously. The selection process is repeated until there is no more contig available with the minimum number of effective supporting reads (defaults to 2), or the maximum number of assembled contigs (defaults to 10) is met.
\subsubsection{Contig alignment for large SVs} For large SV candidates spanning two distinct regions of the genome, the reference sequences are extracted from the two expected breakend regions, and the order and/or orientation of the references is adjusted such that if the candidate SV exists, the left-most segment of the SV contig should align to the first transformed reference region and the right-most contig segment should align to the second reference region. The contig is aligned across the two reference regions using a variant of Smith-Waterman-Gotoh alignment (\cite{smith1981,gotoh1982}) where a `jump' state is included which can only be entered from the match state for the first reference segment and only exits to the match or insert states of the second reference segment. The state transitions of this alignment scheme are shown in Figure \ref{fig:jumpstate}
......@@ -384,8 +392,7 @@ Candidates are filtered following SV hypothesis refinement as follows:
\paragraph{Candidate output}
Following the late filtration steps, the total set of refined candidates (or imprecise candidates in the case of assembly failure) are reported to a `candidate' VCF file. This file does not include scoring or quality filtration information but provides partial support for applications which are not represented by Manta's current scoring models (i.e. tumor without matched normal), as a method development aid, or as input to a small variant caller or another SV scoring/genotyping method. As an example of the latter case. in the author's workflows all Manta indel candidates of size 50 and smaller are forwarded to a small variant caller to improve support for indels which are larger than the alignment gap size supported by typical high-speed read mappers.
Following the late filtration steps, the total set of refined candidates (or imprecise candidates in the case of assembly failure) are reported to a `candidate' VCF file. This file does not include scoring or quality filtration information but may be useful in a number of contexts, such as: (1) applications which are not supported by Manta's current scoring models (2) as a method development aid (3) as input to a small variant caller or another SV scoring/genotyping method.
\subsection{Scoring}
......@@ -546,7 +553,7 @@ P ( \text{len}(d,a) \vert a) = P ( \text{len}(d,a) ) (1-P(c \vert a)) + P (c \v
In the diploid model the chimera probabilities are the same for both alleles $P(c \vert a) = 1\e{-3}$. In the somatic model these are $P(c \vert a) = 1\e{-4}$ by default; but for Tier 2 analysis, the alternate allele chimera probablity is set to $P(c \vert x) = 5\e{-6}$ for the normal sample only.
For the split-read computation, each read is realigned across both breakends of the reference and variant alleles. Any read which crosses the breakend with at least 16 bases on both sides, has at least 75\% matches on each side and 90\% matches overall is classified as 'supporting' a breakend, and thus is allowed to contribute to the split-read evidence. The likelihood of the read for each of the two alleles, assuming the read is correctly mapped to the locus $m$, is
For the split-read computation, each read is realigned across both breakends of the reference and variant alleles. The likelihood of the read for each of the two alleles, assuming the read is correctly mapped to the locus $m$, is
\begin{equation*}
P (r \vert a,m) = \prod_{b_r \in r} P(b_r \vert b_a)
......@@ -565,6 +572,8 @@ e/3 & \mbox{ otherwise.}
\right.
\end{equation*}
Each candidate split read is evaluated to determine if it 'supports' a breakend. Only reads supporting a breakend are allowed to contribute to the split-read evidence. The first step of split read evaluation is to select the 'supported allele' for which the read has the highest likelihood. The read is then determined to be supporting the breakend if its alignment to the 'supported allele' (1) crosses the breakend with at least 16 bases on both sides, (2) has at least 75\% matches on each side of the breakend and (3) has at least 90\% matches overall.
A spurious read mapping $P(\neg m \vert \neg a)$ given that the sample actually supports an `other' allele type $\neg a$ at this locus is also accounted for:
\begin{equation*}
......@@ -585,7 +594,7 @@ The current filters are:
\begin{itemize}
\item \textit{High read depth} To remove calls in pericentromeric and other regions with collapsed reference representation, calls with very high depth relative to the mean depth of the chromosome are filtered out. Note for somatic calling only the depth of the normal sample is used for testing filtration.
\item \textit{High read depth} To remove calls in pericentromeric and other regions with collapsed reference representation, calls with very high depth relative to the expected depth of the chromosome are filtered out. Note for somatic calling only the depth of the normal sample is used for testing filtration.
The depth associated with the variant call is found from searching within 50 bases of each breakend region's center position. The position with the highest depth in the normal sample within these regions is treated as the variant depth. If the variant depth exceeds 3 times the average chromosome depth then the variant is filtered.
......