Skip to content
Commits on Source (5)
### Important
Users with versions dating prior to June 12 2013 please update. A nasty bug that caused the statistical part of mapDamage to use half of the data for estimation of the damage parameters, sorry for the inconvenience.
Users with versions dating prior to June 12 2013 please update. A nasty bug that caused the statistical part of `mapDamage` to use half of the data for estimation of the damage parameters, sorry for the inconvenience.
### Introduction
Complete documentation, instructions, examples, screenshots and FAQ are available at [this address](http://ginolhac.github.io/mapDamage/).
[mapDamage2](http://geogenetics.ku.dk/publications/mapdamage2.0/) is a computational framework written in Python and R, which tracks and quantifies DNA damage patterns
among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms.
[mapDamage2](http://geogenetics.ku.dk/publications/mapdamage2.0/) is a computational framework written in Python and R, which tracks and quantifies DNA damage patterns
among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms.
Mapdamage is developed at the [Centre for GeoGenetics](http://geogenetics.ku.dk/) by the [Orlando Group ](http://geogenetics.ku.dk/research/research_groups/palaeomix_group/).
`mapDamage` is developed at the [Centre for GeoGenetics](http://geogenetics.ku.dk/) by the [Orlando Group ](http://geogenetics.ku.dk/research/research_groups/palaeomix_group/).
### Web interface
Anna Kostikova from [insideDNA](https://insidedna.me) implemented a web interface for mapDamage.
Users can adjust the cloud ressources in terms of RAM/CPU to perform their analysis. She wrote a [tutorial](https://insidedna.me/tutorials/view/Analysis-ancient-DNA-samples-using-mapDamage) explaining how to use this [web interface (sign up required)](https://insidedna.me/app#/tools/100648/)
Users can adjust the cloud resources in terms of RAM/CPU to perform their analysis. She wrote a [tutorial](https://insidedna.me/tutorials/view/Analysis-ancient-DNA-samples-using-mapDamage) explaining how to use this [web interface (sign up required)](https://insidedna.me/app#/tools/100648/)
### Citation
......@@ -31,5 +31,4 @@ http://bioinformatics.oxfordjournals.org/content/27/15/2153](http://bioinformati
### Contact
Please report bugs and suggest possible improvements to Aurélien Ginolhac, Mikkel Schubert or Hákon Jónsson by email:
aginolhac at snm.ku.dk, MSchubert at snm.ku.dk or jonsson.hakon at gmail.com.
ginolhac at gmail.com, mikkelsch at gmail.com or jonsson.hakon at gmail.com.
#!/usr/bin/python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import random
import time
......@@ -37,7 +35,7 @@ plot and quantify damage patterns from a SAM/BAM file
:Date: November 2012
:Type: tool
:Input: SAM/BAM
:Output: tabulated tables, pdf
:Output: tabulated tables, pdf
"""
# check if pysam if available
......@@ -45,7 +43,7 @@ MODULE = "pysam"
URL = "http://code.google.com/p/pysam/"
try:
__import__(MODULE)
except ImportError, e:
except ImportError as e:
sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
sys.stderr.write(" If module is not installed, please download from '%s'.\n" % URL)
sys.stderr.write(" A local install may be performed using the following command:\n")
......@@ -98,7 +96,7 @@ def _downsample_to_fixed_number(bamfile, options):
if index >= downsample_to:
continue
sample[index] = record
return filter(None, sample)
return [_f for _f in sample if _f]
def _read_bamfile(bamfile, options):
......@@ -116,12 +114,12 @@ def _read_bamfile(bamfile, options):
def _check_mm_option():
"""
As the user can override the system wide mapdamage modules with
the --mapdamage-modules, it has to happen before option parsing
As the user can override the system wide mapdamage modules with
the --mapdamage-modules, it has to happen before option parsing
in mapdamage.parseoptions
"""
path_to_mm = None
for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
for nr,arg in zip(list(range(len(sys.argv))),sys.argv):
if arg.startswith("--mapdamage-modules"):
try:
if "=" in arg:
......@@ -130,7 +128,7 @@ def _check_mm_option():
path_to_mm = arg_p[1]
else:
# the option is of the format --mapdamage-modules AAAA
path_to_mm = sys.argv[nr+1]
path_to_mm = sys.argv[nr+1]
break
except IndexError as e:
raise SystemExit("Must specify a path to --mapdamage-modules")
......@@ -201,7 +199,7 @@ def main():
mapdamage.composition.get_base_comp(options.ref,path_to_basecomp)
mapdamage.rscript.run_stats(options)
return 0
# fetch all references and associated lengths in nucleotides
try:
ref = pysam.Fastafile(options.ref)
......@@ -230,7 +228,7 @@ def main():
in_bam = pysam.Samfile(options.filename)
reflengths = dict(zip(in_bam.references, in_bam.lengths))
reflengths = dict(list(zip(in_bam.references, in_bam.lengths)))
# check if references in SAM/BAM are the same in the fasta reference file
fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
if not fai_lengths:
......@@ -322,7 +320,7 @@ def main():
# compute base composition for genomic regions
mapdamage.composition.count_ref_comp(read, chrom, before, after, dnacomp)
if options.fasta:
mapdamage.seq.write_fasta(read, chrom, seq, refseq, min(coordinate), max(coordinate), before, after, fhfasta)
......@@ -357,7 +355,7 @@ def main():
if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
options.no_stats = True
# run the Bayesian estimation
if not options.no_stats:
# before running the Bayesian estimation get the base composition
......
mapdamage (2.0.9+dfsg-2) UNRELEASED; urgency=medium
mapdamage (2.1.0+dfsg-1) UNRELEASED; urgency=medium
* Use 2to3 to port to Python3
* New upstream version
Closes: #936989
* debhelper-compat 12
* Standards-Version: 4.4.0
......@@ -8,7 +8,7 @@ mapdamage (2.0.9+dfsg-2) UNRELEASED; urgency=medium
* buildsystem=pybuild
* python3-pysam <!nocheck>
-- Andreas Tille <tille@debian.org> Tue, 03 Sep 2019 11:02:34 +0200
-- Andreas Tille <tille@debian.org> Tue, 01 Oct 2019 17:13:12 +0200
mapdamage (2.0.9+dfsg-1) unstable; urgency=medium
......
Description: Use 2to3 to port to Python3
Bug-Debian: https://bugs.debian.org/936989
Author: Andreas Tille <tille@debian.org>
Last-Update: Tue, 03 Sep 2019 11:02:34 +0200
--- a/bin/mapDamage
+++ b/bin/mapDamage
@@ -1,7 +1,7 @@
-#!/usr/bin/python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
-from __future__ import print_function
+
import logging
import random
@@ -45,7 +45,7 @@ MODULE = "pysam"
URL = "http://code.google.com/p/pysam/"
try:
__import__(MODULE)
-except ImportError, e:
+except ImportError as e:
sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
sys.stderr.write(" If module is not installed, please download from '%s'.\n" % URL)
sys.stderr.write(" A local install may be performed using the following command:\n")
@@ -98,7 +98,7 @@ def _downsample_to_fixed_number(bamfile,
if index >= downsample_to:
continue
sample[index] = record
- return filter(None, sample)
+ return [_f for _f in sample if _f]
def _read_bamfile(bamfile, options):
@@ -121,7 +121,7 @@ def _check_mm_option():
in mapdamage.parseoptions
"""
path_to_mm = None
- for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
+ for nr,arg in zip(range(len(sys.argv)),sys.argv):
if arg.startswith("--mapdamage-modules"):
try:
if "=" in arg:
@@ -230,7 +230,7 @@ def main():
in_bam = pysam.Samfile(options.filename)
- reflengths = dict(zip(in_bam.references, in_bam.lengths))
+ reflengths = dict(list(zip(in_bam.references, in_bam.lengths)))
# check if references in SAM/BAM are the same in the fasta reference file
fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
if not fai_lengths:
--- a/mapdamage/__init__.py
+++ b/mapdamage/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import mapdamage.parseoptions
import mapdamage.seq
--- a/mapdamage/align.py
+++ b/mapdamage/align.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import itertools
import mapdamage
@@ -84,7 +84,7 @@ def get_mis(read, seq, refseq, ref, leng
std = '-' if read.is_reverse else '+'
subtable = tab[ref][end][std]
- for (i, nt_seq, nt_ref) in itertools.izip(xrange(length), seq, refseq):
+ for (i, nt_seq, nt_ref) in zip(range(length), seq, refseq):
if (nt_seq in "ACGT-") and (nt_ref in "ACGT-"):
if nt_ref != "-":
# record base composition in the reference, only A, C, G, T
--- a/mapdamage/composition.py
+++ b/mapdamage/composition.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import mapdamage
import itertools
@@ -10,8 +10,8 @@ def count_ref_comp(read, chrom, before,
""" record basae composition in external genomic regions """
std = '-' if read.is_reverse else '+'
- _update_table(comp[chrom]['5p'][std], before, xrange(-len(before), 0))
- _update_table(comp[chrom]['3p'][std], after, xrange(1, len(after) + 1))
+ _update_table(comp[chrom]['5p'][std], before, range(-len(before), 0))
+ _update_table(comp[chrom]['3p'][std], after, range(1, len(after) + 1))
def count_read_comp(read, chrom, length, comp):
@@ -20,12 +20,12 @@ def count_read_comp(read, chrom, length,
if read.is_reverse:
std, seq = '-', mapdamage.seq.revcomp(seq)
- _update_table(comp[chrom]['5p'][std], seq, xrange(1, length + 1))
- _update_table(comp[chrom]['3p'][std], reversed(seq), xrange(-1, - length - 1, -1))
+ _update_table(comp[chrom]['5p'][std], seq, range(1, length + 1))
+ _update_table(comp[chrom]['3p'][std], reversed(seq), range(-1, - length - 1, -1))
def _update_table(table, sequence, indices):
- for (index, nt) in itertools.izip(indices, sequence):
+ for (index, nt) in zip(indices, sequence):
if nt in table:
table[nt][index] += 1
@@ -47,7 +47,7 @@ def get_base_comp(filename,destination=F
bases["C"] = bases["C"] + int(tabs[3])
bases["G"] = bases["G"] + int(tabs[4])
bases["T"] = bases["T"] + int(tabs[5])
- except (OSError, ValueError), error:
+ except (OSError, ValueError) as error:
sys.stderr.write("Error: Seqtk failed: %s\n" % (error,))
sys.exit(1)
# get the base frequencies
--- a/mapdamage/parseoptions.py
+++ b/mapdamage/parseoptions.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
import os
@@ -235,12 +235,12 @@ def options():
if os.path.isdir(options.folder):
if not options.quiet and not options.plot_only:
- print("Warning, %s already exists" % options.folder)
+ print(("Warning, %s already exists" % options.folder))
if options.plot_only:
if not file_exist(options.folder+"/dnacomp.txt") or not file_exist(options.folder+"/misincorporation.txt"):
parser.error('folder %s is not a valid result folder' % options.folder)
else:
- os.makedirs(options.folder, mode = 0750)
+ os.makedirs(options.folder, mode = 0o750)
if options.plot_only or options.stats_only or options.rescale_only:
sys.stderr.write("Error, %s does not exist while plot/stats/rescale only was used\n" % options.folder)
return None
--- a/mapdamage/rescale.py
+++ b/mapdamage/rescale.py
@@ -31,14 +31,14 @@ def get_corr_prob(folder, rescale_length
fi_handle = csv.DictReader(open(full_path))
corr_prob = {}
for line in fi_handle:
- if (corr_prob.has_key(line["Position"])):
+ if (line["Position"] in corr_prob):
sys.exit('This file has multiple position definitions %s, line %d: %s' % \
(folder, fi_handle.line_num, corr_prob[line["Position"]]))
else:
corr_prob[int(line["Position"])] = {'C.T':float(line["C.T"]), 'G.A':float(line["G.A"])}
# Exclude probabilities for positions outside of user-specified region
- for key in corr_prob.keys():
+ for key in list(corr_prob.keys()):
if key < -rescale_length_3p or key > rescale_length_5p:
corr_prob.pop(key)
@@ -75,13 +75,13 @@ def corr_this_base(corr_prob, nt_seq, nt
back_pos = pos-length-1
# position from 3' end
- if corr_prob.has_key(pos):
+ if pos in corr_prob:
p5_corr = corr_prob[pos][subs]
# correction from 5' end
else:
p5_corr = 0
- if corr_prob.has_key(back_pos):
+ if back_pos in corr_prob:
p3_corr = corr_prob[back_pos][subs]
# correction from 3' end
else:
@@ -104,7 +104,7 @@ def corr_this_base(corr_prob, nt_seq, nt
def initialize_subs():
"""Initialize a substitution table, to track the expected substitution counts"""
- per_qual = dict(zip(range(0,130),[0]*130))
+ per_qual = dict(list(zip(list(range(0,130)),[0]*130)))
subs = {"CT-before":per_qual.copy(),\
"TC-before":per_qual.copy(),\
"GA-before":per_qual.copy(),\
@@ -164,7 +164,7 @@ def qual_summary_subs(subs):
for qv in subs[i]:
if qv >= lv :
key = i+"-Q"+str(lv)
- if subs.has_key(key):
+ if key in subs:
subs[key] += subs[i][qv]
else:
subs[key] = subs[i][qv]
@@ -174,32 +174,32 @@ def print_subs(subs):
print("\tThe expected substition frequencies before and after scaling using the scaled qualities as probalities:")
if subs["C"]!=0:
# the special case of no substitutions
- print("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"]))
+ print(("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"])))
else:
print("\tCT\tNA\t\tNA")
if subs["T"]!=0:
- print("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"]))
+ print(("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"])))
else:
print("\tTC\tNA\t\tNA")
if subs["G"]!=0:
- print("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"]))
+ print(("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"])))
else:
print("\tGA\tNA\t\tNA")
if subs["A"]!=0:
- print("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"]))
+ print(("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"])))
else:
print("\tAG\tNA\t\tNA")
print("\tQuality metrics before and after scaling")
- print("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"]))
- print("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"]))
- print("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"]))
- print("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"]))
- print("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"]))
- print("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"]))
- print("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"]))
- print("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"]))
- print("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"]))
- print("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"]))
+ print(("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"])))
+ print(("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"])))
+ print(("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"])))
+ print(("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"])))
+ print(("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"])))
+ print(("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"])))
+ print(("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"])))
+ print(("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"])))
+ print(("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"])))
+ print(("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"])))
def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="both"):
@@ -237,7 +237,7 @@ def rescale_qual_read(bam, read, ref, co
new_qual = [-100]*length_read
pos_on_read = 0
number_of_rescaled_bases = 0.0
- for (i, nt_seq, nt_ref, nt_qual) in itertools.izip(xrange(length_align), seq, refseq, qual):
+ for (i, nt_seq, nt_ref, nt_qual) in zip(range(length_align), seq, refseq, qual):
# rescale the quality according to the triplet position,
# pair of the reference and the sequence
if ((nt_seq == "T" and nt_ref =="C") or (nt_seq == "A" and nt_ref =="G")):
--- a/mapdamage/rescale_test.py
+++ b/mapdamage/rescale_test.py
@@ -1,6 +1,9 @@
import unittest
import optparse
-import rescale
+try:
+ from . import rescale
+except:
+ from mapdamage import rescale
import pysam
import filecmp
--- a/mapdamage/rscript.py
+++ b/mapdamage/rscript.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import os
import subprocess
@@ -35,11 +35,11 @@ def plot(opt):
script = construct_path("mapDamage.R")
call = ["Rscript", script, fcomp, title, opt.refplot, fmut, opt.readplot, \
opt.ymax, opt.folder, opt.title, __version__]
- code = subprocess.call(map(str, call))
+ code = subprocess.call(list(map(str, call)))
if code == 0:
if not opt.quiet:
- print("pdf %s generated" % title)
+ print(("pdf %s generated" % title))
return 0
else:
print("Error: plotting with R failed")
@@ -58,7 +58,7 @@ def opt_plots(opt):
script = construct_path("lengths.R")
call = ["Rscript", script, flength, output, fmut, opt.length, \
opt.title, __version__, bool_to_int(opt.quiet)]
- code = subprocess.call(map(str, call))
+ code = subprocess.call(list(map(str, call)))
if code == 0:
return 0
else:
@@ -90,11 +90,11 @@ def check_R_lib():
if len(missing_lib) > 1:
# Grammar Nazi has arrived
last_ele = missing_lib.pop()
- print ("Missing the following R libraries '" + "', '".join(missing_lib) \
- + "' and '" + last_ele + "'")
+ print(("Missing the following R libraries '" + "', '".join(missing_lib) \
+ + "' and '" + last_ele + "'"))
return 1
elif len(missing_lib) == 1:
- print ("Missing the following R library "+missing_lib[0])
+ print(("Missing the following R library "+missing_lib[0]))
return 1
else :
# No missing libraries
--- a/mapdamage/seq.py
+++ b/mapdamage/seq.py
@@ -1,9 +1,9 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import sys
import string
# from Martin Kircher, to complement DNA
-TABLE = string.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
+TABLE = str.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
'ACGTKYWSRMBDHVacgtkywsrmbdhv')
LETTERS = ("A", "C", "G", "T")
--- a/mapdamage/tables.py
+++ b/mapdamage/tables.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import os
import collections
@@ -16,7 +16,7 @@ def initialize_mut(ref, length):
for std in ('+','-'):
tab_std = tab_end[std] = {}
for mut in mapdamage.seq.HEADER:
- tab_std[mut] = dict.fromkeys(xrange(length), 0)
+ tab_std[mut] = dict.fromkeys(range(length), 0)
return tab
@@ -26,8 +26,8 @@ def print_mut(mut, opt, out):
def initialize_comp(ref, around, length):
- keys = {"3p" : range(-length, 0) + range(1, around + 1),
- "5p" : range(-around, 0) + range(1, length + 1)}
+ keys = {"3p" : list(range(-length, 0)) + list(range(1, around + 1)),
+ "5p" : list(range(-around, 0)) + list(range(1, length + 1))}
tab = {}
for contig in ref:
@@ -74,8 +74,8 @@ def check_table_and_warn_if_dmg_freq_is_
total = 0.0
for filename in ("5pCtoT_freq.txt", "3pGtoA_freq.txt"):
if not os.path.exists(folder+"/"+filename):
- print("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
- % filename)
+ print(("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
+ % filename))
return True
with open(os.path.join(folder, filename)) as handle:
@@ -85,12 +85,12 @@ def check_table_and_warn_if_dmg_freq_is_
total += float(freq[1])
break
else:
- print("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
- % filename)
+ print(("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
+ % filename))
return True
if total < 0.01:
- print("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total)
+ print(("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total))
return False
@@ -103,9 +103,9 @@ def _print_freq_table(table, columns, op
out.write("# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads\n")
out.write("Chr\tEnd\tStd\tPos\t%s\n" % ("\t".join(columns)))
- for (reference, ends) in sorted(table.iteritems()):
- for (end, strands) in sorted(ends.iteritems()):
- for (strand, subtable) in sorted(strands.iteritems()):
+ for (reference, ends) in sorted(table.items()):
+ for (end, strands) in sorted(ends.items()):
+ for (strand, subtable) in sorted(strands.items()):
subtable["Total"] = {}
for index in sorted(subtable[columns[0]]):
subtable["Total"][index] = sum(subtable[letter][index] for letter in mapdamage.seq.LETTERS)
--- a/mapdamage/version.py
+++ b/mapdamage/version.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
try:
from mapdamage._version import __version__
except ImportError:
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
from distutils.core import setup
@@ -14,9 +14,9 @@ def setup_version():
try:
version = subprocess.check_output(("git", "describe", "--always", "--tags", "--dirty"))
with open(os.path.join("mapdamage", "_version.py"), "w") as handle:
- handle.write("#!/usr/bin/env python\n")
+ handle.write("#!/usr/bin/python3\n")
handle.write("__version__ = %r\n" % (version.strip(),))
- except (subprocess.CalledProcessError, OSError), error:
+ except (subprocess.CalledProcessError, OSError) as error:
raise SystemExit("Could not determine mapDamage version: %s" % (error,))
......@@ -4,11 +4,9 @@ Description: change calcSampleStats such as any NA and Nan's are
Last-Update: 2017-03-28
Bug: https://github.com/ginolhac/mapDamage/issues/18
Index: mapdamage/mapdamage/Rscripts/stats/function.R
===================================================================
--- mapdamage.orig/mapdamage/Rscripts/stats/function.R 2017-03-29 00:02:46.538470957 -0400
+++ mapdamage/mapdamage/Rscripts/stats/function.R 2017-03-29 00:11:09.141025051 -0400
@@ -595,8 +595,8 @@
--- a/mapdamage/Rscripts/stats/function.R
+++ b/mapdamage/Rscripts/stats/function.R
@@ -595,8 +595,8 @@ calcSampleStats <- function(da,X){
pos=da[,"Pos"],
mea=apply(X,1,mean),
med=apply(X,1,median),
......
use_debian_packaged_seqtk.patch
fix_quantile_na_rm.patch
2to3.patch
......@@ -27,23 +27,24 @@ Description: Use Debian packaged seqtk
def setup_version():
if not os.path.exists(".git"):
# Release version, no .git folder
@@ -40,16 +25,7 @@ class compileInstall(DistutilsInstall):
@@ -40,17 +25,8 @@ class compileInstall(DistutilsInstall):
def run(self):
self.record=""
setup_version()
- compile_seqtk()
DistutilsInstall.run(self)
- # fixing the permission problem of seqtk
# fixing the permission problem of seqtk
- files = self.get_outputs()
- for fi in files:
- if fi.endswith("seqtk/seqtk"):
- os.chmod(fi,0755)
- os.chmod(fi,0o755)
-
-
-
-
setup(
cmdclass={'install': compileInstall},
@@ -59,7 +35,7 @@ setup(
author='Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson',
author_email='MSchubert@snm.ku.dk, jonsson.hakon@gmail.com',
......@@ -55,7 +56,7 @@ Description: Use Debian packaged seqtk
license='LICENSE.txt',
--- a/bin/mapDamage
+++ b/bin/mapDamage
@@ -151,7 +151,7 @@ def main():
@@ -149,7 +149,7 @@ def main():
sys.path.insert(0,path_to_mm)
import mapdamage
......@@ -66,12 +67,12 @@ Description: Use Debian packaged seqtk
"been intalled properly or current user does not\n"
--- a/mapdamage/composition.py
+++ b/mapdamage/composition.py
@@ -35,7 +35,7 @@ def get_base_comp(filename,destination=F
@@ -33,7 +33,7 @@ def get_base_comp(filename,destination=F
Gets the basecomposition of all the sequences in filename
and returns the value to destination if given.
"""
- path_to_seqtk = mapdamage.rscript.construct_path("seqtk",folder="seqtk")
+ path_to_seqtk = "/usr/bin/seqtk"
- path_to_seqtk = mapdamage.rscript.construct_path("seqtk",folder="seqtk")
+ path_to_seqtk = "/usr/bin/seqtk"
bases = {"A":0,"C":0,"G":0,"T":0}
alp = ["A","C","G","T"]
try:
......@@ -33,20 +33,20 @@ draw.open.rect <- function(xleft, ybottom, xright, ytop, padding = 0) {
} else {
xpoints <- c(xright, xleft - padding, xleft - padding, xright)
}
lines(xpoints, c(ytop, ytop, ybottom, ybottom), col = "darkgrey")
}
plot.base.composition <- function(tbl, base, color, around, ylabels.at = c(), xlabels = FALSE) {
xcoords <- c(-around:-1, 1:around)
plot.axis <- function(yaxis.at) {
axis(side = yaxis.at, labels = (yaxis.at == ylabels.at), line = 0, las = 2, cex.axis = 0.8)
if ((yaxis.at == 2) && (yaxis.at == ylabels.at)) {
mtext("Frequency", side = 2, line = 2.5, cex = 0.6)
}
if (xlabels) {
axis(side = 1, labels = xcoords, at = xcoords, las = 2, cex.axis = 0.6)
} else {
......@@ -64,7 +64,7 @@ plot.base.composition <- function(tbl, base, color, around, ylabels.at = c(), xl
ycoords <- NULL
for (i in xcoords) {
ycoords <- append(ycoords, mean(subtbl[(subtbl$Pos==i), base]/subtbl$Total[(subtbl$Pos==i)],na.rm=T))
ycoords <- append(ycoords, mean(subtbl[(subtbl$Pos==i), base]/subtbl$Total[(subtbl$Pos==i)],na.rm=T))
}
points(xcoords, ycoords, pch = 20, col = color, type = "b")
}
......@@ -80,7 +80,7 @@ plot.base.composition <- function(tbl, base, color, around, ylabels.at = c(), xl
plot.frequencies("3p")
plot.axis(4)
draw.open.rect(-around, 0, 0, 0.5)
par(mar = c(1, 2, 1, 2))
}
......@@ -103,8 +103,8 @@ calculate.mutation.table <- function(filename, length)
write.mutation.table <- function(tbl, end, mismatch, filename)
{
columns <- c("pos", sprintf("5p%s", mismatch))
tbl[is.na(tbl)] <- 0 # Python doesn't like NAs
columns <- c("pos", sprintf("%s%s", end, mismatch))
tbl[is.na(tbl)] <- 0 # Python doesn't like NAs
write.table(tbl[tbl$End == end, c("Pos", mismatch)],
row.names = FALSE, col.names = columns,
sep = "\t", quote = FALSE,
......@@ -117,12 +117,12 @@ plot.mutations <- function(end, axis.at, start.i, end.i, modifier) {
subtable <- tbl[tbl$End == end, c("Pos", mismatches)]
rates <- rowSums(subtable) - subtable$Pos
subtable <- aggregate(list(Rate = rates), list(Pos = subtable$Pos), sum)
lines(subtable$Pos * modifier, subtable$Rate,
xlim = c(1, OPT.LENGTH), ylim = c(0, OPT.YMAX),
col = color, lwd = width)
}
plot(NA, xlim = c(start.i, end.i), ylim=c(0, OPT.YMAX), col="grey", lwd = 1, type = "l", xlab = "", ylab = "", axes = FALSE)
axis(side = 1, labels = start.i:end.i, at = start.i:end.i, las = 2, cex.axis = 0.8)
axis(side = axis.at, labels = TRUE, las = 2, cex.axis = 0.8)
......@@ -130,11 +130,11 @@ plot.mutations <- function(end, axis.at, start.i, end.i, modifier) {
mtext("Frequency", side = 2, line = 2.5, cex = 0.6)
}
draw.open.rect(start.i, OPT.YMAX, end.i, -0.01, padding = 0.5)
for (mismatch in MISMATCHES) {
do.plot(mut, end, modifier, mismatch, "grey", 1)
}
do.plot(mut, end, modifier, CLIPPING, "orange", 1)
do.plot(mut, end, modifier, DELETIONS, "green", 1)
do.plot(mut, end, modifier, INSERTIONS, "purple", 1)
......
#Various useful functions
getPmat <- function(tmu,tv_ti_ratio,acgt){
#Returns the evolutionary substitution matrix
#Returns the evolutionary substitution matrix
if (sum(acgt>=1)!=0 || sum(acgt<=0)!=0){
write("The ACGT frequencies must be in the range 0 to 1",stderr())
stop()
stop()
}
if (all.equal(sum(acgt),1)!=TRUE){
write("The ACGT frequencies do not sum to 1",stderr())
stop()
stop()
}
if (tv_ti_ratio<=0){
write("The transversion and transtition ratio cannot go under 0",stderr())
stop()
stop()
}
#Returns the substitution probability matrix.
if (identical(tv_ti_ratio,1) && identical(acgt,c(.25,.25,.25,.25))){
......@@ -23,7 +23,7 @@ getPmat <- function(tmu,tv_ti_ratio,acgt){
E <- diag(exp(r$values))
# Q The eigen vector change of basis
# M -> M
# M -> M
#
# ^ ^
# |B^-1 |B
......@@ -32,8 +32,8 @@ getPmat <- function(tmu,tv_ti_ratio,acgt){
out <- solve(a=t(B),b= E %*% t(B))#Little trick to avoid numerical difficulties
rownames(out) <- c("A","C","G","T")
colnames(out) <- c("A","C","G","T")
return(out)
}
return(out)
}
}
jukesCantorPmat <- function(tmu){
......@@ -49,22 +49,22 @@ qmatHKY85 <- function(tmu,tv_ti,acgt){
# | pi_a*tv_ti pi_c pi_g * tv_ti sum_4 |
#returns this matrix
#This could be replaced with the analytical formulas for the substitutions probabilities.
Qmat <- matrix(rep(acgt,4),ncol=4,byrow=TRUE)
diag(Qmat) <- 0
#Adjusting the transversions versus transitions
#- * - *
#* - * -
#- * - *
#* - * -
#- * - *
#* - * -
#- * - *
#* - * -
Qmat[1,2] <- tv_ti*Qmat[1,2]
Qmat[1,4] <- tv_ti*Qmat[1,4]
Qmat[2,1] <- tv_ti*Qmat[2,1]
Qmat[2,3] <- tv_ti*Qmat[2,3]
Qmat[3,2] <- tv_ti*Qmat[3,2]
Qmat[3,4] <- tv_ti*Qmat[3,4]
......@@ -73,12 +73,12 @@ qmatHKY85 <- function(tmu,tv_ti,acgt){
diag(Qmat) <- -apply(Qmat,1,sum)
Qmat <- tmu * Qmat
Qmat <- tmu * Qmat
return(Qmat)
}
metroDesc <- function(lpr,lol){
# the logic in the Metropolis-Hastings step
# the logic in the Metropolis-Hastings step
stopifnot(!is.na(lpr))
stopifnot(!is.na(lol))
if (log(runif(1))<lpr-lol){
......@@ -89,7 +89,7 @@ metroDesc <- function(lpr,lol){
}
genOverHang <- function(la){
#Currently not used in the main code but could used to generate
#Currently not used in the main code but could used to generate
#overhangs like Philip
r <- runif(1)
i <- -1
......@@ -143,7 +143,7 @@ seqProbVecLambda <- function(lambda,lambda_disp,m,fo_only=NA,re_only=NA){
}
}
#The following is an MC simulation code to mimic the
#The following is an MC simulation code to mimic the
#nick frequency part in the model from Philip
seqProbVecNuWithLengths<- cxxfunction(methods::signature(
I_la="numeric",
......@@ -162,7 +162,7 @@ seqProbVecNuWithLengths<- cxxfunction(methods::signature(
double r = ((double) rand() / (RAND_MAX));
int i = -1;
double p = 0;
double term = -500;
double term = -500;
while (p<r){
if (i==-1){
term = 1;
......@@ -175,7 +175,7 @@ seqProbVecNuWithLengths<- cxxfunction(methods::signature(
return(i);
}
',body= '
srand(time(0));
srand(time(0));
Rcpp::NumericVector la(I_la);
Rcpp::NumericVector la_disp(I_la_disp);
Rcpp::NumericVector nu(I_nu);
......@@ -199,7 +199,7 @@ seqProbVecNuWithLengths<- cxxfunction(methods::signature(
double left_o_hang = genOverHang(la[0],la_disp[0]);
double right_o_hang = genOverHang(la[0],la_disp[0]);
double o_hang = left_o_hang+right_o_hang;
//
//
if (o_hang>=les(i)){
//Single stranded sequence
for (int j = 0; j <les(i);j++){
......@@ -211,8 +211,8 @@ seqProbVecNuWithLengths<- cxxfunction(methods::signature(
for (int j = 0; j <(les[i]-right_o_hang);j++){
output(j) = output(j)+1;
}
//The right overhang is always G>A for the double stranded but
//Here we will make the assumption that the pattern is symmetric
//The right overhang is always G>A for the double stranded but
//Here we will make the assumption that the pattern is symmetric
//for practical reasons We can\'t do that ....
}else {
Rcpp::NumericVector sa = floor(runif(1,0,les[i]-o_hang))+left_o_hang;
......@@ -258,7 +258,7 @@ pDam <- function(th,ded,des,la,nu,lin){
}
logLikFunOneBaseSlow <- function(Gen,S,Theta,deltad,deltas,laVec,nuVec,m,lin){
#Calculates the log likelihood using R, a C++ version is available
#Calculates the log likelihood using R, a C++ version is available
ll <- 0
for (i in 1:length(laVec)){
#Get the damage probabilities
......@@ -268,7 +268,7 @@ logLikFunOneBaseSlow <- function(Gen,S,Theta,deltad,deltas,laVec,nuVec,m,lin){
return(ll)
}
#The same logic as in logLikFunOneBaseSlow except using a compiled code
#The same logic as in logLikFunOneBaseSlow except using a compiled code
#to do the hard work
logLikFunOneBaseFast <- cxxfunction(methods::signature(
I_Gen="numeric",
......@@ -333,7 +333,7 @@ return(ret);
logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
#Calculates the logLikelihood for all the bases by calling
#Calculates the logLikelihood for all the bases by calling
# logLikFunOneBaseFast for each base
if (deltad<0 || deltad>1 || deltas<0 || deltas>1 ){
return(-Inf)
......@@ -341,8 +341,8 @@ logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
#A,C,G and T
deb <- 0
Asub <- dat[,"A.C"]+dat[,"A.G"]+dat[,"A.T"]
Asub <- dat[,"A.C"]+dat[,"A.G"]+dat[,"A.T"]
ALL <- logLikFunOneBaseFast(dat[,"A"],cbind(dat[,"A"]-Asub,dat[,"A.C"],dat[,"A.G"],dat[,"A.T"]),Theta,deltad,deltas,laVec,nuVec,m,1)
if (deb){
ALLSlow <- logLikFunOneBaseSlow(dat[,"A"],cbind(dat[,"A"]-Asub,dat[,"A.C"],dat[,"A.G"],dat[,"A.T"]),Theta,deltad,deltas,laVec,nuVec,m,1)
......@@ -355,7 +355,7 @@ logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
CLLSlow <- logLikFunOneBaseSlow(dat[,"C"],cbind(dat[,"C.A"],dat[,"C"]-Csub,dat[,"C.G"],dat[,"C.T"]),Theta,deltad,deltas,laVec,nuVec,m,2)
stopifnot(all.equal(CLL,CLLSlow))
}
Gsub <- dat[,"G.A"]+dat[,"G.C"]+dat[,"G.T"]
GLL <- logLikFunOneBaseFast(dat[,"G"],cbind(dat[,"G.A"],dat[,"G.C"],dat[,"G"]-Gsub,dat[,"G.T"]),Theta,deltad,deltas,laVec,nuVec,m,3)
......@@ -363,7 +363,7 @@ logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
GLLSlow <- logLikFunOneBaseSlow(dat[,"G"],cbind(dat[,"G.A"],dat[,"G.C"],dat[,"G"]-Gsub,dat[,"G.T"]),Theta,deltad,deltas,laVec,nuVec,m,3)
stopifnot(all.equal(CLL,CLLSlow))
}
Tsub <- dat[,"T.A"]+dat[,"T.C"]+dat[,"T.G"]
TLL <- logLikFunOneBaseFast(dat[,"T"],cbind(dat[,"T.A"],dat[,"T.C"],dat[,"T.G"],dat[,"T"]-Tsub),Theta,deltad,deltas,laVec,nuVec,m,4)
if (deb){
......@@ -375,7 +375,7 @@ logLikAll <- function(dat,Theta,deltad,deltas,laVec,nuVec,m){
getParams <- function(cp){
#Utility function nice to update the MCMC iterations matrix
#Utility function nice to update the MCMC iterations matrix
return(c(cp$Theta,cp$Rho,cp$DeltaD,cp$DeltaS,cp$Lambda,cp$LambdaRight,cp$LambdaDisp,cp$Nu))
}
......@@ -385,7 +385,7 @@ plotTrace<- function(dat,main,k=111){
}
plotEverything <- function(mcmcOut,hi=0,pl,thin=100){
#Plots the MCMC traceplot in the form of a running median and
#Plots the MCMC traceplot in the form of a running median and
#histogram of the MCMC iterations
if (sum(c(cu_pa$same_overhangs==FALSE,
cu_pa$fix_disp==FALSE,
......@@ -438,7 +438,7 @@ plotEverything <- function(mcmcOut,hi=0,pl,thin=100){
}
accRat <- function(da){
#A rough measure of the acceptance ratio
#A rough measure of the acceptance ratio
return(length(unique(da))/length(da))
}
......@@ -467,7 +467,7 @@ adjustPropVar <- function(mcmc,propVar){
}
runGibbs <- function(cu_pa,iter){
#Sampling over the conditional posterior distribution
#Sampling over the conditional posterior distribution
#for the parameters.
esti <- matrix(nrow=iter,ncol=9)
colnames(esti) <- c("Theta","Rho","DeltaD","DeltaS","Lambda","LambdaRight","LambdaDisp","Nu","LogLik")
......@@ -492,7 +492,7 @@ runGibbs <- function(cu_pa,iter){
#Update the nu parameter by via MC estimation
cu_pa<-updateNu(cu_pa)
}
esti[i,c(1:8)] <- getParams(cu_pa)
esti[i,c(1:8)] <- getParams(cu_pa)
esti[i,"LogLik"] <- logLikAll(cu_pa$dat,cu_pa$ThetaMat,cu_pa$DeltaD,cu_pa$DeltaS,cu_pa$laVec,cu_pa$nuVec,cu_pa$m)
if (! (i %% 1000) && cu_pa$verbose){
cat("MCMC-Iter\t",i,"\t",esti[i,"LogLik"],"\n")
......@@ -504,7 +504,7 @@ runGibbs <- function(cu_pa,iter){
simPredCheck <- function(da,output){
#Simulates one draw from the posterior predictive distribution
#and the probability of a C>T substitution because of a cytosine
#and the probability of a C>T substitution because of a cytosine
#demnation.
bases <- da[,c("A","C","G","T")]
#Constructing the lambda vector
......@@ -541,12 +541,12 @@ simPredCheck <- function(da,output){
output$cu_pa$mLe,
output$cu_pa$forward_only,
output$cu_pa$nuSamples,
output$cu_pa$ds_protocol)
output$cu_pa$ds_protocol)
nuVec <- c(nuVec,rev(1-nuVec))
}else {
nuVec <- output$cu_pa$nuVec
}
#Sample the other parameters
des <- sample(output$out[,"DeltaS"],1)
ded <- sample(output$out[,"DeltaD"],1)
......@@ -562,10 +562,10 @@ simPredCheck <- function(da,output){
subs <- matrix(NA,nrow=nrow(output$cu_pa$dat),ncol=4+length(coln))
colnames(subs) <- c("A","C","G","T",coln)
#
damProb <- rep(NA,nrow(output$cu_pa$dat))
damProbGA <- damProb
damProb <- rep(NA,nrow(output$cu_pa$dat))
damProbGA <- damProb
for (i in 1:nrow(output$cu_pa$dat)){
#Construct the site specific probabilities
#Construct the site specific probabilities
pct <- nuVec[i]*(laVec[i]*des+ded*(1-laVec[i]))
pga <- (1-nuVec[i])*(laVec[i]*des+ded*(1-laVec[i]))
pDamMat <- matrix(c(
......@@ -574,11 +574,11 @@ simPredCheck <- function(da,output){
pga,0,1-pga,0,
0,0,0,1
),nrow=4,byrow=TRUE)
ThetapDam <- pDamMat %*% pmat
ThetapDam <- pDamMat %*% pmat
#Calculate the probability C.T due to cytosine demanation
damProb[i] <- ptransCC*pct/(ptransCC*pct+ptransCT)
#Do not forget the reverse complement
damProbGA[i] <- ptransGG*pga/(ptransGG*pga+ptransGA)
damProb[i] <- ptransCC*pct/(ptransCC*pct+ptransCT)
#Do not forget the reverse complement
damProbGA[i] <- ptransGG*pga/(ptransGG*pga+ptransGA)
#Then draw from a multinomial distribution
subs[i,c("A.C","A.G","A.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"A"],ThetapDam[1,]))[-1]/output$cu_pa$dat[i,"A"]
subs[i,c("C.A","C.G","C.T")] <- t(rmultinom(1,output$cu_pa$dat[i,"C"],ThetapDam[2,]))[-2]/output$cu_pa$dat[i,"C"]
......@@ -602,7 +602,7 @@ calcSampleStats <- function(da,X){
postPredCheck <- function(da,output,samples=10000){
#Plots the 95% posterior predictive intervals with the data as lines.
#Returns the site specific probability of a C>T or G>A substitution
#Returns the site specific probability of a C>T or G>A substitution
#because of a cytosine demnation.
CTs <- matrix(NA,nrow=nrow(da),ncol=samples)
GAs <- matrix(NA,nrow=nrow(da),ncol=samples)
......@@ -635,28 +635,28 @@ postPredCheck <- function(da,output,samples=10000){
}
CTsStats <- calcSampleStats(da,CTs)
GAsStats <- calcSampleStats(da,GAs)
REsStats <- calcSampleStats(da,REs)
REsStats <- calcSampleStats(da,REs)
#Plotting the posterior predictive intervals
p <- ggplot()+
geom_point(aes(x,mea,colour="C->T",aes_string="subs"),data=CTsStats)+
geom_point(aes(x,mea,colour="C->T"),data=CTsStats)+
geom_point(aes(x,mea,colour="G->A"),data=GAsStats)+
geom_point(aes(x,mea,colour="Others"),data=REsStats)+
geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="C->T"),data=CTsStats)+
geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="G->A"),data=GAsStats)+
geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="Others"),data=REsStats)+
geom_errorbar(aes(x=x,ymin=loCI,ymax=hiCI,color="C->T"),data=CTsStats)+
geom_errorbar(aes(x=x,ymin=loCI,ymax=hiCI,color="G->A"),data=GAsStats)+
geom_errorbar(aes(x=x,ymin=loCI,ymax=hiCI,color="Others"),data=REsStats)+
geom_line(aes(oneBased,C.T/C),color="red",data=data.frame(da))+
geom_line(aes(oneBased,G.A/G),color="green",data=data.frame(da))+
geom_line(aes(oneBased,((A.C+A.G+A.T)/A+(C.A+C.G)/C+(G.C+G.T)/G+(T.A+T.C+T.G)/T)/10),color="blue",data=data.frame(da))+
ylab("Substitution rate")+
xlab("Relative position")+
scale_x_continuous(breaks=bres,labels=labs)+
labs(colour = "Subs. type")+
ggtitle("Posterior prediction intervals")
labs(y = "Substitution rate",
x = "Relative position",
colour = "Subs. type",
title = "Posterior prediction intervals")+
scale_x_continuous(breaks=bres,labels=labs)
if (output$cu_pa$use_bw_theme){
p <- p+theme_bw()
}
plot(p)
#The correcting probabilities
#The correcting probabilities
coProbs <- cbind(da[,"Pos"],apply(C2TProbs,1,mean),apply(G2AProbs,1,mean))
colnames(coProbs) <- c("Position","C.T","G.A")
return(coProbs)
......@@ -686,9 +686,8 @@ writeMCMC <- function(out,filename){
qua <- apply(out$out[,parameters],2,quantile,seq(from=0,to=1,by=.025))
acc <- apply(out$out[,parameters],2,accRat)
summStat <- rbind(mea,std,acc,qua)
rownames(summStat)[1] <- "Mean"
rownames(summStat)[2] <- "Std."
rownames(summStat)[3] <- "Acceptance ratio"
rownames(summStat)[1] <- "Mean"
rownames(summStat)[2] <- "Std."
rownames(summStat)[3] <- "Acceptance ratio"
write.csv(summStat,paste(filename,"_summ_stat.csv",sep=""))
}
#The main work flow of the package
#Load the libraries
suppressMessages(library(inline)) #Already checked the libraries
suppressMessages(library(ggplot2)) #thus ignoring any messages from them
suppressMessages(library(Rcpp))
suppressMessages(library(gam))
suppressMessages(library(RcppGSL))
suppressPackageStartupMessages(library(inline)) #Already checked the libraries
suppressPackageStartupMessages(library(ggplot2)) #thus ignoring any messages from them
suppressPackageStartupMessages(library(Rcpp))
suppressPackageStartupMessages(library(gam))
suppressPackageStartupMessages(library(RcppGSL))
#Miscellaneous functions
#Miscellaneous functions
source(paste(path_to_mapDamage_stats,"function.R",sep=""))
#Prior and proposal distributions for the parameters
......@@ -18,7 +18,7 @@ source(paste(path_to_mapDamage_stats,"postConditonal.R",sep=""))
#functions for the grid search
source(paste(path_to_mapDamage_stats,"start.R",sep=""))
#functions for loading the data
#functions for loading the data
source(paste(path_to_mapDamage_stats,"data.R",sep=""))
......@@ -39,14 +39,14 @@ if (forward_only && reverse_only){
stop()
}
fow_dat <-readMapDamData(path_to_dat)
fow_dat <-readMapDamData(path_to_dat)
rev_dat <-readMapDamData(path_to_dat,forward=0)
if (forward_only){
#Taking only the forward part
dat <- fow_dat[1:sub_length,]
dat <- fow_dat[1:sub_length,]
}else if (reverse_only){
#Taking only the reverse part
dat <- rev_dat[sub_length:1,]
dat <- rev_dat[sub_length:1,]
}else {
#Using both ends
dat <- joinFowAndRev(fow_dat,rev_dat,sub_length)
......@@ -107,7 +107,7 @@ cu_pa$ThetaMat <- getPmat(cu_pa$Theta,cu_pa$Rho,cu_pa$acgt)
#######################################################
#
# Setting the lambda vector
# Setting the lambda vector
#
#######################################################
......@@ -127,17 +127,17 @@ if (!cu_pa$same_overhangs){
#######################################################
#
# Setting the nu vector
# Setting the nu vector
#
#######################################################
if (!cu_pa$ds_protocol & cu_pa$nuSamples!=0){
#Using the single stranded protocol with nu samples which makes
#Using the single stranded protocol with nu samples which makes
#no sense
write("Silly to use the MC estimation for nu_i if using the single stranded protocol", stderr())
stop()
}else if (cu_pa$nuSamples!=0 & cu_pa$fix_nu) {
#Using the fixed nu parameter with nu samples which makes no sense
#Using the fixed nu parameter with nu samples which makes no sense
write("Silly to use the MC estimation for nu_i and use fix_nu ", stderr())
stop()
}else if (cu_pa$nuSamples!=0){
......@@ -164,7 +164,7 @@ if (!cu_pa$ds_protocol & cu_pa$nuSamples!=0){
cu_pa$nuVec <- c(rep(1,nrow(dat)/2),rep(0,nrow(dat)/2))
}
}else {
#This is for a non linear nick frequency, assumes the G>T and G>A are
#This is for a non linear nick frequency, assumes the G>T and G>A are
#mostly due to DNA damage patterns do not use for low damage datasets
te<-(dat[,"C.T"]/dat[,"C"])/(dat[,"G.A"]/dat[,"G"]+dat[,"C.T"]/dat[,"C"])
if (sum(is.na(te) )!=0 ){
......@@ -177,7 +177,7 @@ if (!cu_pa$ds_protocol & cu_pa$nuSamples!=0){
cu_pa$nuVec <- c(rep(1,nrow(dat)/2),rep(0,nrow(dat)/2))
}
} else {
#The substitutes seem to be okay estimate the nick frequency using GAM
#The substitutes seem to be okay estimate the nick frequency using GAM
if (cu_pa$forward_only || cu_pa$reverse_only){
if (cu_pa$use_raw_nick_freq){
#Use the frequency
......@@ -202,14 +202,14 @@ if (!cu_pa$ds_protocol & cu_pa$nuSamples!=0){
}
#######################################################
#
# Finding an "optimal" starting place
# Finding an "optimal" starting place
#
#######################################################
if (grid_iter!=0){
#Start at random places and optimize the likelihood function
cu_pa <- gridSearch(cu_pa,grid_iter)
}
cu_pa <- gridSearch(cu_pa,grid_iter)
}
#Calculate the log likelihood in the beginning
if (cu_pa$same_overhangs){
......@@ -267,11 +267,11 @@ if (out_file_base!=""){
pdf(paste(out_file_base,"_MCMC_hist.pdf",sep=""))
plotEverything(mcmcOut,hi=1)
dev.off()
#Posterior predictive plot
#Posterior predictive plot
pdf(paste(out_file_base,"_MCMC_post_pred.pdf",sep=""))
siteProb <- postPredCheck(dat,mcmcOut)
dev.off()
#Write correcting probabilities
#Write correcting probabilities
write.csv(siteProb,paste(out_file_base,"_MCMC_correct_prob.csv",sep=""))
} else {
cat("Plotting\n")
......
#!/usr/bin/env python
#!/usr/bin/env python3
import mapdamage.parseoptions
import mapdamage.seq
......
#!/usr/bin/env python
import itertools
import mapdamage
......@@ -16,11 +14,11 @@ import mapdamage
#X 8 sequence mismatch
def get_coordinates(read):
def get_coordinates(read):
""" return external coordinates of aligned read bases """
fivep = read.aend if read.is_reverse else read.pos
threep = read.pos if read.is_reverse else read.aend
return fivep, threep
......@@ -41,9 +39,9 @@ def get_around(coord, chrom, reflengths, length, ref):
def align(cigarlist, seq, ref):
""" insert gaps according to the cigar string
deletion: gaps to be inserted into read sequences,
insertions: gaps to be inserted into reference sequence """
""" insert gaps according to the cigar string
deletion: gaps to be inserted into read sequences,
insertions: gaps to be inserted into reference sequence """
lref = list(ref)
for nbr, idx in parse_cigar(cigarlist, 1):
lref[idx:idx] = ["-"] * nbr
......@@ -56,9 +54,9 @@ def align(cigarlist, seq, ref):
def align_with_qual(cigarlist, seq, qual, threshold, ref):
""" insert gaps according to the cigar string
deletion: gaps to be inserted into read sequences and qualities,
insertions: gaps to be inserted into reference sequence """
""" insert gaps according to the cigar string
deletion: gaps to be inserted into read sequences and qualities,
insertions: gaps to be inserted into reference sequence """
lref = list(ref)
for nbr, idx in parse_cigar(cigarlist, 1):
lref[idx:idx] = ["-"] * nbr
......@@ -84,7 +82,7 @@ def get_mis(read, seq, refseq, ref, length, tab, end):
std = '-' if read.is_reverse else '+'
subtable = tab[ref][end][std]
for (i, nt_seq, nt_ref) in itertools.izip(xrange(length), seq, refseq):
for (i, nt_seq, nt_ref) in zip(range(length), seq, refseq):
if (nt_seq in "ACGT-") and (nt_ref in "ACGT-"):
if nt_ref != "-":
# record base composition in the reference, only A, C, G, T
......@@ -106,7 +104,7 @@ def parse_cigar(cigarlist, ope):
for operation, length in cigarlist:
if operation == ope:
coordinate.append([length, tlength])
if operation in oplist:
if operation in oplist:
tlength += length
return coordinate
......@@ -127,8 +125,3 @@ def record_soft_clipping(read, tab, length):
end = '5p' if read.is_reverse else '3p'
update_table(end, strand, nbases)
#!/usr/bin/env python
import mapdamage
import itertools
import csv
......@@ -10,8 +8,8 @@ def count_ref_comp(read, chrom, before, after, comp):
""" record basae composition in external genomic regions """
std = '-' if read.is_reverse else '+'
_update_table(comp[chrom]['5p'][std], before, xrange(-len(before), 0))
_update_table(comp[chrom]['3p'][std], after, xrange(1, len(after) + 1))
_update_table(comp[chrom]['5p'][std], before, range(-len(before), 0))
_update_table(comp[chrom]['3p'][std], after, range(1, len(after) + 1))
def count_read_comp(read, chrom, length, comp):
......@@ -20,12 +18,12 @@ def count_read_comp(read, chrom, length, comp):
if read.is_reverse:
std, seq = '-', mapdamage.seq.revcomp(seq)
_update_table(comp[chrom]['5p'][std], seq, xrange(1, length + 1))
_update_table(comp[chrom]['3p'][std], reversed(seq), xrange(-1, - length - 1, -1))
_update_table(comp[chrom]['5p'][std], seq, range(1, length + 1))
_update_table(comp[chrom]['3p'][std], reversed(seq), range(-1, - length - 1, -1))
def _update_table(table, sequence, indices):
for (index, nt) in itertools.izip(indices, sequence):
for (index, nt) in zip(indices, sequence):
if nt in table:
table[nt][index] += 1
......@@ -35,7 +33,7 @@ def get_base_comp(filename,destination=False):
Gets the basecomposition of all the sequences in filename
and returns the value to destination if given.
"""
path_to_seqtk = mapdamage.rscript.construct_path("seqtk",folder="seqtk")
path_to_seqtk = mapdamage.rscript.construct_path("seqtk",folder="seqtk")
bases = {"A":0,"C":0,"G":0,"T":0}
alp = ["A","C","G","T"]
try:
......@@ -47,7 +45,7 @@ def get_base_comp(filename,destination=False):
bases["C"] = bases["C"] + int(tabs[3])
bases["G"] = bases["G"] + int(tabs[4])
bases["T"] = bases["T"] + int(tabs[5])
except (OSError, ValueError), error:
except (OSError, ValueError) as error:
sys.stderr.write("Error: Seqtk failed: %s\n" % (error,))
sys.exit(1)
# get the base frequencies
......@@ -75,7 +73,7 @@ def read_base_comp(filename):
lp = li.split()
if first_line:
header = lp
first_line = False
first_line = False
else:
body = lp
bases = {}
......
#!/usr/bin/env python
from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
import os
import sys
......@@ -38,7 +36,7 @@ def check_py_version():
return None
def options():
def options():
parser = OptionParser("%prog [options] -i BAMfile -r reference.fasta\n\nUse option -h or --help for help", version=__version__, \
epilog="report bugs to aginolhac@snm.ku.dk, MSchubert@snm.ku.dk or jonsson.hakon@gmail.com")
......@@ -153,7 +151,7 @@ def options():
if not check_py_version():
return None
# if the user wants to check the R packages then do that before the option parsing
# if the user wants to check the R packages then do that before the option parsing
if options.check_R_packages:
if check_R_lib():
sys.exit(1)
......@@ -224,7 +222,7 @@ def options():
# check destination for rescaled bam
if not options.rescale_out and (options.rescale or options.rescale_only):
# if there are mulitiple bam files to rescale then pick first one as
# if there are mulitiple bam files to rescale then pick first one as
# the name of the rescaled file
if isinstance(options.filename,list):
basename = os.path.basename(options.filename[0])
......@@ -235,12 +233,12 @@ def options():
if os.path.isdir(options.folder):
if not options.quiet and not options.plot_only:
print("Warning, %s already exists" % options.folder)
print(("Warning, %s already exists" % options.folder))
if options.plot_only:
if not file_exist(options.folder+"/dnacomp.txt") or not file_exist(options.folder+"/misincorporation.txt"):
parser.error('folder %s is not a valid result folder' % options.folder)
else:
os.makedirs(options.folder, mode = 0750)
os.makedirs(options.folder, mode = 0o750)
if options.plot_only or options.stats_only or options.rescale_only:
sys.stderr.write("Error, %s does not exist while plot/stats/rescale only was used\n" % options.folder)
return None
......@@ -279,4 +277,3 @@ def options():
return options
......@@ -31,14 +31,14 @@ def get_corr_prob(folder, rescale_length_5p, rescale_length_3p):
fi_handle = csv.DictReader(open(full_path))
corr_prob = {}
for line in fi_handle:
if (corr_prob.has_key(line["Position"])):
if (line["Position"] in corr_prob):
sys.exit('This file has multiple position definitions %s, line %d: %s' % \
(folder, fi_handle.line_num, corr_prob[line["Position"]]))
else:
corr_prob[int(line["Position"])] = {'C.T':float(line["C.T"]), 'G.A':float(line["G.A"])}
# Exclude probabilities for positions outside of user-specified region
for key in corr_prob.keys():
for key in list(corr_prob.keys()):
if key < -rescale_length_3p or key > rescale_length_5p:
corr_prob.pop(key)
......@@ -75,13 +75,13 @@ def corr_this_base(corr_prob, nt_seq, nt_ref, pos, length,direction="both"):
back_pos = pos-length-1
# position from 3' end
if corr_prob.has_key(pos):
if pos in corr_prob:
p5_corr = corr_prob[pos][subs]
# correction from 5' end
else:
p5_corr = 0
if corr_prob.has_key(back_pos):
if back_pos in corr_prob:
p3_corr = corr_prob[back_pos][subs]
# correction from 3' end
else:
......@@ -104,7 +104,7 @@ def corr_this_base(corr_prob, nt_seq, nt_ref, pos, length,direction="both"):
def initialize_subs():
"""Initialize a substitution table, to track the expected substitution counts"""
per_qual = dict(zip(range(0,130),[0]*130))
per_qual = dict(list(zip(list(range(0,130)),[0]*130)))
subs = {"CT-before":per_qual.copy(),\
"TC-before":per_qual.copy(),\
"GA-before":per_qual.copy(),\
......@@ -164,7 +164,7 @@ def qual_summary_subs(subs):
for qv in subs[i]:
if qv >= lv :
key = i+"-Q"+str(lv)
if subs.has_key(key):
if key in subs:
subs[key] += subs[i][qv]
else:
subs[key] = subs[i][qv]
......@@ -174,32 +174,32 @@ def print_subs(subs):
print("\tThe expected substition frequencies before and after scaling using the scaled qualities as probalities:")
if subs["C"]!=0:
# the special case of no substitutions
print("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"]))
print(("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"])))
else:
print("\tCT\tNA\t\tNA")
if subs["T"]!=0:
print("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"]))
print(("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"])))
else:
print("\tTC\tNA\t\tNA")
if subs["G"]!=0:
print("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"]))
print(("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"])))
else:
print("\tGA\tNA\t\tNA")
if subs["A"]!=0:
print("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"]))
print(("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"])))
else:
print("\tAG\tNA\t\tNA")
print("\tQuality metrics before and after scaling")
print("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"]))
print("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"]))
print("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"]))
print("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"]))
print("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"]))
print("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"]))
print("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"]))
print("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"]))
print("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"]))
print("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"]))
print(("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"])))
print(("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"])))
print(("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"])))
print(("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"])))
print(("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"])))
print(("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"])))
print(("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"])))
print(("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"])))
print(("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"])))
print(("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"])))
def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="both"):
......@@ -237,7 +237,7 @@ def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="b
new_qual = [-100]*length_read
pos_on_read = 0
number_of_rescaled_bases = 0.0
for (i, nt_seq, nt_ref, nt_qual) in itertools.izip(xrange(length_align), seq, refseq, qual):
for (i, nt_seq, nt_ref, nt_qual) in zip(range(length_align), seq, refseq, qual):
# rescale the quality according to the triplet position,
# pair of the reference and the sequence
if ((nt_seq == "T" and nt_ref =="C") or (nt_seq == "A" and nt_ref =="G")):
......
import unittest
import optparse
import rescale
from . import rescale
import pysam
import filecmp
......
#!/usr/bin/env python
import os
import subprocess
from subprocess import CalledProcessError, check_call
......@@ -15,8 +13,8 @@ def construct_path(name,folder="Rscripts"):
def plot(opt):
"""
Calls the R script to make the plots, takes in the arguments
from parameter processing
Calls the R script to make the plots, takes in the arguments
from parameter processing
"""
# comp<-args[1]
# pdfout<-args[2]
......@@ -32,33 +30,33 @@ def plot(opt):
fcomp = opt.folder+"/"+"dnacomp.txt"
title = opt.folder+"/"+"Fragmisincorporation_plot.pdf"
script = construct_path("mapDamage.R")
script = construct_path("mapDamage.R")
call = ["Rscript", script, fcomp, title, opt.refplot, fmut, opt.readplot, \
opt.ymax, opt.folder, opt.title, __version__]
code = subprocess.call(map(str, call))
code = subprocess.call(list(map(str, call)))
if code == 0:
if not opt.quiet:
print("pdf %s generated" % title)
print(("pdf %s generated" % title))
return 0
else:
print("Error: plotting with R failed")
return 1
def opt_plots(opt):
def opt_plots(opt):
""" optional plots for length distribution
and cumulative C>T mutations, per strand """
fmut = opt.folder+"/"+"misincorporation.txt"
flength = opt.folder+"/"+"lgdistribution.txt"
output = opt.folder+"/"+"Length_plot.pdf"
script = construct_path("lengths.R")
script = construct_path("lengths.R")
call = ["Rscript", script, flength, output, fmut, opt.length, \
opt.title, __version__, bool_to_int(opt.quiet)]
code = subprocess.call(map(str, call))
code = subprocess.call(list(map(str, call)))
if code == 0:
return 0
else:
......@@ -77,7 +75,7 @@ def check_R_one_lib(name):
def check_R_lib():
"""
Checks if the necessary R libraries are here, signal
Checks if the necessary R libraries are here, signal
otherwise
"""
libs = ["inline", "ggplot2", "gam", "Rcpp", "RcppGSL"]
......@@ -90,11 +88,11 @@ def check_R_lib():
if len(missing_lib) > 1:
# Grammar Nazi has arrived
last_ele = missing_lib.pop()
print ("Missing the following R libraries '" + "', '".join(missing_lib) \
+ "' and '" + last_ele + "'")
print(("Missing the following R libraries '" + "', '".join(missing_lib) \
+ "' and '" + last_ele + "'"))
return 1
elif len(missing_lib) == 1:
print ("Missing the following R library "+missing_lib[0])
print(("Missing the following R library "+missing_lib[0]))
return 1
else :
# No missing libraries
......
#!/usr/bin/env python
import sys
import string
# from Martin Kircher, to complement DNA
TABLE = string.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
TABLE = str.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
'ACGTKYWSRMBDHVacgtkywsrmbdhv')
LETTERS = ("A", "C", "G", "T")
MUTATIONS = ('G>A', 'C>T', 'A>G', 'T>C', 'A>C', 'A>T', 'C>G', 'C>A', 'T>G',
'T>A', 'G>C', 'G>T', 'A>-', 'T>-', 'C>-', 'G>-', '->A', '->T',
'T>A', 'G>C', 'G>T', 'A>-', 'T>-', 'C>-', 'G>-', '->A', '->T',
'->C', '->G', 'S')
HEADER = LETTERS + ("Total", ) + MUTATIONS
......@@ -33,11 +32,11 @@ def record_lg(read, coordinate, tab):
""" record global length distribution
don't record paired reads as they are normally not used for aDNA """
std = '-' if read.is_reverse else '+'
length = (max(coordinate) - min(coordinate))
if not read.is_paired:
tab[std][length] = tab[std][length] + 1
return tab
......@@ -47,7 +46,7 @@ def read_fasta_index(filename):
sys.stderr.write("Error: %s\n" % msg)
sys.stderr.write(" Filename: %s\n" % filename)
sys.stderr.write(" Line: %s\n" % repr(line))
fai = {}
with open(filename, 'r') as handle:
for line in handle:
......
#!/usr/bin/env python
import os
import collections
......@@ -7,7 +5,7 @@ import mapdamage
from mapdamage.version import __version__
def initialize_mut(ref, length):
def initialize_mut(ref, length):
tab = {}
for contig in ref:
tab_contig = tab[contig] = {}
......@@ -16,8 +14,8 @@ def initialize_mut(ref, length):
for std in ('+','-'):
tab_std = tab_end[std] = {}
for mut in mapdamage.seq.HEADER:
tab_std[mut] = dict.fromkeys(xrange(length), 0)
tab_std[mut] = dict.fromkeys(range(length), 0)
return tab
......@@ -26,8 +24,8 @@ def print_mut(mut, opt, out):
def initialize_comp(ref, around, length):
keys = {"3p" : range(-length, 0) + range(1, around + 1),
"5p" : range(-around, 0) + range(1, length + 1)}
keys = {"3p" : list(range(-length, 0)) + list(range(1, around + 1)),
"5p" : list(range(-around, 0)) + list(range(1, length + 1))}
tab = {}
for contig in ref:
......@@ -52,7 +50,7 @@ def initialize_lg():
for std in ('+','-'):
tab[std] = collections.defaultdict(int)
return tab
return tab
def print_lg(tab, opt, out):
......@@ -74,8 +72,8 @@ def check_table_and_warn_if_dmg_freq_is_low(folder):
total = 0.0
for filename in ("5pCtoT_freq.txt", "3pGtoA_freq.txt"):
if not os.path.exists(folder+"/"+filename):
print("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
% filename)
print(("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
% filename))
return True
with open(os.path.join(folder, filename)) as handle:
......@@ -85,12 +83,12 @@ def check_table_and_warn_if_dmg_freq_is_low(folder):
total += float(freq[1])
break
else:
print("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
% filename)
print(("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
% filename))
return True
if total < 0.01:
print("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total)
print(("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total))
return False
......@@ -103,9 +101,9 @@ def _print_freq_table(table, columns, opt, out, offset = 0):
out.write("# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads\n")
out.write("Chr\tEnd\tStd\tPos\t%s\n" % ("\t".join(columns)))
for (reference, ends) in sorted(table.iteritems()):
for (end, strands) in sorted(ends.iteritems()):
for (strand, subtable) in sorted(strands.iteritems()):
for (reference, ends) in sorted(table.items()):
for (end, strands) in sorted(ends.items()):
for (strand, subtable) in sorted(strands.items()):
subtable["Total"] = {}
for index in sorted(subtable[columns[0]]):
subtable["Total"][index] = sum(subtable[letter][index] for letter in mapdamage.seq.LETTERS)
......
#!/usr/bin/env python
#!/usr/bin/env python3
try:
from mapdamage._version import __version__
except ImportError:
__version__ = "2.0.8"
__version__ = "2.1.0"