Skip to content
Commits on Source (5)
### Important
Users with versions dating prior to June 12 2013 please update. A nasty bug that caused the statistical part of mapDamage to use half of the data for estimation of the damage parameters, sorry for the inconvenience.
Users with versions dating prior to June 12 2013 please update. A nasty bug that caused the statistical part of `mapDamage` to use half of the data for estimation of the damage parameters, sorry for the inconvenience.
### Introduction
Complete documentation, instructions, examples, screenshots and FAQ are available at [this address](http://ginolhac.github.io/mapDamage/).
......@@ -7,12 +7,12 @@ Complete documentation, instructions, examples, screenshots and FAQ are availabl
[mapDamage2](http://geogenetics.ku.dk/publications/mapdamage2.0/) is a computational framework written in Python and R, which tracks and quantifies DNA damage patterns
among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms.
Mapdamage is developed at the [Centre for GeoGenetics](http://geogenetics.ku.dk/) by the [Orlando Group ](http://geogenetics.ku.dk/research/research_groups/palaeomix_group/).
`mapDamage` is developed at the [Centre for GeoGenetics](http://geogenetics.ku.dk/) by the [Orlando Group ](http://geogenetics.ku.dk/research/research_groups/palaeomix_group/).
### Web interface
Anna Kostikova from [insideDNA](https://insidedna.me) implemented a web interface for mapDamage.
Users can adjust the cloud ressources in terms of RAM/CPU to perform their analysis. She wrote a [tutorial](https://insidedna.me/tutorials/view/Analysis-ancient-DNA-samples-using-mapDamage) explaining how to use this [web interface (sign up required)](https://insidedna.me/app#/tools/100648/)
Users can adjust the cloud resources in terms of RAM/CPU to perform their analysis. She wrote a [tutorial](https://insidedna.me/tutorials/view/Analysis-ancient-DNA-samples-using-mapDamage) explaining how to use this [web interface (sign up required)](https://insidedna.me/app#/tools/100648/)
### Citation
......@@ -31,5 +31,4 @@ http://bioinformatics.oxfordjournals.org/content/27/15/2153](http://bioinformati
### Contact
Please report bugs and suggest possible improvements to Aurélien Ginolhac, Mikkel Schubert or Hákon Jónsson by email:
aginolhac at snm.ku.dk, MSchubert at snm.ku.dk or jonsson.hakon at gmail.com.
ginolhac at gmail.com, mikkelsch at gmail.com or jonsson.hakon at gmail.com.
#!/usr/bin/python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
import random
import time
......@@ -45,7 +43,7 @@ MODULE = "pysam"
URL = "http://code.google.com/p/pysam/"
try:
__import__(MODULE)
except ImportError, e:
except ImportError as e:
sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
sys.stderr.write(" If module is not installed, please download from '%s'.\n" % URL)
sys.stderr.write(" A local install may be performed using the following command:\n")
......@@ -98,7 +96,7 @@ def _downsample_to_fixed_number(bamfile, options):
if index >= downsample_to:
continue
sample[index] = record
return filter(None, sample)
return [_f for _f in sample if _f]
def _read_bamfile(bamfile, options):
......@@ -121,7 +119,7 @@ def _check_mm_option():
in mapdamage.parseoptions
"""
path_to_mm = None
for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
for nr,arg in zip(list(range(len(sys.argv))),sys.argv):
if arg.startswith("--mapdamage-modules"):
try:
if "=" in arg:
......@@ -230,7 +228,7 @@ def main():
in_bam = pysam.Samfile(options.filename)
reflengths = dict(zip(in_bam.references, in_bam.lengths))
reflengths = dict(list(zip(in_bam.references, in_bam.lengths)))
# check if references in SAM/BAM are the same in the fasta reference file
fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
if not fai_lengths:
......
mapdamage (2.0.9+dfsg-2) UNRELEASED; urgency=medium
mapdamage (2.1.0+dfsg-1) UNRELEASED; urgency=medium
* Use 2to3 to port to Python3
* New upstream version
Closes: #936989
* debhelper-compat 12
* Standards-Version: 4.4.0
......@@ -8,7 +8,7 @@ mapdamage (2.0.9+dfsg-2) UNRELEASED; urgency=medium
* buildsystem=pybuild
* python3-pysam <!nocheck>
-- Andreas Tille <tille@debian.org> Tue, 03 Sep 2019 11:02:34 +0200
-- Andreas Tille <tille@debian.org> Tue, 01 Oct 2019 17:13:12 +0200
mapdamage (2.0.9+dfsg-1) unstable; urgency=medium
......
Description: Use 2to3 to port to Python3
Bug-Debian: https://bugs.debian.org/936989
Author: Andreas Tille <tille@debian.org>
Last-Update: Tue, 03 Sep 2019 11:02:34 +0200
--- a/bin/mapDamage
+++ b/bin/mapDamage
@@ -1,7 +1,7 @@
-#!/usr/bin/python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
-from __future__ import print_function
+
import logging
import random
@@ -45,7 +45,7 @@ MODULE = "pysam"
URL = "http://code.google.com/p/pysam/"
try:
__import__(MODULE)
-except ImportError, e:
+except ImportError as e:
sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
sys.stderr.write(" If module is not installed, please download from '%s'.\n" % URL)
sys.stderr.write(" A local install may be performed using the following command:\n")
@@ -98,7 +98,7 @@ def _downsample_to_fixed_number(bamfile,
if index >= downsample_to:
continue
sample[index] = record
- return filter(None, sample)
+ return [_f for _f in sample if _f]
def _read_bamfile(bamfile, options):
@@ -121,7 +121,7 @@ def _check_mm_option():
in mapdamage.parseoptions
"""
path_to_mm = None
- for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
+ for nr,arg in zip(range(len(sys.argv)),sys.argv):
if arg.startswith("--mapdamage-modules"):
try:
if "=" in arg:
@@ -230,7 +230,7 @@ def main():
in_bam = pysam.Samfile(options.filename)
- reflengths = dict(zip(in_bam.references, in_bam.lengths))
+ reflengths = dict(list(zip(in_bam.references, in_bam.lengths)))
# check if references in SAM/BAM are the same in the fasta reference file
fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
if not fai_lengths:
--- a/mapdamage/__init__.py
+++ b/mapdamage/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import mapdamage.parseoptions
import mapdamage.seq
--- a/mapdamage/align.py
+++ b/mapdamage/align.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import itertools
import mapdamage
@@ -84,7 +84,7 @@ def get_mis(read, seq, refseq, ref, leng
std = '-' if read.is_reverse else '+'
subtable = tab[ref][end][std]
- for (i, nt_seq, nt_ref) in itertools.izip(xrange(length), seq, refseq):
+ for (i, nt_seq, nt_ref) in zip(range(length), seq, refseq):
if (nt_seq in "ACGT-") and (nt_ref in "ACGT-"):
if nt_ref != "-":
# record base composition in the reference, only A, C, G, T
--- a/mapdamage/composition.py
+++ b/mapdamage/composition.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import mapdamage
import itertools
@@ -10,8 +10,8 @@ def count_ref_comp(read, chrom, before,
""" record basae composition in external genomic regions """
std = '-' if read.is_reverse else '+'
- _update_table(comp[chrom]['5p'][std], before, xrange(-len(before), 0))
- _update_table(comp[chrom]['3p'][std], after, xrange(1, len(after) + 1))
+ _update_table(comp[chrom]['5p'][std], before, range(-len(before), 0))
+ _update_table(comp[chrom]['3p'][std], after, range(1, len(after) + 1))
def count_read_comp(read, chrom, length, comp):
@@ -20,12 +20,12 @@ def count_read_comp(read, chrom, length,
if read.is_reverse:
std, seq = '-', mapdamage.seq.revcomp(seq)
- _update_table(comp[chrom]['5p'][std], seq, xrange(1, length + 1))
- _update_table(comp[chrom]['3p'][std], reversed(seq), xrange(-1, - length - 1, -1))
+ _update_table(comp[chrom]['5p'][std], seq, range(1, length + 1))
+ _update_table(comp[chrom]['3p'][std], reversed(seq), range(-1, - length - 1, -1))
def _update_table(table, sequence, indices):
- for (index, nt) in itertools.izip(indices, sequence):
+ for (index, nt) in zip(indices, sequence):
if nt in table:
table[nt][index] += 1
@@ -47,7 +47,7 @@ def get_base_comp(filename,destination=F
bases["C"] = bases["C"] + int(tabs[3])
bases["G"] = bases["G"] + int(tabs[4])
bases["T"] = bases["T"] + int(tabs[5])
- except (OSError, ValueError), error:
+ except (OSError, ValueError) as error:
sys.stderr.write("Error: Seqtk failed: %s\n" % (error,))
sys.exit(1)
# get the base frequencies
--- a/mapdamage/parseoptions.py
+++ b/mapdamage/parseoptions.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
import os
@@ -235,12 +235,12 @@ def options():
if os.path.isdir(options.folder):
if not options.quiet and not options.plot_only:
- print("Warning, %s already exists" % options.folder)
+ print(("Warning, %s already exists" % options.folder))
if options.plot_only:
if not file_exist(options.folder+"/dnacomp.txt") or not file_exist(options.folder+"/misincorporation.txt"):
parser.error('folder %s is not a valid result folder' % options.folder)
else:
- os.makedirs(options.folder, mode = 0750)
+ os.makedirs(options.folder, mode = 0o750)
if options.plot_only or options.stats_only or options.rescale_only:
sys.stderr.write("Error, %s does not exist while plot/stats/rescale only was used\n" % options.folder)
return None
--- a/mapdamage/rescale.py
+++ b/mapdamage/rescale.py
@@ -31,14 +31,14 @@ def get_corr_prob(folder, rescale_length
fi_handle = csv.DictReader(open(full_path))
corr_prob = {}
for line in fi_handle:
- if (corr_prob.has_key(line["Position"])):
+ if (line["Position"] in corr_prob):
sys.exit('This file has multiple position definitions %s, line %d: %s' % \
(folder, fi_handle.line_num, corr_prob[line["Position"]]))
else:
corr_prob[int(line["Position"])] = {'C.T':float(line["C.T"]), 'G.A':float(line["G.A"])}
# Exclude probabilities for positions outside of user-specified region
- for key in corr_prob.keys():
+ for key in list(corr_prob.keys()):
if key < -rescale_length_3p or key > rescale_length_5p:
corr_prob.pop(key)
@@ -75,13 +75,13 @@ def corr_this_base(corr_prob, nt_seq, nt
back_pos = pos-length-1
# position from 3' end
- if corr_prob.has_key(pos):
+ if pos in corr_prob:
p5_corr = corr_prob[pos][subs]
# correction from 5' end
else:
p5_corr = 0
- if corr_prob.has_key(back_pos):
+ if back_pos in corr_prob:
p3_corr = corr_prob[back_pos][subs]
# correction from 3' end
else:
@@ -104,7 +104,7 @@ def corr_this_base(corr_prob, nt_seq, nt
def initialize_subs():
"""Initialize a substitution table, to track the expected substitution counts"""
- per_qual = dict(zip(range(0,130),[0]*130))
+ per_qual = dict(list(zip(list(range(0,130)),[0]*130)))
subs = {"CT-before":per_qual.copy(),\
"TC-before":per_qual.copy(),\
"GA-before":per_qual.copy(),\
@@ -164,7 +164,7 @@ def qual_summary_subs(subs):
for qv in subs[i]:
if qv >= lv :
key = i+"-Q"+str(lv)
- if subs.has_key(key):
+ if key in subs:
subs[key] += subs[i][qv]
else:
subs[key] = subs[i][qv]
@@ -174,32 +174,32 @@ def print_subs(subs):
print("\tThe expected substition frequencies before and after scaling using the scaled qualities as probalities:")
if subs["C"]!=0:
# the special case of no substitutions
- print("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"]))
+ print(("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"])))
else:
print("\tCT\tNA\t\tNA")
if subs["T"]!=0:
- print("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"]))
+ print(("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"])))
else:
print("\tTC\tNA\t\tNA")
if subs["G"]!=0:
- print("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"]))
+ print(("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"])))
else:
print("\tGA\tNA\t\tNA")
if subs["A"]!=0:
- print("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"]))
+ print(("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"])))
else:
print("\tAG\tNA\t\tNA")
print("\tQuality metrics before and after scaling")
- print("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"]))
- print("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"]))
- print("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"]))
- print("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"]))
- print("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"]))
- print("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"]))
- print("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"]))
- print("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"]))
- print("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"]))
- print("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"]))
+ print(("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"])))
+ print(("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"])))
+ print(("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"])))
+ print(("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"])))
+ print(("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"])))
+ print(("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"])))
+ print(("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"])))
+ print(("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"])))
+ print(("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"])))
+ print(("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"])))
def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="both"):
@@ -237,7 +237,7 @@ def rescale_qual_read(bam, read, ref, co
new_qual = [-100]*length_read
pos_on_read = 0
number_of_rescaled_bases = 0.0
- for (i, nt_seq, nt_ref, nt_qual) in itertools.izip(xrange(length_align), seq, refseq, qual):
+ for (i, nt_seq, nt_ref, nt_qual) in zip(range(length_align), seq, refseq, qual):
# rescale the quality according to the triplet position,
# pair of the reference and the sequence
if ((nt_seq == "T" and nt_ref =="C") or (nt_seq == "A" and nt_ref =="G")):
--- a/mapdamage/rescale_test.py
+++ b/mapdamage/rescale_test.py
@@ -1,6 +1,9 @@
import unittest
import optparse
-import rescale
+try:
+ from . import rescale
+except:
+ from mapdamage import rescale
import pysam
import filecmp
--- a/mapdamage/rscript.py
+++ b/mapdamage/rscript.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import os
import subprocess
@@ -35,11 +35,11 @@ def plot(opt):
script = construct_path("mapDamage.R")
call = ["Rscript", script, fcomp, title, opt.refplot, fmut, opt.readplot, \
opt.ymax, opt.folder, opt.title, __version__]
- code = subprocess.call(map(str, call))
+ code = subprocess.call(list(map(str, call)))
if code == 0:
if not opt.quiet:
- print("pdf %s generated" % title)
+ print(("pdf %s generated" % title))
return 0
else:
print("Error: plotting with R failed")
@@ -58,7 +58,7 @@ def opt_plots(opt):
script = construct_path("lengths.R")
call = ["Rscript", script, flength, output, fmut, opt.length, \
opt.title, __version__, bool_to_int(opt.quiet)]
- code = subprocess.call(map(str, call))
+ code = subprocess.call(list(map(str, call)))
if code == 0:
return 0
else:
@@ -90,11 +90,11 @@ def check_R_lib():
if len(missing_lib) > 1:
# Grammar Nazi has arrived
last_ele = missing_lib.pop()
- print ("Missing the following R libraries '" + "', '".join(missing_lib) \
- + "' and '" + last_ele + "'")
+ print(("Missing the following R libraries '" + "', '".join(missing_lib) \
+ + "' and '" + last_ele + "'"))
return 1
elif len(missing_lib) == 1:
- print ("Missing the following R library "+missing_lib[0])
+ print(("Missing the following R library "+missing_lib[0]))
return 1
else :
# No missing libraries
--- a/mapdamage/seq.py
+++ b/mapdamage/seq.py
@@ -1,9 +1,9 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import sys
import string
# from Martin Kircher, to complement DNA
-TABLE = string.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
+TABLE = str.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
'ACGTKYWSRMBDHVacgtkywsrmbdhv')
LETTERS = ("A", "C", "G", "T")
--- a/mapdamage/tables.py
+++ b/mapdamage/tables.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
import os
import collections
@@ -16,7 +16,7 @@ def initialize_mut(ref, length):
for std in ('+','-'):
tab_std = tab_end[std] = {}
for mut in mapdamage.seq.HEADER:
- tab_std[mut] = dict.fromkeys(xrange(length), 0)
+ tab_std[mut] = dict.fromkeys(range(length), 0)
return tab
@@ -26,8 +26,8 @@ def print_mut(mut, opt, out):
def initialize_comp(ref, around, length):
- keys = {"3p" : range(-length, 0) + range(1, around + 1),
- "5p" : range(-around, 0) + range(1, length + 1)}
+ keys = {"3p" : list(range(-length, 0)) + list(range(1, around + 1)),
+ "5p" : list(range(-around, 0)) + list(range(1, length + 1))}
tab = {}
for contig in ref:
@@ -74,8 +74,8 @@ def check_table_and_warn_if_dmg_freq_is_
total = 0.0
for filename in ("5pCtoT_freq.txt", "3pGtoA_freq.txt"):
if not os.path.exists(folder+"/"+filename):
- print("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
- % filename)
+ print(("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
+ % filename))
return True
with open(os.path.join(folder, filename)) as handle:
@@ -85,12 +85,12 @@ def check_table_and_warn_if_dmg_freq_is_
total += float(freq[1])
break
else:
- print("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
- % filename)
+ print(("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
+ % filename))
return True
if total < 0.01:
- print("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total)
+ print(("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total))
return False
@@ -103,9 +103,9 @@ def _print_freq_table(table, columns, op
out.write("# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads\n")
out.write("Chr\tEnd\tStd\tPos\t%s\n" % ("\t".join(columns)))
- for (reference, ends) in sorted(table.iteritems()):
- for (end, strands) in sorted(ends.iteritems()):
- for (strand, subtable) in sorted(strands.iteritems()):
+ for (reference, ends) in sorted(table.items()):
+ for (end, strands) in sorted(ends.items()):
+ for (strand, subtable) in sorted(strands.items()):
subtable["Total"] = {}
for index in sorted(subtable[columns[0]]):
subtable["Total"][index] = sum(subtable[letter][index] for letter in mapdamage.seq.LETTERS)
--- a/mapdamage/version.py
+++ b/mapdamage/version.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
try:
from mapdamage._version import __version__
except ImportError:
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
from distutils.core import setup
@@ -14,9 +14,9 @@ def setup_version():
try:
version = subprocess.check_output(("git", "describe", "--always", "--tags", "--dirty"))
with open(os.path.join("mapdamage", "_version.py"), "w") as handle:
- handle.write("#!/usr/bin/env python\n")
+ handle.write("#!/usr/bin/python3\n")
handle.write("__version__ = %r\n" % (version.strip(),))
- except (subprocess.CalledProcessError, OSError), error:
+ except (subprocess.CalledProcessError, OSError) as error:
raise SystemExit("Could not determine mapDamage version: %s" % (error,))
......@@ -4,11 +4,9 @@ Description: change calcSampleStats such as any NA and Nan's are
Last-Update: 2017-03-28
Bug: https://github.com/ginolhac/mapDamage/issues/18
Index: mapdamage/mapdamage/Rscripts/stats/function.R
===================================================================
--- mapdamage.orig/mapdamage/Rscripts/stats/function.R 2017-03-29 00:02:46.538470957 -0400
+++ mapdamage/mapdamage/Rscripts/stats/function.R 2017-03-29 00:11:09.141025051 -0400
@@ -595,8 +595,8 @@
--- a/mapdamage/Rscripts/stats/function.R
+++ b/mapdamage/Rscripts/stats/function.R
@@ -595,8 +595,8 @@ calcSampleStats <- function(da,X){
pos=da[,"Pos"],
mea=apply(X,1,mean),
med=apply(X,1,median),
......
use_debian_packaged_seqtk.patch
fix_quantile_na_rm.patch
2to3.patch
......@@ -27,23 +27,24 @@ Description: Use Debian packaged seqtk
def setup_version():
if not os.path.exists(".git"):
# Release version, no .git folder
@@ -40,16 +25,7 @@ class compileInstall(DistutilsInstall):
@@ -40,17 +25,8 @@ class compileInstall(DistutilsInstall):
def run(self):
self.record=""
setup_version()
- compile_seqtk()
DistutilsInstall.run(self)
- # fixing the permission problem of seqtk
# fixing the permission problem of seqtk
- files = self.get_outputs()
- for fi in files:
- if fi.endswith("seqtk/seqtk"):
- os.chmod(fi,0755)
- os.chmod(fi,0o755)
-
-
-
-
setup(
cmdclass={'install': compileInstall},
@@ -59,7 +35,7 @@ setup(
author='Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson',
author_email='MSchubert@snm.ku.dk, jonsson.hakon@gmail.com',
......@@ -55,7 +56,7 @@ Description: Use Debian packaged seqtk
license='LICENSE.txt',
--- a/bin/mapDamage
+++ b/bin/mapDamage
@@ -151,7 +151,7 @@ def main():
@@ -149,7 +149,7 @@ def main():
sys.path.insert(0,path_to_mm)
import mapdamage
......@@ -66,7 +67,7 @@ Description: Use Debian packaged seqtk
"been intalled properly or current user does not\n"
--- a/mapdamage/composition.py
+++ b/mapdamage/composition.py
@@ -35,7 +35,7 @@ def get_base_comp(filename,destination=F
@@ -33,7 +33,7 @@ def get_base_comp(filename,destination=F
Gets the basecomposition of all the sequences in filename
and returns the value to destination if given.
"""
......
......@@ -103,7 +103,7 @@ calculate.mutation.table <- function(filename, length)
write.mutation.table <- function(tbl, end, mismatch, filename)
{
columns <- c("pos", sprintf("5p%s", mismatch))
columns <- c("pos", sprintf("%s%s", end, mismatch))
tbl[is.na(tbl)] <- 0 # Python doesn't like NAs
write.table(tbl[tbl$End == end, c("Pos", mismatch)],
row.names = FALSE, col.names = columns,
......
......@@ -638,20 +638,20 @@ postPredCheck <- function(da,output,samples=10000){
REsStats <- calcSampleStats(da,REs)
#Plotting the posterior predictive intervals
p <- ggplot()+
geom_point(aes(x,mea,colour="C->T",aes_string="subs"),data=CTsStats)+
geom_point(aes(x,mea,colour="C->T"),data=CTsStats)+
geom_point(aes(x,mea,colour="G->A"),data=GAsStats)+
geom_point(aes(x,mea,colour="Others"),data=REsStats)+
geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="C->T"),data=CTsStats)+
geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="G->A"),data=GAsStats)+
geom_errorbar(aes(x=x,y=med,ymin=loCI,ymax=hiCI,color="Others"),data=REsStats)+
geom_errorbar(aes(x=x,ymin=loCI,ymax=hiCI,color="C->T"),data=CTsStats)+
geom_errorbar(aes(x=x,ymin=loCI,ymax=hiCI,color="G->A"),data=GAsStats)+
geom_errorbar(aes(x=x,ymin=loCI,ymax=hiCI,color="Others"),data=REsStats)+
geom_line(aes(oneBased,C.T/C),color="red",data=data.frame(da))+
geom_line(aes(oneBased,G.A/G),color="green",data=data.frame(da))+
geom_line(aes(oneBased,((A.C+A.G+A.T)/A+(C.A+C.G)/C+(G.C+G.T)/G+(T.A+T.C+T.G)/T)/10),color="blue",data=data.frame(da))+
ylab("Substitution rate")+
xlab("Relative position")+
scale_x_continuous(breaks=bres,labels=labs)+
labs(colour = "Subs. type")+
ggtitle("Posterior prediction intervals")
labs(y = "Substitution rate",
x = "Relative position",
colour = "Subs. type",
title = "Posterior prediction intervals")+
scale_x_continuous(breaks=bres,labels=labs)
if (output$cu_pa$use_bw_theme){
p <- p+theme_bw()
}
......@@ -691,4 +691,3 @@ writeMCMC <- function(out,filename){
rownames(summStat)[3] <- "Acceptance ratio"
write.csv(summStat,paste(filename,"_summ_stat.csv",sep=""))
}
#The main work flow of the package
#Load the libraries
suppressMessages(library(inline)) #Already checked the libraries
suppressMessages(library(ggplot2)) #thus ignoring any messages from them
suppressMessages(library(Rcpp))
suppressMessages(library(gam))
suppressMessages(library(RcppGSL))
suppressPackageStartupMessages(library(inline)) #Already checked the libraries
suppressPackageStartupMessages(library(ggplot2)) #thus ignoring any messages from them
suppressPackageStartupMessages(library(Rcpp))
suppressPackageStartupMessages(library(gam))
suppressPackageStartupMessages(library(RcppGSL))
#Miscellaneous functions
source(paste(path_to_mapDamage_stats,"function.R",sep=""))
......
#!/usr/bin/env python
#!/usr/bin/env python3
import mapdamage.parseoptions
import mapdamage.seq
......
#!/usr/bin/env python
import itertools
import mapdamage
......@@ -84,7 +82,7 @@ def get_mis(read, seq, refseq, ref, length, tab, end):
std = '-' if read.is_reverse else '+'
subtable = tab[ref][end][std]
for (i, nt_seq, nt_ref) in itertools.izip(xrange(length), seq, refseq):
for (i, nt_seq, nt_ref) in zip(range(length), seq, refseq):
if (nt_seq in "ACGT-") and (nt_ref in "ACGT-"):
if nt_ref != "-":
# record base composition in the reference, only A, C, G, T
......@@ -127,8 +125,3 @@ def record_soft_clipping(read, tab, length):
end = '5p' if read.is_reverse else '3p'
update_table(end, strand, nbases)
#!/usr/bin/env python
import mapdamage
import itertools
import csv
......@@ -10,8 +8,8 @@ def count_ref_comp(read, chrom, before, after, comp):
""" record basae composition in external genomic regions """
std = '-' if read.is_reverse else '+'
_update_table(comp[chrom]['5p'][std], before, xrange(-len(before), 0))
_update_table(comp[chrom]['3p'][std], after, xrange(1, len(after) + 1))
_update_table(comp[chrom]['5p'][std], before, range(-len(before), 0))
_update_table(comp[chrom]['3p'][std], after, range(1, len(after) + 1))
def count_read_comp(read, chrom, length, comp):
......@@ -20,12 +18,12 @@ def count_read_comp(read, chrom, length, comp):
if read.is_reverse:
std, seq = '-', mapdamage.seq.revcomp(seq)
_update_table(comp[chrom]['5p'][std], seq, xrange(1, length + 1))
_update_table(comp[chrom]['3p'][std], reversed(seq), xrange(-1, - length - 1, -1))
_update_table(comp[chrom]['5p'][std], seq, range(1, length + 1))
_update_table(comp[chrom]['3p'][std], reversed(seq), range(-1, - length - 1, -1))
def _update_table(table, sequence, indices):
for (index, nt) in itertools.izip(indices, sequence):
for (index, nt) in zip(indices, sequence):
if nt in table:
table[nt][index] += 1
......@@ -47,7 +45,7 @@ def get_base_comp(filename,destination=False):
bases["C"] = bases["C"] + int(tabs[3])
bases["G"] = bases["G"] + int(tabs[4])
bases["T"] = bases["T"] + int(tabs[5])
except (OSError, ValueError), error:
except (OSError, ValueError) as error:
sys.stderr.write("Error: Seqtk failed: %s\n" % (error,))
sys.exit(1)
# get the base frequencies
......
#!/usr/bin/env python
from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
import os
import sys
......@@ -235,12 +233,12 @@ def options():
if os.path.isdir(options.folder):
if not options.quiet and not options.plot_only:
print("Warning, %s already exists" % options.folder)
print(("Warning, %s already exists" % options.folder))
if options.plot_only:
if not file_exist(options.folder+"/dnacomp.txt") or not file_exist(options.folder+"/misincorporation.txt"):
parser.error('folder %s is not a valid result folder' % options.folder)
else:
os.makedirs(options.folder, mode = 0750)
os.makedirs(options.folder, mode = 0o750)
if options.plot_only or options.stats_only or options.rescale_only:
sys.stderr.write("Error, %s does not exist while plot/stats/rescale only was used\n" % options.folder)
return None
......@@ -279,4 +277,3 @@ def options():
return options
......@@ -31,14 +31,14 @@ def get_corr_prob(folder, rescale_length_5p, rescale_length_3p):
fi_handle = csv.DictReader(open(full_path))
corr_prob = {}
for line in fi_handle:
if (corr_prob.has_key(line["Position"])):
if (line["Position"] in corr_prob):
sys.exit('This file has multiple position definitions %s, line %d: %s' % \
(folder, fi_handle.line_num, corr_prob[line["Position"]]))
else:
corr_prob[int(line["Position"])] = {'C.T':float(line["C.T"]), 'G.A':float(line["G.A"])}
# Exclude probabilities for positions outside of user-specified region
for key in corr_prob.keys():
for key in list(corr_prob.keys()):
if key < -rescale_length_3p or key > rescale_length_5p:
corr_prob.pop(key)
......@@ -75,13 +75,13 @@ def corr_this_base(corr_prob, nt_seq, nt_ref, pos, length,direction="both"):
back_pos = pos-length-1
# position from 3' end
if corr_prob.has_key(pos):
if pos in corr_prob:
p5_corr = corr_prob[pos][subs]
# correction from 5' end
else:
p5_corr = 0
if corr_prob.has_key(back_pos):
if back_pos in corr_prob:
p3_corr = corr_prob[back_pos][subs]
# correction from 3' end
else:
......@@ -104,7 +104,7 @@ def corr_this_base(corr_prob, nt_seq, nt_ref, pos, length,direction="both"):
def initialize_subs():
"""Initialize a substitution table, to track the expected substitution counts"""
per_qual = dict(zip(range(0,130),[0]*130))
per_qual = dict(list(zip(list(range(0,130)),[0]*130)))
subs = {"CT-before":per_qual.copy(),\
"TC-before":per_qual.copy(),\
"GA-before":per_qual.copy(),\
......@@ -164,7 +164,7 @@ def qual_summary_subs(subs):
for qv in subs[i]:
if qv >= lv :
key = i+"-Q"+str(lv)
if subs.has_key(key):
if key in subs:
subs[key] += subs[i][qv]
else:
subs[key] = subs[i][qv]
......@@ -174,32 +174,32 @@ def print_subs(subs):
print("\tThe expected substition frequencies before and after scaling using the scaled qualities as probalities:")
if subs["C"]!=0:
# the special case of no substitutions
print("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"]))
print(("\tCT\t"+str(subs["CT-pvals_before"]/subs["C"])+"\t\t"+str(subs["CT-pvals"]/subs["C"])))
else:
print("\tCT\tNA\t\tNA")
if subs["T"]!=0:
print("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"]))
print(("\tTC\t"+str(subs["TC-pvals"]/subs["T"])+"\t\t"+str(subs["TC-pvals"]/subs["T"])))
else:
print("\tTC\tNA\t\tNA")
if subs["G"]!=0:
print("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"]))
print(("\tGA\t"+str(subs["GA-pvals_before"]/subs["G"])+"\t\t"+str(subs["GA-pvals"]/subs["G"])))
else:
print("\tGA\tNA\t\tNA")
if subs["A"]!=0:
print("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"]))
print(("\tAG\t"+str(subs["AG-pvals"]/subs["A"])+"\t\t"+str(subs["AG-pvals"]/subs["A"])))
else:
print("\tAG\tNA\t\tNA")
print("\tQuality metrics before and after scaling")
print("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"]))
print("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"]))
print("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"]))
print("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"]))
print("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"]))
print("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"]))
print("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"]))
print("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"]))
print("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"]))
print("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"]))
print(("\tCT-Q0 \t"+str(subs["CT-before-Q0"])+"\t\t"+str(subs["CT-after-Q0"])))
print(("\tCT-Q10 \t"+str(subs["CT-before-Q10"])+"\t\t"+str(subs["CT-after-Q10"])))
print(("\tCT-Q20 \t"+str(subs["CT-before-Q20"])+"\t\t"+str(subs["CT-after-Q20"])))
print(("\tCT-Q30 \t"+str(subs["CT-before-Q30"])+"\t\t"+str(subs["CT-after-Q30"])))
print(("\tCT-Q40 \t"+str(subs["CT-before-Q40"])+"\t\t"+str(subs["CT-after-Q40"])))
print(("\tGA-Q0 \t"+str(subs["GA-before-Q0"])+"\t\t"+str(subs["GA-after-Q0"])))
print(("\tGA-Q10 \t"+str(subs["GA-before-Q10"])+"\t\t"+str(subs["GA-after-Q10"])))
print(("\tGA-Q20 \t"+str(subs["GA-before-Q20"])+"\t\t"+str(subs["GA-after-Q20"])))
print(("\tGA-Q30 \t"+str(subs["GA-before-Q30"])+"\t\t"+str(subs["GA-after-Q30"])))
print(("\tGA-Q40 \t"+str(subs["GA-before-Q40"])+"\t\t"+str(subs["GA-after-Q40"])))
def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="both"):
......@@ -237,7 +237,7 @@ def rescale_qual_read(bam, read, ref, corr_prob,subs, debug = False,direction="b
new_qual = [-100]*length_read
pos_on_read = 0
number_of_rescaled_bases = 0.0
for (i, nt_seq, nt_ref, nt_qual) in itertools.izip(xrange(length_align), seq, refseq, qual):
for (i, nt_seq, nt_ref, nt_qual) in zip(range(length_align), seq, refseq, qual):
# rescale the quality according to the triplet position,
# pair of the reference and the sequence
if ((nt_seq == "T" and nt_ref =="C") or (nt_seq == "A" and nt_ref =="G")):
......
import unittest
import optparse
import rescale
from . import rescale
import pysam
import filecmp
......
#!/usr/bin/env python
import os
import subprocess
from subprocess import CalledProcessError, check_call
......@@ -35,11 +33,11 @@ def plot(opt):
script = construct_path("mapDamage.R")
call = ["Rscript", script, fcomp, title, opt.refplot, fmut, opt.readplot, \
opt.ymax, opt.folder, opt.title, __version__]
code = subprocess.call(map(str, call))
code = subprocess.call(list(map(str, call)))
if code == 0:
if not opt.quiet:
print("pdf %s generated" % title)
print(("pdf %s generated" % title))
return 0
else:
print("Error: plotting with R failed")
......@@ -58,7 +56,7 @@ def opt_plots(opt):
script = construct_path("lengths.R")
call = ["Rscript", script, flength, output, fmut, opt.length, \
opt.title, __version__, bool_to_int(opt.quiet)]
code = subprocess.call(map(str, call))
code = subprocess.call(list(map(str, call)))
if code == 0:
return 0
else:
......@@ -90,11 +88,11 @@ def check_R_lib():
if len(missing_lib) > 1:
# Grammar Nazi has arrived
last_ele = missing_lib.pop()
print ("Missing the following R libraries '" + "', '".join(missing_lib) \
+ "' and '" + last_ele + "'")
print(("Missing the following R libraries '" + "', '".join(missing_lib) \
+ "' and '" + last_ele + "'"))
return 1
elif len(missing_lib) == 1:
print ("Missing the following R library "+missing_lib[0])
print(("Missing the following R library "+missing_lib[0]))
return 1
else :
# No missing libraries
......
#!/usr/bin/env python
import sys
import string
# from Martin Kircher, to complement DNA
TABLE = string.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
TABLE = str.maketrans('TGCAMRWSYKVHDBtgcamrwsykvhdb', \
'ACGTKYWSRMBDHVacgtkywsrmbdhv')
LETTERS = ("A", "C", "G", "T")
......
#!/usr/bin/env python
import os
import collections
......@@ -16,7 +14,7 @@ def initialize_mut(ref, length):
for std in ('+','-'):
tab_std = tab_end[std] = {}
for mut in mapdamage.seq.HEADER:
tab_std[mut] = dict.fromkeys(xrange(length), 0)
tab_std[mut] = dict.fromkeys(range(length), 0)
return tab
......@@ -26,8 +24,8 @@ def print_mut(mut, opt, out):
def initialize_comp(ref, around, length):
keys = {"3p" : range(-length, 0) + range(1, around + 1),
"5p" : range(-around, 0) + range(1, length + 1)}
keys = {"3p" : list(range(-length, 0)) + list(range(1, around + 1)),
"5p" : list(range(-around, 0)) + list(range(1, length + 1))}
tab = {}
for contig in ref:
......@@ -74,8 +72,8 @@ def check_table_and_warn_if_dmg_freq_is_low(folder):
total = 0.0
for filename in ("5pCtoT_freq.txt", "3pGtoA_freq.txt"):
if not os.path.exists(folder+"/"+filename):
print("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
% filename)
print(("Error: Required table has not been created ('%s'), bayesian computation cannot be performed" \
% filename))
return True
with open(os.path.join(folder, filename)) as handle:
......@@ -85,12 +83,12 @@ def check_table_and_warn_if_dmg_freq_is_low(folder):
total += float(freq[1])
break
else:
print("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
% filename)
print(("Error: Could not find pos = 1 in table '%s', bayesian computation cannot be performed" \
% filename))
return True
if total < 0.01:
print("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total)
print(("Warning: DNA damage levels are too low, the Bayesian computation should not be performed (%f < 0.01)\n" % total))
return False
......@@ -103,9 +101,9 @@ def _print_freq_table(table, columns, opt, out, offset = 0):
out.write("# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads\n")
out.write("Chr\tEnd\tStd\tPos\t%s\n" % ("\t".join(columns)))
for (reference, ends) in sorted(table.iteritems()):
for (end, strands) in sorted(ends.iteritems()):
for (strand, subtable) in sorted(strands.iteritems()):
for (reference, ends) in sorted(table.items()):
for (end, strands) in sorted(ends.items()):
for (strand, subtable) in sorted(strands.items()):
subtable["Total"] = {}
for index in sorted(subtable[columns[0]]):
subtable["Total"][index] = sum(subtable[letter][index] for letter in mapdamage.seq.LETTERS)
......
#!/usr/bin/env python
#!/usr/bin/env python3
try:
from mapdamage._version import __version__
except ImportError:
__version__ = "2.0.8"
__version__ = "2.1.0"