Andreas Tille · Andreas Tille · 65e49359 · 65e49359 · 65e49359 · 65e49359
--- a/.hg_archival.txt
+++ b/.hg_archival.txt
 repo: 092c2fe2278cb7f0b18d81faeb4aab98b89dc096
-node: 9760413b180fc2c68b817c23602541d3a97528af
-branch: default
-tag: 2.7.8
+node: 89633b311684ece67f32a9461aa1567e32dd42f7
+branch: 2.9
+tag: 2.9.20
--- a/.hgignore
+++ b/.hgignore
@@ -6,3 +6,4 @@ tests/
 build/
 dist/
 *.egg-info/
+test/
--- a/.hgsub
+++ b/.hgsub
--- a/.hgsubstate
+++ b/.hgsubstate
-c168a100f37e23e2c110849a8d91fac8da49f5bd utils/export2graphlan
-69efddea43ae5e37d761cd138fcf083090371d1a utils/hclust2
+266347d7df0960de6e9771fdeb8899fefe9c80a7 utils/export2graphlan
+6d1023617944d9b7c69312bcb364886e7ef0a9f0 utils/hclust2
--- a/.hgtags
+++ b/.hgtags
@@ -19,3 +19,20 @@ cbd7880df400b453b8beb4e62b39e4a23b5523b6 2.7.6
 4d50b9ccd95234a436541db13bcb10741f99138b 2.7.62
 a71e7d6b9d50b89c4b80714af145e10d050be7cf 2.7.63
 3e9c612e0a922b73214c90db51c5e26be17bf8b6 2.7.7
+9760413b180fc2c68b817c23602541d3a97528af 2.7.8
+28653dd67045f037e26828520e3695c0a0696e70 2.9.0
+0000000000000000000000000000000000000000 2.9.0
+f3840f32febc53e5fc2a815a7c6f2f1d81dfe57c 2.9.1
+0000000000000000000000000000000000000000 2.9.1
+a6017724af42d0dc5b27fae26a42a87704cc72c7 2.9.13
+a6017724af42d0dc5b27fae26a42a87704cc72c7 2.9.13
+559c0f70fe06eb0efd758641a26fbb8902debe89 2.9.13
+fae12d288dc0245549ec2f6db824af939717b5eb 2.9.14
+ac64c6e70db20ea5b4080b924872f29728100325 2.9.15
+ac64c6e70db20ea5b4080b924872f29728100325 2.9.15
+c20357c9133be435919db0b2948f9aa164f19a0b 2.9.15
+c20357c9133be435919db0b2948f9aa164f19a0b 2.9.15
+8f88d3957418b667bafe8aa6d48ec2746853c956 2.9.15
+71f40111c849408b60e1bc7aeddfef84c9e55eea 2.9.17
+170242bd646540bfb5521fe7ff2e54bc8a97dd35 2.9.18
+7030c379d395c9eb463ca48943373d52121140d9 2.9.19
--- a/README.md
+++ b/README.md
--- a/__init__.py
+++ b/__init__.py
-from metaphlan2 import metaphlan2
+from q2_metaphlan2 import metaphlan2
 from ._metaphlan2 import profile_single_fastq
 from ._metaphlan2 import profile_paired_fastq


--- a/_metaphlan2.py
+++ b/_metaphlan2.py
@@ -23,6 +23,12 @@ def metaphlan2_helper(raw_data, nproc, input_type, output_file, verbose=True):

    sb.run(cmd, check=True)

+    print('\n\nIf you use MetaPhlAn2 in your work, please cite:\n\nTruong DT, '
+          'Franzosa EA, Tickle TL, Scholz M, Weingart G,\nPasolli E, Tett A, '
+          'Huttenhower C, Segata N.\nMetaPhlAn2 for enhanced metagenomic taxonomic '
+          'profiling\nNature Methods, 2015 Oct 1;12(10):902-3\n\nPMID: 26418763\n'
+          'doi: https://doi.org/10.1038/nmeth.3589', end='\n\n')
+

 def profile_single_fastq(raw_data: SingleLanePerSampleSingleEndFastqDirFmt,
                         nproc: int=1) -> biom.Table:

--- a/changeset.txt
+++ b/changeset.txt
+== Version 2.9
+- Automatic retrieval and installation of the latest MetaPhlAn2 database
+- New MetaPhlAn2 marker genes extracted with a newer version of ChocoPhlAn based on UniRef
+- Calculation of metagenome size for improved estimation of reads mapped to a given clade
+- Inclusion of NCBI taxonomy ID in the ouput file
+- CAMI (Taxonomic) Profiling Output Format included
 == Version 2.2.0
 - added option "marker_counts" (by Nicola)


--- a/license.txt
+++ b/license.txt
--- a/metaphlan2.py
+++ b/metaphlan2.py
--- a/plugin_setup.py
+++ b/plugin_setup.py
@@ -10,12 +10,12 @@ from q2_types.per_sample_sequences import SequencesWithQuality
 from q2_types.per_sample_sequences import PairedEndSequencesWithQuality
 from q2_types.feature_table import FeatureTable
 from q2_types.feature_table import Frequency
-import metaphlan2
+import q2_metaphlan2


 plugin = Plugin(
-    name='metaphlan2',
-    version='2.7.5',
+    name='q2-metaphlan2',
+    version='2.7.8',
    website='http://segatalab.cibio.unitn.it/tools/metaphlan2/',
    user_support_text='metaphlan-users@googlegroups.com',
    package='metaphlan2',
@@ -31,7 +31,7 @@ plugin = Plugin(
 )

 plugin.methods.register_function(
-    function=metaphlan2._metaphlan2.profile_single_fastq,
+    function=q2_metaphlan2._metaphlan2.profile_single_fastq,

    inputs={'raw_data': SampleData[SequencesWithQuality]},
    input_descriptions={'raw_data': ('metagenomic shotgun sequencing data')},
@@ -53,7 +53,7 @@ plugin.methods.register_function(
 )

 plugin.methods.register_function(
-    function=metaphlan2._metaphlan2.profile_paired_fastq,
+    function=q2_metaphlan2._metaphlan2.profile_paired_fastq,

    inputs={'raw_data': SampleData[PairedEndSequencesWithQuality]},
    input_descriptions={'raw_data': ('metagenomic shotgun sequencing data')},

--- a/strainphlan.py
+++ b/strainphlan.py
@@ -4,9 +4,11 @@

 __author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
              'Francesco Asnicar (f.asnicar@unitn.it), '
-              'Moreno Zolfo (moreno.zolfo@unitn.it)')
-__version__ = '1.2.0'
-__date__ = '31 May 2018'
+              'Moreno Zolfo (moreno.zolfo@unitn.it), '
+              'Francesco Beghini (francesco.beghini@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '1.2.2'
+__date__ = '10 Jul 2019'

 import sys
 import os
@@ -17,6 +19,8 @@ os.environ['PATH'] += ':' + MAIN_DIR
 os.environ['PATH'] += ':' + os.path.join(MAIN_DIR, 'strainphlan_src')
 sys.path.append(os.path.join(MAIN_DIR, 'strainphlan_src'))

+PYTHON_VERSION = float(sys.version_info[0])
+
 import which
 import argparse as ap
 try:
@@ -37,18 +41,15 @@ from Bio.Alphabet import IUPAC
 import pandas
 import logging
 import logging.config
-# import sample2markers
-# import copy
-# import threading
 import numpy
 import random
 import gc
-#import ipdb

 shared_variables = type('shared_variables', (object,), {})

 logging.basicConfig(level=logging.DEBUG, stream=sys.stderr,
-                    disable_existing_loggers=False,
+                    # ToDo: Fix this
+                    #disable_existing_loggers=False,
                    format='%(asctime)s | %(levelname)s | %(name)s | %(funcName)s | %(lineno)d | %(message)s')
 logger = logging.getLogger(__name__)

@@ -486,11 +487,7 @@ def get_db_clades(db):
        tax_clades = tax.split('|')
        for i, clade in enumerate(tax_clades):
            if 't__' not in clade and 's__' not in clade:
-                if i < len(tax_clades)-1:
-                    if 't__' in tax_clades[-1]:
                clade2subclades[clade].add('|'.join(tax_clades[i+1:-1]))
-                    else:
-                        clade2subclades[clade].add('|'.join(tax_clades[i+1:]))
    sing_clades = [clade for clade in clade2subclades if
                             len(clade2subclades[clade]) == 1]

@@ -542,11 +539,10 @@ def clean_alignment(
        N_count,
        N_col):

-    length = len(sample2seq[sample2seq.keys()[0]])
+    length = len(sample2seq[list(sample2seq)[0]])
    logger.debug('marker length: %d', length)
-    aligned_samples = sample2seq.keys()
    for sample in samples:
-        if sample not in aligned_samples:
+        if sample not in sample2seq:
            sample2seq[sample] = ['-' for i in range(length)]
            sample2freq[sample] = [(0.0, 0.0, 0.0) for i in range(length)]

@@ -623,7 +619,7 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,

    # marker list
    if len(genome2marker) == 0:
-        unique_markers = set(marker_records.keys())
+        unique_markers = set(list(marker_records))
    else:
        unique_markers = set([])
        for sample in genome2marker:
@@ -635,14 +631,20 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
    # add ifn_ref_genomes
    oosp = ooSubprocess.ooSubprocess(tmp_dir=tmp_dir)
    logger.debug('load genome contigs')
-    p1 = SpooledTemporaryFile(dir=tmp_dir)
+    p1 = NamedTemporaryFile(dir=tmp_dir)
    contigs = defaultdict(dict)
    for ifn_genome in ifn_ref_genomes:
        genome = ooSubprocess.splitext(ifn_genome)[0]
        if ifn_genome[-4:] == '.bz2':
+            if(PYTHON_VERSION < 3):
                ifile_genome = bz2.BZ2File(ifn_genome, 'r')
+            else:    
+                ifile_genome = bz2.open(ifn_genome, 'rt')
        elif ifn_genome[-3:] == '.gz':
+            if(PYTHON_VERSION < 3):
                ifile_genome = gzip.GzipFile(ifn_genome, 'r')
+            else: 
+                ifile_genome = gzip.open(ifn_genome, 'rt')
        elif ifn_genome[-4:] == '.fna':
            ifile_genome = open(ifn_genome, 'r')
        else:
@@ -651,7 +653,8 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
            exit(1)

        # extract genome contigs
-        for rec in SeqIO.parse(ifile_genome, 'fasta'):
+        reference_sequences = list(SeqIO.parse(ifile_genome, 'fasta'))
+        for rec in reference_sequences:
            #rec.name = genome + '___' + rec.name
            if rec.name in contigs:
                logger.error(
@@ -661,9 +664,8 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
                exit(1)
            contigs[rec.name]['seq'] = str(rec.seq)
            contigs[rec.name]['genome'] = genome
-            SeqIO.write(rec, p1, 'fasta')
-
        ifile_genome.close()
+        SeqIO.write(reference_sequences, p1.name, 'fasta')
    p1.seek(0)

    # build blastdb
@@ -681,10 +683,13 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,

    # blast markers against contigs
    logger.debug('blast markers against contigs')
-    p1 = SpooledTemporaryFile(dir=tmp_dir)
-
+    p1 = NamedTemporaryFile(dir=tmp_dir)
+    unique_marker_records = []
    for marker in unique_markers:
-        SeqIO.write(marker_records[marker], p1, 'fasta')
+        #SeqIO.write(marker_records[marker], p1.name, 'fasta')
+        unique_marker_records.append(marker_records[marker])
+    SeqIO.write(unique_marker_records, p1.name, 'fasta')
+

    p1.seek(0)
    blastn_args = ['-db', blastdb_prefix, '-outfmt', '6', '-evalue', '1e-10',
@@ -700,7 +705,7 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
    for line in output:
        if line.strip() == '':
            break
-        line = line.strip().split()
+        line = line.decode("utf-8").strip().split()
        query = line[0]
        target = line[1]
        pstart = int(line[8])-1
@@ -748,17 +753,17 @@ def align_clean(args):
    marker_file = NamedTemporaryFile(dir=tmp_dir, delete=False)
    marker_fn = marker_file.name
    sample_count = 0
-    for sample in iter(sample2marker.keys()):
-        if marker in iter(sample2marker[sample].keys()):
+    list_sample_markers = []
+    for sample in iter(list(sample2marker)):
+        if marker in iter(list(sample2marker[sample])):
            sample_count += 1
-            SeqIO.write(
+            list_sample_markers.append(
                SeqRecord.SeqRecord(
                    id=sample,
                    description='',
-                    seq=Seq.Seq(sample2marker[sample][marker]['seq'])),
-                marker_file,
-                'fasta')
+                    seq=Seq.Seq(sample2marker[sample][marker]['seq'])))
    marker_file.close()
+    SeqIO.write(list_sample_markers, marker_fn, 'fasta')
    ratio = float(sample_count) / len(sample2marker)
    if  ratio < sample_in_marker:
        os.remove(marker_fn)
@@ -771,7 +776,8 @@ def align_clean(args):

    sample2seq = {}
    sample2freq = {}
-    for rec in SeqIO.parse(alignment_file, 'fasta'):
+
+    for rec in SeqIO.parse(alignment_file.name, 'fasta'):
        sample = rec.name
        sample2seq[sample] = list(str(rec.seq))
        sample2freq[sample] = list(sample2marker[sample][marker]['freq'])
@@ -794,7 +800,7 @@ def align_clean(args):
        exit(1)

    sample2seq, sample2freq = clean_alignment(
-                                    sample2marker.keys(),
+                                    list(sample2marker),
                                    sample2seq,
                                    sample2freq,
                                    gap_in_trailing_col,
@@ -944,10 +950,10 @@ def build_tree(

    # remove long gaps
    logger.debug('full sequence length before long_gap_length: %d'\
-                    %(len(sample2fullseq[sample2fullseq.keys()[0]])))
+                    %(len(sample2fullseq[list(sample2fullseq)[0]])))
    ofile_cladeinfo.write(
                    'full sequence length before long_gap_length: %d\n'\
-                    %(len(sample2fullseq[sample2fullseq.keys()[0]])))
+                    %(len(sample2fullseq[list(sample2fullseq)[0]])))

    df_seq = pandas.DataFrame.from_dict(sample2fullseq, orient='index')
    df_freq = pandas.DataFrame.from_dict(sample2fullfreq, orient='index')
@@ -982,10 +988,10 @@ def build_tree(
            sample2fullseq[sample] = df_seq.loc[sample].tolist()
            sample2fullfreq[sample] = df_freq.loc[sample].tolist()
        logger.debug('full sequence length after long_gap_length: %d'\
-                        %(len(sample2fullseq[sample2fullseq.keys()[0]])))
+                        %(len(sample2fullseq[list(sample2fullseq)[0]])))
        ofile_cladeinfo.write(
                        'full sequence length after long_gap_length: %d\n'\
-                        %(len(sample2fullseq[sample2fullseq.keys()[0]])))
+                        %(len(sample2fullseq[list(sample2fullseq)[0]])))

        for i in range(len(marker_pos)):
            num_del = 0
@@ -1151,7 +1157,7 @@ def load_sample(args):
    kept_markers = args['kept_markers']
    sample = ooSubprocess.splitext(ifn_sample)[0]
    with open(ifn_sample, 'rb') as ifile:
-        marker2seq = msgpack.load(ifile, use_list=False)
+        marker2seq = msgpack.unpack(ifile, use_list=False, encoding='utf-8')

    if kept_clade:
        if kept_clade == 'singleton':
@@ -1159,7 +1165,8 @@ def load_sample(args):
        else:
            # remove redundant clades and markers
            nmarkers = 0
-            for marker in marker2seq.keys():
+            for marker in list(marker2seq):
+                if marker in db['markers']:
                    clade = db['markers'][marker]['taxon'].split('|')[-1]
                    if kept_markers:
                        if marker in kept_markers and clade == kept_clade:
@@ -1203,6 +1210,7 @@ def load_sample(args):
        clade2n_markers = defaultdict(int)
        remove_clade = []
        for marker in marker2seq:
+            if marker in db['markers']:
                clade = db['markers'][marker]['taxon'].split('|')[-1]
                if 's__' in clade or clade in sing_clades:
                    clade2n_markers[clade] = clade2n_markers[clade] + 1
@@ -1211,7 +1219,7 @@ def load_sample(args):
        remove_clade += [clade for clade in clade2n_markers if
                         float(clade2n_markers[clade]) \
                         / float(clade2num_markers[clade]) < marker_in_clade]
-        remove_marker = [marker for marker in marker2seq if
+        remove_marker = [marker for marker in marker2seq if marker in db['markers'] if
                         db['markers'][marker]['taxon'].split('|')[-1] in
                         remove_clade]
        for marker in remove_marker:
@@ -1219,6 +1227,7 @@ def load_sample(args):

        sample_clades = set([])
        for marker in marker2seq:
+            if marker in db['markers']:
                clade = db['markers'][marker]['taxon'].split('|')[-1]
                sample_clades.add(clade)
        return sample_clades
@@ -1356,7 +1365,7 @@ def strainer(args):

        # reduce and convert to shared memory
        #logger.debug('converting db')
-        db['taxonomy'] = db['taxonomy'].keys()
+        db['taxonomy'] = list(db['taxonomy'])
        for m in db['markers']:
            del db['markers'][m]['clade']
            del db['markers'][m]['ext']
@@ -1398,11 +1407,12 @@ def strainer(args):
            repr_marker2seq = msgpack.load(ifile, use_list=False)
        if args['clades'] != ['all'] and args['clades'] != ['singleton']:
            for marker in repr_marker2seq:
+                if marker in db['markers']:
                    clade = db['markers'][marker]['taxon'].split('|')[-1]
                    if clade in args['clades']:
                        kept_markers.add(marker)
        else:
-            kept_markers = set(repr_marker2seq.keys())
+            kept_markers = set(list(repr_marker2seq))
        logger.debug('Number of markers in the representative '\
                     'sample: %d'%len(kept_markers))
        if not kept_markers:
@@ -1421,6 +1431,7 @@ def strainer(args):
            if c.startswith('s__'):
                print(c)
            else:
+                ##ToDo: Fix this on Python 3
                print(c, '(%s)'%(','.join(list(clade2subclades[c]))))
        return

@@ -1430,7 +1441,7 @@ def strainer(args):
    if (args['ifn_markers'] is not None) and (args['ifn_ref_genomes'] is not None):
        logger.info('Add reference genomes')
        marker_records = {}
-
+        # Gets the markers of the reference genome
        for rec in SeqIO.parse(open(args['ifn_markers'], 'r'), 'fasta'):
            if rec.id in kept_markers or (not kept_markers):
                marker_records[rec.id] = rec
@@ -1457,6 +1468,7 @@ def strainer(args):
        else:
            sample2order[ref] = 'first'

+ 
    # build tree for each clade
    for clade in args['clades']:
        logger.info('Build the tree for %s'%clade)
@@ -1488,12 +1500,13 @@ def strainer(args):
        # remove samples with percentage of markers less than marker_in_clade
        logger.debug('remove samples with percentage of markers '\
                     'less than marker_in_clade')
-        for sample in sample2marker.keys():
+        for sample in list(sample2marker):
            if len(sample2marker[sample]):
                if clade == 'singleton':
                    c = 'singleton'
                else:
-                    marker = sample2marker[sample].keys()[0]
+                    marker = list(sample2marker[sample])[0]
+                    if marker in db['markers']:
                        c = db['markers'][marker]['taxon'].split('|')[-1]
                if len(sample2marker[sample]) / \
                    float(clade2num_markers[c]) < args['marker_in_clade']:
@@ -1556,8 +1569,8 @@ def check_dependencies(args):

 def strainphlan():
    args = read_params()
-
    # fix db .pkl file
+    
    if '--mpa_pkl' not in sys.argv:
        if os.path.isfile(os.path.join(args['mpa_pkl'], "mpa_" + args['index'] + ".pkl")):
            args['mpa_pkl'] = os.path.join(args['mpa_pkl'], "mpa_" + args['index'] + ".pkl")

--- a/strainphlan_src/add_metadata_tree.py
+++ b/strainphlan_src/add_metadata_tree.py
 #!/usr/bin/env python
-#Author: Duy Tin Truong (duytin.truong@unitn.it)
+#Authors: Duy Tin Truong (duytin.truong@unitn.it)
 #        at CIBIO, University of Trento, Italy

-
-# import sys
-# import os
 import argparse as ap
 import pandas
-# import copy
-# import ConfigParser
 import dendropy
-# import numpy
-# import ipdb
+import numpy

+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'

 def read_params():
    p = ap.ArgumentParser()
@@ -45,7 +43,7 @@ def get_index_col(ifn):
 def main(args):
    add_fields = args['metadatas']
    for ifn_tree in args['ifn_trees']:
-        print 'Input:', ifn_tree
+        print ('Input:', ifn_tree)
        df_list = []
        samples = []
        for ifn in args['ifn_metadatas']:
@@ -53,7 +51,7 @@ def main(args):
            df = pandas.read_csv(
                ifn,
                sep='\t',
-                dtype=unicode,
+                dtype=numpy.unicode_,
                header=0,
                index_col=index_col)
            df = df.transpose()
@@ -63,14 +61,14 @@ def main(args):
                with open(ifn, 'r') as ifile:
                    add_fields = [f for f in ifile.readline().strip().split('\t') \
                                  if f.upper() != 'SAMPLEID']
-        print 'number of samples in metadata: %d'%len(samples)
+        print ('number of samples in metadata: %d'%len(samples))
        count = 0
        with open(ifn_tree, 'r') as ifile:
            line = ifile.readline()
        line = line.replace(args['string_to_remove'], '')
-        tree = dendropy.Tree(stream=open(ifn_tree, 'r'), schema='newick')
+        tree = dendropy.Tree.get(stream=open(ifn_tree, 'r'), schema='newick')
        for node in tree.leaf_nodes():
-            sample = node.get_node_str().strip("'")
+            sample = node.__getattribute__("taxon").__str__().strip("'")
            sample = sample.replace(' ', '_')
            sample = sample.replace(args['string_to_remove'], '')
            prefixes = [prefix for prefix in
@@ -101,8 +99,8 @@ def main(args):
            line = line.replace(sample + ':', metadata + ':')

        ofn_tree = ifn_tree + '.metadata'
-        print 'Number of samples in tree: %d'%count
-        print 'Output:', ofn_tree
+        print ('Number of samples in tree: %d'%count)
+        print ('Output:', ofn_tree)
        with open(ofn_tree, 'w') as ofile:
            ofile.write(line)


--- a/strainphlan_src/build_tree_single_strain.py
+++ b/strainphlan_src/build_tree_single_strain.py
@@ -2,9 +2,11 @@
 #Author: Duy Tin Truong (duytin.truong@unitn.it)
 #        at CIBIO, University of Trento, Italy

-__author__  = 'Duy Tin Truong (duytin.truong@unitn.it)'
-__version__ = '0.1'
-__date__    = '17 Sep 2015'
+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'
+

 # import sys
 import os
@@ -54,7 +56,7 @@ def read_params():


 def run(cmd):
-    print cmd
+    print(cmd)
    os.system(cmd)



--- a/strainphlan_src/compute_distance.py
+++ b/strainphlan_src/compute_distance.py
@@ -2,9 +2,10 @@
 #Author: Duy Tin Truong (duytin.truong@unitn.it)
 #        at CIBIO, University of Trento, Italy

-__author__  = 'Duy Tin Truong (duytin.truong@unitn.it)'
-__version__ = '0.1'
-__date__    = '1 Sep 2014'
+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'

 import sys
 import os
@@ -13,10 +14,11 @@ MAIN_DIR = os.path.dirname(ABS_PATH)
 os.environ['PATH'] += ':%s'%MAIN_DIR
 sys.path.append(MAIN_DIR)

+PYTHON_VERSION = float(sys.version_info[0])
+
 from mixed_utils import dist2file, statistics
 import argparse as ap
-from Bio import SeqIO#, Seq, SeqRecord
-# from collections import defaultdict
+from Bio import SeqIO
 import numpy
 from ooSubprocess import ooSubprocess

@@ -51,8 +53,12 @@ def read_params():

 def get_dist(seq1, seq2, ignore_gaps):
    if len(seq1) != len(seq2):        
+        if(PYTHON_VERSION < 3):
            print >> sys.stderr, 'Error: Two sequences have different lengths!'
            print >> sys.stderr, 'Cannot compute the distance!'
+        else:
+            print('Error: Two sequences have different lengths!', file=sys.stderr)
+            print('Cannot compute the distance!', file=sys.stderr)
        exit(1)

    abs_dist = 0.0
@@ -77,11 +83,10 @@ def get_dist(seq1, seq2, ignore_gaps):
 def compute_dist_matrix(ifn_alignment, ofn_prefix, ignore_gaps, overwrite):
    ofn_abs_dist = ofn_prefix + '.abs_dist'
    if (not overwrite) and os.path.isfile(ofn_abs_dist.replace('.abs_dist', '.rel_dist')):
-        print 'File %s exists, skip!'%ofn_abs_dist
+        print('File %s exists, skip!'%ofn_abs_dist)
        return
    else:
-        print 'Compute dist_matrix for %s'%ofn_abs_dist
-    #print 'Compute dist_matrix for %s'%ofn_abs_dist
+        print('Compute dist_matrix for %s'%ofn_abs_dist)

    recs = [rec for rec in SeqIO.parse(open((ifn_alignment), 'r'), 'fasta')]
    abs_dist = numpy.zeros((len(recs), len(recs)))

--- a/strainphlan_src/compute_distance_all.py
+++ b/strainphlan_src/compute_distance_all.py
@@ -2,9 +2,10 @@
 #Author: Duy Tin Truong (duytin.truong@unitn.it)
 #        at CIBIO, University of Trento, Italy

-__author__  = 'Duy Tin Truong (duytin.truong@unitn.it)'
-__version__ = '0.1'
-__date__    = '9 Feb 2015'
+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'

 import sys
 import os
@@ -13,9 +14,7 @@ MAIN_DIR = os.path.dirname(ABS_PATH)
 os.environ['PATH'] += ':%s'%MAIN_DIR
 sys.path.append(MAIN_DIR)
 import argparse as ap
-from Bio import SeqIO#, Seq, SeqRecord
-# from collections import defaultdict
-# import numpy
+from Bio import SeqIO
 from compute_distance import compute_dist_matrix
 from ooSubprocess import parallelize


--- a/strainphlan_src/dump_file.py
+++ b/strainphlan_src/dump_file.py
@@ -2,20 +2,19 @@
 #Author: Duy Tin Truong (duytin.truong@unitn.it)
 #        at CIBIO, University of Trento, Italy

+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'

 import sys
 import argparse as ap
 import bz2
 import gzip
 import tarfile
-#import logging.config
-#sys.path.append('../pyphlan')
-#sys.path.append('pyphlan')
 import ooSubprocess

-#logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
-#logger = logging.getLogger(__name__)
-
+PYTHON_VERSION = float(sys.version_info[0])

 def read_params():
    p = ap.ArgumentParser()
@@ -33,10 +32,16 @@ def dump_file(ifn):
        ifile = tarfile.open(ifn, 'r:gz')
        file_ext = '.tar.gz'
    elif ifn.endswith('.bz2'):
+        if(PYTHON_VERSION < 3):
            ifile = bz2.BZ2File(ifn, 'r')
+        else:    
+            ifile = bz2.open(ifn, 'rt')
        file_ext = '.bz2'
    elif ifn.endswith('.gz'):
-        ifile = gzip.GzipFile(ifn, 'r')
+        if(PYTHON_VERSION < 3):
+            ifile = gzip.BZ2File(ifn, 'r')
+        else:    
+            ifile = gzip.open(ifn, 'rt')
        file_ext = '.gz'
    elif ifn.endswith('.fastq'):
        ifile = open(ifn, 'r')

--- a/strainphlan_src/extract_markers.py
+++ b/strainphlan_src/extract_markers.py
@@ -2,12 +2,11 @@
 #Author: Duy Tin Truong (duytin.truong@unitn.it)
 #        at CIBIO, University of Trento, Italy

-__author__  = 'Duy Tin Truong (duytin.truong@unitn.it)'
-__version__ = '0.1'
-__date__    = '1 Sep 2014'
+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'

-# import sys
-# import os
 import argparse as ap
 import pickle
 import bz2
@@ -29,7 +28,7 @@ def extract_markers(mpa_pkl, ifn_markers, clade, ofn_markers):
    for marker in db['markers']:
        if clade == db['markers'][marker]['taxon'].split('|')[-1]:
            markers.add(marker)
-    print 'number of markers', len(markers)
+    print('number of markers: {}'.format(len(markers)))
    with open(ofn_markers, 'w') as ofile:
        for rec in SeqIO.parse(open(ifn_markers, 'r'), 'fasta'):
            if rec.name in markers:

--- a/strainphlan_src/fastx_len_filter.py
+++ b/strainphlan_src/fastx_len_filter.py
 #!/usr/bin/env python
+#Author: Duy Tin Truong (duytin.truong@unitn.it)
+#        at CIBIO, University of Trento, Italy
+
+__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
+              'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
+__version__ = '0.2'
+__date__    = '10 Jul 19'
+
+
 from Bio import SeqIO
 import argparse as ap
 import sys
@@ -12,6 +21,8 @@ if __name__ == '__main__':
 	args = read_params(sys.argv)
 	min_len = args['min_len']
 	with sys.stdout as outf:
+		list_r = []
 		for r in SeqIO.parse(sys.stdin, "fastq"):
 			if len(r) >= min_len:
-				SeqIO.write(r, outf, "fastq")
+				list_r.append(r)
+		SeqIO.write(list_r, outf, "fastq")