Skip to content
Commits on Source (2)
repo: 092c2fe2278cb7f0b18d81faeb4aab98b89dc096
node: 9760413b180fc2c68b817c23602541d3a97528af
branch: default
tag: 2.7.8
node: 89633b311684ece67f32a9461aa1567e32dd42f7
branch: 2.9
tag: 2.9.20
......@@ -6,3 +6,4 @@ tests/
build/
dist/
*.egg-info/
test/
File mode changed from 100644 to 100755
c168a100f37e23e2c110849a8d91fac8da49f5bd utils/export2graphlan
69efddea43ae5e37d761cd138fcf083090371d1a utils/hclust2
266347d7df0960de6e9771fdeb8899fefe9c80a7 utils/export2graphlan
6d1023617944d9b7c69312bcb364886e7ef0a9f0 utils/hclust2
......@@ -19,3 +19,20 @@ cbd7880df400b453b8beb4e62b39e4a23b5523b6 2.7.6
4d50b9ccd95234a436541db13bcb10741f99138b 2.7.62
a71e7d6b9d50b89c4b80714af145e10d050be7cf 2.7.63
3e9c612e0a922b73214c90db51c5e26be17bf8b6 2.7.7
9760413b180fc2c68b817c23602541d3a97528af 2.7.8
28653dd67045f037e26828520e3695c0a0696e70 2.9.0
0000000000000000000000000000000000000000 2.9.0
f3840f32febc53e5fc2a815a7c6f2f1d81dfe57c 2.9.1
0000000000000000000000000000000000000000 2.9.1
a6017724af42d0dc5b27fae26a42a87704cc72c7 2.9.13
a6017724af42d0dc5b27fae26a42a87704cc72c7 2.9.13
559c0f70fe06eb0efd758641a26fbb8902debe89 2.9.13
fae12d288dc0245549ec2f6db824af939717b5eb 2.9.14
ac64c6e70db20ea5b4080b924872f29728100325 2.9.15
ac64c6e70db20ea5b4080b924872f29728100325 2.9.15
c20357c9133be435919db0b2948f9aa164f19a0b 2.9.15
c20357c9133be435919db0b2948f9aa164f19a0b 2.9.15
8f88d3957418b667bafe8aa6d48ec2746853c956 2.9.15
71f40111c849408b60e1bc7aeddfef84c9e55eea 2.9.17
170242bd646540bfb5521fe7ff2e54bc8a97dd35 2.9.18
7030c379d395c9eb463ca48943373d52121140d9 2.9.19
This diff is collapsed.
from metaphlan2 import metaphlan2
from q2_metaphlan2 import metaphlan2
from ._metaphlan2 import profile_single_fastq
from ._metaphlan2 import profile_paired_fastq
......
......@@ -23,6 +23,12 @@ def metaphlan2_helper(raw_data, nproc, input_type, output_file, verbose=True):
sb.run(cmd, check=True)
print('\n\nIf you use MetaPhlAn2 in your work, please cite:\n\nTruong DT, '
'Franzosa EA, Tickle TL, Scholz M, Weingart G,\nPasolli E, Tett A, '
'Huttenhower C, Segata N.\nMetaPhlAn2 for enhanced metagenomic taxonomic '
'profiling\nNature Methods, 2015 Oct 1;12(10):902-3\n\nPMID: 26418763\n'
'doi: https://doi.org/10.1038/nmeth.3589', end='\n\n')
def profile_single_fastq(raw_data: SingleLanePerSampleSingleEndFastqDirFmt,
nproc: int=1) -> biom.Table:
......
== Version 2.9
- Automatic retrieval and installation of the latest MetaPhlAn2 database
- New MetaPhlAn2 marker genes extracted with a newer version of ChocoPhlAn based on UniRef
- Calculation of metagenome size for improved estimation of reads mapped to a given clade
- Inclusion of NCBI taxonomy ID in the ouput file
- CAMI (Taxonomic) Profiling Output Format included
== Version 2.2.0
- added option "marker_counts" (by Nicola)
......
File mode changed from 100644 to 100755
This diff is collapsed.
......@@ -10,12 +10,12 @@ from q2_types.per_sample_sequences import SequencesWithQuality
from q2_types.per_sample_sequences import PairedEndSequencesWithQuality
from q2_types.feature_table import FeatureTable
from q2_types.feature_table import Frequency
import metaphlan2
import q2_metaphlan2
plugin = Plugin(
name='metaphlan2',
version='2.7.5',
name='q2-metaphlan2',
version='2.7.8',
website='http://segatalab.cibio.unitn.it/tools/metaphlan2/',
user_support_text='metaphlan-users@googlegroups.com',
package='metaphlan2',
......@@ -31,7 +31,7 @@ plugin = Plugin(
)
plugin.methods.register_function(
function=metaphlan2._metaphlan2.profile_single_fastq,
function=q2_metaphlan2._metaphlan2.profile_single_fastq,
inputs={'raw_data': SampleData[SequencesWithQuality]},
input_descriptions={'raw_data': ('metagenomic shotgun sequencing data')},
......@@ -53,7 +53,7 @@ plugin.methods.register_function(
)
plugin.methods.register_function(
function=metaphlan2._metaphlan2.profile_paired_fastq,
function=q2_metaphlan2._metaphlan2.profile_paired_fastq,
inputs={'raw_data': SampleData[PairedEndSequencesWithQuality]},
input_descriptions={'raw_data': ('metagenomic shotgun sequencing data')},
......
......@@ -4,9 +4,11 @@
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Francesco Asnicar (f.asnicar@unitn.it), '
'Moreno Zolfo (moreno.zolfo@unitn.it)')
__version__ = '1.2.0'
__date__ = '31 May 2018'
'Moreno Zolfo (moreno.zolfo@unitn.it), '
'Francesco Beghini (francesco.beghini@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '1.2.2'
__date__ = '10 Jul 2019'
import sys
import os
......@@ -17,6 +19,8 @@ os.environ['PATH'] += ':' + MAIN_DIR
os.environ['PATH'] += ':' + os.path.join(MAIN_DIR, 'strainphlan_src')
sys.path.append(os.path.join(MAIN_DIR, 'strainphlan_src'))
PYTHON_VERSION = float(sys.version_info[0])
import which
import argparse as ap
try:
......@@ -37,18 +41,15 @@ from Bio.Alphabet import IUPAC
import pandas
import logging
import logging.config
# import sample2markers
# import copy
# import threading
import numpy
import random
import gc
#import ipdb
shared_variables = type('shared_variables', (object,), {})
logging.basicConfig(level=logging.DEBUG, stream=sys.stderr,
disable_existing_loggers=False,
# ToDo: Fix this
#disable_existing_loggers=False,
format='%(asctime)s | %(levelname)s | %(name)s | %(funcName)s | %(lineno)d | %(message)s')
logger = logging.getLogger(__name__)
......@@ -486,11 +487,7 @@ def get_db_clades(db):
tax_clades = tax.split('|')
for i, clade in enumerate(tax_clades):
if 't__' not in clade and 's__' not in clade:
if i < len(tax_clades)-1:
if 't__' in tax_clades[-1]:
clade2subclades[clade].add('|'.join(tax_clades[i+1:-1]))
else:
clade2subclades[clade].add('|'.join(tax_clades[i+1:]))
sing_clades = [clade for clade in clade2subclades if
len(clade2subclades[clade]) == 1]
......@@ -542,11 +539,10 @@ def clean_alignment(
N_count,
N_col):
length = len(sample2seq[sample2seq.keys()[0]])
length = len(sample2seq[list(sample2seq)[0]])
logger.debug('marker length: %d', length)
aligned_samples = sample2seq.keys()
for sample in samples:
if sample not in aligned_samples:
if sample not in sample2seq:
sample2seq[sample] = ['-' for i in range(length)]
sample2freq[sample] = [(0.0, 0.0, 0.0) for i in range(length)]
......@@ -623,7 +619,7 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
# marker list
if len(genome2marker) == 0:
unique_markers = set(marker_records.keys())
unique_markers = set(list(marker_records))
else:
unique_markers = set([])
for sample in genome2marker:
......@@ -635,14 +631,20 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
# add ifn_ref_genomes
oosp = ooSubprocess.ooSubprocess(tmp_dir=tmp_dir)
logger.debug('load genome contigs')
p1 = SpooledTemporaryFile(dir=tmp_dir)
p1 = NamedTemporaryFile(dir=tmp_dir)
contigs = defaultdict(dict)
for ifn_genome in ifn_ref_genomes:
genome = ooSubprocess.splitext(ifn_genome)[0]
if ifn_genome[-4:] == '.bz2':
if(PYTHON_VERSION < 3):
ifile_genome = bz2.BZ2File(ifn_genome, 'r')
else:
ifile_genome = bz2.open(ifn_genome, 'rt')
elif ifn_genome[-3:] == '.gz':
if(PYTHON_VERSION < 3):
ifile_genome = gzip.GzipFile(ifn_genome, 'r')
else:
ifile_genome = gzip.open(ifn_genome, 'rt')
elif ifn_genome[-4:] == '.fna':
ifile_genome = open(ifn_genome, 'r')
else:
......@@ -651,7 +653,8 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
exit(1)
# extract genome contigs
for rec in SeqIO.parse(ifile_genome, 'fasta'):
reference_sequences = list(SeqIO.parse(ifile_genome, 'fasta'))
for rec in reference_sequences:
#rec.name = genome + '___' + rec.name
if rec.name in contigs:
logger.error(
......@@ -661,9 +664,8 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
exit(1)
contigs[rec.name]['seq'] = str(rec.seq)
contigs[rec.name]['genome'] = genome
SeqIO.write(rec, p1, 'fasta')
ifile_genome.close()
SeqIO.write(reference_sequences, p1.name, 'fasta')
p1.seek(0)
# build blastdb
......@@ -681,10 +683,13 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
# blast markers against contigs
logger.debug('blast markers against contigs')
p1 = SpooledTemporaryFile(dir=tmp_dir)
p1 = NamedTemporaryFile(dir=tmp_dir)
unique_marker_records = []
for marker in unique_markers:
SeqIO.write(marker_records[marker], p1, 'fasta')
#SeqIO.write(marker_records[marker], p1.name, 'fasta')
unique_marker_records.append(marker_records[marker])
SeqIO.write(unique_marker_records, p1.name, 'fasta')
p1.seek(0)
blastn_args = ['-db', blastdb_prefix, '-outfmt', '6', '-evalue', '1e-10',
......@@ -700,7 +705,7 @@ def add_ref_genomes(genome2marker, marker_records, ifn_ref_genomes, tmp_dir,
for line in output:
if line.strip() == '':
break
line = line.strip().split()
line = line.decode("utf-8").strip().split()
query = line[0]
target = line[1]
pstart = int(line[8])-1
......@@ -748,17 +753,17 @@ def align_clean(args):
marker_file = NamedTemporaryFile(dir=tmp_dir, delete=False)
marker_fn = marker_file.name
sample_count = 0
for sample in iter(sample2marker.keys()):
if marker in iter(sample2marker[sample].keys()):
list_sample_markers = []
for sample in iter(list(sample2marker)):
if marker in iter(list(sample2marker[sample])):
sample_count += 1
SeqIO.write(
list_sample_markers.append(
SeqRecord.SeqRecord(
id=sample,
description='',
seq=Seq.Seq(sample2marker[sample][marker]['seq'])),
marker_file,
'fasta')
seq=Seq.Seq(sample2marker[sample][marker]['seq'])))
marker_file.close()
SeqIO.write(list_sample_markers, marker_fn, 'fasta')
ratio = float(sample_count) / len(sample2marker)
if ratio < sample_in_marker:
os.remove(marker_fn)
......@@ -771,7 +776,8 @@ def align_clean(args):
sample2seq = {}
sample2freq = {}
for rec in SeqIO.parse(alignment_file, 'fasta'):
for rec in SeqIO.parse(alignment_file.name, 'fasta'):
sample = rec.name
sample2seq[sample] = list(str(rec.seq))
sample2freq[sample] = list(sample2marker[sample][marker]['freq'])
......@@ -794,7 +800,7 @@ def align_clean(args):
exit(1)
sample2seq, sample2freq = clean_alignment(
sample2marker.keys(),
list(sample2marker),
sample2seq,
sample2freq,
gap_in_trailing_col,
......@@ -944,10 +950,10 @@ def build_tree(
# remove long gaps
logger.debug('full sequence length before long_gap_length: %d'\
%(len(sample2fullseq[sample2fullseq.keys()[0]])))
%(len(sample2fullseq[list(sample2fullseq)[0]])))
ofile_cladeinfo.write(
'full sequence length before long_gap_length: %d\n'\
%(len(sample2fullseq[sample2fullseq.keys()[0]])))
%(len(sample2fullseq[list(sample2fullseq)[0]])))
df_seq = pandas.DataFrame.from_dict(sample2fullseq, orient='index')
df_freq = pandas.DataFrame.from_dict(sample2fullfreq, orient='index')
......@@ -982,10 +988,10 @@ def build_tree(
sample2fullseq[sample] = df_seq.loc[sample].tolist()
sample2fullfreq[sample] = df_freq.loc[sample].tolist()
logger.debug('full sequence length after long_gap_length: %d'\
%(len(sample2fullseq[sample2fullseq.keys()[0]])))
%(len(sample2fullseq[list(sample2fullseq)[0]])))
ofile_cladeinfo.write(
'full sequence length after long_gap_length: %d\n'\
%(len(sample2fullseq[sample2fullseq.keys()[0]])))
%(len(sample2fullseq[list(sample2fullseq)[0]])))
for i in range(len(marker_pos)):
num_del = 0
......@@ -1151,7 +1157,7 @@ def load_sample(args):
kept_markers = args['kept_markers']
sample = ooSubprocess.splitext(ifn_sample)[0]
with open(ifn_sample, 'rb') as ifile:
marker2seq = msgpack.load(ifile, use_list=False)
marker2seq = msgpack.unpack(ifile, use_list=False, encoding='utf-8')
if kept_clade:
if kept_clade == 'singleton':
......@@ -1159,7 +1165,8 @@ def load_sample(args):
else:
# remove redundant clades and markers
nmarkers = 0
for marker in marker2seq.keys():
for marker in list(marker2seq):
if marker in db['markers']:
clade = db['markers'][marker]['taxon'].split('|')[-1]
if kept_markers:
if marker in kept_markers and clade == kept_clade:
......@@ -1203,6 +1210,7 @@ def load_sample(args):
clade2n_markers = defaultdict(int)
remove_clade = []
for marker in marker2seq:
if marker in db['markers']:
clade = db['markers'][marker]['taxon'].split('|')[-1]
if 's__' in clade or clade in sing_clades:
clade2n_markers[clade] = clade2n_markers[clade] + 1
......@@ -1211,7 +1219,7 @@ def load_sample(args):
remove_clade += [clade for clade in clade2n_markers if
float(clade2n_markers[clade]) \
/ float(clade2num_markers[clade]) < marker_in_clade]
remove_marker = [marker for marker in marker2seq if
remove_marker = [marker for marker in marker2seq if marker in db['markers'] if
db['markers'][marker]['taxon'].split('|')[-1] in
remove_clade]
for marker in remove_marker:
......@@ -1219,6 +1227,7 @@ def load_sample(args):
sample_clades = set([])
for marker in marker2seq:
if marker in db['markers']:
clade = db['markers'][marker]['taxon'].split('|')[-1]
sample_clades.add(clade)
return sample_clades
......@@ -1356,7 +1365,7 @@ def strainer(args):
# reduce and convert to shared memory
#logger.debug('converting db')
db['taxonomy'] = db['taxonomy'].keys()
db['taxonomy'] = list(db['taxonomy'])
for m in db['markers']:
del db['markers'][m]['clade']
del db['markers'][m]['ext']
......@@ -1398,11 +1407,12 @@ def strainer(args):
repr_marker2seq = msgpack.load(ifile, use_list=False)
if args['clades'] != ['all'] and args['clades'] != ['singleton']:
for marker in repr_marker2seq:
if marker in db['markers']:
clade = db['markers'][marker]['taxon'].split('|')[-1]
if clade in args['clades']:
kept_markers.add(marker)
else:
kept_markers = set(repr_marker2seq.keys())
kept_markers = set(list(repr_marker2seq))
logger.debug('Number of markers in the representative '\
'sample: %d'%len(kept_markers))
if not kept_markers:
......@@ -1421,6 +1431,7 @@ def strainer(args):
if c.startswith('s__'):
print(c)
else:
##ToDo: Fix this on Python 3
print(c, '(%s)'%(','.join(list(clade2subclades[c]))))
return
......@@ -1430,7 +1441,7 @@ def strainer(args):
if (args['ifn_markers'] is not None) and (args['ifn_ref_genomes'] is not None):
logger.info('Add reference genomes')
marker_records = {}
# Gets the markers of the reference genome
for rec in SeqIO.parse(open(args['ifn_markers'], 'r'), 'fasta'):
if rec.id in kept_markers or (not kept_markers):
marker_records[rec.id] = rec
......@@ -1457,6 +1468,7 @@ def strainer(args):
else:
sample2order[ref] = 'first'
# build tree for each clade
for clade in args['clades']:
logger.info('Build the tree for %s'%clade)
......@@ -1488,12 +1500,13 @@ def strainer(args):
# remove samples with percentage of markers less than marker_in_clade
logger.debug('remove samples with percentage of markers '\
'less than marker_in_clade')
for sample in sample2marker.keys():
for sample in list(sample2marker):
if len(sample2marker[sample]):
if clade == 'singleton':
c = 'singleton'
else:
marker = sample2marker[sample].keys()[0]
marker = list(sample2marker[sample])[0]
if marker in db['markers']:
c = db['markers'][marker]['taxon'].split('|')[-1]
if len(sample2marker[sample]) / \
float(clade2num_markers[c]) < args['marker_in_clade']:
......@@ -1556,8 +1569,8 @@ def check_dependencies(args):
def strainphlan():
args = read_params()
# fix db .pkl file
if '--mpa_pkl' not in sys.argv:
if os.path.isfile(os.path.join(args['mpa_pkl'], "mpa_" + args['index'] + ".pkl")):
args['mpa_pkl'] = os.path.join(args['mpa_pkl'], "mpa_" + args['index'] + ".pkl")
......
#!/usr/bin/env python
#Author: Duy Tin Truong (duytin.truong@unitn.it)
#Authors: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
# import sys
# import os
import argparse as ap
import pandas
# import copy
# import ConfigParser
import dendropy
# import numpy
# import ipdb
import numpy
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
def read_params():
p = ap.ArgumentParser()
......@@ -45,7 +43,7 @@ def get_index_col(ifn):
def main(args):
add_fields = args['metadatas']
for ifn_tree in args['ifn_trees']:
print 'Input:', ifn_tree
print ('Input:', ifn_tree)
df_list = []
samples = []
for ifn in args['ifn_metadatas']:
......@@ -53,7 +51,7 @@ def main(args):
df = pandas.read_csv(
ifn,
sep='\t',
dtype=unicode,
dtype=numpy.unicode_,
header=0,
index_col=index_col)
df = df.transpose()
......@@ -63,14 +61,14 @@ def main(args):
with open(ifn, 'r') as ifile:
add_fields = [f for f in ifile.readline().strip().split('\t') \
if f.upper() != 'SAMPLEID']
print 'number of samples in metadata: %d'%len(samples)
print ('number of samples in metadata: %d'%len(samples))
count = 0
with open(ifn_tree, 'r') as ifile:
line = ifile.readline()
line = line.replace(args['string_to_remove'], '')
tree = dendropy.Tree(stream=open(ifn_tree, 'r'), schema='newick')
tree = dendropy.Tree.get(stream=open(ifn_tree, 'r'), schema='newick')
for node in tree.leaf_nodes():
sample = node.get_node_str().strip("'")
sample = node.__getattribute__("taxon").__str__().strip("'")
sample = sample.replace(' ', '_')
sample = sample.replace(args['string_to_remove'], '')
prefixes = [prefix for prefix in
......@@ -101,8 +99,8 @@ def main(args):
line = line.replace(sample + ':', metadata + ':')
ofn_tree = ifn_tree + '.metadata'
print 'Number of samples in tree: %d'%count
print 'Output:', ofn_tree
print ('Number of samples in tree: %d'%count)
print ('Output:', ofn_tree)
with open(ofn_tree, 'w') as ofile:
ofile.write(line)
......
......@@ -2,9 +2,11 @@
#Author: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
__author__ = 'Duy Tin Truong (duytin.truong@unitn.it)'
__version__ = '0.1'
__date__ = '17 Sep 2015'
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
# import sys
import os
......@@ -54,7 +56,7 @@ def read_params():
def run(cmd):
print cmd
print(cmd)
os.system(cmd)
......
......@@ -2,9 +2,10 @@
#Author: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
__author__ = 'Duy Tin Truong (duytin.truong@unitn.it)'
__version__ = '0.1'
__date__ = '1 Sep 2014'
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
import sys
import os
......@@ -13,10 +14,11 @@ MAIN_DIR = os.path.dirname(ABS_PATH)
os.environ['PATH'] += ':%s'%MAIN_DIR
sys.path.append(MAIN_DIR)
PYTHON_VERSION = float(sys.version_info[0])
from mixed_utils import dist2file, statistics
import argparse as ap
from Bio import SeqIO#, Seq, SeqRecord
# from collections import defaultdict
from Bio import SeqIO
import numpy
from ooSubprocess import ooSubprocess
......@@ -51,8 +53,12 @@ def read_params():
def get_dist(seq1, seq2, ignore_gaps):
if len(seq1) != len(seq2):
if(PYTHON_VERSION < 3):
print >> sys.stderr, 'Error: Two sequences have different lengths!'
print >> sys.stderr, 'Cannot compute the distance!'
else:
print('Error: Two sequences have different lengths!', file=sys.stderr)
print('Cannot compute the distance!', file=sys.stderr)
exit(1)
abs_dist = 0.0
......@@ -77,11 +83,10 @@ def get_dist(seq1, seq2, ignore_gaps):
def compute_dist_matrix(ifn_alignment, ofn_prefix, ignore_gaps, overwrite):
ofn_abs_dist = ofn_prefix + '.abs_dist'
if (not overwrite) and os.path.isfile(ofn_abs_dist.replace('.abs_dist', '.rel_dist')):
print 'File %s exists, skip!'%ofn_abs_dist
print('File %s exists, skip!'%ofn_abs_dist)
return
else:
print 'Compute dist_matrix for %s'%ofn_abs_dist
#print 'Compute dist_matrix for %s'%ofn_abs_dist
print('Compute dist_matrix for %s'%ofn_abs_dist)
recs = [rec for rec in SeqIO.parse(open((ifn_alignment), 'r'), 'fasta')]
abs_dist = numpy.zeros((len(recs), len(recs)))
......
......@@ -2,9 +2,10 @@
#Author: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
__author__ = 'Duy Tin Truong (duytin.truong@unitn.it)'
__version__ = '0.1'
__date__ = '9 Feb 2015'
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
import sys
import os
......@@ -13,9 +14,7 @@ MAIN_DIR = os.path.dirname(ABS_PATH)
os.environ['PATH'] += ':%s'%MAIN_DIR
sys.path.append(MAIN_DIR)
import argparse as ap
from Bio import SeqIO#, Seq, SeqRecord
# from collections import defaultdict
# import numpy
from Bio import SeqIO
from compute_distance import compute_dist_matrix
from ooSubprocess import parallelize
......
......@@ -2,20 +2,19 @@
#Author: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
import sys
import argparse as ap
import bz2
import gzip
import tarfile
#import logging.config
#sys.path.append('../pyphlan')
#sys.path.append('pyphlan')
import ooSubprocess
#logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
#logger = logging.getLogger(__name__)
PYTHON_VERSION = float(sys.version_info[0])
def read_params():
p = ap.ArgumentParser()
......@@ -33,10 +32,16 @@ def dump_file(ifn):
ifile = tarfile.open(ifn, 'r:gz')
file_ext = '.tar.gz'
elif ifn.endswith('.bz2'):
if(PYTHON_VERSION < 3):
ifile = bz2.BZ2File(ifn, 'r')
else:
ifile = bz2.open(ifn, 'rt')
file_ext = '.bz2'
elif ifn.endswith('.gz'):
ifile = gzip.GzipFile(ifn, 'r')
if(PYTHON_VERSION < 3):
ifile = gzip.BZ2File(ifn, 'r')
else:
ifile = gzip.open(ifn, 'rt')
file_ext = '.gz'
elif ifn.endswith('.fastq'):
ifile = open(ifn, 'r')
......
......@@ -2,12 +2,11 @@
#Author: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
__author__ = 'Duy Tin Truong (duytin.truong@unitn.it)'
__version__ = '0.1'
__date__ = '1 Sep 2014'
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
# import sys
# import os
import argparse as ap
import pickle
import bz2
......@@ -29,7 +28,7 @@ def extract_markers(mpa_pkl, ifn_markers, clade, ofn_markers):
for marker in db['markers']:
if clade == db['markers'][marker]['taxon'].split('|')[-1]:
markers.add(marker)
print 'number of markers', len(markers)
print('number of markers: {}'.format(len(markers)))
with open(ofn_markers, 'w') as ofile:
for rec in SeqIO.parse(open(ifn_markers, 'r'), 'fasta'):
if rec.name in markers:
......
#!/usr/bin/env python
#Author: Duy Tin Truong (duytin.truong@unitn.it)
# at CIBIO, University of Trento, Italy
__author__ = ('Duy Tin Truong (duytin.truong@unitn.it), '
'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)')
__version__ = '0.2'
__date__ = '10 Jul 19'
from Bio import SeqIO
import argparse as ap
import sys
......@@ -12,6 +21,8 @@ if __name__ == '__main__':
args = read_params(sys.argv)
min_len = args['min_len']
with sys.stdout as outf:
list_r = []
for r in SeqIO.parse(sys.stdin, "fastq"):
if len(r) >= min_len:
SeqIO.write(r, outf, "fastq")
list_r.append(r)
SeqIO.write(list_r, outf, "fastq")