Skip to content
Commits on Source (3)
......@@ -25,6 +25,8 @@ requirements:
- biom-format >=2.1.5,<2.2.0
- ijson
- h5py
- matplotlib 3.1.0
- matplotlib-base 3.1.0
- qiime2 {{ release }}.*
test:
......
q2-types (2019.7.0-2) UNRELEASED; urgency=medium
q2-types (2019.10.0-1) UNRELEASED; urgency=medium
* Minor changes in autopkgtest
* New upstream version
-- Liubov Chuprikova <chuprikovalv@gmail.com> Tue, 17 Sep 2019 12:01:34 +0200
-- Liubov Chuprikova <chuprikovalv@gmail.com> Sun, 15 Dec 2019 13:44:07 +0100
q2-types (2019.7.0-1) unstable; urgency=medium
......
......@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = " (tag: 2019.7.0)"
git_full = "9f31de0c81510fbe6be8b16f95e23b4c974ca002"
git_date = "2019-07-30 18:15:54 +0000"
git_refnames = " (tag: 2019.10.0)"
git_full = "b382dd345500fc2172858ff00638e6bca35760ed"
git_date = "2019-11-01 01:04:25 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
......
......@@ -54,9 +54,6 @@ class TaxonomyFormat(model.TextFileFormat):
elif line.lstrip(' ') == '\n':
# Blank line
continue
elif line.startswith('#'):
# Comment line
continue
else:
cells = line.split('\t')
if len(cells) < 2:
......@@ -93,41 +90,53 @@ class TSVTaxonomyFormat(model.TextFileFormat):
Optionally followed by other arbitrary columns.
This format supports comment lines starting with #, and blank lines. The
expected header must be the first non-comment, non-blank line. In addition
to the header, there must be at least one line of data.
This format supports blank lines. The expected header must be the first
non-blank line. In addition to the header, there must be at least one line
of data.
"""
HEADER = ['Feature ID', 'Taxon']
def sniff(self):
def _check_n_records(self, n=None):
with self.open() as fh:
data_lines = 0
data_line_count = 0
header = None
while data_lines < 10:
line = fh.readline()
if line == '':
# EOF
break
elif line.lstrip(' ') == '\n':
file_ = enumerate(fh) if n is None else zip(range(n), fh)
for i, line in file_:
# Tracks line number for error reporting
i = i + 1
if line.lstrip(' ') == '\n':
# Blank line
continue
elif line.startswith('#'):
# Comment line
continue
cells = line.rstrip('\n').split('\t')
cells = line.strip('\n').split('\t')
if header is None:
if cells[:2] != self.HEADER:
return False
raise ValidationError(
'%s must be the first two header values. The '
'first two header values provided are: %s (on '
'line %s).' % (self.HEADER, cells[:2], i))
header = cells
else:
if len(cells) != len(header):
return False
data_lines += 1
raise ValidationError(
'Number of values on line %s are not the same as '
'number of header values. Found %s values '
'(%s), expected %s.' % (i, len(cells), cells,
len(self.HEADER)))
data_line_count += 1
return header is not None and data_lines > 0
if data_line_count == 0:
raise ValidationError('No taxonomy records found, only blank '
'lines and/or a header row.')
def _validate_(self, level):
self._check_n_records(n={'min': 10, 'max': None}[level])
TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat(
......@@ -138,6 +147,7 @@ class DNAFASTAFormat(model.TextFileFormat):
def _validate_lines(self, max_lines):
FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN]+\r?\n?')
last_line_was_ID = False
ids = {}
with open(str(self), 'rb') as fh:
try:
......@@ -149,8 +159,8 @@ class DNAFASTAFormat(model.TextFileFormat):
return
if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
"FASTA ID. FASTA IDs must start "
"with '>'")
"description. Descriptions must "
"start with '>'")
fh.seek(0)
for line_number, line in enumerate(fh, 1):
if line_number >= max_lines:
......@@ -158,9 +168,24 @@ class DNAFASTAFormat(model.TextFileFormat):
line = line.decode('utf-8-sig')
if line.startswith('>'):
if last_line_was_ID:
raise ValidationError('Multiple consecutive IDs '
'starting on line '
f'{line_number-1!r}')
raise ValidationError('Multiple consecutive '
'descriptions starting on '
f'line {line_number-1!r}')
line = line.split()
if line[0] == '>':
if len(line) == 1:
raise ValidationError(
f'Description on line {line_number} is '
'missing an ID.')
else:
raise ValidationError(
f'ID on line {line_number} starts with a '
'space. IDs may not start with spaces')
if line[0] in ids:
raise ValidationError(
f'ID on line {line_number} is a duplicate of '
f'another ID on line {ids[line[0]]}.')
ids[line[0]] = line_number
last_line_was_ID = True
elif re.fullmatch(FASTADNAValidator, line):
last_line_was_ID = False
......
......@@ -47,7 +47,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None):
"""
# Using `dtype=object` and `set_index()` to avoid type casting/inference of
# any columns or the index.
df = pd.read_csv(filepath, sep='\t', comment='#', skip_blank_lines=True,
df = pd.read_csv(filepath, sep='\t', skip_blank_lines=True,
header=None, dtype=object)
if len(df.columns) < 2:
......@@ -88,6 +88,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None):
"column names are duplicated: %s" %
', '.join(df.columns.get_duplicates()))
df['Taxon'] = df['Taxon'].str.strip()
return df
......
>SEQUENCE1
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
>SEQUENCE1
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
ACGTACGTACGTACGTACGTACGT
> this_id_starts_with_a_space
\ No newline at end of file
Feature ID Taxon
seq1 k__Bacteria; p__Proteobacteria -1.0
seq2 k__Bacteria 1.0
Feature ID Taxon Confidence Random
seq1 k__Foo; p__Bar -1.0
seq2 k__Foo; p__Baz -42.0
Feature ID Taxon Confidence
seq1 k__Foo; p__Bar -1.0
Feature ID Taxon Confidence
seq1 k__Foo; p__Bar -1.0
Feature ID Taxon Confidence
seq1 k__Foo; p__Bar -1.0
# There's some important whitespace in this file for testing, take care not to
# remove :)
# hello
# world #
......@@ -15,20 +11,24 @@
# comment
Feature ID Taxon Extra Column
# hello, peanut
#
#
SEQUENCE1 k__Bar; p__Baz foo
# GWAR
seq2 some; taxonomy; for; ya bar baz
# FOOTER
......@@ -37,7 +37,7 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_taxonomy_format_validate_negative(self):
filenames = ['empty', 'blanks-and-comments', '1-column.tsv']
filenames = ['empty', 'blanks', '1-column.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
......@@ -78,7 +78,7 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_headerless_tsv_taxonomy_format_validate_negative(self):
filenames = ['empty', 'blanks-and-comments', '1-column.tsv']
filenames = ['empty', 'blanks', '1-column.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
......@@ -113,7 +113,7 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_tsv_taxonomy_format_validate_negative(self):
filenames = ['empty', 'blanks-and-comments', '1-column.tsv',
filenames = ['empty', 'blanks', '1-column.tsv',
'headerless.tsv', 'header-only.tsv', 'jagged.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
......@@ -134,6 +134,19 @@ class TestTaxonomyFormats(TestPluginBase):
format.validate()
def test_tsv_taxonomy_format_column_header_lengths(self):
filenames = ['greater-column-length.tsv', 'greater-header-length.tsv']
filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]
for filepath in filepaths:
format = TSVTaxonomyFormat(filepath, mode='r')
with self.assertRaisesRegex(ValidationError,
'line 2.*3 values.*expected 2'):
format.validate()
class TestDNAFASTAFormats(TestPluginBase):
package = 'q2_types.feature_data.tests'
......@@ -169,7 +182,8 @@ class TestDNAFASTAFormats(TestPluginBase):
filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
format = DNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(ValidationError, 'consecutive IDs.*1'):
with self.assertRaisesRegex(
ValidationError, 'consecutive descriptions.*1'):
format.validate()
def test_dna_fasta_format_missing_initial_ID(self):
......@@ -201,6 +215,28 @@ class TestDNAFASTAFormats(TestPluginBase):
format.validate()
def test_dna_fasta_format_duplicate_ids(self):
filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(ValidationError, '3.*duplicate.*1'):
format.validate()
def test_dna_fasta_format_no_id(self):
filepath = self.get_data_path('dna-sequences-no-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(ValidationError, '1.*missing an ID'):
format.validate()
def test_dna_fasta_format_id_starts_with_space(self):
filepath = self.get_data_path(
'dna-sequences-id-starts-with-space.fasta')
format = DNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(ValidationError, '1 starts with a space'):
format.validate()
def test_paired_dna_sequences_directory_format(self):
filepath = self.get_data_path('dna-sequences.fasta')
temp_dir = self.temp_dir.name
......
......@@ -264,6 +264,45 @@ class TestTaxonomyFormatTransformers(TestPluginBase):
self.assertEqual(exp, obs)
def test_tsv_taxonomy_to_metadata_trailing_whitespace_taxon(self):
_, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
os.path.join(
'taxonomy',
'trailing_space_taxon.tsv'))
index = pd.Index(['seq1'], name='Feature ID', dtype=object)
exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
exp = qiime2.Metadata(exp_df)
self.assertEqual(exp, obs)
def test_tsv_taxonomy_to_metadata_leading_whitespace_taxon(self):
_, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
os.path.join(
'taxonomy',
'leading_space_taxon.tsv'))
index = pd.Index(['seq1'], name='Feature ID', dtype=object)
exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
exp = qiime2.Metadata(exp_df)
self.assertEqual(exp, obs)
def test_tsv_taxonomy_to_metadata_trailing_leading_whitespace_taxon(self):
_, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
os.path.join(
'taxonomy',
'start_end_space_taxon.tsv'))
index = pd.Index(['seq1'], name='Feature ID', dtype=object)
exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
exp = qiime2.Metadata(exp_df)
self.assertEqual(exp, obs)
# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers.
......@@ -275,11 +314,11 @@ class TestTaxonomyFormatsToDataFrame(TestPluginBase):
_taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy', '1-column.tsv')))
def test_blanks_and_comments(self):
def test_blanks(self):
with self.assertRaises(pandas.io.common.EmptyDataError):
_taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'blanks-and-comments')))
'blanks')))
def test_empty(self):
with self.assertRaises(pandas.io.common.EmptyDataError):
......
......@@ -22,7 +22,8 @@ class BIOMV100Format(model.TextFileFormat):
}
def sniff(self):
with self.open() as fh:
# Can't self.open(mode='rb'), so we defer to the backing pathlib object
with self.path.open(mode='rb') as fh:
try:
parser = ijson.parse(fh)
for prefix, event, value in parser:
......
......@@ -26,7 +26,10 @@ class TestTransformers(TestPluginBase):
name='shannon', index=exp_index)
obs = transformer(exp)
obs = pd.Series.from_csv(str(obs), sep='\t', header=0)
# Squeeze equals true to return series instead of dataframe
obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
squeeze=True)
assert_series_equal(exp, obs)
......