Should handle the filenames internally as bytes, not str
This bug was originally reported by Chris Lamb (lamby@debian.org) in Debian bug #898361:
This is via <https://github.com/lamby/trydiffoscope/issues/35>, but I
think the bug is in diffoscope itself.
So, given the following test:
import os
import pytest
import subprocess
def test_invalid_filename(capsys, tmpdir):
base = str(tmpdir.mkdir('src')).encode('utf-8')
a = os.path.join(base, b'\xf0\x28\x8c\x28')
b = os.path.join(base, b'\xf0\x28\x8c\x29')
with open(a, 'w'), open(b, 'w'):
pass
subprocess.check_call(('bin/diffoscope', a, b))
I get:
____________________________ test_invalid_filename _____________________________
capsys = <_pytest.capture.CaptureFixture object at 0x7f25bd267710>
tmpdir = local('/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0')
def test_invalid_filename(capsys, tmpdir):
base = str(tmpdir.mkdir('src')).encode('utf-8')
a = os.path.join(base, b'\xf0\x28\x8c\x28')
b = os.path.join(base, b'\xf0\x28\x8c\x29')
with open(a, 'w'), open(b, 'w'):
pass
> subprocess.check_call(('bin/diffoscope', a, b))
tests/test_filenames.py:34:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
popenargs = (('bin/diffoscope', b'/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0/src/\xf0(\x8c(', b'/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0/src/\xf0(\x8c)'),)
kwargs = {}, retcode = 2
cmd = ('bin/diffoscope', b'/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0/src/\xf0(\x8c(', b'/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0/src/\xf0(\x8c)')
def check_call(*popenargs, **kwargs):
"""Run command with arguments. Wait for command to complete. If
the exit code was zero then return, otherwise raise
CalledProcessError. The CalledProcessError object will have the
return code in the returncode attribute.
The arguments are the same as for the call function. Example:
check_call(["ls", "-l"])
"""
retcode = call(*popenargs, **kwargs)
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
> raise CalledProcessError(retcode, cmd)
E subprocess.CalledProcessError: Command '('bin/diffoscope', b'/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0/src/\xf0(\x8c(', b'/tmp/pytest-of-lamby/pytest-43/test_invalid_filename0/src/\xf0(\x8c)')' returned non-zero exit status 2.
/usr/lib/python3.6/subprocess.py:291: CalledProcessError
------------------------------ Captured log setup ------------------------------
locale.py 33 DEBUG Normalising locale, timezone, etc.
__init__.py 128 DEBUG Loaded 66 comparator classes
__init__.py 128 DEBUG Loaded 66 comparator classes
----------------------------- Captured stderr call -----------------------------
Traceback (most recent call last):
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/main.py", line 448, in main
sys.exit(run_diffoscope(parsed_args))
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/main.py", line 420, in run_diffoscope
difference = compare_root_paths(path1, path2)
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/utils/compare.py", line 65, in compare_root_paths
file1 = specialize(FilesystemFile(path1, container=container1))
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/utils/specialize.py", line 49, in specialize
if try_recognize(file, cls, cls.recognizes):
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/utils/specialize.py", line 36, in try_recognize
if not recognizes(file):
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/debian.py", line 169, in recognizes
if not super().recognizes(file):
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/utils/file.py", line 141, in recognizes
lambda m, t: t.search(m), file.magic_file_type),
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/utils/file.py", line 227, in magic_file_type
self._magic_file_type = File.guess_file_type(self.path)
File "/home/lamby/git/debian/reproducible/diffoscope/diffoscope/comparators/utils/file.py", line 71, in guess_file_type
return self._mimedb.file(path)
File "/usr/lib/python3/dist-packages/magic/compat.py", line 148, in file
return Magic.__tostr(_file(self._magic_t, Magic.__tobytes(filename)))
File "/usr/lib/python3/dist-packages/magic/compat.py", line 138, in __tobytes
return bytes(b, 'utf-8')
UnicodeEncodeError: 'utf-8' codec can't encode character '\udcf0' in position 58: surrogates not allowed
=========================== 1 failed in 0.75 seconds ===========================
However, I can't seem to minimally reproduce with file by itself:
import magic
filename = b'\xf0\x28\x8c\x28'
with open(filename, 'w'):
pass
m = magic.open(magic.NONE)
m.load()
m.file(filename)