Commit 4342fa36 authored by Jérémy Bobbio's avatar Jérémy Bobbio

Add --fuzzy-threshold option

This allows to specify the TLSH score used as cut-off for fuzzy matching.
Specifying 0 will disable fuzzy-matching entirely.

Thanks Jakub Wilk for prompting me to implement this.

Closes: #797557
parent d8d18709
......@@ -58,11 +58,16 @@ def create_parser():
parser.add_argument('--max-diff-block-lines', dest='max_diff_block_lines', type=int,
help='maximum number of lines per diff block (default: %d)' %
Config.general.max_diff_block_lines,
default=Config.general.max_diff_input_lines)
default=Config.general.max_diff_block_lines)
parser.add_argument('--max-diff-input-lines', dest='max_diff_input_lines', type=int,
help='maximum number of lines fed to diff (default: %d)' %
Config.general.max_diff_input_lines,
default=Config.general.max_diff_input_lines)
parser.add_argument('--fuzzy-threshold', dest='fuzzy_threshold', type=int,
help='threshold for fuzzy-matching '
'(0 to disable, %d is default, 400 is high fuzziness)' %
(Config.general.fuzzy_threshold),
default=Config.general.fuzzy_threshold)
parser.add_argument('--css', metavar='url', dest='css_url',
help='link to an extra CSS for the HTML report')
parser.add_argument('file1', help='first file to compare')
......@@ -101,6 +106,7 @@ def run_diffoscope(parsed_args):
Config.general.max_diff_block_lines = parsed_args.max_diff_block_lines
Config.general.max_diff_input_lines = parsed_args.max_diff_input_lines
Config.general.max_report_size = parsed_args.max_report_size
Config.general.fuzzy_threshold = parsed_args.fuzzy_threshold
if parsed_args.debug:
logger.setLevel(logging.DEBUG)
set_locale()
......
......@@ -25,6 +25,7 @@ import re
import sys
import tlsh
from diffoscope import logger, tool_required
from diffoscope.config import Config
from diffoscope.difference import Difference
from diffoscope.comparators.binary import \
File, FilesystemFile, compare_binary_files
......@@ -126,10 +127,9 @@ def specialize(file):
return file
fuzzy_threshold = 60
def perform_fuzzy_matching(files1, files2):
if Config.general.fuzzy_threshold == 0:
return
files2 = set(files2)
already_compared = set()
for file1 in filter(lambda f: not f.is_directory(), files1):
......@@ -144,6 +144,6 @@ def perform_fuzzy_matching(files1, files2):
comparisons.sort(key=operator.itemgetter(0))
score, file2 = comparisons[0]
logger.debug('fuzzy top match %s %s: %d difference score', file1.name, file2.name, score)
if score < fuzzy_threshold:
if score < Config.general.fuzzy_threshold:
yield file1, file2, score
already_compared.add(file2)
......@@ -30,6 +30,7 @@ class Config(object):
self._max_diff_block_lines = 50
self._max_diff_input_lines = 100000 # GNU diff cannot process arbitrary large files :(
self._max_report_size = 2000 * 2 ** 10 # 2000 kB
self._fuzzy_threshold = 60
@classproperty
def general(cls):
......@@ -61,3 +62,10 @@ class Config(object):
def max_report_size(self, value):
self._max_report_size = value
@property
def fuzzy_threshold(self):
return self._fuzzy_threshold
@fuzzy_threshold.setter
def fuzzy_threshold(self, value):
self._fuzzy_threshold = value
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment