...
 
Commits (6)
......@@ -17,6 +17,7 @@
#
# Normal output and testing dirs
#
/build_date.h
/jdupes
/jdupes*.exe
/*.pkg.tar.xz
......
jdupes 1.11.1
- Disable build date embedding by default to make reproducible builds easier
jdupes 1.11
- Add new option -T for partial hash matches only (dangerous!)
- Fix '-P partial' printing
jdupes 1.10.4
- Fix a bug that caused -x/--xsize to fail randomly
......
......@@ -35,6 +35,7 @@ NO_HARDLINKS Disable hard linking options and code
NO_SYMLINKS Disable symbolic linking options and code
NO_USER_ORDER Disable -I/-O options and code
NO_UNICODE [Windows only] disable all Unicode support
EMBED_BUILD_DATE Hard-code the build date into the binary
The LOW_MEMORY option tweaks various knobs in the program to lower total
memory usage. It also disables some features to reduce the size of certain
......@@ -42,6 +43,13 @@ data structures. The improvements in memory usage are not very large, but if
you're running in a very RAM-limited environment or have a CPU with very small
caches it may be a good choice.
If you are building binaries that will have the same version number and still
need a way to differentiate the binaries, you can use EMBED_BUILD_DATE to add
the date of the build to the version info in 'jdupes -v'. Note that you must
have a build that will rebuild jdupes.c (such as after a 'make clean') for
the shown build date to actually update. This option also makes it impossible
to create reproducible builds if that's important to you.
A test directory is included so that you may familiarize yourself with the way
jdupes operates. You may test the program before installing it by issuing a
command such as "./jdupes testdir" or "./jdupes -r testdir", just to name a
......
......@@ -59,6 +59,13 @@ COMPILER_OPTIONS += -std=gnu99 -O2 -g -D_FILE_OFFSET_BITS=64 -fstrict-aliasing -
# no need to modify anything beyond this point #
#####################################################################
# Set built-on date for display in program version info screen
ifdef EMBED_BUILD_DATE
BD=$(shell date +"\"%Y-%m-%d %H:%M:%S %z\"")
$(shell echo "#define BUILT_ON_DATE \"$(BD)\"" > build_date.h)
COMPILER_OPTIONS += -DBUILD_DATE
endif
# Debugging code inclusion
ifdef LOUD
DEBUG=1
......@@ -84,22 +91,13 @@ ifeq ($(OS), Windows_NT)
ifndef NO_UNICODE
UNICODE=1
COMPILER_OPTIONS += -municode
PROGRAM_SUFFIX=.exe
endif
COMPILER_OPTIONS += -D__USE_MINGW_ANSI_STDIO=1 -DON_WINDOWS=1
OBJS += win_stat.o winres.o
override undefine ENABLE_BTRFS
endif
# xxHash support
ifdef USE_XXHASH
COMPILER_OPTIONS += -DUSE_HASH_XXHASH64
OBJS += xxhash.o
OBJS_CLEAN += jody_hash.o
else
COMPILER_OPTIONS += -DUSE_HASH_JODYHASH
OBJS += jody_hash.o
OBJS_CLEAN += xxhash.o
endif
# New BTRFS support option
ifdef ENABLE_BTRFS
COMPILER_OPTIONS += -DENABLE_BTRFS
......@@ -109,7 +107,7 @@ OBJS_CLEAN += act_dedupefiles.o
endif
# Low memory mode
ifdef LOW_MEMORY
COMPILER_OPTIONS += -DLOW_MEMORY -DJODY_HASH_WIDTH=32 -DSMA_PAGE_SIZE=32768 -DCHUNK_SIZE=16384 -DNO_HARDLINKS -DNO_USER_ORDER
COMPILER_OPTIONS += -DLOW_MEMORY -DSMA_PAGE_SIZE=32768 -DCHUNK_SIZE=16384 -DNO_HARDLINKS -DNO_USER_ORDER
endif
CFLAGS += $(COMPILER_OPTIONS) $(CFLAGS_EXTRA)
......@@ -124,6 +122,7 @@ INSTALL_DATA = $(INSTALL) -m 0644
OBJS += jdupes.o jody_paths.o jody_sort.o jody_win_unicode.o string_malloc.o
OBJS += jody_cacheinfo.o
OBJS += act_deletefiles.o act_linkfiles.o act_printmatches.o act_summarize.o
OBJS += xxhash.o
OBJS += $(ADDITIONAL_OBJECTS)
all: $(PROGRAM_NAME)
......@@ -138,14 +137,18 @@ installdirs:
test -e $(DESTDIR)$(BIN_DIR) || $(MKDIR) $(DESTDIR)$(BIN_DIR)
test -e $(DESTDIR)$(MAN_DIR) || $(MKDIR) $(DESTDIR)$(MAN_DIR)
install: jdupes installdirs
install: $(PROGRAM_NAME) installdirs
$(INSTALL_PROGRAM) $(PROGRAM_NAME) $(DESTDIR)$(BIN_DIR)/$(PROGRAM_NAME)
$(INSTALL_DATA) $(PROGRAM_NAME).1 $(DESTDIR)$(MAN_DIR)/$(PROGRAM_NAME).$(MAN_EXT)
test:
./test.sh
stripped: $(PROGRAM_NAME)
strip $(PROGRAM_NAME)$(PROGRAM_SUFFIX)
clean:
$(RM) $(OBJS) $(OBJS_CLEAN) $(PROGRAM_NAME) $(PROGRAM_NAME).exe *~ *.gcno *.gcda *.gcov
$(RM) $(OBJS) $(OBJS_CLEAN) build_date.h $(PROGRAM_NAME) $(PROGRAM_NAME).exe *~ *.gcno *.gcda *.gcov
distclean: clean
$(RM) *.pkg.tar.xz
......
=== NOTE: This is archived material from 'fdupes' development ===
=== and from the pre-'jdupes' code work. DO NOT EDIT. ===
The following list, organized by fdupes version, documents changes
to fdupes. Every item on the list includes, inside square brackets,
a list of indentifiers referring to the people who contributed
that particular item. When more than one person is listed the person
who contributed the patch or idea appears first, followed by
those who've otherwise worked on that item. For a list of
contributors names and identifiers please see the CONTRIBUTORS file.
Changes from 2.1 to 2.2 [JLB]
- Changed fdupes-jody executable names and document texts to use the
full 'fdupes-jody' name instead of 'fdupes'. Moved copyrights and
contact information to reflect 'fdupes-jody' code responsibility.
This is primarily intended to keep fdupes-jody distinctly separate
from the fdupes by Adrian Lopez upon which it is based, and to make
certain that the correct person gets harassed if it breaks ;-)
- Added '-B/--dedupe' feature (not compiled in by default) which
sends file match lists directly to the kernel btrfs driver to do
block-level data de-duplication. Patch submitted by Sebastian
Schmidt <yath@yath.de>. Thanks!
- Remove and replace some string function calls. Performance increase
shows on benchmarks but not significant in most cases.
Changes from 2.0.2 to 2.1 [JLB]
- Minor performance improvements to hashing and memory allocation code
- Added an experimental tree rebalancing function. It is compiled out
by default because benchmarks indicated either no improvement or a
slight slowdown compared to an unbalanced tree. To compile it in,
try 'make CFLAGS_EXTRA=-DUSE_TREE_REBALANCE'
- Increased size of string_malloc pages from 64K to 256K since testing
shows a minor performance improvement with large file sets
- Made variable scope and type changes for a tiny performance boost
Changes from 2.0.1 to 2.0.2 [JLB]
- Removed redundant getfilestats() calls for a tiny speed boost
- Added a -D/--debug switch to show a set of statistic counters for
various parts of the fdupes algorithms. Can be used to determine what
fdupes is doing "under the hood" and give insight into why performance
may be slower or behave strangely on your data set. To enable it, use
DEBUG=1 with your make command, i.e. 'make DEBUG=1'
- Performance note: the fdupes algorithm uses a tree data structure and
becomes progressively slower as more files are processed and the tree
depth grows larger. As of this version, a rewrite of the core
algorithm is in progress which will remove this tree structure and
significantly improve performance on most large file sets.
Changes from 2.0 to 2.0.1 [JLB]
- Hard links were treated as identical in match checking but not in
match confirmation. This version fixes the problem, increasing speed
with file sets involving lots of hard links.
- A few minor efficiency improvements were performed
Changes from 1.51-jody5 to 2.0 [JLB]
- Bumped major version to 2.0 due to the number of behavioral changes
and improvements to the code, plus it looks less messy than the
hyphenated versioning
- Increased "chunk size" for better performance and to solve the disk
thrashing problem when checking two large files for a match
- When using the -H option, hard links now automatically match each
other without performing any file reads
- Changed primary memory allocator to string_alloc by Jody Bruchon
to improve performance over generic malloc/calloc
- Progress indicator now lists the number of duplicate pairs found in
addition to the usual file progress count and completion percentage
- Progress is updated more rapidly when full file comparisons happen
so users are less likely to think fdupes is "frozen"
- Floating point code was made optional and is removed by default
- A comparison script was added to check built fdupes behavior against
whatever fdupes is currently installed on the system
- Added "undocumented" -Q / --quick option which is not fully safe but can
be used to drastically reduce run time for large data sets if some risk
of data loss is acceptable to the user
- Added -O/--paramorder option to sort files by the order their parent
directory set was specified on the command line first. This makes it
possible to choose what directories' files get preserved over others
while using -d and -N together
- The file list loading progress indicator was revamped. Rather than a
simple "spinning pipe" indicator, it now shows the number of files and
directories scanned as well as which command line specified set the
scanning is currently happening in
- fdupes was enhanced to support more than 2 million files total by
changing from 'int' internal sizes to the maximum supported by the
platform being compiled for
- Hard link code was modified to be much safer; now any file is only
permanently deleted after a hard link succeeds
- Hard links on Windows (on supporting filesystems) are now supported
- Hashing code was optimized for a benchmarked 8.4% improvement in file
processing overhead (and much more under typical real-world conditions)
- Hard linking checks for more error conditions and output is much
clearer about what action was taken on any given file
Changes from 1.51-jody4-jkl1 to 1.51-jody5 [JLB]
- Less malloc()s so less memory usage and a slight speedup
- Change --order=name to an intelligent numerically correct sort
- Fixed bug where progress text was missing until first update
- Performance boost for small files (4KB or less) by not running
redundant hashes and comparisons
- Test files added for numerically correct sort ordering
Changes from 1.51-jody4 to 1.51-jody4-jkl1 [JKL]
- added `--xsize=SIZE' option: exclude files of size < SIZE
- updated Makefile: `PREFIX = /usr/local'
- updated README: Usage to reflect curent parameters
Changes from 1.51-jody2 to 1.51-jody4 [JLB]
- Add support for hard linking duplicates with -L switch
- Updated jody_hash algorithm with much lower collision rate
- Remove freopen() call that posed a portability problem
- Improved progress indicator behavior
- Many minor bug fixes
Changes from 1.51 to 1.51-jody2 [JLB]
- Switched to C99
- Replaced MD5 with Jody Bruchon's hash function
- Added a delay to progress indications for better performance
- Removed lots of unused code
- Ported fdupes to Microsoft Windows (with MinGW)
Changes from 1.50 to 1.51
- Added support for 64-bit file offsets on 32-bit systems.
- Using tty for interactive input instead of regular stdin. This is to
allow feeding filenames via stdin in future versions of fdupes without
breaking interactive deletion feature.
- Fixed some typos in --help.
- Turned C++ style comments into C style comments.
Changes from 1.40 to 1.50-PR2
- Fixed memory leak. [JB]
- Added "--summarize" option. [AL]
- Added "--recurse:" selective recursion option. [AL]
- Added "--noprompt" option for totally automated deletion of
duplicate files.
- Now sorts duplicates (old to new) for consistent order when
listing or deleteing duplicate files.
- Now tests for early matching of files, which should help speed up
the matching process when large files are involved.
- Added warning whenever a file cannot be deleted. [CHL, AL]
- Fixed bug where some files would not be closed after failure. [AL]
- Fixed bug where confirmmatch() function wouldn't always deal
properly with zero-length files. [AL]
- Fixed bug where progress indicator would not be cleared
when no files were found. [AL]
- Removed experimental red-black tree code (it was slower on
my system than the default code). [AL]
- Modified md5/md5.c to avoid compiler warning. [CHL]
- Changes to fdupes.c for compilation under platforms where
getopt_long is unavailable. [LR, AL]
- Changes to help text for clarity. [AL]
- Various changes and improvements to Makefile. [PB, AL]
Changes from 1.31 to 1.40
- Added option to omit the first file in each group
of matches. [LM, AL]
- Added escaping of filenames containing spaces when
sameline option is specified. [AL]
- Changed version indicator format from "fdupes version X.Y"
to the simpler "fdupes X.Y". [AL]
- Changed ordering of options appearing in the help
text (--help), manpage, and README file. [AL]
Changes from 1.30 to 1.31
- Added interactive option to preserve all files during
delete procedure (something similar was already in
place, but now it's official). [AL]
- Updated delete procedure prompt format. [AL]
- Cosmetic code changes. [AL]
Changes from 1.20 to 1.30
- Added size option to display size of duplicates. [LB, AL]
- Added missing typecast for proper compilation under g++. [LB]
- Better handling of errors occurring during retrieval
of a file's signature. [KK, AL]
- No longer displays an error message when specified
directories contain no files. [AL]
- Added red-black tree structure (experimental compile-time
option, disabled by default). [AL]
Changes from 1.12 to 1.20
- Fixed bug where program would crash when files being
scanned were named pipes or sockets. [FD]
- Fix against security risk resulting from the use of a
temporary file to store md5sum output. [FD, AL]
- Using an external md5sum program is now optional. Started
using L. Peter Deutsh's MD5 library instead. [FD, AL]
- Added hardlinks option to distinguish between hard links
and actual duplicate files. [FD, AL]
- Added noempty option to exclude zero-length files
from consideration [AL]
Changes from 1.11 to 1.12
- Improved handling of extremely long input on preserve
prompt (delete option). [SSD, AL]
Changes from 1.1 to 1.11
- Started checking file sizes before signatures
for better performance. [AB, AL]
- Added fdupes manpage. [AB, AL]
Changes from 1.0 to 1.1
- Added delete option for semi-automatic deletion
of duplicate files. [AL]
=== NOTE: This is archived material from 'fdupes' development ===
=== and from the pre-'jdupes' code work. DO NOT EDIT. ===
The following people have contributed in some way to the development
of fdupes. Please see the CHANGES file for detailed information
on their contributions. Names are listed in alphabetical order.
[AB] Adrian Bridgett (adrian.bridgett@iname.com)
[AL] Adrian Lopez (adrian2@caribe.net)
[CHL] Charles Longeau (chl@tuxfamily.org)
[FD] Frank DENIS, a.k.a.
Jedi/Sector One, a.k.a.
DJ Chrysalis (j@4u.net)
[JB] Jean-Baptiste ()
[JLB] Jody Lee Bruchon (jody@jodybruchon.com)
[JKL] Jan Klabacka (jan.klabacka@gmail.com)
[KK] Kresimir Kukulj (madmax@pc-hrvoje.srce.hr)
[LB] Laurent Bonnaud (Laurent.Bonnaud@iut2.upmf-grenoble.fr)
[LM] Luca Montecchiani (m.luca@iname.com)
[LR] Lukas Ruf (lukas@lpr.ch)
[PB] Peter Bray (Sydney, Australia)
[SSD] Steven S. Dick (ssd@nevets.oau.org)
Introduction
--------------------------------------------------------------------------
jdupes is a program for identifying and taking actions upon duplicate
files. This fork known as 'jdupes' is heavily modified from and improved
over the original. See CHANGES for details.
files.
A WORD OF WARNING: jdupes IS NOT a drop-in compatible replacement for
fdupes! Do not blindly replace fdupes with jdupes in scripts and expect
......@@ -12,12 +11,12 @@ between the two programs. For example, the -I switch in jdupes means
"immediately delete files during scanning without prompting the user."
Why use jdupes instead of the original fdupes or other forks?
Why use jdupes instead of the original fdupes or other duplicate finders?
--------------------------------------------------------------------------
The biggest reason is raw speed. In testing on various data sets, jdupes is
over 7 times faster than fdupes-1.51 on average.
jdupes is the only Windows port of fdupes. Most duplicate scanners built on
jdupes provides a native Wndows port. Most duplicate scanners built on
Linux and other UNIX-like systems do not compile for Windows out-of-the-box
and even if they do, they don't support Unicode and other Windows-specific
quirks and features.
......@@ -38,10 +37,6 @@ not afraid of dropping features of low value; a prime example is the -1
switch which outputs all matches in a set on one line, a feature which was
found to be useless in real-world tests and therefore thrown out.
The downside is that jdupes development is never guaranteed to be bug-free!
If the program eats your dog or sets fire to your lawn, the authors cannot
be held responsible. If you notice a bug, please report it.
While jdupes maintains some degree of compatibility with fdupes from which
it was originally derived, there is no guarantee that it will continue to
maintain such compatibility in the future. However, compatibility will be
......@@ -49,6 +44,9 @@ retained between minor versions, i.e. jdupes-1.6 and jdupes-1.6.1 should
not have any significant differences in results with identical command
lines.
If the program eats your dog or sets fire to your lawn, the authors cannot
be held responsible. If you notice a bug, please report it.
What jdupes is not: a similar (but not identical) file finding tool
--------------------------------------------------------------------------
......@@ -64,11 +62,20 @@ Plenty of excellent tools already exist to "fuzzy match" specific file types
using knowledge of their file formats to help. There are no plans to add
this type of matching to jdupes.
There are some match options available in jdupes that enable dangerous file
matching based on partial or likely but not 100% certain matching. These
are considered expert options for special situations and are clearly and
loudly documented as being dangerous. The -Q and -T options are notable
examples, and the extreme danger of the -T option is safeguarded by a
requirement to specify it twice so it can't be used accidentally.
Usage
--------------------------------------------------------------------------
Usage: jdupes [options] DIRECTORY...
Duplicate file sets will be printed by default unless a different action
option is specified (delete, summarize, link, dedupe, etc.)
-@ --loud output annoying low-level debug info while running
-0 --printnull output nulls instead of CR/LF (like 'find -print0')
-1 --one-file-system do not match files on different filesystems/devices
......@@ -90,6 +97,7 @@ Usage: jdupes [options] DIRECTORY...
-I --isolate files in the same specified directory won't match
-l --linksoft make relative symlinks for duplicates w/o prompting
-L --linkhard hard link all duplicate files without prompting
Windows allows a maximum of 1023 hard links per file
-m --summarize summarize dupe information
-M --printwithsummary will print matches and --summarize at the end
-N --noprompt together with --delete, preserve the first file in
......@@ -101,15 +109,18 @@ Usage: jdupes [options] DIRECTORY...
-p --permissions don't consider files with different owner/group or
permission bits as duplicates
-P --print=type print extra info (partial, early, fullhash)
-Q --quick skip byte-for-byte confirmation for quick matching
WARNING: -Q can result in data loss! Be very careful!
-q --quiet hide progress indicator
-Q --quick skip byte-by-byte duplicate verification. WARNING:
this may delete non-duplicates! Read the manual first!
-r --recurse for every directory, process its subdirectories too
-R --recurse: for each directory given after this option follow
subdirectories encountered within (note the ':' at
the end of the option, manpage for more details)
-s --symlinks follow symlinks
-S --size show size of duplicate files
-q --quiet hide progress indicator
-T --partial-only match based on partial hashes only. WARNING:
EXTREMELY DANGEROUS paired with destructive actions!
-T must be specified twice to work. Read the manual!
-v --version display jdupes version and license information
-x --xsize=SIZE exclude files of size < SIZE bytes from consideration
--xsize=+SIZE '+' specified before SIZE, exclude size > SIZE
......@@ -141,6 +152,51 @@ than once. All files within that directory will be listed as their own
duplicates, leading to data loss should a user preserve a file without its
"duplicate" (the file itself!)
Using -1 or --one-file-system prevents matches that cross filesystems, but a
more relaxed form of this option may be added that allows cross-matching for
all filesystems that each parameter is present on.
-Z or --softabort used to be --hardabort in jdupes prior to v1.5 and had the
opposite behavior. Defaulting to taking action on abort is probably not what
most users would expect. The decision to invert rather than reassign to a
different option was made because this feature was still fairly new at the
time of the change.
The -O or --paramorder option allows the user greater control over wha
appears in the first position of a match set, specifically for keeping the -N
option from deleting all but one file in a set in a seemingly random way. All
directories specified on the command line will be used as the sorting order
of result sets first, followed by the sorting algorithm set by the -o or
--order option. This means that the order of all match pairs for a single
directory specification will retain the old sorting behavior even if this
option is specified.
When used together with options -s or --symlink, a user could accidentally
preserve a symlink while deleting the file it points to.
The -Q or --quick option only reads each file once, hashes it, and performs
comparisons based solely on the hashes. There is a small but significant risk
of a hash collision which is the purpose of the failsafe byte-for-byte
comparison that this option explicitly bypasses. Do not use it on ANY data
set for which any amount of data loss is unacceptable. You have been warned!
The -T or --partial-only option produces results based on a hash of the first
block of file data in each file, ignoring everything else in the file.
Partial hash checks have always been an important exclusion step in the
jdupes algorithm, usually hashing the first 4096 bytes of data and allowing
files that are different at the start to be rejected early. In certain
scenarios it may be a useful heuristic for a user to see that a set of files
has the same size and the same starting data, even if the remaining data does
not match; one example of this would be comparing files with data blocks that
are damaged or missing such as an incomplete file transfer or checking a data
recovery against known-good copies to see what damaged data can be deleted in
favor of restoring the known-good copy. This option is meant to be used with
informational actions and can result in EXTREME DATA LOSS if used with
options that delete files, create hard links, or perform other destructive
actions on data based on the matching output. Because of the potential for
massive data destruction, this option MUST BE SPECIFIED TWICE to take effect
and will error out if it is only specified once.
The -I/--isolate option attempts to block matches that are contained in
the same specified directory parameter on the command line. Due to the
underlying nature of the jdupes algorithm, a lot of matches will be
......@@ -354,3 +410,4 @@ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
jdupes (1.11.1-1) unstable; urgency=medium
* New upstream version 1.11.1
* debian/docs: removed no longer existent files OLD_CHANGES and
OLD_CONTRIBUTORS.
* debian/patches/10-disable_test.patch: removed in favor of an override add
to debian/rules.
* debian/rules: added override_dh_auto_test target as a substitute for the
patch 10-disable_test.patch.
* debian/tests/control: removed no longer needed redirection to stderr. The
upstream fixed the source code. Thanks!
-- Joao Eriberto Mota Filho <eriberto@debian.org> Sun, 18 Nov 2018 21:09:54 -0200
jdupes (1.10.4-1) unstable; urgency=medium
* New upstream release.
......
OLD_CHANGES
OLD_CONTRIBUTORS
README
Description: disable a wrong test.
Author: Joao Eriberto Mota Filho <eriberto@debian.org>
Last-Update: 2018-08-20
--- jdupes-1.10.2.orig/Makefile
+++ jdupes-1.10.2/Makefile
@@ -139,8 +139,8 @@ install: jdupes installdirs
$(INSTALL_PROGRAM) $(PROGRAM_NAME) $(DESTDIR)$(BIN_DIR)/$(PROGRAM_NAME)
$(INSTALL_DATA) $(PROGRAM_NAME).1 $(DESTDIR)$(MAN_DIR)/$(PROGRAM_NAME).$(MAN_EXT)
-test:
- ./test.sh
+#test:
+# ./test.sh
clean:
$(RM) $(OBJS) $(OBJS_CLEAN) $(PROGRAM_NAME) $(PROGRAM_NAME).exe *~ *.gcno *.gcda *.gcov
10-disable_test.patch
......@@ -12,3 +12,5 @@ endif
override_dh_auto_build:
dh_auto_build -- $(OPTS)
override_dh_auto_test:
Test-Command: jdupes debian/tests/files/ 2> /dev/null
Test-Command: jdupes debian/tests/files/
......@@ -8,13 +8,15 @@ jdupes \- finds and performs actions upon duplicate files
[
.I options
]
.I DIRECTORY
.I FILES and/or DIRECTORIES
\|.\|.\|.
.SH "DESCRIPTION"
Searches the given path(s) for duplicate files. Such files are found by
comparing file sizes, then partial and full file hashes, followed by a
byte-by-byte comparison.
byte-by-byte comparison. The default behavior with no other "action
options" specified (delete, summarize, link, dedupe, etc.) is to print
sets of matching files.
.SH OPTIONS
.TP
......@@ -132,6 +134,10 @@ show size of duplicate files
.B -s --symlinks
follow symlinked directories
.TP
.B -T --partial-only
.B [WARNING: EXTREME RISK OF DATA LOSS, SEE CAVEATS]
match based on hash of first block of file data, ignoring the rest
.TP
.B -v --version
display jdupes version and compilation feature flags
.TP
......@@ -218,7 +224,7 @@ or
.BR \-\-softabort
used to be --hardabort in jdupes prior to v1.5 and had the opposite behavior.
Defaulting to taking action on abort is probably not what most users would
expect. The decision to invert rather than reassign to a different switch
expect. The decision to invert rather than reassign to a different option
was made because this feature was still fairly new at the time of the change.
The
......@@ -241,11 +247,6 @@ or
a user could accidentally preserve a symlink while deleting the file it
points to.
Furthermore, when specifying a particular directory more than once, all
files within that directory will be listed as their own duplicates,
leading to data loss should a user preserve a file without its "duplicate"
(the file itself!).
The
.B \-Q
or
......@@ -258,6 +259,29 @@ set for which any amount of data loss is unacceptable. This option is not
included in the help text for the program due to its risky nature.
.B You have been warned!
The
.B \-T
or
.BR \-\-partial\-only
option produces results based on a hash of the first block of file data
in each file, ignoring everything else in the file. Partial hash checks
have always been an important exclusion step in the jdupes algorithm,
usually hashing the first 4096 bytes of data and allowing files that are
different at the start to be rejected early. In certain scenarios it may
be a useful heuristic for a user to see that a set of files has the same
size and the same starting data, even if the remaining data does not
match; one example of this would be comparing files with data blocks that
are damaged or missing such as an incomplete file transfer or checking a
data recovery against known-good copies to see what damaged data can be
deleted in favor of restoring the known-good copy. This option is meant
to be used with informational actions and
.B can result in EXTREME DATA LOSS
if used with options that delete files, create hard links, or perform
other destructive actions on data based on the matching output. Because
of the potential for massive data destruction,
.B this option MUST BE SPECIFIED TWICE
to take effect and will error out if it is only specified once.
Using the
.B \-C
or
......@@ -282,4 +306,3 @@ jdupes is a fork of 'fdupes' which is maintained by and contains
extra code copyrighted by Jody Bruchon <jody@jodybruchon.com>
Based on 'fdupes' created by Adrian Lopez <adrian2@caribe.net>
......@@ -41,14 +41,7 @@
#include <sys/time.h>
#include "jdupes.h"
#include "string_malloc.h"
#if defined USE_HASH_JODYHASH
#define JODY_HASH_NOCOMPAT
#include "jody_hash.h"
#elif defined USE_HASH_XXHASH64
#include "xxhash.h"
#else
#error No USE_HASH is defined
#endif
#include "xxhash.h"
#include "jody_sort.h"
#include "jody_win_unicode.h"
#include "jody_cacheinfo.h"
......@@ -61,7 +54,6 @@
#include "act_printmatches.h"
#include "act_summarize.h"
/* Detect Windows and modify as needed */
#if defined _WIN32 || defined __CYGWIN__
const char dir_sep = '\\';
......@@ -191,20 +183,6 @@ static const char *extensions[] = {
#ifdef SMA_PAGE_SIZE
"smapage",
#endif
#ifdef USE_JODY_HASH
#if JODY_HASH_WIDTH == 64
"jodyhash64",
#endif
#if JODY_HASH_WIDTH == 32
"jodyhash32",
#endif
#if JODY_HASH_WIDTH == 16
"jodyhash16",
#endif
#endif /* USE_JODY_HASH */
#ifdef USE_HASH_XXHASH64
"xxhash64",
#endif
#ifdef NO_PERMS
"noperm",
#endif
......@@ -320,7 +298,7 @@ extern void nullptr(const char * restrict func)
exit(EXIT_FAILURE);
}
/* Compare two jody_hashes like memcmp() */
/* Compare two hashes like memcmp() */
#define HASH_COMPARE(a,b) ((a > b) ? 1:((a == b) ? 0:-1))
......@@ -878,11 +856,11 @@ static void grokdir(const char * const restrict dir,
#ifdef UNICODE
/* Windows requires \* at the end of directory names */
strncpy(tempname, dir, PATHBUF_SIZE * 2);
strncpy(tempname, dir, PATHBUF_SIZE * 2 - 1);
dirlen = strlen(tempname) - 1;
p = tempname + dirlen;
if (*p == '/' || *p == '\\') *p = '\0';
strncat(tempname, "\\*", PATHBUF_SIZE * 2);
strncat(tempname, "\\*", PATHBUF_SIZE * 2 - 1);
if (!M2W(tempname, wname)) goto error_cd;
......@@ -1040,9 +1018,7 @@ static jdupes_hash_t *get_filehash(const file_t * const restrict checkfile,
static jdupes_hash_t *chunk = NULL;
FILE *file;
int check = 0;
#ifdef USE_HASH_XXHASH64
XXH64_state_t *xxhstate;
#endif
if (checkfile == NULL || checkfile->d_name == NULL) nullptr("get_filehash()");
LOUD(fprintf(stderr, "get_filehash('%s', %" PRIdMAX ")\n", checkfile->d_name, (intmax_t)max_read);)
......@@ -1102,11 +1078,9 @@ static jdupes_hash_t *get_filehash(const file_t * const restrict checkfile,
fsize -= PARTIAL_HASH_SIZE;
}
#ifdef USE_HASH_XXHASH64
xxhstate = XXH64_createState();
if (xxhstate == NULL) nullptr("xxhstate");
XXH64_reset(xxhstate, 0);
#endif
/* Read the file in CHUNK_SIZE chunks until we've read it all. */
while (fsize > 0) {
......@@ -1120,11 +1094,7 @@ static jdupes_hash_t *get_filehash(const file_t * const restrict checkfile,
return NULL;
}
#if defined USE_HASH_JODYHASH
*hash = jody_block_hash(chunk, *hash, bytes_to_read);
#elif defined USE_HASH_XXHASH64
XXH64_update(xxhstate, chunk, bytes_to_read);
#endif
if ((off_t)bytes_to_read > fsize) break;
else fsize -= (off_t)bytes_to_read;
......@@ -1140,10 +1110,9 @@ static jdupes_hash_t *get_filehash(const file_t * const restrict checkfile,
fclose(file);
#ifdef USE_HASH_XXHASH64
*hash = XXH64_digest(xxhstate);
XXH64_freeState(xxhstate);
#endif
LOUD(fprintf(stderr, "get_filehash: returning hash: 0x%016jx\n", (uintmax_t)*hash));
return hash;
}
......@@ -1252,8 +1221,13 @@ static file_t **checkmatch(filetree_t * restrict tree, file_t * const restrict f
LOUD(if (cmpresult) fprintf(stderr, "checkmatch: partial hashes do not match\n"));
DBG(partial_hash++;)
if (file->size <= PARTIAL_HASH_SIZE) {
LOUD(fprintf(stderr, "checkmatch: small file: copying partial hash to full hash\n"));
/* Print partial hash matching pairs if requested */
if (cmpresult == 0 && ISFLAG(p_flags, P_PARTIAL))
printf("Partial hashes match:\n %s\n %s\n\n", file->d_name, tree->file->d_name);
if (file->size <= PARTIAL_HASH_SIZE || ISFLAG(flags, F_PARTIALONLY)) {
if (ISFLAG(flags, F_PARTIALONLY)) LOUD(fprintf(stderr, "checkmatch: partial only mode: treating partial hash as full hash\n"));
else { LOUD(fprintf(stderr, "checkmatch: small file: copying partial hash to full hash\n")); }
/* filehash_partial = filehash if file is small enough */
if (!ISFLAG(file->flags, F_HASH_FULL)) {
file->filehash = file->filehash_partial;
......@@ -1283,9 +1257,6 @@ static file_t **checkmatch(filetree_t * restrict tree, file_t * const restrict f
SETFLAG(file->flags, F_HASH_FULL);
}
/* Print partial hash matching pairs if requested */
if (ISFLAG(p_flags, P_PARTIAL)) printf("Partial hashes match:\n %s\n %s\n\n", file->d_name, tree->file->d_name);
/* Full file hash comparison */
cmpresult = HASH_COMPARE(file->filehash, tree->file->filehash);
LOUD(if (!cmpresult) fprintf(stderr, "checkmatch: full hashes match\n"));
......@@ -1492,8 +1463,10 @@ static void registerpair(file_t **matchlist, file_t *newmatch,
static inline void help_text(void)
{
printf("Usage: jdupes [options] DIRECTORY...\n\n");
printf("Usage: jdupes [options] FILES and/or DIRECTORIES...\n\n");
printf("Duplicate file sets will be printed by default unless a different action\n");
printf("option is specified (delete, summarize, link, dedupe, etc.)\n");
#ifdef LOUD
printf(" -@ --loud \toutput annoying low-level debug info while running\n");
#endif
......@@ -1558,10 +1531,11 @@ static inline void help_text(void)
#endif
printf(" -S --size \tshow size of duplicate files\n");
printf(" -q --quiet \thide progress indicator\n");
/* This is undocumented in the quick help because it is a dangerous option. If you
* really want it, uncomment it here, and may your data rest in peace. */
/* printf(" -Q --quick \tskip byte-by-byte duplicate verification. WARNING:\n");
printf(" \tthis may delete non-duplicates! Read the manual first!\n"); */
printf(" -Q --quick \tskip byte-by-byte duplicate verification. WARNING:\n");
printf(" \tthis may delete non-duplicates! Read the manual first!\n");
printf(" -T --partial-only \tmatch based on partial hashes only. WARNING:\n");
printf(" \tEXTREMELY DANGEROUS paired with destructive actions!\n");
printf(" \t-T must be specified twice to work. Read the manual!\n");
printf(" -v --version \tdisplay jdupes version and license information\n");
printf(" -x --xsize=SIZE \texclude files of size < SIZE bytes from consideration\n");
printf(" --xsize=+SIZE \t'+' specified before SIZE, exclude size > SIZE\n");
......@@ -1590,6 +1564,7 @@ int main(int argc, char **argv)
static int firstrecurse;
static int opt;
static int pm = 1;
static int partialonly_spec = 0;
static ordertype_t ordertype = ORDER_NAME;
static long manual_chunk_size = 0;
#ifndef ON_WINDOWS
......@@ -1630,6 +1605,7 @@ int main(int argc, char **argv)
{ "recursive:", 0, 0, 'R' },
{ "symlinks", 0, 0, 's' },
{ "size", 0, 0, 'S' },
{ "partial-only", 0, 0, 'T' },
{ "version", 0, 0, 'v' },
{ "xsize", 1, 0, 'x' },
{ "exclude", 1, 0, 'X' },
......@@ -1687,7 +1663,7 @@ int main(int argc, char **argv)
oldargv = cloneargs(argc, argv);
while ((opt = GETOPT(argc, argv,
"@01ABC:dDfhHiIlLmMnNOpP:qQrRsSvzZo:x:X:"
"@01ABC:dDfhHiIlLmMnNOpP:qQrRsSTvVzZo:x:X:"
#ifndef OMIT_GETOPT_LONG
, long_options, NULL
#endif
......@@ -1787,6 +1763,14 @@ int main(int argc, char **argv)
case 'R':
SETFLAG(flags, F_RECURSEAFTER);
break;
case 'T':
if (partialonly_spec == 0)
partialonly_spec = 1;
else {
partialonly_spec = 2;
SETFLAG(flags, F_PARTIALONLY);
}
break;
#ifndef NO_SYMLINKS
case 'l':
SETFLAG(flags, F_MAKESYMLINKS);
......@@ -1828,6 +1812,7 @@ int main(int argc, char **argv)
#endif
break;
case 'v':
case 'V':
printf("jdupes %s (%s) ", VER, VERDATE);
/* Indicate bitness information */
......@@ -1840,6 +1825,11 @@ int main(int argc, char **argv)
} else printf("%u-bit i%u\n", (unsigned int)(sizeof(uintptr_t) * 8),
(unsigned int)(sizeof(long) * 8));
#ifdef BUILD_DATE
#include "build_date.h"
printf("Built on %s\n", BUILT_ON_DATE);
#endif
printf("Compile-time extensions:");
if (*extensions != NULL) {
int c = 0;
......@@ -1898,7 +1888,19 @@ int main(int argc, char **argv)
}
if (optind >= argc) {
fprintf(stderr, "no directories specified (use -h option for help)\n");
fprintf(stderr, "no files or directories specified (use -h option for help)\n");
string_malloc_destroy();
exit(EXIT_FAILURE);
}
if (partialonly_spec == 1) {
fprintf(stderr, "--partial-only specified only once (it's VERY DANGEROUS, read the manual!)\n");
string_malloc_destroy();
exit(EXIT_FAILURE);
}
if (ISFLAG(flags, F_PARTIALONLY) && ISFLAG(flags, F_QUICKCOMPARE)) {
fprintf(stderr, "--partial-only overrides --quick and is even more dangerous (read the manual!)\n");
string_malloc_destroy();
exit(EXIT_FAILURE);
}
......@@ -2001,15 +2003,15 @@ int main(int argc, char **argv)
/* Byte-for-byte check that a matched pair are actually matched */
if (match != NULL) {
/* Quick comparison mode will never run confirmmatch()
/* Quick or partial-only compare will never run confirmmatch()
* Also skip match confirmation for hard-linked files
* (This set of comparisons is ugly, but quite efficient) */
if (ISFLAG(flags, F_QUICKCOMPARE) ||
if (ISFLAG(flags, F_QUICKCOMPARE) || ISFLAG(flags, F_PARTIALONLY) ||
(ISFLAG(flags, F_CONSIDERHARDLINKS) &&
(curfile->inode == (*match)->inode) &&
(curfile->device == (*match)->device))
) {
LOUD(fprintf(stderr, "MAIN: notice: quick compare match (-Q)\n"));
LOUD(fprintf(stderr, "MAIN: notice: quick or partial-only match (-Q/-T)\n"));
registerpair(match, curfile,
(ordertype == ORDER_TIME) ? sort_pairs_by_mtime : sort_pairs_by_filename);
dupecount++;
......
......@@ -8,18 +8,6 @@
extern "C" {
#endif
/* Select hash algorithm */
//#define USE_HASH_JODYHASH /* jodyhash */
//#define USE_HASH_XXHASH64 /* xxHash64 */
/* Failsafes */
#if !defined USE_HASH_JODYHASH && !defined USE_HASH_XXHASH64
#define USE_HASH_JODYHASH
#endif
#if defined USE_HASH_JODYHASH && defined USE_HASH_XXHASH64
#error Multiple USE_HASH options
#endif
/* Detect Windows and modify as needed */
#if defined _WIN32 || defined __CYGWIN__
#ifndef ON_WINDOWS
......@@ -46,13 +34,7 @@ extern "C" {
#include "jody_sort.h"
#include "version.h"
/* Configure hash function based on choice above */
#if defined USE_HASH_JODYHASH
#define JODY_HASH_NOCOMPAT
#include "jody_hash.h"
#elif defined USE_HASH_XXHASH64
#include "xxhash.h"
#endif
#include "xxhash.h"
/* Optional btrfs support */
#ifdef ENABLE_BTRFS
......@@ -61,11 +43,7 @@ extern "C" {
#endif
/* Set hash type (change this if swapping in a different hash function) */
#if defined USE_HASH_JODYHASH
typedef jodyhash_t jdupes_hash_t;
#elif defined USE_HASH_XXHASH64
typedef XXH64_hash_t jdupes_hash_t;
#endif
/* Some types are different on Windows */
#ifdef ON_WINDOWS
......@@ -168,6 +146,7 @@ extern uint_fast32_t flags;
#define F_PRINTMATCHES 0x00400000U
#define F_ONEFS 0x00800000U
#define F_PRINTNULL 0x01000000U
#define F_PARTIALONLY 0x02000000U
#define F_LOUD 0x40000000U
#define F_DEBUG 0x80000000U
......
/* Jody Bruchon's fast hashing function
*
* This function was written to generate a fast hash that also has a
* fairly low collision rate. The collision rate is much higher than
* a secure hash algorithm, but the calculation is drastically simpler
* and faster.
*
* Copyright (C) 2014-2018 by Jody Bruchon <jody@jodybruchon.com>
* Released under The MIT License
*/
#include <stdio.h>
#include <stdlib.h>
#include "jody_hash.h"
/* DO NOT modify the shift unless you know what you're doing.
* This shift was decided upon after lots of testing and
* changing it will likely cause lots of hash collisions. */
#ifndef JODY_HASH_SHIFT
#define JODY_HASH_SHIFT 14
#endif
/* The salt value's purpose is to cause each byte in the
* jodyhash_t word to have a positionally dependent variation.
* It is injected into the calculation to prevent a string of
* identical bytes from easily producing an identical hash. */
/* The tail mask table is used for block sizes that are
* indivisible by the width of a jodyhash_t. It is ANDed with the
* final jodyhash_t-sized element to zero out data in the buffer
* that is not part of the data to be hashed. */
/* Set hash parameters based on requested hash width */
#if JODY_HASH_WIDTH == 64
#define JODY_HASH_CONSTANT 0x1f3d5b79U
static const jodyhash_t tail_mask[] = {
0x0000000000000000,
0x00000000000000ff,
0x000000000000ffff,
0x0000000000ffffff,
0x00000000ffffffff,
0x000000ffffffffff,
0x0000ffffffffffff,
0x00ffffffffffffff,
0xffffffffffffffff
};
#endif /* JODY_HASH_WIDTH == 64 */
#if JODY_HASH_WIDTH == 32
#define JODY_HASH_CONSTANT 0x1f3d5b79U
static const jodyhash_t tail_mask[] = {
0x00000000,
0x000000ff,
0x0000ffff,
0x00ffffff,
0xffffffff,
};
#endif /* JODY_HASH_WIDTH == 32 */
#if JODY_HASH_WIDTH == 16
#define JODY_HASH_CONSTANT 0x1f5bU
static const jodyhash_t tail_mask[] = {
0x0000,
0x00ff,
0xffff,
};
#endif /* JODY_HASH_WIDTH == 16 */
/* Hash a block of arbitrary size; must be divisible by sizeof(jodyhash_t)
* The first block should pass a start_hash of zero.
* All blocks after the first should pass start_hash as the value
* returned by the last call to this function. This allows hashing
* of any amount of data. If data is not divisible by the size of
* jodyhash_t, it is MANDATORY that the caller provide a data buffer
* which is divisible by sizeof(jodyhash_t). */
extern jodyhash_t jody_block_hash(const jodyhash_t * restrict data,
const jodyhash_t start_hash, const size_t count)
{
jodyhash_t hash = start_hash;
jodyhash_t element;
jodyhash_t partial_salt;
size_t len;
/* Don't bother trying to hash a zero-length block */
if (count == 0) return hash;
len = count / sizeof(jodyhash_t);
for (; len > 0; len--) {
element = *data;
hash += element;
hash += JODY_HASH_CONSTANT;
hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(jodyhash_t) * 8 - JODY_HASH_SHIFT); /* bit rotate left */
hash ^= element;
hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(jodyhash_t) * 8 - JODY_HASH_SHIFT);
hash ^= JODY_HASH_CONSTANT;
hash += element;
data++;
}
/* Handle data tail (for blocks indivisible by sizeof(jodyhash_t)) */
len = count & (sizeof(jodyhash_t) - 1);
if (len) {
partial_salt = JODY_HASH_CONSTANT & tail_mask[len];
element = *data & tail_mask[len];
hash += element;
hash += partial_salt;
hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(jodyhash_t) * 8 - JODY_HASH_SHIFT);
hash ^= element;
hash = (hash << JODY_HASH_SHIFT) | hash >> (sizeof(jodyhash_t) * 8 - JODY_HASH_SHIFT);
hash ^= partial_salt;
hash += element;
}
return hash;
}
/* Jody Bruchon's fast hashing function (headers)
* See jody_hash.c for license information */
#ifndef JODY_HASH_H
#define JODY_HASH_H
#ifdef __cplusplus
extern "C" {
#endif
/* Required for uint64_t */
#include <stdint.h>
/* Width of a jody_hash. Changing this will also require
* changing the width of tail masks to match. */
#ifndef JODY_HASH_WIDTH
#define JODY_HASH_WIDTH 64
#endif
#if JODY_HASH_WIDTH == 64
typedef uint64_t jodyhash_t;
#endif
#if JODY_HASH_WIDTH == 32
typedef uint32_t jodyhash_t;
#endif
#if JODY_HASH_WIDTH == 16
typedef uint16_t jodyhash_t;
#endif
#ifndef JODY_HASH_NOCOMPAT
typedef jodyhash_t hash_t;
#endif
/* Version increments when algorithm changes incompatibly */
#define JODY_HASH_VERSION 5
extern jodyhash_t jody_block_hash(const jodyhash_t * restrict data,
const jodyhash_t start_hash, const size_t count);
#ifdef __cplusplus
}
#endif
#endif /* JODY_HASH_H */
......@@ -89,7 +89,7 @@ extern int make_relative_link_name(const char * const src,
if (*src != '/' || *dest != '/') {
if (!getcwd(p1, PATHBUF_SIZE * 2)) goto error_getcwd;
*(p1 + (PATHBUF_SIZE * 2) - 1) = '\0';
strncat(p1, "/", PATHBUF_SIZE * 2);
strncat(p1, "/", PATHBUF_SIZE * 2 - 1);
strncpy(p2, p1, PATHBUF_SIZE * 2);
}
......
......@@ -4,7 +4,7 @@
#ifndef JDUPES_VERSION_H
#define JDUPES_VERSION_H
#define VER "1.10.4"
#define VERDATE "2018-09-09"
#define VER "1.11.1"
#define VERDATE "2018-11-09"
#endif /* JDUPES_VERSION_H */