Commit 4f905264 authored by Mark Fasheh's avatar Mark Fasheh

Add option to choose hash type used.

This adds an option to duperemove (and csum-test), '--hash=' which allows
the user to pick which algorithm is used for hashing blocks.

Internally we make a hash_module structure which holds function pointers to
the various hashing operations as well pointers to friendly and internal
names. The init_hash() function is changed to take a string and will choose
the module structure whose hash name matches that string.

Aside from defining a module structure, the changes to each hash module c
file are minimal - renaming functions and variables so we don't clash.
Signed-off-by: 's avatarMark Fasheh <mfasheh@suse.de>
parent 02c6d6d1
......@@ -5,46 +5,36 @@ CFLAGS = -Wall -ggdb
MANPAGES=duperemove.8 btrfs-extent-same.8 hashstats.8 show-shared-extents.8
HEADERS=csum.h hash-tree.h results-tree.h kernel.h list.h rbtree.h dedupe.h \
btrfs-ioctl.h filerec.h btrfs-util.h debug.h util.h serialize.h \
memstats.h file_scan.h find_dupes.h run_dedupe.h xxhash.h
CFILES=duperemove.c hash-tree.c results-tree.c rbtree.c dedupe.c filerec.c \
btrfs-util.c util.c serialize.c memstats.c file_scan.c find_dupes.c \
run_dedupe.c
hash_impl_CFILES=csum-gcrypt.c csum-xxhash.c xxhash.c csum-murmur3.c
run_dedupe.c csum.c
hash_CFILES=csum-gcrypt.c csum-xxhash.c xxhash.c csum-murmur3.c
hash_CFLAGS=$(shell libgcrypt-config --cflags)
hash_LIBS=$(shell libgcrypt-config --libs)
CFILES += $(hash_CFILES)
hashstats_CFILES=hashstats.c
btrfs_extent_same_CFILES=btrfs-extent-same.c
csum_test_CFILES=csum-test.c
DIST_CFILES:=$(CFILES) $(hashstats_CFILES) $(btrfs_extent_same_CFILES) \
$(csum_test_CFILES) $(hash_impl_CFILES)
HEADERS=csum.h hash-tree.h results-tree.h kernel.h list.h rbtree.h dedupe.h \
btrfs-ioctl.h filerec.h btrfs-util.h debug.h util.h serialize.h \
memstats.h file_scan.h find_dupes.h run_dedupe.h xxhash.h
$(csum_test_CFILES) $(hash_CFILES)
DIST_SOURCES:=$(DIST_CFILES) $(HEADERS) LICENSE LICENSE.xxhash Makefile \
rbtree.txt README.md TODO $(MANPAGES) SubmittingPatches FAQ.md
DIST=duperemove-$(RELEASE)
DIST_TARBALL=$(DIST).tar.gz
TEMP_INSTALL_DIR:=$(shell mktemp -du -p .)
crypt_CFILES=csum-gcrypt.c
crypt_CFLAGS=$(shell libgcrypt-config --cflags)
crypt_LIBS=$(shell libgcrypt-config --libs)
ifdef USE_XXHASH
crypt_CFILES=csum-xxhash.c xxhash.c
crypt_CFLAGS=-DUSE_XXHASH
crypt_LIBS=
endif
ifdef USE_MURMUR3
crypt_CFILES=csum-murmur3.o
crypt_CFLAGS=-DUSE_MURMUR3
crypt_LIBS=
endif
crypt_obj=$(crypt_CFILES:.c=.o)
CFILES += $(crypt_CFILES)
objects = $(CFILES:.c=.o)
hashstats_obj = $(crypt_obj) rbtree.o hash-tree.o filerec.o util.o serialize.o \
results-tree.o
hash_obj=$(hash_CFILES:.c=.o)
hashstats_obj = $(hash_obj) rbtree.o hash-tree.o filerec.o util.o serialize.o \
results-tree.o csum.o
show_shared_obj = rbtree.o util.o
csum_test_obj = $(crypt_obj) util.o
csum_test_obj = $(hash_obj) util.o csum.o
progs = duperemove hashstats btrfs-extent-same show-shared-extents csum-test
......@@ -52,8 +42,8 @@ glib_CFLAGS=$(shell pkg-config --cflags glib-2.0)
glib_LIBS=$(shell pkg-config --libs glib-2.0)
override CFLAGS += -D_FILE_OFFSET_BITS=64 -DVERSTRING=\"$(RELEASE)\" \
$(crypt_CFLAGS) $(glib_CFLAGS) -rdynamic
LIBRARY_FLAGS += $(crypt_LIBS) $(glib_LIBS)
$(hash_CFLAGS) $(glib_CFLAGS) -rdynamic
LIBRARY_FLAGS += $(hash_LIBS) $(glib_LIBS)
DESTDIR = /
PREFIX = /usr/local
......
......@@ -28,16 +28,15 @@
GCRY_THREAD_OPTION_PTHREAD_IMPL;
unsigned int digest_len = 0;
#define HASH_TYPE "SHA256 "
char hash_type[8];
#define HASH_TYPE_SHA256 "SHA256 "
void checksum_block(char *buf, int len, unsigned char *digest)
static void sha256_checksum_block(char *buf, int len, unsigned char *digest)
{
gcry_md_hash_buffer(HASH_FUNC, digest, buf, len);
}
int init_hash(void)
static int sha256_init_hash(unsigned int *ret_digest_len)
{
gcry_control (GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread);
......@@ -57,33 +56,23 @@ int init_hash(void)
if (gcry_md_test_algo(HASH_FUNC))
return 1;
digest_len = gcry_md_get_algo_dlen(HASH_FUNC);
if (!digest_len)
*ret_digest_len = gcry_md_get_algo_dlen(HASH_FUNC);
if (!(*ret_digest_len))
return 1;
strncpy(hash_type, HASH_TYPE, 8);
abort_on(digest_len == 0 || digest_len > DIGEST_LEN_MAX);
return 0;
}
void debug_print_digest(FILE *stream, unsigned char *digest)
{
uint32_t i;
for (i = 0; i < digest_len; i++)
fprintf(stream, "%.2x", digest[i]);
}
struct running_checksum {
struct sha256_running_checksum {
gcry_md_hd_t hd;
unsigned char digest[DIGEST_LEN_MAX];
};
DECLARE_RUNNING_CSUM_CAST_FUNCS(sha256_running_checksum);
struct running_checksum *start_running_checksum(void)
static struct running_checksum *sha256_start_running_checksum(void)
{
struct running_checksum *c = calloc(1, sizeof(struct running_checksum));
struct sha256_running_checksum *c =
calloc(1, sizeof(struct sha256_running_checksum));
if (c) {
if (gcry_md_open(&c->hd, HASH_FUNC, 0) != GPG_ERR_NO_ERROR) {
......@@ -92,17 +81,20 @@ struct running_checksum *start_running_checksum(void)
}
}
return c;
return priv_to_rc(c);
}
void add_to_running_checksum(struct running_checksum *c,
unsigned int len, unsigned char *buf)
static void sha256_add_to_running_checksum(struct running_checksum *_c,
unsigned int len, unsigned char *buf)
{
struct sha256_running_checksum *c = rc_to_priv(_c);
gcry_md_write(c->hd, buf, len);
}
void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
static void sha256_finish_running_checksum(struct running_checksum *_c,
unsigned char *digest)
{
struct sha256_running_checksum *c = rc_to_priv(_c);
unsigned char *gcry_digest;
/* gcry_md_read() does this implicitly */
......@@ -114,3 +106,17 @@ void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
free(c);
}
struct csum_module_ops ops_sha256 = {
.init = sha256_init_hash,
.checksum_block = sha256_checksum_block,
.start_running_checksum = sha256_start_running_checksum,
.add_to_running_checksum = sha256_add_to_running_checksum,
.finish_running_checksum = sha256_finish_running_checksum,
};
struct csum_module csum_module_sha256 = {
.name = "SHA256",
.hash_type = HASH_TYPE_SHA256,
.ops = &ops_sha256,
};
......@@ -29,9 +29,7 @@
#include "util.h"
#include "debug.h"
#define HASH_TYPE "Murmur3 "
char hash_type[8];
unsigned int digest_len = 0;
#define HASH_TYPE_MURMUR3 "Murmur3 "
#ifdef __GNUC__
#define FORCE_INLINE __attribute__((always_inline)) inline
......@@ -78,41 +76,27 @@ static FORCE_INLINE uint64_t fmix64(uint64_t k)
return k;
}
int init_hash(void)
static int murmur3_init_hash(unsigned int *ret_digest_len)
{
strncpy(hash_type, HASH_TYPE, 8);
digest_len = 16;
abort_on(digest_len == 0 || digest_len > DIGEST_LEN_MAX);
*ret_digest_len = 16;
return 0;
}
void debug_print_digest(FILE *stream, unsigned char *digest)
{
uint32_t i;
for (i = 0; i < digest_len; i++)
fprintf(stream, "%.2x", digest[i]);
}
void checksum_block(char *buf, int len, unsigned char *digest)
{
struct running_checksum *csum = start_running_checksum();
add_to_running_checksum(csum, len, (unsigned char*)buf);
finish_running_checksum(csum, digest);
}
#define REM_BUFFER_LEN 15
struct running_checksum {
struct murmur3_running_checksum {
uint64_t h1;
uint64_t h2;
uint64_t len;
unsigned char rem_buffer[REM_BUFFER_LEN]; /* Holds partial block between calls */
unsigned int rem_len;
};
DECLARE_RUNNING_CSUM_CAST_FUNCS(murmur3_running_checksum);
struct running_checksum *start_running_checksum(void)
struct running_checksum *murmur3_start_running_checksum(void)
{
struct running_checksum *c = calloc(1, sizeof(struct running_checksum));
struct murmur3_running_checksum *c =
calloc(1, sizeof(struct murmur3_running_checksum));
if (c) {
/* Init h1 & h2 with the same seed */
......@@ -122,12 +106,14 @@ struct running_checksum *start_running_checksum(void)
c->rem_len = 0;
}
return c;
return priv_to_rc(c);
}
void add_to_running_checksum(struct running_checksum *c,
unsigned int len, unsigned char *buf)
static void murmur3_add_to_running_checksum(struct running_checksum *_c,
unsigned int len,
unsigned char *buf)
{
struct murmur3_running_checksum *c = rc_to_priv(_c);
unsigned char block[16];
const uint8_t * data = (const uint8_t*)buf;
int i;
......@@ -145,7 +131,7 @@ void add_to_running_checksum(struct running_checksum *c,
data = data + (16 - c->rem_len);
len -= (16 - c->rem_len);
c->rem_len = 0;
add_to_running_checksum(c, 16, block);
add_to_running_checksum(_c, 16, block);
}
/* We will now process 16-bytes blocks, as much as possible */
......@@ -185,7 +171,7 @@ void add_to_running_checksum(struct running_checksum *c,
c->rem_len += len;
}
void checksum_tailing_data(struct running_checksum *c)
static void checksum_tailing_data(struct murmur3_running_checksum *c)
{
uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
......@@ -240,8 +226,11 @@ void checksum_tailing_data(struct running_checksum *c)
}
void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
static void murmur3_finish_running_checksum(struct running_checksum *_c,
unsigned char *digest)
{
struct murmur3_running_checksum *c = rc_to_priv(_c);
checksum_tailing_data(c);
uint64_t h1 = c->h1;
......@@ -267,3 +256,25 @@ void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
free(c);
}
static void murmur3_checksum_block(char *buf, int len, unsigned char *digest)
{
struct running_checksum *csum = murmur3_start_running_checksum();
murmur3_add_to_running_checksum(csum, len, (unsigned char*)buf);
murmur3_finish_running_checksum(csum, digest);
}
struct csum_module_ops ops_murmur3 = {
.init = murmur3_init_hash,
.checksum_block = murmur3_checksum_block,
.start_running_checksum = murmur3_start_running_checksum,
.add_to_running_checksum = murmur3_add_to_running_checksum,
.finish_running_checksum = murmur3_finish_running_checksum,
};
struct csum_module csum_module_murmur3 = {
.name = "murmur3",
.hash_type = HASH_TYPE_MURMUR3,
.ops = &ops_murmur3,
};
......@@ -36,7 +36,9 @@
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <limits.h>
#include <stdio.h>
......@@ -47,21 +49,32 @@ static unsigned int buf_len = 4096;
static unsigned char *buf = NULL;
static unsigned char digest[DIGEST_LEN_MAX] = { 0, };
static char *user_hash = DEFAULT_HASH_STR;
enum {
HASH_OPTION = CHAR_MAX + 1,
};
static int parse_opts(int argc, char **argv, char **fname)
{
int c;
static struct option long_ops[] = {
{ "hash", 1, 0, HASH_OPTION },
{ 0, 0, 0, 0}
};
if (argc < 2)
return 1;
while ((c = getopt(argc, argv, "b:"))
while ((c = getopt_long(argc, argv, "b:", long_ops, NULL))
!= -1) {
switch (c) {
case 'b':
buf_len = atoi(optarg);
printf("User provided buffer len: %u\n", buf_len);
break;
case HASH_OPTION:
user_hash = optarg;
break;
default:
return 1;
}
......@@ -80,14 +93,16 @@ int main(int argc, char **argv)
struct stat s;
struct running_checksum *csum;
init_hash();
ret = parse_opts(argc, argv, &fname);
if (ret) {
fprintf(stderr, "Usage: %s [-b buflen] filename\n", argv[0]);
fprintf(stderr, "Usage: %s [-b buflen] [--hash=hash_type] filename\n", argv[0]);
return 1;
}
ret = init_csum_module(user_hash);
if (ret)
return ret;
buf = malloc(buf_len);
if (buf == NULL)
return ENOMEM;
......
......@@ -24,25 +24,15 @@
#include "debug.h"
#include "xxhash.h"
#define HASH_TYPE "XXHASH "
char hash_type[8];
uint32_t digest_len = DIGEST_LEN_MAX;
#define HASH_TYPE_XXHASH "XXHASH "
int init_hash(void)
static int xxhash_init_hash(unsigned int *ret_digest_len)
{
strncpy(hash_type, HASH_TYPE, 8);
*ret_digest_len = DIGEST_LEN_MAX;
return 0;
}
void debug_print_digest(FILE *stream, unsigned char *digest)
{
uint32_t i;
for (i = 0; i < DIGEST_LEN_MAX; i++)
fprintf(stream, "%.2x", digest[i]);
}
void checksum_block(char *buf, int len, unsigned char *digest) {
static void xxhash_checksum_block(char *buf, int len, unsigned char *digest) {
unsigned long long *hash = (unsigned long long*)digest;
/*
* For xxhash one use only first 64 bit from 256 bit hash field
......@@ -52,26 +42,46 @@ void checksum_block(char *buf, int len, unsigned char *digest) {
*hash = XXH64(buf, len, 0);
}
struct running_checksum {
struct xxhash_running_checksum {
XXH64_state_t td64;
};
DECLARE_RUNNING_CSUM_CAST_FUNCS(xxhash_running_checksum);
struct running_checksum *start_running_checksum(void)
static struct running_checksum *xxhash_start_running_checksum(void)
{
struct running_checksum *c = calloc(1, sizeof(struct running_checksum));
struct xxhash_running_checksum *c =
calloc(1, sizeof(struct xxhash_running_checksum));
XXH64_reset(&c->td64, 0);
return c;
return priv_to_rc(c);
}
void add_to_running_checksum(struct running_checksum *c, unsigned int len, unsigned char *buf)
static void xxhash_add_to_running_checksum(struct running_checksum *_c,
unsigned int len, unsigned char *buf)
{
struct xxhash_running_checksum *c = rc_to_priv(_c);
XXH64_update(&c->td64, buf, len);
}
void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
static void xxhash_finish_running_checksum(struct running_checksum *_c,
unsigned char *digest)
{
struct xxhash_running_checksum *c = rc_to_priv(_c);
unsigned long long *hash = (unsigned long long*)digest;
*hash = XXH64_digest(&c->td64);
free(c);
}
struct csum_module_ops ops_xxhash = {
.init = xxhash_init_hash,
.checksum_block = xxhash_checksum_block,
.start_running_checksum = xxhash_start_running_checksum,
.add_to_running_checksum = xxhash_add_to_running_checksum,
.finish_running_checksum = xxhash_finish_running_checksum,
};
struct csum_module csum_module_xxhash = {
.name = "xxhash",
.hash_type = HASH_TYPE_XXHASH,
.ops = &ops_xxhash,
};
/*
* csum.c
*
* Copyright (C) 2014 SUSE. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <gcrypt.h>
#include <errno.h>
#include <pthread.h>
#include "csum.h"
#include "debug.h"
unsigned int digest_len = 0;
char hash_type[8];
struct csum_module *modules[] = {
&csum_module_murmur3,
&csum_module_sha256,
&csum_module_xxhash,
NULL,
};
struct csum_module *csum_mod = NULL;
int init_csum_module(const char *type)
{
int ret;
struct csum_module *m = modules[0];
int i = 0;
while (m) {
if (strcasecmp(type, m->name) == 0)
break;
m = modules[++i];
}
if (!m)
return EINVAL;
csum_mod = m;
strncpy(hash_type, csum_mod->hash_type, 8);
ret = csum_mod->ops->init(&digest_len);
if (ret)
return ret;
abort_on(digest_len == 0 || digest_len > DIGEST_LEN_MAX);
if (csum_mod != &csum_module_sha256)
fprintf(stderr,
"Warning: %s support is experimental!\n",
csum_mod->name);
return 0;
}
void debug_print_digest(FILE *stream, unsigned char *digest)
{
uint32_t i;
for (i = 0; i < digest_len; i++)
fprintf(stream, "%.2x", digest[i]);
}
void checksum_block(char *buf, int len, unsigned char *digest)
{
csum_mod->ops->checksum_block(buf, len, digest);
}
struct running_checksum *start_running_checksum(void)
{
return csum_mod->ops->start_running_checksum();
}
void add_to_running_checksum(struct running_checksum *c,
unsigned int len, unsigned char *buf)
{
csum_mod->ops->add_to_running_checksum(c, len, buf);
}
void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
{
csum_mod->ops->finish_running_checksum(c, digest);
}
......@@ -4,12 +4,13 @@
#include <stdio.h>
#define DIGEST_LEN_MAX 32
#define DEFAULT_HASH_STR "sha256"
extern unsigned int digest_len;
extern char hash_type[8];
/* Init / debug */
int init_hash(void);
int init_csum_module(const char *type);
void debug_print_digest(FILE *stream, unsigned char *digest);
/* Checksums a single block in one go. */
......@@ -22,4 +23,49 @@ void add_to_running_checksum(struct running_checksum *c,
unsigned int len, unsigned char *buf);
void finish_running_checksum(struct running_checksum *c, unsigned char *digest);
#endif /* __CSUM_H__ */
/* csum-module implementation details */
struct csum_module_ops {
int (*init)(unsigned int *ret_digest_len);
void (*checksum_block)(char *buf, int len, unsigned char *digest);
struct running_checksum *(*start_running_checksum)(void);
void (*add_to_running_checksum)(struct running_checksum *c,
unsigned int len, unsigned char *buf);
void (*finish_running_checksum)(struct running_checksum *c,
unsigned char *digest);
};
struct csum_module {
/*
* Friendly name, suitable for printing to the user. We use
* this also for option parsing.
*/
const char *name;
/*
* Internally identifies this hash, is also what we write in
* hashfiles. Must not exceed 8 characters.
*/
const char *hash_type;
struct csum_module_ops *ops;
};
extern struct csum_module csum_module_sha256;
extern struct csum_module csum_module_xxhash;
extern struct csum_module csum_module_murmur3;
extern struct csum_module *csum_mod; /* The module currently in use */
#define DECLARE_RUNNING_CSUM_CAST_FUNCS(_type) \
static inline struct _type * \
rc_to_priv(struct running_checksum *rc) \
{ \
return (struct _type *)rc; \
} \
static inline struct running_checksum * \
priv_to_rc(struct _type *priv) \
{ \
return (struct running_checksum *)priv; \
}
#endif /* csum.h */
......@@ -70,6 +70,8 @@ int do_lookup_extents = 1;
int fancy_status = 0;
static char *user_hash = DEFAULT_HASH_STR;
static void usage(const char *prog)
{
printf("duperemove %s\n", VERSTRING);
......@@ -114,6 +116,7 @@ enum {
HASH_THREADS_OPTION,
LOOKUP_EXTENTS_OPTION,
ONE_FILESYSTEM_OPTION,
HASH_OPTION,
};
/*
......@@ -131,6 +134,7 @@ static int parse_options(int argc, char **argv)
{ "hash-threads", 1, 0, HASH_THREADS_OPTION },
{ "lookup-extents", 1, 0, LOOKUP_EXTENTS_OPTION },
{ "one-file-system", 0, 0, ONE_FILESYSTEM_OPTION },
{ "hash", 1, 0, HASH_OPTION },
{ 0, 0, 0, 0}
};
......@@ -188,6 +192,9 @@ static int parse_options(int argc, char **argv)
case 'x':
one_file_system = 1;
break;
case HASH_OPTION:
user_hash = optarg;
break;
case HELP_OPTION:
case '?':
default:
......@@ -248,9 +255,6 @@ int main(int argc, char **argv)
struct results_tree res;
struct filerec *file;
if (init_hash())
return ENOMEM;
init_filerec();
init_hash_tree(&tree);
init_results_tree(&res);
......@@ -260,12 +264,14 @@ int main(int argc, char **argv)
return EINVAL;
}
#ifdef USE_XXHASH
printf("Warning: xxhash support is experimental and might change!\n");
#endif
#ifdef USE_MURMUR3
printf("Warning: murmur3 support is experimental and might change!\n");
#endif
ret = init_csum_module(user_hash);
if (ret) {
if (ret == EINVAL)
fprintf(stderr,
"Could not initialize hash module \"%s\"\n",
user_hash);
return ret;
}
if (isatty(STDOUT_FILENO))
fancy_status = 1;
......@@ -301,7 +307,7 @@ int main(int argc, char **argv)
}
printf("Using %uK blocks\n", blocksize/1024);
printf("Using hash: %.*s\n", 8, hash_type);
printf("Using hash: %s\n", csum_mod->name);
if (!read_hashes) {
ret = populate_hash_tree(&tree);
......
......@@ -199,6 +199,7 @@ static void usage(const char *prog)
enum {
HELP_OPTION = CHAR_MAX + 1,
VERSION_OPTION,
HASH_OPTION,
};
static int parse_options(int argc, char **argv)
......@@ -253,9 +254,6 @@ int main(int argc, char **argv)
struct hash_tree tree;
struct hash_file_header h;
if (init_hash())
return ENOMEM;
init_filerec();
init_hash_tree(&tree);
......@@ -264,6 +262,9 @@ int main(int argc, char **argv)
return EINVAL;
}
if (init_csum_module(DEFAULT_HASH_STR))
return ENOMEM;
ret = read_hash_tree(serialize_fname, &tree, &blocksize, &h, 0);
if (ret == FILE_VERSION_ERROR) {
fprintf(stderr,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment