find_dupes.c 12.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * find_dupes.c
 *
 * Implementation of duplicate extent search
 *
 * Copyright (C) 2014 SUSE.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * Authors: Mark Fasheh <mfasheh@suse.de>
 */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdint.h>

26
#include <inttypes.h>
27 28 29 30 31 32 33 34 35 36 37 38 39
#include <glib.h>

#include "csum.h"
#include "rbtree.h"
#include "list.h"
#include "filerec.h"
#include "hash-tree.h"
#include "results-tree.h"
#include "memstats.h"
#include "debug.h"

#include "find_dupes.h"

40
extern int block_dedupe;
Mark Fasheh's avatar
Mark Fasheh committed
41
extern int dedupe_same_file;
42
extern unsigned int cpu_threads;
43

44
/*
45 46
 * Extent search status globals. This could be an atomic but GCond
 * requires a mutex so we might as well use it...
47 48 49 50 51 52
 */
static unsigned long long	search_total;
static unsigned long long	search_processed;
static GMutex			progress_mutex;
static GCond			progress_updated;

53 54
/* XXX: This is allowed to be called *after* update_extent_search_status() */
static void set_extent_search_status_count(unsigned long long nr_items)
55 56 57 58 59 60 61 62 63 64 65
{
	search_total = nr_items;
}

static void print_extent_search_status(unsigned long long processed)
{
	static int last_pos = -1;
	int i, pos;
	int width = 40;
	float progress;

Mark Fasheh's avatar
Mark Fasheh committed
66
	if (!stdout_is_tty || verbose || debug || quiet)
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
		return;

	progress = (float) processed / search_total;
	pos = width * progress;

	/* Only update our status every width% */
	if (pos <= last_pos)
		return;
	last_pos = pos;

	printf("\r[");
	for(i = 0; i < width; i++) {
		if (i < pos)
			printf("#");
		else if (i == pos)
			printf("%%");
		else
			printf(" ");
	}
	printf("]");
	fflush(stdout);
}

/*
 * 'err' is impossible in the current code when this is called, but we
 * can keep the handling here in case that changes.
 */
static void clear_extent_search_status(unsigned long long processed,
				       int err)
{
Mark Fasheh's avatar
Mark Fasheh committed
97
	if (!stdout_is_tty || verbose || debug || quiet)
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
		return;

	if (err)
		printf("\nSearch exited (%llu processed) with error %d: "
		       "\"%s\"\n", processed, err, strerror(err));
	else
		printf("\nSearch completed with no errors.             \n");
	fflush(stdout);
}

static void update_extent_search_status(unsigned long long processed)
{
	g_mutex_lock(&progress_mutex);
	search_processed += processed;
	g_cond_signal(&progress_updated);
	g_mutex_unlock(&progress_mutex);
}

static void wait_update_extent_search_status(GThreadPool *pool)
{
	uint64_t end_time = g_get_monotonic_time () + 1 * G_TIME_SPAN_SECOND;
	unsigned long long tmp, last = 0;

	if (!stdout_is_tty || verbose || debug)
		return;

	/* Get the bar started */
	print_extent_search_status(0);

	g_mutex_lock(&progress_mutex);
	while (search_processed < search_total) {
		g_cond_wait_until(&progress_updated, &progress_mutex, end_time);
		tmp = search_processed;
		g_mutex_unlock(&progress_mutex);

		if (tmp != last)
			print_extent_search_status(tmp);
		last = tmp;

		g_mutex_lock(&progress_mutex);
	}
	g_mutex_unlock(&progress_mutex);

	print_extent_search_status(search_processed);
	clear_extent_search_status(search_processed, 0);
}

145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
static void record_match(struct results_tree *res, unsigned char *digest,
			 struct filerec *orig, struct filerec *walk,
			 struct file_block **start, struct file_block **end)
{
	int ret;
	uint64_t soff[2], eoff[2];
	struct filerec *recs[2];
	uint64_t len;

	abort_on(start[0]->b_file != orig);
	abort_on(start[1]->b_file != walk);

	recs[0] = start[0]->b_file;
	recs[1] = start[1]->b_file;

	soff[0] = start[0]->b_loff;
	soff[1] = start[1]->b_loff;

163 164
	eoff[0] = block_len(end[0]) + end[0]->b_loff - 1;
	eoff[1] = block_len(end[1]) + end[1]->b_loff - 1;
165

166
	len = eoff[0] - soff[0] + 1;
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186

	ret = insert_result(res, digest, recs, soff, eoff);
	if (ret) {
		abort_on(ret != ENOMEM); /* Only error possible here. */
		fprintf(stderr, "Out of memory while processing results\n");
		print_mem_stats();
		exit(ENOMEM);
	}

	dprintf("Duplicated extent of %llu blocks in files:\n%s\t\t%s\n",
		(unsigned long long)len / blocksize, orig->filename,
		walk->filename);

	dprintf("%llu-%llu\t\t%llu-%llu\n",
		(unsigned long long)soff[0] / blocksize,
		(unsigned long long)eoff[0] / blocksize,
		(unsigned long long)soff[1] / blocksize,
		(unsigned long long)eoff[1] / blocksize);
}

Mark Fasheh's avatar
Mark Fasheh committed
187 188
static inline void mark_block_seen(uint64_t *off, struct file_block *block)
{
189 190 191 192 193
	/*
	 * Add 1 so the check in block_seen triggers on block->b_loff. This
	 * also allows it to catch the case where *off is initialized to 0 and
	 * b_loff == 0 but we haven't actually seen the block yet.
	 */
Mark Fasheh's avatar
Mark Fasheh committed
194 195 196 197 198 199 200 201 202 203 204
	if ((*off) < block->b_loff)
		*off = block->b_loff + 1;
}

static inline int block_seen(uint64_t off, struct file_block *block)
{
	if (off > block->b_loff)
		return 1;
	return 0;
}

205 206
static int walk_dupe_block(struct filerec *orig_file,
			   struct file_block *orig_file_block,
Mark Fasheh's avatar
Mark Fasheh committed
207
			   uint64_t *orig_best_off,
208 209
			   struct filerec *walk_file,
			   struct file_block *walk_file_block,
Mark Fasheh's avatar
Mark Fasheh committed
210
			   uint64_t *walk_best_off,
211 212 213 214 215
			   struct results_tree *res)
{
	struct file_block *orig = orig_file_block;
	struct file_block *block = walk_file_block;
	struct file_block *start[2] = { orig, block };
216
	struct file_block *end[2] = { NULL, NULL };
217 218
	struct running_checksum *csum;
	unsigned char match_id[DIGEST_LEN_MAX] = {0, };
219
	uint64_t orig_blkno, walk_blkno;
220
	struct rb_node *node;
221

Mark Fasheh's avatar
Mark Fasheh committed
222 223
	if (block_seen(*walk_best_off, block) ||
	    block_seen(*orig_best_off, orig))
224 225 226 227 228 229 230
		goto out;

	csum = start_running_checksum();

	abort_on(block->b_parent != orig->b_parent);

	while (block->b_parent == orig->b_parent) {
Mark Fasheh's avatar
Mark Fasheh committed
231 232
		mark_block_seen(walk_best_off, block);
		mark_block_seen(orig_best_off, orig);
233 234 235 236 237 238 239 240

		end[0] = orig;
		end[1] = block;

		add_to_running_checksum(csum, digest_len,
					block->b_parent->dl_hash);

		/*
241
		 * Check that we don't walk off either tree
242
		 */
243 244
		if (rb_next(&orig->b_file_next) == NULL ||
		    rb_next(&block->b_file_next) == NULL)
245 246
			break;

247 248 249
		orig_blkno = orig->b_loff;
		walk_blkno = block->b_loff;

250 251 252 253
		node = rb_next(&orig->b_file_next);
		orig = rb_entry(node, struct file_block, b_file_next);
		node = rb_next(&block->b_file_next);
		block = rb_entry(node, struct file_block, b_file_next);
254 255 256 257 258 259 260 261 262

		/*
		 * Check that our next blocks are contiguous wrt the
		 * old ones. If they aren't, then this has to be the
		 * end of our extents.
		 */
		if (orig->b_loff != (orig_blkno + blocksize) ||
		    block->b_loff != (walk_blkno + blocksize))
			break;
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
	}

	finish_running_checksum(csum, match_id);

	record_match(res, match_id, orig_file, walk_file,
		     start, end);
out:
	return 0;
}

/*
 * Start an extent search (with orig_block) at each block in our dups
 * list which is owned by walk_file.
 */
static void lookup_walk_file_hash_head(struct file_block *orig_block,
				       struct filerec *walk_file,
Mark Fasheh's avatar
Mark Fasheh committed
279 280
				       struct results_tree *res,
				       uint64_t *file_off, uint64_t *walk_off)
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
{
	struct dupe_blocks_list *parent = orig_block->b_parent;
	struct file_block *cur;
	struct file_hash_head *head = find_file_hash_head(parent, walk_file);

	/* find_file_dups should have checked this for us already */
	abort_on(head == NULL);

	list_for_each_entry(cur, &head->h_blocks, b_head_list) {
		/* Ignore self. Technically this shouldn't happen (see above)
		 * until we allow walking a file against itself. */
		if (cur == orig_block)
			continue;

		abort_on(cur->b_file != walk_file);

Mark Fasheh's avatar
Mark Fasheh committed
297 298
		if (walk_dupe_block(orig_block->b_file, orig_block, file_off,
				    walk_file, cur, walk_off, res))
299 300 301 302 303 304 305 306
			break;
	}
}

static void find_file_dupes(struct filerec *file, struct filerec *walk_file,
			    struct results_tree *res)
{
	struct file_block *cur;
307
	struct rb_node *node;
Mark Fasheh's avatar
Mark Fasheh committed
308 309
	uint64_t file_off = 0;
	uint64_t walk_off = 0;
310

311 312 313
	vprintf("Compare files \"%s\" and "
		"\"%s\"\n", file->filename, walk_file->filename);

314 315
	for (node = rb_first(&file->block_tree); node; node = rb_next(node)) {
		cur = rb_entry(node, struct file_block, b_file_next);
316

Mark Fasheh's avatar
Mark Fasheh committed
317
		if (block_seen(file_off, cur))
318 319 320 321 322 323 324 325 326 327
			continue;

		if (!file_in_dups_list(cur->b_parent, walk_file))
			continue;

		/*
		 * For each file block with the same hash:
		 *  - Traverse, along with original file until we have no match
		 *     - record
		 */
Mark Fasheh's avatar
Mark Fasheh committed
328 329
		lookup_walk_file_hash_head(cur, walk_file, res, &file_off,
					   &walk_off);
330 331 332
	}
}

333 334 335 336 337
struct find_dupes_cmp {
	struct filerec *file1;
	struct filerec *file2;
};
declare_alloc_tracking(find_dupes_cmp);
338

339 340
static int find_dupes_worker(struct find_dupes_cmp *cmp,
			     struct results_tree *res)
341
{
342
	struct filerec *file1 = cmp->file1;
343
	struct filerec *file2 = cmp->file2;
Mark Fasheh's avatar
Mark Fasheh committed
344

345
	free_find_dupes_cmp(cmp);
346

347 348
	find_file_dupes(file1, file2, res);
	update_extent_search_status(1);
Mark Fasheh's avatar
Mark Fasheh committed
349

350
	return mark_filerecs_compared(file1, file2);
351 352
}

353 354
static int push_compares(GThreadPool *pool, struct dupe_blocks_list *dups,
			 unsigned long long *pushed)
355 356
{
	int ret;
357 358 359
	struct filerec *file1, *file2, *tmp1, *tmp2;
	LIST_HEAD(cmp_files);
	unsigned int cmp_tot = 0;
360 361
	struct rb_node *node;
	struct file_hash_head *fh;
362
	GError *err = NULL;
363

364 365
	dprintf("Gather files from hash: ");
	if (debug)
366
		debug_print_digest_short(stdout, dups->dl_hash);
367
	dprintf(" (%llu identical extents)\n", dups->dl_num_elem);
368

369 370 371
	for (node = rb_first(&dups->dl_files_root); node; node = rb_next(node)) {
		fh = rb_entry(node, struct file_hash_head, h_node);
		file1 = fh->h_file;
372

373 374 375
		abort_on(!list_empty(&file1->tmp_list));
		list_add_tail(&file1->tmp_list, &cmp_files);
		cmp_tot++;
376
	}
377

378
	vprintf("Process %u files.\n", cmp_tot);
379

380 381 382 383
	list_for_each_entry_safe(file1, tmp1, &cmp_files, tmp_list) {
		file2 = file1;/* start from file1 for list iter */
		list_for_each_entry_safe_continue(file2, tmp2, &cmp_files,
						  tmp_list) {
384
			if (filerec_deduped(file1) && filerec_deduped(file2))
385 386
				continue;

387
			if (filerecs_compared(file1, file2))
388 389
				continue;

390
			if (dedupe_same_file || file1 != file2) {
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
				/* fire this off to a worker */
				struct find_dupes_cmp *cmp;

				cmp = malloc_find_dupes_cmp();
				if (!cmp)
					return ENOMEM;

				cmp->file1 = file1;
				cmp->file2 = file2;

				g_thread_pool_push(pool, cmp, &err);
				if (err) {
					free_find_dupes_cmp(cmp);

					fprintf(stderr,
						"Error from thread pool: %s\n ",
						err->message);
					g_error_free(err);
					return ENOMEM;
				}
411
				(*pushed)++;
412 413
			}
		}
414 415 416 417 418 419 420 421 422 423 424
		/*
		 * Throttle how many compares we're allocating and
		 * queuing, otherwise we run the risk of running into
		 * an ENOMEM. We can abuse the progress conditional
		 * wait queue for this as each worker thread will
		 * activate it on the way out.
		 */
		g_mutex_lock(&progress_mutex);
		while (g_thread_pool_unprocessed(pool) >= 4096)
			g_cond_wait(&progress_updated, &progress_mutex);
		g_mutex_unlock(&progress_mutex);
425 426 427

		cmp_tot--;
		list_del_init(&file1->tmp_list);
428
	}
429 430

	ret = 0;
431

432 433 434 435
	list_for_each_entry_safe(file1, tmp1, &cmp_files, tmp_list)
		list_del_init(&file1->tmp_list);

	return ret;
436 437
}

438 439
static int find_all_dupes_filewise(struct hash_tree *tree,
				   struct results_tree *res)
440 441 442 443 444
{
	int ret = 0;
	struct rb_root *root = &tree->root;
	struct rb_node *node = rb_first(root);
	struct dupe_blocks_list *dups;
445 446
	GError *err = NULL;
	GThreadPool *pool = NULL;
447
	unsigned long long pushed = 0;
448

Mark Fasheh's avatar
Mark Fasheh committed
449 450
	qprintf("Hashing completed. Using %u threads to calculate duplicate "
		"extents. This may take some time.\n", cpu_threads);
451

452 453 454 455 456 457 458 459 460 461
	pool = g_thread_pool_new((GFunc) find_dupes_worker, res,
				 cpu_threads, TRUE, &err);
	if (err) {
		fprintf(stderr,
			"Unable to create find file dupes thread pool: %s\n",
			err->message);
		g_error_free(err);
		return ENOMEM;
	}

462 463 464 465 466 467 468
	while (1) {
		if (node == NULL)
			break;

		dups = rb_entry(node, struct dupe_blocks_list, dl_node);

		if (dups->dl_num_elem > 1) {
469
			ret = push_compares(pool, dups, &pushed);
470 471 472 473
			if (ret) {
				fprintf(stderr,
					"Error: %s while comparing files",
					strerror(ret));
474
				goto out;
475
			}
476 477 478 479 480
		}

		node = rb_next(node);
	}

481 482 483
	set_extent_search_status_count(pushed);
	wait_update_extent_search_status(pool);

484
out:
485
	g_thread_pool_free(pool, FALSE, TRUE);
486 487 488 489 490
	/*
	 * Save memory by freeing each filerec compared tree once all
	 * threads have finished.
	 */
	free_all_filerec_compared();
491 492
	return ret;
}
493 494 495 496 497 498 499 500 501 502

int find_all_dupes(struct hash_tree *tree, struct results_tree *res)
{
	int ret;
	struct filerec *file;

	if (block_dedupe) {
		sort_hashes_by_size(tree);
		return 0;
	} else {
503
		unsigned long long orig_extent_count = res->num_extents;
504 505 506 507 508 509

		ret = find_all_dupes_filewise(tree, res);

		vprintf("Removing overlapping extents\n");
		list_for_each_entry(file, &filerec_list, rec_list)
			remove_overlapping_extents(res, file);
510 511 512
		dprintf("Removed %llu extents (had %llu).\n",
			orig_extent_count - res->num_extents,
			orig_extent_count);
513 514 515 516

		return ret;
	}
}