zbox.h 2.68 KB
Newer Older
1
/*
2
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef ZBOX_H_
#define ZBOX_H_

23 24
#include "btypes.h"

25 26 27 28 29 30 31 32
/**
 * Fill z with Z-box information for s.  String z will not be resized
 * and will only be filled up to its size cap.  This is the linear-time
 * algorithm from Gusfield.  An optional sanity-check uses a naive
 * algorithm to double-check results.
 */
template<typename T>
void calcZ(const T& s,
33 34
           TIndexOffU off,
           EList<TIndexOffU>& z,
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
           bool verbose = false,
           bool sanityCheck = false)
{
	size_t lCur = 0, rCur = 0;
	size_t zlen = z.size();
	size_t slen = s.length();
	assert_gt(zlen, 0);
	assert_eq(z[0], 0);
	//assert_leq(zlen, slen);
	for (size_t k = 1; k < zlen && k+off < slen; k++) {
		assert_lt(lCur, k);
		assert(z[lCur] == 0 || z[lCur] == rCur - lCur + 1);
		if(k > rCur) {
			// compare starting at k with prefix starting at 0
			size_t ki = k;
			while(off+ki < s.length() && s[off+ki] == s[off+ki-k]) ki++;
51
			z[k] = (TIndexOffU)(ki - k);
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
			assert_lt(off+z[k], slen);
			if(z[k] > 0) {
				lCur = k;
				rCur = k + z[k] - 1;
			}
		} else {
			// position k is contained in a Z-box
			size_t betaLen = rCur - k + 1;
			size_t kPrime = k - lCur;
			assert_eq(s[off+k], s[off+kPrime]);
			if(z[kPrime] < betaLen) {
				z[k] = z[kPrime];
				assert_lt(off+z[k], slen);
				// lCur, rCur unchanged
			} else if (z[kPrime] > 0) {
				int q = 0;
				while (off+q+rCur+1 < s.length() && s[off+q+rCur+1] == s[off+betaLen+q]) q++;
69
				z[k] = (TIndexOffU)(betaLen + q);
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
				assert_lt(off+z[k], slen);
				rCur = rCur + q;
				assert_geq(k, lCur);
				lCur = k;
			} else {
				z[k] = 0;
				assert_lt(off+z[k], slen);
				// lCur, rCur unchanged
			}
		}
	}
#ifndef NDEBUG
	if(sanityCheck) {
		// Recalculate Z-boxes using naive quadratic-time algorithm and
		// compare to linear-time result
		assert_eq(0, z[0]);
		for(size_t i = 1; i < z.size(); i++) {
			size_t j;
			for(j = i; off+j < s.length(); j++) {
				if(s[off+j] != s[off+j-i]) break;
			}
			assert_eq(j-i, z[i]);
		}
	}
#endif
}

#endif /*ZBOX_H_*/