db.c 38.3 KB
Newer Older
Richard Curnow's avatar
Richard Curnow committed
1 2 3 4
/*
  mairix - message index builder and finder for maildir folders.

 **********************************************************************
5
 * Copyright (C) Richard P. Curnow  2002,2003,2004,2005,2006,2007,2009
Richard P. Curnow's avatar
Richard P. Curnow committed
6
 *
Richard Curnow's avatar
Richard Curnow committed
7 8 9
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
Richard P. Curnow's avatar
Richard P. Curnow committed
10
 *
Richard Curnow's avatar
Richard Curnow committed
11 12 13 14
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
Richard P. Curnow's avatar
Richard P. Curnow committed
15
 *
Richard Curnow's avatar
Richard Curnow committed
16 17
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
18
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Richard P. Curnow's avatar
Richard P. Curnow committed
19
 *
Richard Curnow's avatar
Richard Curnow committed
20 21 22 23 24 25 26 27 28
 **********************************************************************
 */

/* Handle complete database */

#include "mairix.h"
#include "reader.h"
#include <ctype.h>
#include <assert.h>
29 30
#include <sys/time.h>
#include <unistd.h>
31
#include "imapinterface.h"
Richard Curnow's avatar
Richard Curnow committed
32

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
struct sortable_token {/*{{{*/
  char *text;
  int index;
};
/*}}}*/
static int compare_sortable_tokens(const void *a, const void *b)/*{{{*/
{
  const struct sortable_token *aa = (const struct sortable_token *) a;
  const struct sortable_token *bb = (const struct sortable_token *) b;
  int foo;
  foo = strcmp(aa->text, bb->text);
  if (foo) {
    return foo;
  } else {
    if (aa->index < bb->index) return -1;
    else if (aa->index > bb->index) return +1;
    else return 0;
  }
}
/*}}}*/
53
static void check_toktable_enc_integrity(int n_msgs, struct toktable *table)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
54 55 56 57 58
{
  /* FIXME : Check reachability of tokens that are displaced from their natural
   * hash bucket (if deletions have occurred during purge). */

  int idx, incr;
59
  int i, k;
Richard Curnow's avatar
Richard Curnow committed
60 61
  unsigned char *j, *last_char;
  int broken_chains = 0;
62 63
  struct sortable_token *sort_list;
  int any_duplicates;
Richard P. Curnow's avatar
Richard P. Curnow committed
64

Richard Curnow's avatar
Richard Curnow committed
65 66 67 68 69
  for (i=0; i<table->size; i++) {
    struct token *tok = table->tokens[i];
    if (tok) {
      idx = 0;
      incr = 0;
70 71
      last_char = tok->match0.msginfo + tok->match0.n;
      for (j = tok->match0.msginfo; j < last_char; ) {
Richard Curnow's avatar
Richard Curnow committed
72 73 74
        incr = read_increment(&j);
        idx += incr;
      }
75 76
      if (idx != tok->match0.highest) {
        fprintf(stderr, "broken encoding chain for token <%s>, highest=%ld\n", tok->text, tok->match0.highest);
Richard Curnow's avatar
Richard Curnow committed
77 78 79
        fflush(stderr);
        broken_chains = 1;
      }
80 81
      if (idx >= n_msgs) {
        fprintf(stderr, "end of chain higher than number of message paths (%d) for token <%s>\n", n_msgs, tok->text);
Richard Curnow's avatar
Richard Curnow committed
82 83 84 85 86 87 88 89
        fflush(stderr);
        broken_chains = 1;
      }
    }
  }

  assert(!broken_chains);

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
  /* Check there are no duplicated tokens in the table. */
  sort_list = new_array(struct sortable_token, table->n);
  k = 0;
  for (i=0; i<table->size; i++) {
    struct token *tok = table->tokens[i];
    if (tok) {
      sort_list[k].text = new_string(tok->text);
      sort_list[k].index = i;
      k++;
    }
  }
  assert(k == table->n);

  qsort(sort_list, table->n, sizeof(struct sortable_token), compare_sortable_tokens);
  /* Check for uniqueness of neighbouring token texts */
  any_duplicates = 0;
  for (i=0; i<(table->n - 1); i++) {
    if (!strcmp(sort_list[i].text, sort_list[i+1].text)) {
Richard P. Curnow's avatar
Richard P. Curnow committed
108
      fprintf(stderr, "Token table contains duplicated token %s at indices %d and %d\n",
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
               sort_list[i].text, sort_list[i].index, sort_list[i+1].index);
      any_duplicates = 1;
    }
  }

  /* release */
  for (i=0; i<table->n; i++) {
    free(sort_list[i].text);
  }
  free(sort_list);

  if (any_duplicates) {
    fprintf(stderr, "Token table contained duplicate entries, aborting\n");
    assert(0);
  }
Richard Curnow's avatar
Richard Curnow committed
124 125 126 127 128 129 130 131 132 133 134
}
/*}}}*/
static int compare_strings(const void *a, const void *b)/*{{{*/
{
  const char **aa = (const char **) a;
  const char **bb = (const char **) b;
  return strcmp(*aa, *bb);
}
/*}}}*/
static void check_message_path_integrity(struct database *db)/*{{{*/
{
135
  /* TODO : for now only checks integrity of non-mbox paths. */
Richard Curnow's avatar
Richard Curnow committed
136 137
  /* Check there are no duplicates */
  int i;
138
  int n, imap_n;
Richard Curnow's avatar
Richard Curnow committed
139
  int has_duplicate = 0;
Richard P. Curnow's avatar
Richard P. Curnow committed
140

141
  char **paths, **imaps;
142
  paths = new_array(char *, db->n_msgs);
143 144
  imaps = new_array(char *, db->n_msgs);
  for (i=0, n=0, imap_n=0; i<db->n_msgs; i++) {
145 146 147 148 149 150 151
    switch (db->type[i]) {
      case MTY_DEAD:
      case MTY_MBOX:
        break;
      case MTY_FILE:
        paths[n++] = db->msgs[i].src.mpf.path;
        break;
152 153 154
      case MTY_IMAP:
        imaps[imap_n++] = db->msgs[i].src.mpf.path;
        break;
Richard Curnow's avatar
Richard Curnow committed
155 156 157 158
    }
  }

  qsort(paths, n, sizeof(char *), compare_strings);
159
  qsort(imaps, imap_n, sizeof(char *), compare_strings);
Richard Curnow's avatar
Richard Curnow committed
160 161 162 163 164 165 166

  for (i=1; i<n; i++) {
    if (!strcmp(paths[i-1], paths[i])) {
      fprintf(stderr, "Path <%s> repeated\n", paths[i]);
      has_duplicate = 1;
    }
  }
167 168 169 170 171 172
  for (i=1; i<imap_n; i++) {
    if (!strcmp(imaps[i-1], imaps[i])) {
      fprintf(stderr, "IMAP message <%s> repeated\n", imaps[i]);
      has_duplicate = 1;
    }
  }
Richard Curnow's avatar
Richard Curnow committed
173 174 175 176 177

  fflush(stderr);
  assert(!has_duplicate);

  free(paths);
178
  free(imaps);
Richard Curnow's avatar
Richard Curnow committed
179 180 181 182 183 184 185
  return;
}
/*}}}*/
void check_database_integrity(struct database *db)/*{{{*/
{
  if (verbose) fprintf(stderr, "Checking message path integrity\n");
  check_message_path_integrity(db);
Richard P. Curnow's avatar
Richard P. Curnow committed
186

Richard Curnow's avatar
Richard Curnow committed
187 188
  /* Just check encoding chains for now */
  if (verbose) fprintf(stderr, "Checking to\n");
189
  check_toktable_enc_integrity(db->n_msgs, db->to);
Richard Curnow's avatar
Richard Curnow committed
190
  if (verbose) fprintf(stderr, "Checking cc\n");
191
  check_toktable_enc_integrity(db->n_msgs, db->cc);
Richard Curnow's avatar
Richard Curnow committed
192
  if (verbose) fprintf(stderr, "Checking from\n");
193
  check_toktable_enc_integrity(db->n_msgs, db->from);
Richard Curnow's avatar
Richard Curnow committed
194
  if (verbose) fprintf(stderr, "Checking subject\n");
195
  check_toktable_enc_integrity(db->n_msgs, db->subject);
Richard Curnow's avatar
Richard Curnow committed
196
  if (verbose) fprintf(stderr, "Checking body\n");
197
  check_toktable_enc_integrity(db->n_msgs, db->body);
198 199
  if (verbose) fprintf(stderr, "Checking attachment_name\n");
  check_toktable_enc_integrity(db->n_msgs, db->attachment_name);
Richard Curnow's avatar
Richard Curnow committed
200 201
}
/*}}}*/
Richard P. Curnow's avatar
Richard P. Curnow committed
202
struct database *new_database(unsigned int hash_key)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
203 204
{
  struct database *result = new(struct database);
205 206
  struct timeval tv;
  pid_t  pid;
Richard Curnow's avatar
Richard Curnow committed
207 208 209 210 211 212

  result->to = new_toktable();
  result->cc = new_toktable();
  result->from = new_toktable();
  result->subject = new_toktable();
  result->body = new_toktable();
213
  result->attachment_name = new_toktable();
Richard Curnow's avatar
Richard Curnow committed
214

215 216
  result->msg_ids = new_toktable2();

Richard P. Curnow's avatar
Richard P. Curnow committed
217 218 219 220 221 222 223
  if ( hash_key == CREATE_RANDOM_DATABASE_HASH )
    {
      gettimeofday(&tv, NULL);
      pid = getpid();
      hash_key = tv.tv_sec ^ (pid ^ (tv.tv_usec << 15));
    }
  result->hash_key = hash_key;
224 225 226 227 228

  result->msgs = NULL;
  result->type = NULL;
  result->n_msgs = 0;
  result->max_msgs = 0;
Richard Curnow's avatar
Richard Curnow committed
229

230 231 232
  result->mboxen = NULL;
  result->n_mboxen = 0;
  result->max_mboxen = 0;
Richard Curnow's avatar
Richard Curnow committed
233 234 235 236 237 238 239 240 241 242 243 244 245

  return result;
}
/*}}}*/
void free_database(struct database *db)/*{{{*/
{
  int i;

  free_toktable(db->to);
  free_toktable(db->cc);
  free_toktable(db->from);
  free_toktable(db->subject);
  free_toktable(db->body);
246
  free_toktable(db->attachment_name);
247 248 249 250 251 252 253 254 255 256
  free_toktable2(db->msg_ids);

  if (db->msgs) {
    for (i=0; i<db->n_msgs; i++) {
      switch (db->type[i]) {
        case MTY_DEAD:
          break;
        case MTY_MBOX:
          break;
        case MTY_FILE:
257
        case MTY_IMAP:
258 259 260 261
          assert(db->msgs[i].src.mpf.path);
          free(db->msgs[i].src.mpf.path);
          break;
      }
Richard Curnow's avatar
Richard Curnow committed
262
    }
263 264
    free(db->msgs);
    free(db->type);
Richard Curnow's avatar
Richard Curnow committed
265 266 267 268 269 270 271 272 273 274
  }

  free(db);
}
/*}}}*/

static int get_max (int a, int b) {/*{{{*/
  return (a > b) ? a : b;
}
/*}}}*/
275
static void import_toktable(char *data, unsigned int hash_key, int n_msgs, struct toktable_db *in, struct toktable *out)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
{
  int n, size, i;

  n = in->n;
  size = 1;
  while (size < n) size <<= 1;
  size <<= 1; /* safe hash table size */

  out->size = size;
  out->mask = size - 1;
  out->n = n;
  out->tokens = new_array(struct token *, size);
  memset(out->tokens, 0, size * sizeof(struct token *));
  out->hwm = (n + size) >> 1;

  for (i=0; i<n; i++) {
    unsigned int hash, index;
    char *text;
    unsigned char *enc;
    int enc_len;
    struct token *nt;
    int enc_hi;
    int idx, incr;
    unsigned char *j;

    /* Recover enc_len and enc_hi from the data */
    enc = (unsigned char *) data + in->enc_offsets[i];
    idx = 0;
    for (j = enc; *j != 0xff; ) {
      incr = read_increment(&j);
      idx += incr;
    }
    enc_len = j - enc;
    enc_hi = idx;

    text = data + in->tok_offsets[i];
312
    hash = hashfn((unsigned char *) text, strlen(text), hash_key);
Richard Curnow's avatar
Richard Curnow committed
313 314 315 316 317

    nt = new(struct token);
    nt->hashval = hash;
    nt->text = new_string(text);
    /* Allow a bit of headroom for adding more entries later */
318 319 320 321
    nt->match0.max = get_max(16, enc_len + (enc_len >> 1));
    nt->match0.n = enc_len;
    nt->match0.highest = enc_hi;
    assert(nt->match0.highest < n_msgs);
322
    nt->match0.msginfo = new_array(unsigned char, nt->match0.max);
323 324 325 326
    memcpy(nt->match0.msginfo, enc, nt->match0.n);

    index = hash & out->mask;
    while (out->tokens[index]) {
327 328 329 330 331 332 333 334 335
      /* Audit to look for corrupt database with multiple entries for the same
       * string. */
      if (!strcmp(nt->text, out->tokens[index]->text)) {
        fprintf(stderr, "\n!!! Corrupt token table found in database, token <%s> duplicated, aborting\n",
            nt->text);
        fprintf(stderr, "  Delete the database file and rebuild from scratch as a workaround\n");
        /* No point going on - need to find out why the database got corrupted
         * in the 1st place.  Workaround for user - rebuild database from
         * scratch by deleting it then rerunning. */
336
        unlock_and_exit(1);
337
      }
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
      ++index;
      index &= out->mask;
    }

    out->tokens[index] = nt;
  }
}
/*}}}*/
static void import_toktable2(char *data, unsigned int hash_key, int n_msgs, struct toktable2_db *in, struct toktable2 *out)/*{{{*/
{
  int n, size, i;

  n = in->n;
  size = 1;
  while (size < n) size <<= 1;
  size <<= 1; /* safe hash table size */

  out->size = size;
  out->mask = size - 1;
  out->n = n;
  out->tokens = new_array(struct token2 *, size);
  memset(out->tokens, 0, size * sizeof(struct token *));
  out->hwm = (n + size) >> 1;

  for (i=0; i<n; i++) {
    unsigned int hash, index;
    char *text;
    struct token2 *nt;
    unsigned char *enc0, *enc1;
    int enc0_len, enc1_len;
    int enc0_hi, enc1_hi;
    int idx, incr;
    unsigned char *j;

/*{{{ do enc0*/
    enc0 = (unsigned char *) data + in->enc0_offsets[i];
    idx = 0;
    for (j = enc0; *j != 0xff; ) {
      incr = read_increment(&j);
      idx += incr;
    }
    enc0_len = j - enc0;
    enc0_hi = idx;
/*}}}*/
/*{{{ do enc1*/
    enc1 = (unsigned char *) data + in->enc1_offsets[i];
    idx = 0;
    for (j = enc1; *j != 0xff; ) {
      incr = read_increment(&j);
      idx += incr;
    }
    enc1_len = j - enc1;
    enc1_hi = idx;
/*}}}*/

    text = data + in->tok_offsets[i];
394
    hash = hashfn((unsigned char *) text, strlen(text), hash_key);
395 396 397 398 399 400 401 402 403 404

    nt = new(struct token2);
    nt->hashval = hash;
    nt->text = new_string(text);
    /* Allow a bit of headroom for adding more entries later */
    /*{{{ set up match0 chain */
    nt->match0.max = get_max(16, enc0_len + (enc0_len >> 1));
    nt->match0.n = enc0_len;
    nt->match0.highest = enc0_hi;
    assert(nt->match0.highest < n_msgs);
405
    nt->match0.msginfo = new_array(unsigned char, nt->match0.max);
406 407 408 409 410 411 412
    memcpy(nt->match0.msginfo, enc0, nt->match0.n);
    /*}}}*/
    /*{{{ set up match1 chain */
    nt->match1.max = get_max(16, enc1_len + (enc1_len >> 1));
    nt->match1.n = enc1_len;
    nt->match1.highest = enc1_hi;
    assert(nt->match1.highest < n_msgs);
413
    nt->match1.msginfo = new_array(unsigned char, nt->match1.max);
414 415
    memcpy(nt->match1.msginfo, enc1, nt->match1.n);
    /*}}}*/
Richard Curnow's avatar
Richard Curnow committed
416 417 418 419 420 421 422 423 424 425 426

    index = hash & out->mask;
    while (out->tokens[index]) {
      ++index;
      index &= out->mask;
    }

    out->tokens[index] = nt;
  }
}
/*}}}*/
427
struct database *new_database_from_file(char *db_filename, int do_integrity_checks)/*{{{*/
Richard P. Curnow's avatar
Richard P. Curnow committed
428
{
Richard Curnow's avatar
Richard Curnow committed
429 430 431
  /* Read existing database from file for doing incremental update */
  struct database *result;
  struct read_db *input;
432
  int i, n, N;
Richard P. Curnow's avatar
Richard P. Curnow committed
433

Richard P. Curnow's avatar
Richard P. Curnow committed
434
  result = new_database( CREATE_RANDOM_DATABASE_HASH );
Richard Curnow's avatar
Richard Curnow committed
435
  input = open_db(db_filename);
Richard Curnow's avatar
Richard Curnow committed
436 437 438 439 440
  if (!input) {
    /* Nothing to initialise */
    if (verbose) printf("Database file was empty, creating a new database\n");
    return result;
  }
Richard Curnow's avatar
Richard Curnow committed
441 442

  /* Build pathname information */
443 444 445 446 447
  n = result->n_msgs = input->n_msgs;
  result->max_msgs = input->n_msgs; /* let it be extended as-and-when */
  result->msgs = new_array(struct msgpath, n);
  result->type = new_array(enum message_type, n);

448 449
  result->hash_key = input->hash_key;

450 451 452 453 454 455 456
  /* Set up mbox structures */
  N = result->n_mboxen = result->max_mboxen = input->n_mboxen;
  result->mboxen = N ? (new_array(struct mbox, N)) : NULL;
  for (i=0; i<N; i++) {
    int nn;
    if (input->mbox_paths_table[i]) {
      result->mboxen[i].path = new_string(input->data + input->mbox_paths_table[i]);
Richard Curnow's avatar
Richard Curnow committed
457
    } else {
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
      /* mbox is dead. */
      result->mboxen[i].path = NULL;
    }
    result->mboxen[i].file_mtime = input->mbox_mtime_table[i];
    result->mboxen[i].file_size  = input->mbox_size_table[i];
    nn = result->mboxen[i].n_msgs = input->mbox_entries_table[i];
    result->mboxen[i].max_msgs = nn;
    result->mboxen[i].start = new_array(off_t, nn);
    result->mboxen[i].len   = new_array(size_t, nn);
    result->mboxen[i].check_all = new_array(checksum_t, nn);
    /* Copy the entire checksum table in one go. */
    memcpy(result->mboxen[i].check_all,
           input->data + input->mbox_checksum_table[i],
           nn * sizeof(checksum_t));
    result->mboxen[i].n_so_far = 0;
  }

  for (i=0; i<n; i++) {
476
    switch (rd_msg_type(input, i)) {
477 478 479 480 481 482 483 484 485
      case DB_MSG_DEAD:
        result->type[i] = MTY_DEAD;
        break;
      case DB_MSG_FILE:
        result->type[i] = MTY_FILE;
        result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]);
        result->msgs[i].src.mpf.mtime = input->mtime_table[i];
        result->msgs[i].src.mpf.size  = input->size_table[i];
        break;
486 487 488 489 490 491
      case DB_MSG_IMAP:
        result->type[i] = MTY_IMAP;
        result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]);
        result->msgs[i].src.mpf.mtime = 0;
        result->msgs[i].src.mpf.size  = 0;
        break;
492 493
      case DB_MSG_MBOX:
        {
494 495
          unsigned int mbi, msgi;
          int n;
496 497 498 499 500 501 502 503 504 505 506 507 508 509
          struct mbox *mb;
          result->type[i] = MTY_MBOX;
          decode_mbox_indices(input->path_offsets[i], &mbi, &msgi);
          result->msgs[i].src.mbox.file_index = mbi;
          mb = &result->mboxen[mbi];
          assert(mb->n_so_far == msgi);
          n = mb->n_so_far;
          result->msgs[i].src.mbox.msg_index = n;
          mb->start[n] = input->mtime_table[i];
          mb->len[n] = input->size_table[i];
          ++mb->n_so_far;
        }

        break;
Richard Curnow's avatar
Richard Curnow committed
510
    }
511 512 513
    result->msgs[i].seen    = (input->msg_type_and_flags[i] & FLAG_SEEN)    ? 1:0;
    result->msgs[i].replied = (input->msg_type_and_flags[i] & FLAG_REPLIED) ? 1:0;
    result->msgs[i].flagged = (input->msg_type_and_flags[i] & FLAG_FLAGGED) ? 1:0;
514 515
    result->msgs[i].date  = input->date_table[i];
    result->msgs[i].tid   = input->tid_table[i];
Richard Curnow's avatar
Richard Curnow committed
516 517
  }

518 519 520 521 522
  import_toktable(input->data, input->hash_key, result->n_msgs, &input->to, result->to);
  import_toktable(input->data, input->hash_key, result->n_msgs, &input->cc, result->cc);
  import_toktable(input->data, input->hash_key, result->n_msgs, &input->from, result->from);
  import_toktable(input->data, input->hash_key, result->n_msgs, &input->subject, result->subject);
  import_toktable(input->data, input->hash_key, result->n_msgs, &input->body, result->body);
Richard P. Curnow's avatar
Richard P. Curnow committed
523
  import_toktable(input->data, input->hash_key, result->n_msgs, &input->attachment_name, result->attachment_name);
524
  import_toktable2(input->data, input->hash_key, result->n_msgs, &input->msg_ids, result->msg_ids);
Richard Curnow's avatar
Richard Curnow committed
525 526 527

  close_db(input);

528 529 530
  if (do_integrity_checks) {
    check_database_integrity(result);
  }
Richard P. Curnow's avatar
Richard P. Curnow committed
531

Richard Curnow's avatar
Richard Curnow committed
532 533 534 535
  return result;
}
/*}}}*/

536
static void add_angled_terms(int file_index, unsigned int hash_key, struct toktable2 *table, int add_to_chain1, char *s)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
537 538 539 540 541 542 543 544 545
{
  char *left, *right;

  if (s) {
    left = strchr(s, '<');
    while (left) {
      right = strchr(left, '>');
      if (right) {
        *right = '\0';
546
        add_token2_in_file(file_index, hash_key, left+1, table, add_to_chain1);
Richard Curnow's avatar
Richard Curnow committed
547 548 549 550 551 552 553 554 555 556
        *right = '>'; /* restore */
      } else {
        break;
      }
      left = strchr(right, '<');
    }
  }
}
/*}}}*/

557 558 559 560 561 562 563 564 565 566 567 568 569 570
/* Macro for what characters can make up token strings.

   The following characters have special meanings:
   0x2b +
   0x2d -
   0x2e .
   0x40 @
   0x5f _

   since they can occur within email addresses and message IDs when considered
   as a whole rather than as individual words.  Underscore (0x5f) is considered
   a word-character always too.

   */
571 572 573
static unsigned char special_table[256] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00-0f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10-1f */
574
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, /* 20-2f */
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30-3f */
  2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40-4f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, /* 50-5f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60-6f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70-7f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90-9f */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0-af */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0-bf */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c0-cf */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* d0-df */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* e0-ef */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* f0-ff */
};

590 591 592 593 594 595 596 597 598 599 600
#if 0
#define CHAR_VALID(x,mask) (isalnum((unsigned char) x) || (special_table[(unsigned int)(unsigned char) x] & mask))
#endif
static inline int char_valid_p(char x, unsigned int mask)/*{{{*/
{
  unsigned char xx = (unsigned char) x;
  if (isalnum(xx)) return 1;
  else if (special_table[(unsigned int) xx] & mask) return 1;
  else return 0;
}
/*}}}*/
601
static void tokenise_string(int file_index, unsigned int hash_key, struct toktable *table, char *data, int match_mask)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
602
{
603 604
  char *ss, *es, old_es;
  ss = data;
Richard Curnow's avatar
Richard Curnow committed
605
  for (;;) {
606
    while (*ss && !char_valid_p(*ss,match_mask)) ss++;
Richard Curnow's avatar
Richard Curnow committed
607 608
    if (!*ss) break;
    es = ss + 1;
609
    while (*es && char_valid_p(*es,match_mask)) es++;
Richard P. Curnow's avatar
Richard P. Curnow committed
610

Richard Curnow's avatar
Richard Curnow committed
611 612 613 614
    /* deal with token [ss,es) */
    old_es = *es;
    *es = '\0';
    /* FIXME: Ought to do this by passing start and length - clean up later */
615
    add_token_in_file(file_index, hash_key, ss, table);
Richard Curnow's avatar
Richard Curnow committed
616 617 618 619 620 621 622
    *es = old_es;

    if (!*es) break;
    ss = es;
  }
}
/*}}}*/
623
static void tokenise_html_string(int file_index, unsigned int hash_key, struct toktable *table, char *data)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
624
{
625
  char *ss, *es, old_es;
Richard Curnow's avatar
Richard Curnow committed
626 627

  /* FIXME : Probably want to rewrite this as an explicit FSM */
Richard P. Curnow's avatar
Richard P. Curnow committed
628

629
  ss = data;
Richard Curnow's avatar
Richard Curnow committed
630 631
  for (;;) {
    /* Assume < and > are never valid token characters ! */
632
    while (*ss && !char_valid_p(*ss, 1)) {
Richard Curnow's avatar
Richard Curnow committed
633 634 635 636 637 638
      if (*ss++ == '<') {
        /* Skip over HTML tag */
        while (*ss && (*ss != '>')) ss++;
      }
    }
    if (!*ss) break;
Richard P. Curnow's avatar
Richard P. Curnow committed
639

Richard Curnow's avatar
Richard Curnow committed
640
    es = ss + 1;
641
    while (*es && char_valid_p(*es, 1)) es++;
Richard P. Curnow's avatar
Richard P. Curnow committed
642

Richard Curnow's avatar
Richard Curnow committed
643 644 645 646
    /* deal with token [ss,es) */
    old_es = *es;
    *es = '\0';
    /* FIXME: Ought to do this by passing start and length - clean up later */
647
    add_token_in_file(file_index, hash_key, ss, table);
Richard Curnow's avatar
Richard Curnow committed
648 649 650 651 652 653 654
    *es = old_es;

    if (!*es) break;
    ss = es;
  }
}
/*}}}*/
655
void tokenise_message(int file_index, struct database *db, struct rfc822 *msg)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
656 657 658
{
  struct attachment *a;

659 660
  /* Match on whole addresses in these headers as well as the individual words */
  if (msg->hdrs.to) {
661 662
    tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 1);
    tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 2);
663 664
  }
  if (msg->hdrs.cc) {
665 666
    tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 1);
    tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 2);
667 668
  }
  if (msg->hdrs.from) {
669 670
    tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 1);
    tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 2);
671
  }
672
  if (msg->hdrs.subject) tokenise_string(file_index, db->hash_key, db->subject, msg->hdrs.subject, 1);
Richard Curnow's avatar
Richard Curnow committed
673 674 675 676

  for (a=msg->atts.next; a!=&msg->atts; a=a->next) {
    switch (a->ct) {
      case CT_TEXT_PLAIN:
677
        tokenise_string(file_index, db->hash_key, db->body, a->data.normal.bytes, 1);
Richard Curnow's avatar
Richard Curnow committed
678 679
        break;
      case CT_TEXT_HTML:
680
        tokenise_html_string(file_index, db->hash_key, db->body, a->data.normal.bytes);
Richard Curnow's avatar
Richard Curnow committed
681 682 683 684 685
        break;
      case CT_MESSAGE_RFC822:
        /* Just recurse for now - maybe we should have separate token tables
         * for tokens occurring in embedded messages? */

Richard Curnow's avatar
Richard Curnow committed
686 687 688
        if (a->data.rfc822) {
          tokenise_message(file_index, db, a->data.rfc822);
        }
Richard Curnow's avatar
Richard Curnow committed
689 690 691 692 693 694 695 696 697
        break;
      default:
        /* Don't do anything - unknown text format or some nasty binary stuff.
         * In future, we could have all kinds of 'plug-ins' here, e.g.
         * something that can parse PDF to get the basic text strings out of
         * the pages? */
        break;
    }

698 699 700 701
    if (a->filename) {
      add_token_in_file(file_index, db->hash_key, a->filename, db->attachment_name);
    }

Richard Curnow's avatar
Richard Curnow committed
702 703 704
  }

  /* Deal with threading information */
705 706 707
  add_angled_terms(file_index, db->hash_key, db->msg_ids, 1, msg->hdrs.message_id);
  add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.in_reply_to);
  add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.references);
Richard Curnow's avatar
Richard Curnow committed
708 709
}
/*}}}*/
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735

static void scan_maildir_flags(struct msgpath *m)/*{{{*/
{
  const char *p, *start;
  start = m->src.mpf.path;
  m->seen = 0;
  m->replied = 0;
  m->flagged = 0;
  for (p=start; *p; p++) {}
  for (p--; (p >= start) && ((*p) != ':'); p--) {}
  if (p >= start) {
    if (!strncmp(p, ":2,", 3)) {
      p += 3;
      while (*p) {
        switch (*p) {
          case 'F': m->flagged = 1; break;
          case 'R': m->replied = 1; break;
          case 'S': m->seen = 1; break;
          default: break;
        }
        p++;
      }
    }
  }
}
/*}}}*/
736
static void scan_new_messages(struct database *db, int start_at, struct imap_ll *imapc)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
737 738
{
  int i;
739 740
  for (i=start_at; i<db->n_msgs; i++) {
    struct rfc822 *msg = NULL;
741 742 743 744 745
    int len = strlen(db->msgs[i].src.mpf.path);

    if (len > 10 && !strcmp(db->msgs[i].src.mpf.path + len - 11, "/.gitignore"))
      continue;

746 747 748 749 750 751 752 753 754 755 756
    switch (db->type[i]) {
      case MTY_DEAD:
        assert(0);
        break;
      case MTY_MBOX:
        assert(0); /* Should never get here - mbox messages are scanned elsewhere. */
        break;
      case MTY_FILE:
        if (verbose) fprintf(stderr, "Scanning <%s>\n", db->msgs[i].src.mpf.path);
        msg = make_rfc822(db->msgs[i].src.mpf.path);
        break;
757 758 759 760
      case MTY_IMAP:
        if (verbose) fprintf(stderr, "Scanning IMAP <%s>\n", db->msgs[i].src.mpf.path);
        msg = make_rfc822_from_imap(db->msgs[i].src.mpf.path, imapc);
        break;
761
    }
Richard P. Curnow's avatar
Richard P. Curnow committed
762
    if(msg)
Richard Curnow's avatar
Richard Curnow committed
763
    {
764
      db->msgs[i].date = msg->hdrs.date;
765
      scan_maildir_flags(&db->msgs[i]);
Richard Curnow's avatar
Richard Curnow committed
766 767 768 769
      tokenise_message(i, db, msg);
      free_rfc822(msg);
    }
    else
770
      fprintf(stderr, "Skipping %s (could not parse message)\n", db->msgs[i].src.mpf.path);
Richard Curnow's avatar
Richard Curnow committed
771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
  }
}
/*}}}*/

static inline void set_bit(unsigned long *x, int n)/*{{{*/
{
  int set;
  unsigned long mask;
  set = (n >> 5);
  mask = (1UL << (n & 31));
  x[set] |= mask;
}
/*}}}*/
static inline int isset_bit(unsigned long *x, int n)/*{{{*/
{
  int set;
  unsigned long mask;
  set = (n >> 5);
  mask = (1UL << (n & 31));
  return (x[set] & mask) ? 1 : 0;
}
/*}}}*/
static int find_base(int *table, int index) {/*{{{*/
  int a = index;

  /* TODO : make this compress the path lengths down to the base entry */
  while (table[a] != a) {
    a = table[a];
  }
  return a;
}
/*}}}*/
static void find_threading(struct database *db)/*{{{*/
{

  /* ix is a table mapping path array index to the lowest path array index that
   * is known to share at least one message ID in its hdrs somewhere (i.e. they
   * must be in the same thread) */
  int *ix;

811
  int i, m, np, sm;
Richard Curnow's avatar
Richard Curnow committed
812
  int next_tid;
Richard P. Curnow's avatar
Richard P. Curnow committed
813

814
  np = db->n_msgs;
Richard Curnow's avatar
Richard Curnow committed
815
  sm = db->msg_ids->size;
Richard P. Curnow's avatar
Richard P. Curnow committed
816

Richard Curnow's avatar
Richard Curnow committed
817 818 819 820 821 822
  ix = new_array(int, np);
  for (i=0; i<np; i++) {
    ix[i] = i; /* default - every message in a thread of its own */
  }

  for (m=0; m<sm; m++) {
823
    struct token2 *tok = db->msg_ids->tokens[m];
Richard Curnow's avatar
Richard Curnow committed
824
    if (tok) {
825 826
      unsigned char *j = tok->match0.msginfo;
      unsigned char *last_char = j + tok->match0.n;
Richard Curnow's avatar
Richard Curnow committed
827
      int cur = 0, incr, first=1;
828
      int new_base=-1, old_base;
Richard Curnow's avatar
Richard Curnow committed
829 830 831 832 833 834 835 836 837 838 839 840
      while (j < last_char) {
        incr = read_increment(&j);
        cur += incr;
        if (first) {
          new_base = find_base(ix, cur);
          first = 0;
        } else {
          old_base = find_base(ix, cur);
          if (old_base < new_base) {
            ix[new_base] = old_base;
            new_base = old_base;
          } else if (old_base > new_base) {
841
            assert(new_base != -1);
Richard Curnow's avatar
Richard Curnow committed
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
            ix[old_base] = new_base;
          }
        }
      }
    }
  }

  /* Now make each entry point directly to its base */
  for (i=0; i<np; i++) {
    if (ix[i] != i) {
      /* Sure to work as we're going up from the bottom */
      ix[i] = ix[ix[i]];
    }
  }

  /* Now allocate contiguous thread group numbers */
  next_tid = 0;
  for (i=0; i<np; i++) {
    if (ix[i] == i) {
861
      db->msgs[i].tid = next_tid++;
Richard Curnow's avatar
Richard Curnow committed
862
    } else {
863
      db->msgs[i].tid = db->msgs[ix[i]].tid;
Richard Curnow's avatar
Richard Curnow committed
864 865 866 867 868 869 870
    }
  }

  free(ix);
  return;
}
/*}}}*/
871
static int lookup_msgpath(struct msgpath *sorted_paths, int n_msgs, char *key, enum message_type type)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
872 873 874
{
  /* Implement bisection search */
 int l, h, m, r;
875
 l = 0, h = n_msgs;
Richard Curnow's avatar
Richard Curnow committed
876 877 878
 m = -1;
 while (h > l) {
   m = (h + l) >> 1;
879
   /* Should only get called on 'file' type messages - TBC */
880 881 882 883 884 885 886
   if (sorted_paths[m].type < type) {
     r = -1;
   } else if (sorted_paths[m].type > type) {
     r = 1;
   } else {
     r = strcmp(sorted_paths[m].src.mpf.path, key);
   }
Richard Curnow's avatar
Richard Curnow committed
887 888 889 890 891 892 893 894
   if (r == 0) break;
   if (l == m) return -1;
   if (r > 0) h = m;
   else       l = m;
 }
 return m;
}
/*}}}*/
895
void maybe_grow_message_arrays(struct database *db)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
896
{
897
  if (db->n_msgs == db->max_msgs) {
898
    if (db->max_msgs <= 128) {
899
      db->max_msgs = 256;
Richard Curnow's avatar
Richard Curnow committed
900
    } else {
901
      db->max_msgs += (db->max_msgs >> 1);
Richard Curnow's avatar
Richard Curnow committed
902
    }
903 904
    db->msgs  = grow_array(struct msgpath,    db->max_msgs, db->msgs);
    db->type  = grow_array(enum message_type, db->max_msgs, db->type);
Richard Curnow's avatar
Richard Curnow committed
905 906 907
  }
}
/*}}}*/
908
static void add_msg_path(struct database *db, char *path, time_t mtime, size_t message_size, enum message_type type)/*{{{*/
909 910
{
  maybe_grow_message_arrays(db);
911
  db->type[db->n_msgs] = type;
912 913 914 915 916 917
  db->msgs[db->n_msgs].src.mpf.path = new_string(path);
  db->msgs[db->n_msgs].src.mpf.mtime = mtime;
  db->msgs[db->n_msgs].src.mpf.size = message_size;
  ++db->n_msgs;
}
/*}}}*/
918 919 920 921 922

static int do_stat(struct msgpath *mp)/*{{{*/
{
  struct stat sb;
  int status;
923 924 925 926 927
  if (mp->type == MTY_IMAP) {
    mp->src.mpf.mtime = 0;
    mp->src.mpf.size = 0;
    return 1;
  }
928 929 930 931 932 933 934 935 936 937 938
  status = stat(mp->src.mpf.path, &sb);
  if ((status < 0) ||
      !S_ISREG(sb.st_mode)) {
    return 0;
  } else {
    mp->src.mpf.mtime = sb.st_mtime;
    mp->src.mpf.size = sb.st_size;
    return 1;
  }
}
/*}}}*/
939
int update_database(struct database *db, struct msgpath *sorted_paths, int n_msgs, int do_fast_index, struct imap_ll *imapc)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
940 941 942 943 944 945 946 947 948 949 950 951 952
{
  /* The incoming list must be sorted into order, to make binary searching
   * possible.  We search for each existing path in the incoming sorted array.
   * If the date differs, or the file no longer exist, the existing database
   * entry for that file is nulled.  (These are only recovered if the database
   * is actively compressed.)  If the date differed, a new entry for the file
   * is put at the end of the list.  Similarly, any new file goes at the end.
   * These new entries are all rescanned to find tokens and add them to the
   * database. */

  char *file_in_db, *file_in_new_list;
  int matched_index;
  int i, new_entries_start_at;
953
  int any_new, n_newly_pruned, n_already_dead;
954
  int status;
Richard P. Curnow's avatar
Richard P. Curnow committed
955

956 957 958 959 960 961 962 963 964 965 966
  file_in_db = new_array(char, n_msgs);
  file_in_new_list = new_array(char, db->n_msgs);
  bzero(file_in_db, n_msgs);
  bzero(file_in_new_list, db->n_msgs);

  n_already_dead = 0;
  n_newly_pruned = 0;

  for (i=0; i<db->n_msgs; i++) {
    switch (db->type[i]) {
      case MTY_FILE:
967
        matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path, MTY_FILE);
968
        if (matched_index >= 0) {
969 970 971 972 973 974 975 976 977 978 979 980 981
          if (do_fast_index) {
            /* Assume the presence of a matching path is good enough without
             * even bothering to stat the file that's there now. */
            file_in_db[matched_index] = 1;
            file_in_new_list[i] = 1;
          } else {
            status = do_stat(sorted_paths + matched_index);
            if (status) {
              if (sorted_paths[matched_index].src.mpf.mtime == db->msgs[i].src.mpf.mtime) {
                /* Treat stale files as though the path has changed. */
                file_in_db[matched_index] = 1;
                file_in_new_list[i] = 1;
              }
982
            } else {
983 984
              /* This path will get treated as dead, and be re-stated below.
               * When that stat fails, the path won't get added to the db. */
985 986
            }
          }
987 988
        }
        break;
989 990 991 992 993 994 995
      case MTY_IMAP:
        matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path, MTY_IMAP);
        if (matched_index >= 0) {
          file_in_db[matched_index] = 1;
          file_in_new_list[i] = 1;
        }
        break;
996 997 998 999 1000
      case MTY_MBOX:
        /* Nothing to do on this pass. */
        break;
      case MTY_DEAD:
        break;
Richard Curnow's avatar
Richard Curnow committed
1001 1002 1003 1004
    }
  }

  /* Add new entries to database */
1005
  new_entries_start_at = db->n_msgs;
Richard P. Curnow's avatar
Richard P. Curnow committed
1006

1007
  for (i=0; i<db->n_msgs; i++) {
Richard Curnow's avatar
Richard Curnow committed
1008
    /* Weed dead entries */
1009 1010
    switch (db->type[i]) {
      case MTY_FILE:
1011
      case MTY_IMAP:
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
        if (!file_in_new_list[i]) {
          free(db->msgs[i].src.mpf.path);
          db->msgs[i].src.mpf.path = NULL;
          db->type[i] = MTY_DEAD;
          ++n_newly_pruned;
        }
        break;
      case MTY_MBOX:
        {
          int msg_index, file_index, number_valid;
          int mbox_valid;
          msg_index = db->msgs[i].src.mbox.msg_index;
          file_index = db->msgs[i].src.mbox.file_index;
          assert (file_index < db->n_mboxen);
          mbox_valid = (db->mboxen[file_index].path) ? 1 : 0;
          number_valid = db->mboxen[file_index].n_old_msgs_valid;
          if (!mbox_valid || (msg_index >= number_valid)) {
            db->type[i] = MTY_DEAD;
            ++n_newly_pruned;
          }
        }
        break;
      case MTY_DEAD:
        /* already dead */
        ++n_already_dead;
        break;
Richard Curnow's avatar
Richard Curnow committed
1038 1039 1040 1041
    }
  }

  if (verbose) {
1042
    fprintf(stderr, "%d newly dead messages, %d messages now dead in total\n", n_newly_pruned, n_newly_pruned+n_already_dead);
Richard Curnow's avatar
Richard Curnow committed
1043 1044 1045
  }

  any_new = 0;
1046
  for (i=0; i<n_msgs; i++) {
Richard Curnow's avatar
Richard Curnow committed
1047
    if (!file_in_db[i]) {
1048
      int status;
Richard Curnow's avatar
Richard Curnow committed
1049
      any_new = 1;
1050
      /* The 'sorted_paths' array is only used for file-per-message folders. */
1051 1052 1053 1054
      status = do_stat(sorted_paths + i);
      if (status) {
        /* We only add files that could be successfully stat()'d as regular
         * files. */
1055
        add_msg_path(db, sorted_paths[i].src.mpf.path, sorted_paths[i].src.mpf.mtime, sorted_paths[i].src.mpf.size, sorted_paths[i].type);
1056 1057 1058
      } else {
        fprintf(stderr, "Cannot add '%s' to database; stat() failed\n", sorted_paths[i].src.mpf.path);
      }
Richard Curnow's avatar
Richard Curnow committed
1059 1060
    }
  }
1061

Richard Curnow's avatar
Richard Curnow committed
1062
  if (any_new) {
1063
    scan_new_messages(db, new_entries_start_at, imapc);
1064 1065 1066 1067
  }

  /* Add newly found mbox messages. */
  any_new |= add_mbox_messages(db);
Richard P. Curnow's avatar
Richard P. Curnow committed
1068

1069
  if (any_new) {
Richard Curnow's avatar
Richard Curnow committed
1070 1071 1072 1073
    find_threading(db);
  } else {
    if (verbose) fprintf(stderr, "No new messages found\n");
  }
Richard P. Curnow's avatar
Richard P. Curnow committed
1074

Richard Curnow's avatar
Richard Curnow committed
1075 1076 1077
  free(file_in_db);
  free(file_in_new_list);

1078
  return any_new || (n_newly_pruned > 0);
Richard Curnow's avatar
Richard Curnow committed
1079 1080
}
/*}}}*/
1081
static void recode_encoding(struct matches *m, int *new_idx)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
1082 1083 1084 1085
{
  unsigned char *new_enc, *old_enc;
  unsigned char *j, *last_char;
  int incr, idx, n_idx;
1086 1087 1088 1089

  old_enc = m->msginfo;
  j = old_enc;
  last_char = old_enc + m->n;
Richard P. Curnow's avatar
Richard P. Curnow committed
1090

1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
  new_enc = new_array(unsigned char, m->max); /* Probably not bigger than this. */
  m->n = 0;
  m->highest = 0;
  m->msginfo = new_enc;
  idx = 0;

  while (j < last_char) {
    incr = read_increment(&j);
    idx += incr;
    n_idx = new_idx[idx];
    if (n_idx >= 0) {
      check_and_enlarge_encoding(m);
      insert_index_on_encoding(m, n_idx);
    }
  }
  free(old_enc);
}
/*}}}*/
static void recode_toktable(struct toktable *tbl, int *new_idx)/*{{{*/
{
  /* Re-encode the vectors according to the new path indices */
Richard Curnow's avatar
Richard Curnow committed
1112 1113 1114
  int i;
  int any_dead = 0;
  int any_moved, pass;
Richard P. Curnow's avatar
Richard P. Curnow committed
1115

Richard Curnow's avatar
Richard Curnow committed
1116 1117 1118
  for (i=0; i<tbl->size; i++) {
    struct token *tok = tbl->tokens[i];
    if (tok) {
1119 1120 1121 1122 1123 1124 1125
      recode_encoding(&tok->match0, new_idx);
      if (tok->match0.n == 0) {
        /* Delete this token.  Gotcha - there may be tokens further on in the
         * array that didn't get their natural hash bucket due to collisions.
         * Need to shuffle such tokens up to guarantee that the buckets between
         * the natural one and the one where they are now are all occupied, to
         * prevent their lookups failing. */
Richard Curnow's avatar
Richard Curnow committed
1126

1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
#if 0
        fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i);
#endif
        free_token(tok);
        tbl->tokens[i] = NULL;
        --tbl->n; /* Maintain number in use counter */
        any_dead = 1;
      }

    }
  }


  if (any_dead) {
    /* Now close gaps.  This has to be done in a second pass, otherwise we get a
     * problem with moving entries that need deleting back before the current
       scan point. */

    pass = 1;
    for (;;) {
      int i;

      if (verbose) {
        fprintf(stderr, "Pass %d\n", pass);
      }

      any_moved = 0;

      for (i=0; i<tbl->size; i++) {
        if (tbl->tokens[i]) {
          int nat_bucket_i;
          nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask;
          if (nat_bucket_i != i) {
            /* Find earliest bucket that we could move i to */
            int j = nat_bucket_i;
            while (j != i) {
              if (!tbl->tokens[j]) {
                /* put it here */
#if 0
                fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i);
#endif
                tbl->tokens[j] = tbl->tokens[i];
                tbl->tokens[i] = NULL;
                any_moved = 1;
                break;
              } else {
                j++;
                j &= tbl->mask;
              }
            }
            if (tbl->tokens[i]) {
#if 0
              fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i);
#endif
            }
          }
Richard Curnow's avatar
Richard Curnow committed
1183 1184
        }
      }
1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197

      if (!any_moved) break;
      pass++;
    }
  }
}
/*}}}*/
static void recode_toktable2(struct toktable2 *tbl, int *new_idx)/*{{{*/
{
  /* Re-encode the vectors according to the new path indices */
  int i;
  int any_dead = 0;
  int any_moved, pass;
Richard P. Curnow's avatar
Richard P. Curnow committed
1198

1199 1200 1201 1202 1203 1204
  for (i=0; i<tbl->size; i++) {
    struct token2 *tok = tbl->tokens[i];
    if (tok) {
      recode_encoding(&tok->match0, new_idx);
      recode_encoding(&tok->match1, new_idx);
      if ((tok->match0.n == 0) && (tok->match1.n == 0)) {
Richard Curnow's avatar
Richard Curnow committed
1205 1206 1207 1208 1209 1210
        /* Delete this token.  Gotcha - there may be tokens further on in the
         * array that didn't get their natural hash bucket due to collisions.
         * Need to shuffle such tokens up to guarantee that the buckets between
         * the natural one and the one where they are now are all occupied, to
         * prevent their lookups failing. */

Richard Curnow's avatar
Richard Curnow committed
1211
#if 0
Richard Curnow's avatar
Richard Curnow committed
1212
        fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i);
Richard Curnow's avatar
Richard Curnow committed
1213
#endif
1214
        free_token2(tok);
Richard Curnow's avatar
Richard Curnow committed
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
        tbl->tokens[i] = NULL;
        --tbl->n; /* Maintain number in use counter */
        any_dead = 1;
      }
    }
  }

  if (any_dead) {
    /* Now close gaps.  This has to be done in a second pass, otherwise we get a
     * problem with moving entries that need deleting back before the current
       scan point. */

    pass = 1;
    for (;;) {
      int i;

1231 1232 1233 1234
      if (verbose) {
        fprintf(stderr, "Pass %d\n", pass);
      }

Richard Curnow's avatar
Richard Curnow committed
1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
      any_moved = 0;

      for (i=0; i<tbl->size; i++) {
        if (tbl->tokens[i]) {
          int nat_bucket_i;
          nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask;
          if (nat_bucket_i != i) {
            /* Find earliest bucket that we could move i to */
            int j = nat_bucket_i;
            while (j != i) {
              if (!tbl->tokens[j]) {
                /* put it here */
Richard Curnow's avatar
Richard Curnow committed
1247
#if 0
Richard Curnow's avatar
Richard Curnow committed
1248
                fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i);
Richard Curnow's avatar
Richard Curnow committed
1249
#endif
Richard Curnow's avatar
Richard Curnow committed
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
                tbl->tokens[j] = tbl->tokens[i];
                tbl->tokens[i] = NULL;
                any_moved = 1;
                break;
              } else {
                j++;
                j &= tbl->mask;
              }
            }
            if (tbl->tokens[i]) {
#if 0
              fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i);
#endif
            }
          }
        }
      }

      if (!any_moved) break;
      pass++;
    }
  }
}
/*}}}*/
1274
int cull_dead_messages(struct database *db, int do_integrity_checks)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
1275 1276 1277 1278 1279 1280 1281 1282
{
  /* Return true if any culled */

  int *new_idx, i, j, n_old;
  int any_culled = 0;

  /* Check db is OK before we start on this. (Check afterwards is done in the
   * writer.c code.) */
1283 1284 1285
  if (do_integrity_checks) {
    check_database_integrity(db);
  }
Richard Curnow's avatar
Richard Curnow committed
1286 1287 1288 1289 1290

  if (verbose) {
    fprintf(stderr, "Culling dead messages\n");
  }

1291
  n_old = db->n_msgs;
Richard Curnow's avatar
Richard Curnow committed
1292 1293 1294

  new_idx = new_array(int, n_old);
  for (i=0, j=0; i<n_old; i++) {
1295 1296 1297
    switch (db->type[i]) {
      case MTY_FILE:
      case MTY_MBOX:
1298
      case MTY_IMAP:
1299 1300 1301 1302 1303 1304
        new_idx[i] = j++;
        break;
      case MTY_DEAD:
        new_idx[i] = -1;
        any_culled = 1;
        break;
Richard Curnow's avatar
Richard Curnow committed
1305 1306 1307 1308 1309 1310 1311 1312
    }
  }

  recode_toktable(db->to, new_idx);
  recode_toktable(db->cc, new_idx);
  recode_toktable(db->from, new_idx);
  recode_toktable(db->subject, new_idx);
  recode_toktable(db->body, new_idx);
1313
  recode_toktable(db->attachment_name, new_idx);
1314
  recode_toktable2(db->msg_ids, new_idx);
Richard P. Curnow's avatar
Richard P. Curnow committed
1315

Richard Curnow's avatar
Richard Curnow committed
1316 1317
  /* And crunch down the filename table */
  for (i=0, j=0; i<n_old; i++) {
1318 1319 1320 1321 1322
    switch (db->type[i]) {
      case MTY_DEAD:
        break;
      case MTY_FILE:
      case MTY_MBOX:
1323
      case MTY_IMAP:
1324 1325 1326 1327 1328 1329
        if (i > j) {
          db->msgs[j] = db->msgs[i];
          db->type[j]  = db->type[i];
        }
        j++;
        break;
Richard Curnow's avatar
Richard Curnow committed
1330 1331
    }
  }
1332
  db->n_msgs = j;
Richard Curnow's avatar
Richard Curnow committed
1333 1334 1335

  free(new_idx);

1336 1337 1338
  /* .. and cull dead mboxen */
  cull_dead_mboxen(db);

Richard Curnow's avatar
Richard Curnow committed
1339 1340 1341
  return any_culled;
}
/*}}}*/