tok.c 8.54 KB
Newer Older
Richard Curnow's avatar
Richard Curnow committed
1 2 3 4
/*
  mairix - message index builder and finder for maildir folders.

 **********************************************************************
5
 * Copyright (C) Richard P. Curnow  2002-2004, 2005
Richard P. Curnow's avatar
Richard P. Curnow committed
6
 *
Richard Curnow's avatar
Richard Curnow committed
7 8 9
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
Richard P. Curnow's avatar
Richard P. Curnow committed
10
 *
Richard Curnow's avatar
Richard Curnow committed
11 12 13 14
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
Richard P. Curnow's avatar
Richard P. Curnow committed
15
 *
Richard Curnow's avatar
Richard Curnow committed
16 17
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
18
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Richard P. Curnow's avatar
Richard P. Curnow committed
19
 *
Richard Curnow's avatar
Richard Curnow committed
20 21 22 23 24 25 26 27 28
 **********************************************************************
 */

/* Functions for handling tokens */

#include <assert.h>
#include <ctype.h>
#include "mairix.h"

29 30 31 32 33 34 35
static void init_matches(struct matches *m) {/*{{{*/
  m->msginfo = NULL;
  m->n = 0;
  m->max = 0;
  m->highest = 0;
}
/*}}}*/
Richard Curnow's avatar
Richard Curnow committed
36 37 38 39
struct token *new_token(void)/*{{{*/
{
  struct token *result = new(struct token);
  result->text = NULL;
40 41 42 43 44 45 46 47 48 49
  init_matches(&result->match0);
  return result;
}
/*}}}*/
struct token2 *new_token2(void)/*{{{*/
{
  struct token2 *result = new(struct token2);
  result->text = NULL;
  init_matches(&result->match0);
  init_matches(&result->match1);
Richard Curnow's avatar
Richard Curnow committed
50 51 52 53 54 55
  return result;
}
/*}}}*/
void free_token(struct token *x)/*{{{*/
{
  if (x->text) free(x->text);
56 57 58 59 60 61 62 63 64
  if (x->match0.msginfo) free(x->match0.msginfo);
  free(x);
}
/*}}}*/
void free_token2(struct token2 *x)/*{{{*/
{
  if (x->text) free(x->text);
  if (x->match0.msginfo) free(x->match0.msginfo);
  if (x->match1.msginfo) free(x->match1.msginfo);
Richard Curnow's avatar
Richard Curnow committed
65 66 67 68 69 70 71 72 73 74 75 76 77
  free(x);
}
/*}}}*/
struct toktable *new_toktable(void)/*{{{*/
{
  struct toktable *result = new(struct toktable);
  result->tokens = NULL;
  result->n = 0;
  result->hwm = 0;
  result->size = 0;
  return result;
}
/*}}}*/
78 79 80 81 82 83 84 85 86 87
struct toktable2 *new_toktable2(void)/*{{{*/
{
  struct toktable2 *result = new(struct toktable2);
  result->tokens = NULL;
  result->n = 0;
  result->hwm = 0;
  result->size = 0;
  return result;
}
/*}}}*/
Richard Curnow's avatar
Richard Curnow committed
88 89 90 91 92 93 94 95 96 97 98 99 100 101
void free_toktable(struct toktable *x)/*{{{*/
{
  if (x->tokens) {
    int i;
    for (i=0; i<x->size; i++) {
      if (x->tokens[i]) {
        free_token(x->tokens[i]);
      }
    }
    free(x->tokens);
  }
  free(x);
}
/*}}}*/
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
void free_toktable2(struct toktable2 *x)/*{{{*/
{
  if (x->tokens) {
    int i;
    for (i=0; i<x->size; i++) {
      if (x->tokens[i]) {
        free_token2(x->tokens[i]);
      }
    }
    free(x->tokens);
  }
  free(x);
}
/*}}}*/
/* FIXME : This stuff really needs cleaning up. */
Richard Curnow's avatar
Richard Curnow committed
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
static void enlarge_toktable(struct toktable *table)/*{{{*/
{
  if (table->size == 0) {
    int i;
    /* initial allocation */
    table->size = 1024;
    table->mask = table->size - 1;
    table->tokens = new_array(struct token *, table->size);
    for (i=0; i<table->size; i++) {
      table->tokens[i] = NULL;
    }
  } else {
    struct token **old_tokens;
    int old_size = table->size;
    int i;
    /* reallocate */
    old_tokens = table->tokens;
    table->size <<= 1;
    table->mask = table->size - 1;
    table->tokens = new_array(struct token *, table->size);
    for (i=0; i<table->size; i++) {
      table->tokens[i] = NULL;
    }
    for (i=0; i<old_size; i++) {
      unsigned long new_index;
      if (old_tokens[i]) {
        new_index = old_tokens[i]->hashval & table->mask;
        while (table->tokens[new_index]) {
          new_index++;
          new_index &= table->mask;
        }
        table->tokens[new_index] = old_tokens[i];
      }
    }
    free(old_tokens);
  }
  table->hwm = (table->size >> 2) + (table->size >> 3); /* allow 3/8 of nodes to be used */
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
}
/*}}}*/
static void enlarge_toktable2(struct toktable2 *table)/*{{{*/
{
  if (table->size == 0) {
    int i;
    /* initial allocation */
    table->size = 1024;
    table->mask = table->size - 1;
    table->tokens = new_array(struct token2 *, table->size);
    for (i=0; i<table->size; i++) {
      table->tokens[i] = NULL;
    }
  } else {
    struct token2 **old_tokens;
    int old_size = table->size;
    int i;
    /* reallocate */
    old_tokens = table->tokens;
    table->size <<= 1;
    table->mask = table->size - 1;
    table->tokens = new_array(struct token2 *, table->size);
    for (i=0; i<table->size; i++) {
      table->tokens[i] = NULL;
    }
    for (i=0; i<old_size; i++) {
      unsigned long new_index;
      if (old_tokens[i]) {
        new_index = old_tokens[i]->hashval & table->mask;
        while (table->tokens[new_index]) {
          new_index++;
          new_index &= table->mask;
        }
        table->tokens[new_index] = old_tokens[i];
      }
    }
    free(old_tokens);
  }
  table->hwm = (table->size >> 2) + (table->size >> 3); /* allow 3/8 of nodes to be used */
Richard Curnow's avatar
Richard Curnow committed
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
}
/*}}}*/
static int insert_value(unsigned char *x, int val)/*{{{*/
{
  assert(val >= 0);
  if (val <= 127) {
    *x = val;
    return 1;
  } else if (val <= 16383) {
    *x++ = (val >> 8) | 0x80;
    *x   = (val & 0xff);
    return 2;
  } else {
    int a = (val >> 24);
    assert (a <= 63);
    *x++ = a | 0xc0;
    *x++ = ((val >> 16) & 0xff);
    *x++ = ((val >>  8) & 0xff);
    *x   = (val         & 0xff);
    return 4;
  }
}
/*}}}*/
216
void check_and_enlarge_encoding(struct matches *m)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
217
{
218 219 220
  if (m->n + 4 >= m->max) {
    if (m->max == 0) {
      m->max = 16;
Richard Curnow's avatar
Richard Curnow committed
221
    } else {
222
      m->max += (m->max >> 1);
Richard Curnow's avatar
Richard Curnow committed
223
    }
224
    m->msginfo = grow_array(unsigned char, m->max, m->msginfo);
Richard Curnow's avatar
Richard Curnow committed
225 226 227
  }
}
/*}}}*/
228
void insert_index_on_encoding(struct matches *m, int idx)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
229
{
230
  if (m->n == 0) {
Richard Curnow's avatar
Richard Curnow committed
231
    /* Always encode value */
232
    m->n += insert_value(m->msginfo + m->n, idx);
Richard Curnow's avatar
Richard Curnow committed
233
  } else {
234 235 236 237
    assert(idx >= m->highest);
    if (idx > m->highest) {
      int increment = idx - m->highest;
      m->n += insert_value(m->msginfo + m->n, increment);
Richard Curnow's avatar
Richard Curnow committed
238 239 240 241
    } else {
      /* token has already been seen in this file */
    }
  }
242
  m->highest = idx;
Richard Curnow's avatar
Richard Curnow committed
243 244
}
/*}}}*/
245
void add_token_in_file(int file_index, unsigned int hash_key, char *tok_text, struct toktable *table)/*{{{*/
Richard Curnow's avatar
Richard Curnow committed
246 247 248 249
{
  unsigned long hash;
  int index;
  struct token *tok;
250 251
  char *lc_tok_text;
  char *p;
Richard Curnow's avatar
Richard Curnow committed
252

253
  lc_tok_text = new_string((char*)tok_text);
Richard Curnow's avatar
Richard Curnow committed
254
  for (p = lc_tok_text; *p; p++) {
255
    *p = tolower(*(unsigned char *) p);
Richard Curnow's avatar
Richard Curnow committed
256 257
  }
  /* 2nd arg is string length */
258
  hash = hashfn((unsigned char *) lc_tok_text, p - lc_tok_text, hash_key);
Richard Curnow's avatar
Richard Curnow committed
259 260 261 262 263 264 265 266

  if (table->n >= table->hwm) {
    enlarge_toktable(table);
  }

  index = hash & table->mask;
  while (table->tokens[index]) {
    /* strcmp ok as text has been tolower'd earlier */
Richard P. Curnow's avatar
Richard P. Curnow committed
267
    if (!strcmp(lc_tok_text, table->tokens[index]->text))
Richard Curnow's avatar
Richard Curnow committed
268 269 270 271 272 273 274 275 276
      break;
    index++;
    index &= table->mask;
  }

  if (!table->tokens[index]) {
    /* Allocate new */
    struct token *new_tok = new_token();
    /* New token takes ownership of lc_tok_text, no need to free that later. */
277
    new_tok->text = (char *) lc_tok_text;
Richard Curnow's avatar
Richard Curnow committed
278 279 280 281 282 283
    new_tok->hashval = hash; /* save full width for later */
    table->tokens[index] = new_tok;
    ++table->n;
  } else {
    free(lc_tok_text);
  }
Richard P. Curnow's avatar
Richard P. Curnow committed
284

Richard Curnow's avatar
Richard Curnow committed
285
  tok = table->tokens[index];
Richard P. Curnow's avatar
Richard P. Curnow committed
286

287 288 289 290 291 292 293 294 295 296 297 298 299 300
  check_and_enlarge_encoding(&tok->match0);
  insert_index_on_encoding(&tok->match0, file_index);
}
/*}}}*/
void add_token2_in_file(int file_index, unsigned int hash_key, char *tok_text, struct toktable2 *table, int add_to_chain1)/*{{{*/
{
  unsigned long hash;
  int index;
  struct token2 *tok;
  char *lc_tok_text;
  char *p;

  lc_tok_text = new_string(tok_text);
  for (p = lc_tok_text; *p; p++) {
301
    *p = tolower(*(unsigned char *) p);
302 303
  }
  /* 2nd arg is string length */
304
  hash = hashfn((unsigned char *) lc_tok_text, p - lc_tok_text, hash_key);
305 306 307 308 309 310 311 312

  if (table->n >= table->hwm) {
    enlarge_toktable2(table);
  }

  index = hash & table->mask;
  while (table->tokens[index]) {
    /* strcmp ok as text has been tolower'd earlier */
Richard P. Curnow's avatar
Richard P. Curnow committed
313
    if (!strcmp(lc_tok_text, table->tokens[index]->text))
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
      break;
    index++;
    index &= table->mask;
  }

  if (!table->tokens[index]) {
    /* Allocate new */
    struct token2 *new_tok = new_token2();
    /* New token takes ownership of lc_tok_text, no need to free that later. */
    new_tok->text = lc_tok_text;
    new_tok->hashval = hash; /* save full width for later */
    table->tokens[index] = new_tok;
    ++table->n;
  } else {
    free(lc_tok_text);
  }
Richard P. Curnow's avatar
Richard P. Curnow committed
330

331
  tok = table->tokens[index];
Richard P. Curnow's avatar
Richard P. Curnow committed
332

333 334 335 336 337 338
  check_and_enlarge_encoding(&tok->match0);
  insert_index_on_encoding(&tok->match0, file_index);
  if (add_to_chain1) {
    check_and_enlarge_encoding(&tok->match1);
    insert_index_on_encoding(&tok->match1, file_index);
  }
Richard Curnow's avatar
Richard Curnow committed
339 340 341 342 343 344
}
/*}}}*/