similarity.h 6.71 KB
Newer Older
eulerto's avatar
eulerto committed
1 2 3 4 5 6 7 8 9
#ifndef SIMILARITY_H
#define	SIMILARITY_H

#include "postgres.h"

#include "fmgr.h"
#include "utils/builtins.h"
#include "utils/guc.h"

10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
/*
 * XXX Windows workaround
 */
#ifndef WIN32

#define PGS_EXPORT

#else

#define PGS_EXPORT		__declspec(dllexport)

/*
 * PG_MODULE_MAGIC and PG_FUNCTION_INFO_V1 macros seems to be broken.
 * It uses PGDLLIMPORT, but those objects are not imported from postgres
 * and exported from the user module. So, it should be always dllexported.
 */

#undef PG_MODULE_MAGIC
#define PG_MODULE_MAGIC \
extern PGS_EXPORT const Pg_magic_struct *PG_MAGIC_FUNCTION_NAME(void); \
const Pg_magic_struct * \
PG_MAGIC_FUNCTION_NAME(void) \
{ \
	static const Pg_magic_struct Pg_magic_data = PG_MODULE_MAGIC_DATA; \
	return &Pg_magic_data; \
} \
extern int no_such_variable

#undef PG_FUNCTION_INFO_V1
#define PG_FUNCTION_INFO_V1(funcname) \
extern PGS_EXPORT const Pg_finfo_record * CppConcat(pg_finfo_,funcname)(void); \
const Pg_finfo_record * \
CppConcat(pg_finfo_,funcname) (void) \
{ \
	static const Pg_finfo_record my_finfo = { 1 }; \
	return &my_finfo; \
} \
extern int no_such_variable

#endif	/* Windows workaround */


eulerto's avatar
eulerto committed
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
/* case insensitive ? */
#define		PGS_IGNORE_CASE			1

/* maximum string length */
#define		PGS_MAX_STR_LEN			1024

/*
 * Jaro
 */

/* operation's weight */
#define	PGS_JARO_W1					1.0/3.0
#define	PGS_JARO_W2					1.0/3.0
#define	PGS_JARO_WT					1.0/3.0

/* size of the initial prefix considered */
#define	PGS_JARO_PREFIX_SIZE		4

/* scaling factor */
#define	PGS_JARO_SCALING_FACTOR		0.1

/* minimum score for a string that gets boosted */
#define	PGS_JARO_BOOST_THRESHOLD	0.7

/*
 * Levenshtein
 */
#define		PGS_LEV_MIN_COST		0
#define		PGS_LEV_MAX_COST		1

/*
 * Needleman-Wunch
 */

/*
 * Smith-Waterman
 */
/*
XXX simmetrics uses these values
#define		PGS_SW_MIN_COST			-2.0
#define		PGS_SW_MAX_COST			1.0
#define		PGS_SW_GAP_COST			0.5
*/
#define		PGS_SW_MIN_COST			-1.0
#define		PGS_SW_MAX_COST			2.0
#define		PGS_SW_GAP_COST			-1.0

/*
 * Smith-Waterman-Gotoh
 */
#define		PGS_SWG_WINDOW_SIZE		100

/*
 * Soundex
 */
#define		PGS_SOUNDEX_LEN			4
#define		PGS_SOUNDEX_INV_CODE		-1

/*
 * commonly used functions
 */
#define		min2(a, b)			((a < b) ? a : b)
#define		max2(a, b)			((a > b) ? a : b)
#define		min3(a, b, c)		((a < b && a < c) ? a : ((b < c) ? b : c))
#define		max3(a, b, c)		((a > b && a > c) ? a : ((b > c) ? b : c))
#define		max4(a, b, c, d)	((a > b && a > c && a > d) ? a : ((b > c && b > d) ? b : ((c > d) ? c : d)))

/*
 * normalized results?
 */
extern bool	pgs_block_is_normalized;
extern bool	pgs_cosine_is_normalized;
extern bool	pgs_dice_is_normalized;
extern bool	pgs_euclidean_is_normalized;
extern bool	pgs_hamming_is_normalized;
extern bool	pgs_jaccard_is_normalized;
extern bool	pgs_jaro_is_normalized;
extern bool	pgs_jarowinkler_is_normalized;
extern bool	pgs_levenshtein_is_normalized;
extern bool	pgs_matching_is_normalized;
extern bool	pgs_mongeelkan_is_normalized;
extern bool	pgs_nw_is_normalized;
extern bool	pgs_overlap_is_normalized;
extern bool	pgs_qgram_is_normalized;
extern bool	pgs_sw_is_normalized;
extern bool	pgs_swg_is_normalized;

/*
 * how to separate things?
 */
enum
{
	PGS_UNIT_WORD,		/* tokenize by spaces */
	PGS_UNIT_GRAM,		/* tokenize by n-gram */
	PGS_UNIT_ALNUM,		/* tokenize by nonalnum characters */
	PGS_UNIT_CAMELCASE	/* tokenize by camel-case */
};

/*
 * tokenizers per function
 */
extern int	pgs_block_tokenizer;
extern int	pgs_cosine_tokenizer;
extern int	pgs_dice_tokenizer;
extern int	pgs_euclidean_tokenizer;
extern int	pgs_jaccard_tokenizer;
extern int	pgs_matching_tokenizer;
extern int	pgs_mongeelkan_tokenizer;
extern int	pgs_overlap_tokenizer;
extern int	pgs_qgram_tokenizer;

/*
 * thresholds per function
 */
extern float8	pgs_block_threshold;
extern float8	pgs_cosine_threshold;
extern float8	pgs_dice_threshold;
extern float8	pgs_euclidean_threshold;
extern float8	pgs_hamming_threshold;
extern float8	pgs_jaccard_threshold;
extern float8	pgs_jaro_threshold;
extern float8	pgs_jarowinkler_threshold;
extern float8	pgs_levenshtein_threshold;
extern float8	pgs_matching_threshold;
extern float8	pgs_mongeelkan_threshold;
extern float8	pgs_nw_threshold;
extern float8	pgs_overlap_threshold;
extern float8	pgs_qgram_threshold;
extern float8	pgs_sw_threshold;
extern float8	pgs_swg_threshold;

/*
 * gap penalty
 */
extern float8	pgs_nw_gap_penalty;

/*
 * levenshtein.c
 */
int _lev(char *a, char *b, int icost, int dcost);
int _lev_slow(char *a, char *b, int icost, int dcost);

/*
 * similarity.c
 */
int levcost(char a, char b);
int nwcost(char a, char b);
float swcost(char *a, char *b, int i, int j);
float swggapcost(int i, int j);
float megapcost(char *a, char *b, int i, int j);
void _PG_init(void);

/*
 * external function declarations
 */
207 208 209 210 211 212 213 214 215 216
extern Datum PGS_EXPORT block(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT block_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT cosine(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT cosine_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT dice(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT dice_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT euclidean(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT euclidean_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT hamming(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT hamming_op(PG_FUNCTION_ARGS);
217 218
extern Datum PGS_EXPORT hamming_text(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT hamming_text_op(PG_FUNCTION_ARGS);
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
extern Datum PGS_EXPORT jaccard(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT jaccard_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT jaro(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT jaro_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT jarowinkler(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT jarowinkler_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT lev(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT lev_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT levslow(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT levslow_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT matchingcoefficient(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT matchingcoefficient_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT mongeelkan(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT mongeelkan_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT needlemanwunsch(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT needlemanwunsch_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT overlapcoefficient(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT overlapcoefficient_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT qgram(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT qgram_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT smithwaterman(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT smithwaterman_op(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT smithwatermangotoh(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT smithwatermangotoh_op(PG_FUNCTION_ARGS);
243 244
extern Datum PGS_EXPORT soundex(PG_FUNCTION_ARGS);
extern Datum PGS_EXPORT soundex_op(PG_FUNCTION_ARGS);
eulerto's avatar
eulerto committed
245 246

#endif