Commit fc87205d authored by Ralf Treinen's avatar Ralf Treinen

New upstream version 1.6

parent 2bc1b08f
......@@ -15,13 +15,17 @@ main.cmx: config.cmx configfile.cmx database.cmx mbox.cmx processing.cmx
mbox.cmo: mbox.cmi
mbox.cmx: mbox.cmi
processing.cmo: attachments.cmi config.cmi database.cmi mail.cmi rankmsg.cmi \
wordsplit.cmi processing.cmi
refhosts.cmi wordsplit.cmi processing.cmi
processing.cmx: attachments.cmx config.cmx database.cmx mail.cmx rankmsg.cmx \
wordsplit.cmx processing.cmi
rankmsg.cmo: config.cmi database.cmi mail.cmi wordsplit.cmi rankmsg.cmi
rankmsg.cmx: config.cmx database.cmx mail.cmx wordsplit.cmx rankmsg.cmi
testsplit.cmo: mail.cmi mbox.cmi wordsplit.cmi
testsplit.cmx: mail.cmx mbox.cmx wordsplit.cmx
refhosts.cmx wordsplit.cmx processing.cmi
rankmsg.cmo: config.cmi database.cmi mail.cmi refhosts.cmi wordsplit.cmi \
rankmsg.cmi
rankmsg.cmx: config.cmx database.cmx mail.cmx refhosts.cmx wordsplit.cmx \
rankmsg.cmi
refhosts.cmo: refhosts.cmi
refhosts.cmx: refhosts.cmi
virus.cmo: mail.cmi mbox.cmi zip.cmo
virus.cmx: mail.cmx mbox.cmx zip.cmx
wordsplit.cmo: wordsplit.cmi
wordsplit.cmx: wordsplit.cmi
attachments.cmi: mail.cmi
......
Release 1.6:
- Adapt to safe strings, using mutable byte arrays where needed.
Release 1.5:
- Be resilient to changes in OCaml's hash table implementation.
- Added command "spamoracle upgrade" to convert the database to
the current hash table format.
Release 1.4:
- More lenient rule for ignoring HTML comments.
- Recognition of S P A C E D O-U-T words.
......
......@@ -21,7 +21,7 @@ OCAMLOPT=ocamlopt
BYTEOBJS=configfile.cmo config.cmo \
htmlscan.cmo mail.cmo database.cmo mbox.cmo wordsplit.cmo \
rankmsg.cmo attachments.cmo processing.cmo main.cmo
refhosts.cmo rankmsg.cmo attachments.cmo processing.cmo main.cmo
BYTELIBS=unix.cma str.cma
NATOBJS=$(BYTEOBJS:.cmo=.cmx)
......
SpamOracle -- a spam classification tool
Version 1.3
Version 1.6
OVERVIEW:
......@@ -34,7 +34,7 @@ LICENSE:
REQUIREMENTS AND LIMITATIONS:
- To compile: Objective Caml, http://caml.inria.fr/
- To compile: Objective Caml, https://ocaml.org/, version 4.02 or later.
- To use:
......@@ -139,7 +139,7 @@ INITIALIZATION:
attachments for this message. Here, we have one attachment of type
"application/octect-stream", file name "Guangwen4.zip", and
character set "GB2312". The latter is an encoding for Chinese
and a sure sign that this is a Chinese spam
and a solid hint that this is a Chinese spam
(assuming that, like me, you can't read Chinese).
Normally, when running "spamoracle test goodmails", most messages
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: attachments.ml,v 1.3 2002/08/26 09:35:23 xleroy Exp $ *)
(* $Id$ *)
(* Summarize the attachments of a message as one line that can be
put in the header of the message. Allows procmail to filter
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: attachments.mli,v 1.2 2002/08/26 09:35:23 xleroy Exp $ *)
(* $Id$ *)
(* Summarize the attachments of a message as one line that can be
put in the header of the message. Allows procmail to filter
......
......@@ -12,6 +12,8 @@ let html_tag_attr = ref (Str.regexp_case_fold
let mail_headers = ref (Str.regexp_case_fold
"from:\\|subject:")
let alternative_favor_html = ref true
let spam_header = ref "X-Spam"
let attachments_header = ref "X-Attachments"
......@@ -22,14 +24,27 @@ let num_words_retained = ref 15
let max_repetitions = ref 2
let robinson_s = ref 0.0
let robinson_x = ref 0.5
let low_freq_limit = ref 0.01
let high_freq_limit = ref 0.99
let use_chi_square = ref false
let good_mail_prob = ref 0.2
let spam_mail_prob = ref 0.8
let min_meaningful_words = ref 5
let summarize_referenced = ref false
let referenced_header = ref "X-Referenced-Hosts"
let reassemble_words = ref false
let external_converter = ref ""
open Configfile
let options = [
......@@ -37,15 +52,23 @@ let options = [
"html_retain_tags", Bool html_add_tags;
"html_tag_attributes", Regexp html_tag_attr;
"mail_headers", Regexp mail_headers;
"alternative_favor_html", Bool alternative_favor_html;
"spam_header", String spam_header;
"attachments_header", String attachments_header;
"summarize_attachments", Bool summarize_attachments;
"referenced_header", String referenced_header;
"summarize_referenced", Bool summarize_referenced;
"num_meaningful_words", Int num_words_retained;
"max_repetitions", Int max_repetitions;
"low_freq_limit", Float low_freq_limit;
"high_freq_limit", Float high_freq_limit;
"min_meaningful_words", Int min_meaningful_words;
"good_mail_prob", Float good_mail_prob;
"spam_mail_prob", Float spam_mail_prob
"spam_mail_prob", Float spam_mail_prob;
"robinson_s", Float robinson_s;
"robinson_x", Float robinson_x;
"use_chi_square", Bool use_chi_square;
"reassemble_words", Bool reassemble_words;
"external_converter", String external_converter
]
......@@ -14,6 +14,10 @@ val html_tag_attr : Str.regexp ref
val mail_headers : Str.regexp ref
(** Regexp matching names of e-mail headers that must be analyzed. *)
val alternative_favor_html : bool ref
(** If true, consider only the HTML part of a multipart/alternative.
Otherwise, consider all parts. *)
val spam_header : string ref
(** Name of header added with spam / not-spam info (default: "X-Spam") *)
......@@ -31,10 +35,17 @@ val max_repetitions : int ref
can appear. *)
val low_freq_limit : float ref
(** Lower limit for word frequencies. Default is 0.01. *)
(** Lower limit for word frequencies. Default is 0.001. *)
val high_freq_limit : float ref
(** Upper limit for word frequencies. Default is 0.99. *)
(** Upper limit for word frequencies. Default is 0.999. *)
val robinson_s : float ref
val robinson_x : float ref
(** Robinson's parameters for taking word frequencies into account. *)
val use_chi_square : bool ref
(** Use Robinson's chi-square test *)
val min_meaningful_words : int ref
(** Number of meaningful words below which mails are classified as unknown *)
......@@ -43,5 +54,18 @@ val good_mail_prob : float ref
val spam_mail_prob : float ref
(** Spam probability below which mails are classified as spam *)
val summarize_referenced : bool ref
val referenced_header : string ref
val reassemble_words : bool ref
val external_converter : string ref
(** Program to be called on message parts that are not text.
The program receives the content-type as first argument
and the actual data on standard input.
It should output the corresponding text on standard output,
or exit with non-zero error code if it cannot extract text. *)
val options : (string * Configfile.value) list
(** List of configurable parameters *)
......@@ -98,4 +98,4 @@ let parse opts filename =
with End_of_file ->
close_in ic
end;
!errors
List.rev !errors
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: database.ml,v 1.8 2003/05/30 11:31:38 xleroy Exp $ *)
(* $Id$ *)
(* Word frequency database *)
......@@ -29,12 +29,17 @@ type full = {
f_low_freq: (string, int * int) Hashtbl.t
}
let magic = "Mailscrubber0001"
let magic = "Mailscrubber" (* + 4 digits for version number *)
let check_magic filename ic =
let buf = String.create (String.length magic) in
really_input ic buf 0 (String.length magic);
if buf <> magic then raise(Error(filename ^ ": bad magic number"))
let mlen = String.length magic in
let buf = really_input_string ic (mlen + 4) in
if String.sub buf 0 mlen <> magic then
raise(Error(filename ^ ": bad magic number"));
try
int_of_string (String.sub buf mlen 4)
with Failure _ ->
raise(Error(filename ^ ": bad magic number"));
type db_chan = {zipped : bool ; ic : in_channel}
......@@ -51,28 +56,43 @@ let close_db {zipped = zipped ; ic = ic } =
then ignore(Unix.close_process_in ic)
else close_in ic
let marshal_from_channel filename ic =
let current_version =
if Sys.ocaml_version < "4.03" then 1
else 2
let read_hashtbl filename ic version =
try
Marshal.from_channel ic
let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in
if version = current_version then tbl
else if version > current_version then
raise (Error(filename ^ ": database version not supported"))
else begin
Printf.eprintf "%s: converting from version %d to version %d\n\
Run 'spamoracle upgrade' to suppress this warning.\n%!"
filename version current_version;
let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in
Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl;
tbl'
end
with Failure _ ->
raise (Error(filename ^ ": database is corrupted"))
let read_short filename =
let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
check_magic filename ic;
let version = check_magic filename ic in
let ng = input_binary_int ic in
let ns = input_binary_int ic in
let freq = marshal_from_channel filename ic in
let freq = read_hashtbl filename ic version in
close_db db_ic;
{ s_num_good = ng; s_num_spam = ns; s_freq = freq }
let read_full filename =
let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
check_magic filename ic;
let version = check_magic filename ic in
let ng = input_binary_int ic in
let ns = input_binary_int ic in
let high_freq = marshal_from_channel filename ic in
let low_freq = marshal_from_channel filename ic in
let high_freq = read_hashtbl filename ic version in
let low_freq = read_hashtbl filename ic version in
close_db db_ic;
{ f_num_good = ng; f_num_spam = ns;
f_low_freq = low_freq; f_high_freq = high_freq }
......@@ -97,7 +117,7 @@ let write_full filename db =
else
filename, false in
let (tempname, oc) = temp_file (basename ^ ".tmp") in
output_string oc magic;
Printf.fprintf oc "%s%04d" magic current_version;
output_binary_int oc db.f_num_good;
output_binary_int oc db.f_num_spam;
Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing];
......@@ -194,3 +214,6 @@ let restore ic =
()
end;
db
let in_short db w = Hashtbl.mem db.s_freq w
let in_full db w = Hashtbl.mem db.f_high_freq w
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: database.mli,v 1.4 2003/05/25 08:42:52 xleroy Exp $ *)
(* $Id$ *)
(** Word frequency database *)
......@@ -35,5 +35,8 @@ val add_good: full -> string -> unit
val add_spam: full -> string -> unit
val dump: full -> out_channel -> unit
val restore: in_channel -> full
val in_short: short -> string -> bool
val in_full: full -> string -> bool
val current_version: int
exception Error of string
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: htmlscan.mli,v 1.1 2003/01/02 08:52:15 xleroy Exp $ *)
(* $Id$ *)
(** Approximate HTML scanner. Extracts words and certain parameters
of certain tags (e.g. URLs) from HTML text. *)
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: htmlscan.mll,v 1.7 2003/08/28 14:39:02 xleroy Exp $ *)
(* $Id$ *)
(** Approximate HTML scanner. Extracts words from HTML text,
as well as certain parameters of certain tags (e.g. URLs). *)
......@@ -93,7 +93,7 @@ module Output = struct
if !Config.html_add_tags then add_extra ob t
let tag_attr ob t n s =
let n = String.lowercase n in
let n = String.lowercase_ascii n in
if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then
if n = "href" || n = "src"
then add_extra ob (decode_url s)
......@@ -117,12 +117,12 @@ rule main = parse
{ comment lexbuf; main lexbuf }
| "<" name
{ let s = Lexing.lexeme lexbuf in
tag := String.lowercase(String.sub s 1 (String.length s - 1));
tag := String.lowercase_ascii (String.sub s 1 (String.length s - 1));
tagbody lexbuf;
main lexbuf }
| "</" name
{ let s = Lexing.lexeme lexbuf in
tag := String.lowercase(String.sub s 2 (String.length s - 2));
tag := String.lowercase_ascii (String.sub s 2 (String.length s - 2));
tagbody lexbuf;
main lexbuf }
| "<" (* tolerance *)
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: mail.ml,v 1.8 2003/08/28 14:38:38 xleroy Exp $ *)
(* $Id$ *)
(** Parsing of e-mail messages, including attachments *)
......@@ -30,7 +30,7 @@ let base64_decode_char c =
let decode_base64 s =
let d = Buffer.create (String.length s * 3 / 4) in
let buf = Array.create 4 0 in
let buf = Array.make 4 0 in
let pos = ref 0 in
for i = 0 to String.length s - 1 do
let n = base64_decode_char s.[i] in
......@@ -119,7 +119,7 @@ let parse_header s =
[] -> List.rev accu
| line :: rem ->
if Str.string_match re_field line 0 then begin
let field_name = String.lowercase (Str.matched_group 1 line)
let field_name = String.lowercase_ascii (Str.matched_group 1 line)
and field_val = Str.matched_group 2 line in
parse_field ((field_name, decode_header field_val) :: accu) rem
end else
......@@ -169,6 +169,31 @@ let rec parse_message s =
body = s;
parts = [] }
let safe_remove fname = try Sys.remove fname with Sys_error _ -> ()
let run_body_through_external_converter cmd arg body =
let infile = Filename.temp_file "spamoracle" ".data" in
let outfile = Filename.temp_file "spamoracle" ".txt" in
let oc = open_out_bin infile in
let ic = open_in_bin outfile in
output_string oc body;
close_out oc;
let retcode =
Sys.command
(Printf.sprintf "%s %s < %s > %s"
cmd (Filename.quote arg) infile outfile) in
if retcode <> 0 then begin
close_in ic;
safe_remove infile; safe_remove outfile;
None
end else begin
let len = in_channel_length ic in
let res = really_input_string ic len in
close_in ic;
safe_remove infile; safe_remove outfile;
Some res
end
let header s msg =
let rec hdr = function
[] -> []
......@@ -187,8 +212,12 @@ let re_content_html =
Str.regexp_case_fold "text/html"
let re_content_message_rfc822 =
Str.regexp_case_fold "message/rfc822"
let re_content_alternative =
Str.regexp_case_fold "multipart/alternative"
let re_content_multipart =
Str.regexp_case_fold "multipart/"
let re_content_any_text =
Str.regexp_case_fold "text/"
let rec iter_text_parts fn m =
if header_matches "content-type:" re_content_text m
......@@ -196,12 +225,29 @@ let rec iter_text_parts fn m =
fn m
else if header_matches "content-type:" re_content_html m then
fn {m with body = Htmlscan.extract_text m.body}
else if header_matches "content-type:" re_content_multipart m then begin
else if header_matches "content-type:" re_content_alternative m then begin
try
if not !Config.alternative_favor_html then raise Not_found;
iter_text_parts fn
(List.find (header_matches "content-type:" re_content_html) m.parts)
with Not_found ->
fn m;
List.iter (iter_text_parts fn) m.parts
end else if header_matches "content-type:" re_content_multipart m then begin
fn m;
List.iter (iter_text_parts fn) m.parts
end else if header_matches "content-type:" re_content_message_rfc822 m then
iter_text_parts fn (parse_message m.body)
else
else if !Config.external_converter <> ""
&& not (header_matches "content-type:" re_content_any_text m)
then begin
match run_body_through_external_converter
!Config.external_converter
(header "content-type:" m)
m.body with
| None -> ()
| Some txt -> fn {m with body = txt}
end else
()
let iter_message fn msg =
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: mail.mli,v 1.3 2003/03/23 09:14:10 xleroy Exp $ *)
(* $Id$ *)
(** Parsing of e-mail messages, including attachments *)
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: main.ml,v 1.14 2003/08/30 13:14:11 xleroy Exp $ *)
(* $Id$ *)
(* Argument parsing and main program *)
......@@ -80,13 +80,9 @@ let list_command args =
let match_word w (g, s) =
if Str.string_match re w 0 then begin
let p =
if 2 * g + s < 5 then -1.0 else begin
let pgood = float (2 * g) /. float db.f_num_good
and pspam = float s /. float db.f_num_spam in
max !Config.low_freq_limit
(min !Config.high_freq_limit
(pspam /. (pgood +. pspam)))
end in
if 2 * g + s < 5
then -1.0
else Rankmsg.word_proba g s db.f_num_good db.f_num_spam in
res := (w, p, g, s) :: !res
end in
Hashtbl.iter match_word db.f_high_freq;
......@@ -148,23 +144,24 @@ let stat_command args =
let percentage a b =
100.0 *. float a /. float b in
if !num_msgs > 0 then
printf "%s: %.2f%% good, %.2f%% unknown, %.2f%% spam\n"
printf "%s: %d (%.2f%%) good, %d (%.2f%%) unknown, %d (%.2f%%) spam\n"
f
(percentage !num_good !num_msgs)
(percentage !num_unknown !num_msgs)
(percentage !num_spam !num_msgs)
!num_good (percentage !num_good !num_msgs)
!num_unknown (percentage !num_unknown !num_msgs)
!num_spam (percentage !num_spam !num_msgs)
in List.iter stat_mbox args
let words_command args =
let db = Database.read_short !Config.database_name in
if args = [] then
wordsplit_message (read_single_msg stdin)
wordsplit_message db (read_single_msg stdin)
else
List.iter
(fun f ->
mbox_file_iter f
(fun msg ->
print_string "----------------------------------------\n";
wordsplit_message msg))
wordsplit_message db msg))
args
let backup_command () =
......@@ -173,6 +170,13 @@ let backup_command () =
let restore_command () =
Database.write_full !Config.database_name (Database.restore stdin)
let upgrade_command () =
let db = Database.read_full !Config.database_name in
Database.write_full !Config.database_name db;
printf "Converted %s to version %d.\n"
!Config.database_name
Database.current_version
let rec parse_args_1 = function
"-config" :: file :: rem ->
parse_config_file file; parse_args_2 rem
......@@ -208,6 +212,8 @@ and parse_args_3 = function
restore_command ()
| "words" :: rem ->
words_command rem
| "upgrade" :: rem ->
upgrade_command ()
| s :: rem ->
raise(Usage("Unknown command " ^ s))
| [] ->
......@@ -249,6 +255,9 @@ Usage:
spamoracle [-config conf] [-f db] restore < database.backup
Restore database from text backup file read from standard input
spamoracle [-config conf] [-f db] upgrade
Convert database to the latest format
spamoracle [-config conf] [-f db] words {mailbox}*
Extract words from messages and print them
{mailbox}* Mailboxes containing messages to scan
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: mbox.ml,v 1.4 2002/08/26 09:35:25 xleroy Exp $ *)
(* $Id$ *)
(* Reading of a mailbox file and splitting into individual messages *)
......@@ -83,11 +83,11 @@ let mbox_channel_iter inchan fn =
let read_single_msg inchan =
let res = Buffer.create 10000 in
let buf = String.create 1024 in
let buf = Bytes.create 1024 in
let rec read () =
let n = input inchan buf 0 (String.length buf) in
let n = input inchan buf 0 (Bytes.length buf) in
if n > 0 then begin
Buffer.add_substring res buf 0 n;
Buffer.add_subbytes res buf 0 n;
read ()
end in
read ();
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: mbox.mli,v 1.3 2002/08/26 09:35:25 xleroy Exp $ *)
(* $Id$ *)
(** Reading of a mailbox file and splitting into individual messages *)
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: processing.ml,v 1.8 2003/08/30 13:14:11 xleroy Exp $ *)
(* $Id$ *)
(* Processing messages *)
......@@ -28,7 +28,7 @@ let mark_message db txt =
let r = rank_message db m in
try
let pos_sep = Str.search_forward re_nl_nl txt 0 in
output stdout txt 0 pos_sep;
output_substring stdout txt 0 pos_sep;
let verdict =
if r.spam_prob <= !Config.good_mail_prob
&& r.num_meaningful >= !Config.min_meaningful_words then "no"
......@@ -42,7 +42,12 @@ let mark_message db txt =
if att <> "" then
printf "\n%s: %s" !Config.attachments_header att;
end;
output stdout txt pos_sep (String.length txt - pos_sep)
if !Config.summarize_referenced then begin
let refh = Refhosts.summarize () in
if refh <> "" then
printf "\n%s: %s" !Config.referenced_header refh;
end;
output_substring stdout txt pos_sep (String.length txt - pos_sep)
with Not_found ->
print_string txt
......@@ -52,7 +57,7 @@ let record_words db is_spam txt =
Wordsplit.iter
(fun w ->
if is_spam then add_spam db w else add_good db w)
txt
(in_full db) txt
let add_message db verbose is_spam msg =
if verbose then begin
......@@ -79,6 +84,10 @@ let test_message db low high f txt =
let att = Attachments.summarize msg in
if att <> "" then printf "Attachments: %s\n" att
end;
if !Config.summarize_referenced then begin
let refh = Refhosts.summarize () in
if refh <> "" then printf "Referenced hosts: %s\n" refh
end;
printf "File: %s\n" f;
end
......@@ -95,13 +104,13 @@ let stat_message db txt =
(* Word splitting *)
let wordsplit_message txt =
let wordsplit_message db txt =
Format.open_hovbox 0;
Mail.iter_message
(fun txt ->
Wordsplit.iter
(fun word -> Format.print_string word; Format.print_space())
txt)
(in_short db) txt)
(parse_message txt);
Format.close_box();
Format.print_newline()
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: processing.mli,v 1.4 2003/08/30 13:14:11 xleroy Exp $ *)
(* $Id$ *)
(* Processing messages *)
......@@ -20,4 +20,4 @@ val add_message : Database.full -> bool -> bool -> string -> unit
val test_message : Database.short -> float -> float -> string -> string -> unit
type message_class = Msg_good | Msg_unknown | Msg_spam
val stat_message : Database.short -> string -> message_class
val wordsplit_message : string -> unit
val wordsplit_message : Database.short -> string -> unit
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: rankmsg.ml,v 1.4 2003/03/23 09:14:11 xleroy Exp $ *)
(* $Id$ *)
(* Message ranking *)
......@@ -41,32 +41,91 @@ let add_word w p res =
let normalize (p : float) low high =
if p > high then high else if p < low then low else p
let cap (p : float) =
if p > 1.0 then 1.0 else p
let word_proba g b num_g num_b =
let g = 2 * g in (* Graham's magic factor to bias in favor of ham *)
let pgood = cap (float g /. float num_g)
and pbad = cap (float b /. float num_b) in
let p = pbad /. (pgood +. pbad) in
if !Config.robinson_s = 0.0 then
normalize p !Config.low_freq_limit !Config.high_freq_limit
else begin
(* Robinson's adjustement *)
let n = float (g + b) in
let p =
(!Config.robinson_s *. !Config.robinson_x +. n *. p)
/. (!Config.robinson_s +. n) in
(* Result normalization *)
normalize p !Config.low_freq_limit !Config.high_freq_limit
end
let process_word (db, res) w =
try
let (g, b) = Hashtbl.find db.s_freq w in
if word_count_in w res < !Config.max_repetitions then begin
let g = 2 * g in
let pgood = float g /. float db.s_num_good
and pbad = float b /. float db.s_num_spam in
let p =
normalize (pbad /. (pgood +. pbad))
!Config.low_freq_limit !Config.high_freq_limit in
let p = word_proba g b db.s_num_good db.s_num_spam in
add_word w p res
end
with Not_found ->
()
let process_words ctx txt =
Wordsplit.iter (process_word ctx) txt
let process_words ((db, res) as ctx) txt =
Wordsplit.iter (process_word ctx) (in_short db) txt;
if !Config.summarize_referenced then Refhosts.add txt
let process_msg ctx m =
iter_message (process_words ctx) m
let bayes_rule res =
let probs = List.map snd (Array.to_list res) in
let prod = List.fold_left ( *. ) 1.0 probs
and cprod = List.fold_left ( *. ) 1.0 (List.map (fun x -> 1.0 -. x) probs) in
prod /. (prod +. cprod)
(* This is Graham's original approach *)
let spaminess_score_graham res =
let p = ref 1.0 and pexp = ref 0
and cp = ref 1.0 and cpexp = ref 0 in
for i = 0 to Array.length res - 1 do
let (_, x) = res.(i) in
p := !p *. x;
if !p <= 1e-100 then begin
let (m, e) = frexp !p in p := m; pexp := !pexp + e