Skip to content

Commits on Source 6

......@@ -15,13 +15,17 @@ main.cmx: config.cmx configfile.cmx database.cmx mbox.cmx processing.cmx
mbox.cmo: mbox.cmi
mbox.cmx: mbox.cmi
processing.cmo: attachments.cmi config.cmi database.cmi mail.cmi rankmsg.cmi \
wordsplit.cmi processing.cmi
refhosts.cmi wordsplit.cmi processing.cmi
processing.cmx: attachments.cmx config.cmx database.cmx mail.cmx rankmsg.cmx \
wordsplit.cmx processing.cmi
rankmsg.cmo: config.cmi database.cmi mail.cmi wordsplit.cmi rankmsg.cmi
rankmsg.cmx: config.cmx database.cmx mail.cmx wordsplit.cmx rankmsg.cmi
testsplit.cmo: mail.cmi mbox.cmi wordsplit.cmi
testsplit.cmx: mail.cmx mbox.cmx wordsplit.cmx
refhosts.cmx wordsplit.cmx processing.cmi
rankmsg.cmo: config.cmi database.cmi mail.cmi refhosts.cmi wordsplit.cmi \
rankmsg.cmi
rankmsg.cmx: config.cmx database.cmx mail.cmx refhosts.cmx wordsplit.cmx \
rankmsg.cmi
refhosts.cmo: refhosts.cmi
refhosts.cmx: refhosts.cmi
virus.cmo: mail.cmi mbox.cmi zip.cmo
virus.cmx: mail.cmx mbox.cmx zip.cmx
wordsplit.cmo: wordsplit.cmi
wordsplit.cmx: wordsplit.cmi
attachments.cmi: mail.cmi
......
Release 1.6:
- Adapt to safe strings, using mutable byte arrays where needed.
Release 1.5:
- Be resilient to changes in OCaml's hash table implementation.
- Added command "spamoracle upgrade" to convert the database to
the current hash table format.
Release 1.4:
- More lenient rule for ignoring HTML comments.
- Recognition of S P A C E D O-U-T words.
......
......@@ -21,7 +21,7 @@ OCAMLOPT=ocamlopt
BYTEOBJS=configfile.cmo config.cmo \
htmlscan.cmo mail.cmo database.cmo mbox.cmo wordsplit.cmo \
rankmsg.cmo attachments.cmo processing.cmo main.cmo
refhosts.cmo rankmsg.cmo attachments.cmo processing.cmo main.cmo
BYTELIBS=unix.cma str.cma
NATOBJS=$(BYTEOBJS:.cmo=.cmx)
......
SpamOracle -- a spam classification tool
Version 1.3
Version 1.6
OVERVIEW:
......@@ -34,7 +34,7 @@ LICENSE:
REQUIREMENTS AND LIMITATIONS:
- To compile: Objective Caml, http://caml.inria.fr/
- To compile: Objective Caml, https://ocaml.org/, version 4.02 or later.
- To use:
......@@ -139,7 +139,7 @@ INITIALIZATION:
attachments for this message. Here, we have one attachment of type
"application/octect-stream", file name "Guangwen4.zip", and
character set "GB2312". The latter is an encoding for Chinese
and a sure sign that this is a Chinese spam
and a solid hint that this is a Chinese spam
(assuming that, like me, you can't read Chinese).
Normally, when running "spamoracle test goodmails", most messages
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: attachments.ml,v 1.3 2002/08/26 09:35:23 xleroy Exp $ *)
(* $Id$ *)
(* Summarize the attachments of a message as one line that can be
put in the header of the message. Allows procmail to filter
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: attachments.mli,v 1.2 2002/08/26 09:35:23 xleroy Exp $ *)
(* $Id$ *)
(* Summarize the attachments of a message as one line that can be
put in the header of the message. Allows procmail to filter
......
......@@ -12,6 +12,8 @@ let html_tag_attr = ref (Str.regexp_case_fold
let mail_headers = ref (Str.regexp_case_fold
"from:\\|subject:")
let alternative_favor_html = ref true
let spam_header = ref "X-Spam"
let attachments_header = ref "X-Attachments"
......@@ -22,14 +24,27 @@ let num_words_retained = ref 15
let max_repetitions = ref 2
let robinson_s = ref 0.0
let robinson_x = ref 0.5
let low_freq_limit = ref 0.01
let high_freq_limit = ref 0.99
let use_chi_square = ref false
let good_mail_prob = ref 0.2
let spam_mail_prob = ref 0.8
let min_meaningful_words = ref 5
let summarize_referenced = ref false
let referenced_header = ref "X-Referenced-Hosts"
let reassemble_words = ref false
let external_converter = ref ""
open Configfile
let options = [
......@@ -37,15 +52,23 @@ let options = [
"html_retain_tags", Bool html_add_tags;
"html_tag_attributes", Regexp html_tag_attr;
"mail_headers", Regexp mail_headers;
"alternative_favor_html", Bool alternative_favor_html;
"spam_header", String spam_header;
"attachments_header", String attachments_header;
"summarize_attachments", Bool summarize_attachments;
"referenced_header", String referenced_header;
"summarize_referenced", Bool summarize_referenced;
"num_meaningful_words", Int num_words_retained;
"max_repetitions", Int max_repetitions;
"low_freq_limit", Float low_freq_limit;
"high_freq_limit", Float high_freq_limit;
"min_meaningful_words", Int min_meaningful_words;
"good_mail_prob", Float good_mail_prob;
"spam_mail_prob", Float spam_mail_prob
"spam_mail_prob", Float spam_mail_prob;
"robinson_s", Float robinson_s;
"robinson_x", Float robinson_x;
"use_chi_square", Bool use_chi_square;
"reassemble_words", Bool reassemble_words;
"external_converter", String external_converter
]
......@@ -14,6 +14,10 @@ val html_tag_attr : Str.regexp ref
val mail_headers : Str.regexp ref
(** Regexp matching names of e-mail headers that must be analyzed. *)
val alternative_favor_html : bool ref
(** If true, consider only the HTML part of a multipart/alternative.
Otherwise, consider all parts. *)
val spam_header : string ref
(** Name of header added with spam / not-spam info (default: "X-Spam") *)
......@@ -31,10 +35,17 @@ val max_repetitions : int ref
can appear. *)
val low_freq_limit : float ref
(** Lower limit for word frequencies. Default is 0.01. *)
(** Lower limit for word frequencies. Default is 0.001. *)
val high_freq_limit : float ref
(** Upper limit for word frequencies. Default is 0.99. *)
(** Upper limit for word frequencies. Default is 0.999. *)
val robinson_s : float ref
val robinson_x : float ref
(** Robinson's parameters for taking word frequencies into account. *)
val use_chi_square : bool ref
(** Use Robinson's chi-square test *)
val min_meaningful_words : int ref
(** Number of meaningful words below which mails are classified as unknown *)
......@@ -43,5 +54,18 @@ val good_mail_prob : float ref
val spam_mail_prob : float ref
(** Spam probability below which mails are classified as spam *)
val summarize_referenced : bool ref
val referenced_header : string ref
val reassemble_words : bool ref
val external_converter : string ref
(** Program to be called on message parts that are not text.
The program receives the content-type as first argument
and the actual data on standard input.
It should output the corresponding text on standard output,
or exit with non-zero error code if it cannot extract text. *)
val options : (string * Configfile.value) list
(** List of configurable parameters *)
......@@ -98,4 +98,4 @@ let parse opts filename =
with End_of_file ->
close_in ic
end;
!errors
List.rev !errors
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: database.ml,v 1.8 2003/05/30 11:31:38 xleroy Exp $ *)
(* $Id$ *)
(* Word frequency database *)
......@@ -29,12 +29,17 @@ type full = {
f_low_freq: (string, int * int) Hashtbl.t
}
let magic = "Mailscrubber0001"
let magic = "Mailscrubber" (* + 4 digits for version number *)
let check_magic filename ic =
let buf = String.create (String.length magic) in
really_input ic buf 0 (String.length magic);
if buf <> magic then raise(Error(filename ^ ": bad magic number"))
let mlen = String.length magic in
let buf = really_input_string ic (mlen + 4) in
if String.sub buf 0 mlen <> magic then
raise(Error(filename ^ ": bad magic number"));
try
int_of_string (String.sub buf mlen 4)
with Failure _ ->
raise(Error(filename ^ ": bad magic number"));
type db_chan = {zipped : bool ; ic : in_channel}
......@@ -51,28 +56,43 @@ let close_db {zipped = zipped ; ic = ic } =
then ignore(Unix.close_process_in ic)
else close_in ic
let marshal_from_channel filename ic =
let current_version =
if Sys.ocaml_version < "4.03" then 1
else 2
let read_hashtbl filename ic version =
try
Marshal.from_channel ic
let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in
if version = current_version then tbl
else if version > current_version then
raise (Error(filename ^ ": database version not supported"))
else begin
Printf.eprintf "%s: converting from version %d to version %d\n\
Run 'spamoracle upgrade' to suppress this warning.\n%!"
filename version current_version;
let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in
Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl;
tbl'
end
with Failure _ ->
raise (Error(filename ^ ": database is corrupted"))
let read_short filename =
let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
check_magic filename ic;
let version = check_magic filename ic in
let ng = input_binary_int ic in
let ns = input_binary_int ic in
let freq = marshal_from_channel filename ic in
let freq = read_hashtbl filename ic version in
close_db db_ic;
{ s_num_good = ng; s_num_spam = ns; s_freq = freq }
let read_full filename =
let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
check_magic filename ic;
let version = check_magic filename ic in
let ng = input_binary_int ic in
let ns = input_binary_int ic in
let high_freq = marshal_from_channel filename ic in
let low_freq = marshal_from_channel filename ic in
let high_freq = read_hashtbl filename ic version in
let low_freq = read_hashtbl filename ic version in
close_db db_ic;
{ f_num_good = ng; f_num_spam = ns;
f_low_freq = low_freq; f_high_freq = high_freq }
......@@ -97,7 +117,7 @@ let write_full filename db =
else
filename, false in
let (tempname, oc) = temp_file (basename ^ ".tmp") in
output_string oc magic;
Printf.fprintf oc "%s%04d" magic current_version;
output_binary_int oc db.f_num_good;
output_binary_int oc db.f_num_spam;
Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing];
......@@ -194,3 +214,6 @@ let restore ic =
()
end;
db
let in_short db w = Hashtbl.mem db.s_freq w
let in_full db w = Hashtbl.mem db.f_high_freq w
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: database.mli,v 1.4 2003/05/25 08:42:52 xleroy Exp $ *)
(* $Id$ *)
(** Word frequency database *)
......@@ -35,5 +35,8 @@ val add_good: full -> string -> unit
val add_spam: full -> string -> unit
val dump: full -> out_channel -> unit
val restore: in_channel -> full
val in_short: short -> string -> bool
val in_full: full -> string -> bool
val current_version: int
exception Error of string
spamoracle (1.4-17) UNRELEASED; urgency=medium
spamoracle (1.6-1) unstable; urgency=medium
* Team upload
* New upstream release, fixes FTBFS with OCaml 4.08 (closes: #944235)
- drop our patch 0003-Fix-compilation-with-OCaml-4.08.0
* Build-depend on debian-compat
- remove debian/compat
* Debhelper compatibility level 12
* Set debian/watch to github
* Update homepage to github
* Drop Breaks/Replaces/Provides spamoracle-byte as that package has been
removed 10 years ago.
* Standards-Version 4.4.1 (no change)
-- Ralf Treinen <treinen@debian.org> Sat, 09 Nov 2019 18:47:46 +0100
-- Ralf Treinen <treinen@debian.org> Mon, 11 Nov 2019 16:10:22 +0100
spamoracle (1.4-16) unstable; urgency=medium
......
......@@ -3,18 +3,15 @@ Section: net
Priority: optional
Maintainer: Debian OCaml Maintainers <debian-ocaml-maint@lists.debian.org>
Uploaders: Samuel Mimram <smimram@debian.org>, Stefano Zacchiroli <zack@debian.org>
Build-Depends: debhelper (>= 10), dh-ocaml, ocaml-nox (>= 3.11)
Standards-Version: 3.9.1
Homepage: http://pauillac.inria.fr/~xleroy/software.html#spamoracle
Build-Depends: debhelper-compat (= 12), dh-ocaml, ocaml-nox (>= 3.11)
Standards-Version: 4.4.1
Homepage: https://github.com/xavierleroy/spamoracle/
Vcs-Git: https://salsa.debian.org/ocaml-team/spamoracle.git
Vcs-Browser: https://salsa.debian.org/ocaml-team/spamoracle
Package: spamoracle
Architecture: any
Depends: ${ocaml:Depends}, ${shlibs:Depends}, ${misc:Depends}
Conflicts: spamoracle-byte
Replaces: spamoracle-byte
Provides: spamoracle-byte
Description: statistical analysis spam filter based on Bayes' formula
SpamOracle, a.k.a. "Saint Peter", is a tool to help detect and filter away
"spam" (unsolicited commercial e-mail). It proceeds by statistical analysis
......
From: Stephane Glondu <steph@glondu.net>
Date: Sun, 8 Sep 2019 18:45:45 +0200
Subject: Fix compilation with OCaml 4.08.0
---
database.ml | 2 +-
mbox.ml | 4 ++--
processing.ml | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/database.ml b/database.ml
index 3e7e951..9c8aee1 100644
--- a/database.ml
+++ b/database.ml
@@ -34,7 +34,7 @@ let magic = "Mailscrubber0001"
let check_magic filename ic =
let buf = String.create (String.length magic) in
really_input ic buf 0 (String.length magic);
- if buf <> magic then raise(Error(filename ^ ": bad magic number"))
+ if Bytes.to_string buf <> magic then raise(Error(filename ^ ": bad magic number"))
type db_chan = {zipped : bool ; ic : in_channel}
diff --git a/mbox.ml b/mbox.ml
index 0c88d96..7d633d1 100644
--- a/mbox.ml
+++ b/mbox.ml
@@ -85,9 +85,9 @@ let read_single_msg inchan =
let res = Buffer.create 10000 in
let buf = String.create 1024 in
let rec read () =
- let n = input inchan buf 0 (String.length buf) in
+ let n = input inchan buf 0 (Bytes.length buf) in
if n > 0 then begin
- Buffer.add_substring res buf 0 n;
+ Buffer.add_subbytes res buf 0 n;
read ()
end in
read ();
diff --git a/processing.ml b/processing.ml
index d8c4cdf..d062418 100644
--- a/processing.ml
+++ b/processing.ml
@@ -28,7 +28,7 @@ let mark_message db txt =
let r = rank_message db m in
try
let pos_sep = Str.search_forward re_nl_nl txt 0 in
- output stdout txt 0 pos_sep;
+ output stdout (Bytes.of_string txt) 0 pos_sep;
let verdict =
if r.spam_prob <= !Config.good_mail_prob
&& r.num_meaningful >= !Config.min_meaningful_words then "no"
@@ -42,7 +42,7 @@ let mark_message db txt =
if att <> "" then
printf "\n%s: %s" !Config.attachments_header att;
end;
- output stdout txt pos_sep (String.length txt - pos_sep)
+ output stdout (Bytes.of_string txt) pos_sep (String.length txt - pos_sep)
with Not_found ->
print_string txt
0001-Makefiles-configure-scripts-and-other-build-stuff-ad.patch
0002-Remove-characters-which-are-not-allowed-in-manpages.patch
0003-Fix-compilation-with-OCaml-4.08.0.patch
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: htmlscan.mli,v 1.1 2003/01/02 08:52:15 xleroy Exp $ *)
(* $Id$ *)
(** Approximate HTML scanner. Extracts words and certain parameters
of certain tags (e.g. URLs) from HTML text. *)
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: htmlscan.mll,v 1.7 2003/08/28 14:39:02 xleroy Exp $ *)
(* $Id$ *)
(** Approximate HTML scanner. Extracts words from HTML text,
as well as certain parameters of certain tags (e.g. URLs). *)
......@@ -93,7 +93,7 @@ module Output = struct
if !Config.html_add_tags then add_extra ob t
let tag_attr ob t n s =
let n = String.lowercase n in
let n = String.lowercase_ascii n in
if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then
if n = "href" || n = "src"
then add_extra ob (decode_url s)
......@@ -117,12 +117,12 @@ rule main = parse
{ comment lexbuf; main lexbuf }
| "<" name
{ let s = Lexing.lexeme lexbuf in
tag := String.lowercase(String.sub s 1 (String.length s - 1));
tag := String.lowercase_ascii (String.sub s 1 (String.length s - 1));
tagbody lexbuf;
main lexbuf }
| "</" name
{ let s = Lexing.lexeme lexbuf in
tag := String.lowercase(String.sub s 2 (String.length s - 2));
tag := String.lowercase_ascii (String.sub s 2 (String.length s - 2));
tagbody lexbuf;
main lexbuf }
| "<" (* tolerance *)
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: mail.ml,v 1.8 2003/08/28 14:38:38 xleroy Exp $ *)
(* $Id$ *)
(** Parsing of e-mail messages, including attachments *)
......@@ -30,7 +30,7 @@ let base64_decode_char c =
let decode_base64 s =
let d = Buffer.create (String.length s * 3 / 4) in
let buf = Array.create 4 0 in
let buf = Array.make 4 0 in
let pos = ref 0 in
for i = 0 to String.length s - 1 do
let n = base64_decode_char s.[i] in
......@@ -119,7 +119,7 @@ let parse_header s =
[] -> List.rev accu
| line :: rem ->
if Str.string_match re_field line 0 then begin
let field_name = String.lowercase (Str.matched_group 1 line)
let field_name = String.lowercase_ascii (Str.matched_group 1 line)
and field_val = Str.matched_group 2 line in
parse_field ((field_name, decode_header field_val) :: accu) rem
end else
......@@ -169,6 +169,31 @@ let rec parse_message s =
body = s;
parts = [] }
let safe_remove fname = try Sys.remove fname with Sys_error _ -> ()
let run_body_through_external_converter cmd arg body =
let infile = Filename.temp_file "spamoracle" ".data" in
let outfile = Filename.temp_file "spamoracle" ".txt" in
let oc = open_out_bin infile in
let ic = open_in_bin outfile in
output_string oc body;
close_out oc;
let retcode =
Sys.command
(Printf.sprintf "%s %s < %s > %s"
cmd (Filename.quote arg) infile outfile) in
if retcode <> 0 then begin
close_in ic;
safe_remove infile; safe_remove outfile;
None
end else begin
let len = in_channel_length ic in
let res = really_input_string ic len in
close_in ic;
safe_remove infile; safe_remove outfile;
Some res
end
let header s msg =
let rec hdr = function
[] -> []
......@@ -187,8 +212,12 @@ let re_content_html =
Str.regexp_case_fold "text/html"
let re_content_message_rfc822 =
Str.regexp_case_fold "message/rfc822"
let re_content_alternative =
Str.regexp_case_fold "multipart/alternative"
let re_content_multipart =
Str.regexp_case_fold "multipart/"
let re_content_any_text =
Str.regexp_case_fold "text/"
let rec iter_text_parts fn m =
if header_matches "content-type:" re_content_text m
......@@ -196,12 +225,29 @@ let rec iter_text_parts fn m =
fn m
else if header_matches "content-type:" re_content_html m then
fn {m with body = Htmlscan.extract_text m.body}
else if header_matches "content-type:" re_content_multipart m then begin
else if header_matches "content-type:" re_content_alternative m then begin
try
if not !Config.alternative_favor_html then raise Not_found;
iter_text_parts fn
(List.find (header_matches "content-type:" re_content_html) m.parts)
with Not_found ->
fn m;
List.iter (iter_text_parts fn) m.parts
end else if header_matches "content-type:" re_content_multipart m then begin
fn m;
List.iter (iter_text_parts fn) m.parts
end else if header_matches "content-type:" re_content_message_rfc822 m then
iter_text_parts fn (parse_message m.body)
else
else if !Config.external_converter <> ""
&& not (header_matches "content-type:" re_content_any_text m)
then begin
match run_body_through_external_converter
!Config.external_converter
(header "content-type:" m)
m.body with
| None -> ()
| Some txt -> fn {m with body = txt}
end else
()
let iter_message fn msg =
......
......@@ -10,7 +10,7 @@
(* *)
(***********************************************************************)
(* $Id: mail.mli,v 1.3 2003/03/23 09:14:10 xleroy Exp $ *)
(* $Id$ *)
(** Parsing of e-mail messages, including attachments *)
......