OCaml regex-redux update
CONTRIBUTE SOURCE CODE
(* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
*
* regex-dna program contributed by Christophe TROESTLER
* converted from regex-dna program
*
* updated by Roman Kashitsyn: use Bytes instead of String
* updated by Gaëtan Dubreil: use the Re library and parallelize processing
*)
open Printf
let variants = ["agggtaaa|tttaccct"; "[cgt]gggtaaa|tttaccc[acg]";
"a[act]ggtaaa|tttacc[agt]t"; "ag[act]gtaaa|tttac[agt]ct";
"agg[act]taaa|ttta[agt]cct"; "aggg[acg]aaa|ttt[cgt]ccct";
"agggt[cgt]aa|tt[acg]accct"; "agggta[cgt]a|t[acg]taccct";
"agggtaa[cgt]|[acg]ttaccct"]
let subst = ["tHa[Nt]", "<4>"; "aND|caN|Ha[DS]|WaS", "<3>";
"a[NSt]|BY", "<2>"; "<[^>]*>", "|"; "\\|[^|][^|]*\\|", "-"]
(* Read all of a redirected FASTA format file from stdin. *)
let file_data, file_length =
let b = Buffer.create 0xFFFF in
let s = Bytes.create 0xFFF in
let r = ref 1 in
while !r > 0 do
r := input stdin s 0 0xFFF;
Buffer.add_substring b (Bytes.unsafe_to_string s) 0 !r
done;
(Buffer.contents b, Buffer.length b)
(* Remove FASTA sequence descriptions and all linefeed characters. *)
let dna = Re.replace_string (Re.Pcre.regexp ">.*\n|\n") "" file_data
let code_length = String.length dna
(* Count matches of [re]. *)
let count re s =
let re = Re.Pcre.regexp re in
let i = ref 0 in
let n = ref 0 in
try
while true do
let grps = Re.exec ~pos:!i re s in
i := Re.Group.stop grps 0;
incr n
done;
assert false
with Not_found -> !n
let () =
if Unix.fork() = 0 then (
List.iter (fun re -> printf "%s %i\n" re (count re dna)) variants;
)
else (
let b = ref dna in
List.iter (fun (re, s) ->
b := Re.replace_string (Re.Pcre.regexp re) s !b) subst;
ignore(Unix.wait());
printf "\n%i\n%i\n%i\n" file_length code_length (String.length !b)
)
Provide a helpful Title
OCaml regex-redux Gaëtan Dubreil
Attach your source code file
Provide an example build command-line
opam install re
ocamlopt -noassert -unsafe -fPIC -nodynlink -inline 100 -O3 -I $OPAM_SWITCH_PREFIX/lib/re unix.cmxa re.cmxa -ccopt -march=ivybridge regexredux.ocaml-3.ml -o regexredux.ocaml-3.ocaml_run
time ./regexredux.ocaml-3.ocaml_run < input5000000.txt
This update replaces the use of the Str library with the Re library. The use of the Str library is deprecated because it is not thread-safe, so it will be incompatible with OCaml 5 (Multicore OCaml). This update also parallelizes the count and replace tasks as is done for other languages.
Thank you for your exciting site.