Skip to content
Commits on Source (2)
1.9.0 (05-Apr-2019)
-------------------
* Fix regression in `Re.exec_partial` (#164)
* Mov gen related functions to `Re.Gen` and deprecate the old names (#167)
* Introduce `Re.View` that exposes the internal representation (#163)
1.8.0 (04-Aug-2018)
-------------------
* Fix index-out-of-bounds exception in Re.Perl.re (#160)
* Add seq based iterators (#170)
1.7.3 (05-Mar-2018)
-------------------
......
Requirements
The installation procedure defined in the Makefile requires findlib
(http://www.ocaml-programming.de/packages/documentation/findlib/).
Installation
- Compile with "make all".
- If you have ocamlopt, do also "make opt".
- Become super-user if necessary and do "make install"
(A "make uninstall" removes the library.)
JBUILDER ?= jbuilder
DUNE ?= dune
all:
@$(JBUILDER) build
@$(DUNE) build
test:
@$(JBUILDER) runtest
@$(DUNE) runtest
check: test
clean:
@$(JBUILDER) clean
@$(DUNE) clean
.PHONY: check test all clean
.PHONY: all-supported-ocaml-versions
all-supported-ocaml-versions:
jbuilder build @runtest --workspace jbuild-workspace.dev
dune build @runtest --workspace dune-workspace.dev
......@@ -18,20 +18,18 @@ Features
The following styles of regular expressions are supported:
- Perl-style regular expressions (module `Re.Perl`);
- Posix extended regular expressions (module `Re_posix`);
- Posix extended regular expressions (module `Re.Posix`);
- Emacs-style regular expressions (module `Re.Emacs`);
- Shell-style file globbing (module `Re_glob`).
- Shell-style file globbing (module `Re.Glob`).
It is also possible to build regular expressions by combining simpler
regular expressions (module `Re`).
It is also possible to build regular expressions by combining simpler regular
expressions (module `Re`).
The most notable missing features are **back-references** and
look-ahead/look-behind **assertions**.
There is also a subset of the PCRE interface available in the
`Re.pcre` library. This makes it easier to port code from that
library to Re without any changes beyond replacing the `pcre`
findlib package with `re.pcre`.
There is also a subset of the PCRE interface available in the `Re.Pcre` module.
This makes it easier to port code from that library to Re minimal changes.
Performances
============
......
(executable
(libraries re threads core_bench)
(name benchmark))
(jbuild_version 1)
(executable
((libraries (re threads core_bench))
(name benchmark)))
(library
(name re_str)
(public_name re.str)
(wrapped false)
(modules re_str)
(synopsis "Deprecated. Use Re.Str")
(libraries re))
(library
(name re_pcre)
(public_name re.pcre)
(wrapped false)
(modules re_pcre)
(synopsis "Deprecated. Use Re.Pcre")
(libraries re))
(library
(name re_perl)
(public_name re.perl)
(wrapped false)
(modules re_perl)
(synopsis "Deprecated. Use Re.Perl")
(libraries re))
(library
(name re_posix)
(public_name re.posix)
(wrapped false)
(modules re_posix)
(synopsis "Deprecated. Use Re.Posix")
(libraries re))
(library
(name re_emacs)
(public_name re.emacs)
(wrapped false)
(modules re_emacs)
(synopsis "Deprecated. Use Re.Emacs")
(libraries re))
(library
(name re_glob)
(public_name re.glob)
(wrapped false)
(modules re_glob)
(synopsis "Deprecated. Use Re.Glob")
(libraries re))
(jbuild_version 1)
(library
((name re_str)
(public_name re.str)
(wrapped false)
(modules (re_str))
(synopsis "Deprecated. Use Re.Str")
(libraries (re))))
(library
((name re_pcre)
(public_name re.pcre)
(wrapped false)
(modules (re_pcre))
(synopsis "Deprecated. Use Re.Pcre")
(libraries (re))))
(library
((name re_perl)
(public_name re.perl)
(wrapped false)
(modules (re_perl))
(synopsis "Deprecated. Use Re.Perl")
(libraries (re))))
(library
((name re_posix)
(public_name re.posix)
(wrapped false)
(modules (re_posix))
(synopsis "Deprecated. Use Re.Posix")
(libraries (re))))
(library
((name re_emacs)
(public_name re.emacs)
(wrapped false)
(modules (re_emacs))
(synopsis "Deprecated. Use Re.Emacs")
(libraries (re))))
(library
((name re_glob)
(public_name re.glob)
(wrapped false)
(modules (re_glob))
(synopsis "Deprecated. Use Re.Glob")
(libraries (re))))
(env
(_ (flags (:standard -w -50))))
\ No newline at end of file
(lang dune 1.0)
(name re)
(lang dune 1.0)
;; This file is used by `make all-supported-ocaml-versions`
(context (opam (switch 4.02.3)))
(context (opam (switch 4.03.0)))
(context (opam (switch 4.04.2)))
(context (opam (switch 4.05.0)))
(context (opam (switch 4.06.1)))
(context (opam (switch 4.07.0)))
\ No newline at end of file
;; This file is used by `make all-supported-ocaml-versions`
(context ((switch 4.02.3)))
(context ((switch 4.03.0)))
(context ((switch 4.04.2)))
(context ((switch 4.05.0)))
(context ((switch 4.06.0)))
\ No newline at end of file
(* In reality, this can really be represented as a bool array.
The representation is best thought of as a list of all chars along with a
flag:
(a, 0), (b, 1), (c, 0), (d, 0), ...
characters belonging to the same color are represented by sequnces of
characters with the flag set to 0.
*)
type t = Bytes.t
let make () = Bytes.make 257 '\000'
let flatten cm =
let c = Bytes.create 256 in
let color_repr = Bytes.create 256 in
let v = ref 0 in
Bytes.set c 0 '\000';
Bytes.set color_repr 0 '\000';
for i = 1 to 255 do
if Bytes.get cm i <> '\000' then incr v;
Bytes.set c i (Char.chr !v);
Bytes.set color_repr !v (Char.chr i)
done;
(c, Bytes.sub color_repr 0 (!v + 1), !v + 1)
(* mark all the endpoints of the intervals of the char set with the 1 byte *)
let split s cm =
Cset.iter s ~f:(fun i j ->
Bytes.set cm i '\001';
Bytes.set cm (j + 1) '\001';
)
(* Color maps exists to provide an optimization for the regex engine. The fact
that some characters are entirely equivalent for some regexes means that we
can use them interchangeably.
A color map assigns a color to every character in our character set. Any two
characters with the same color will be treated equivalently by the automaton.
*)
type t
val make : unit -> t
val flatten : t -> bytes * bytes * int
val split : Cset.t -> t -> unit
This diff is collapsed.
......@@ -28,9 +28,40 @@ type t
type re
(** Compiled regular expression *)
type groups
(** Manipulate matching groups. *)
module Group : sig
type t
(** Information about groups in a match. *)
val get : t -> int -> string
(** Raise [Not_found] if the group did not match *)
val offset : t -> int -> int * int
(** Raise [Not_found] if the group did not match *)
val start : t -> int -> int
(** Return the start of the match. Raise [Not_found] if the group did not match. *)
val stop : t -> int -> int
(** Return the end of the match. Raise [Not_found] if the group did not match. *)
val all : t -> string array
(** Return the empty string for each group which did not match *)
val all_offset : t -> (int * int) array
(** Return [(-1,-1)] for each group which did not match *)
val test : t -> int -> bool
(** Test whether a group matched *)
val nb_groups : t -> int
(** Returns the total number of groups defined - matched or not.
This function is experimental. *)
val pp : Format.formatter -> t -> unit
end
type groups = Group.t [@@ocaml.deprecated "Use Group.t"]
(** {2 Compilation and execution of a regular expression} *)
val compile : t -> re
......@@ -40,7 +71,7 @@ val compile : t -> re
val exec :
?pos:int -> (* Default: 0 *)
?len:int -> (* Default: -1 (until end of string) *)
re -> string -> groups
re -> string -> Group.t
(** [exec re str] matches [str] against the compiled expression [re],
and returns the matched groups if any.
@param pos optional beginning of the string (default 0)
......@@ -52,7 +83,7 @@ val exec :
val exec_opt :
?pos:int -> (* Default: 0 *)
?len:int -> (* Default: -1 (until end of string) *)
re -> string -> groups option
re -> string -> Group.t option
(** Similar to {!exec}, but returns an option instead of using an exception. *)
val execp :
......@@ -68,41 +99,6 @@ val exec_partial :
re -> string -> [ `Full | `Partial | `Mismatch ]
(** More detailed version of {!exec_p} *)
(** Manipulate matching groups. *)
module Group : sig
type t = groups
(** Information about groups in a match. *)
val get : t -> int -> string
(** Raise [Not_found] if the group did not match *)
val offset : t -> int -> int * int
(** Raise [Not_found] if the group did not match *)
val start : t -> int -> int
(** Return the start of the match. Raise [Not_found] if the group did not match. *)
val stop : t -> int -> int
(** Return the end of the match. Raise [Not_found] if the group did not match. *)
val all : t -> string array
(** Return the empty string for each group which did not match *)
val all_offset : t -> (int * int) array
(** Return [(-1,-1)] for each group which did not match *)
val test : t -> int -> bool
(** Test whether a group matched *)
val nb_groups : t -> int
(** Returns the total number of groups defined - matched or not.
This function is experimental. *)
val pp : Format.formatter -> t -> unit
end
(** Marks *)
module Mark : sig
......@@ -124,62 +120,84 @@ end
(** {2 High Level Operations} *)
type 'a gen = unit -> 'a option
type split_token =
[ `Text of string (** Text between delimiters *)
| `Delim of Group.t (** Delimiter *)
]
val all :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> Group.t list
(** Repeatedly calls {!exec} on the given string, starting at given
position and length.*)
type 'a seq = 'a Seq.t
val all_gen :
module Seq : sig
val all :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> Group.t gen
(** Same as {!all} but returns a generator *)
re -> string -> Group.t Seq.t
(** Same as {!all} but returns an iterator
@since NEXT_RELEASE *)
val matches :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> string list
(** Same as {!all}, but extracts the matched substring rather than
returning the whole group. This basically iterates over matched
strings *)
val matches_gen :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> string gen
(** Same as {!matches}, but returns a generator. *)
re -> string -> string Seq.t
(** Same as {!matches}, but returns an iterator
@since NEXT_RELEASE *)
val split :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> string list
(** [split re s] splits [s] into chunks separated by [re]. It yields
the chunks themselves, not the separator. For instance
this can be used with a whitespace-matching re such as ["[\t ]+"]. *)
re -> string -> string Seq.t
(** @since NEXT_RELEASE *)
val split_gen :
val split_full :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> string gen
re -> string -> split_token Seq.t
(** @since NEXT_RELEASE *)
end
type split_token =
[ `Text of string (** Text between delimiters *)
| `Delim of Group.t (** Delimiter *)
]
val all : ?pos:int -> ?len:int -> re -> string -> Group.t list
(** Repeatedly calls {!exec} on the given string, starting at given position and
length.*)
val split_full :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> split_token list
type 'a gen = unit -> 'a option
val split_full_gen :
?pos:int -> (** Default: 0 *)
?len:int ->
re -> string -> split_token gen
val all_gen : ?pos:int -> ?len:int -> re -> string -> Group.t gen
[@@ocaml.deprecated "Use Seq.all"]
val all_seq : ?pos:int -> ?len:int -> re -> string -> Group.t seq
[@@ocaml.deprecated "Use Seq.all"]
val matches : ?pos:int -> ?len:int -> re -> string -> string list
(** Same as {!all}, but extracts the matched substring rather than returning
the whole group. This basically iterates over matched strings *)
val matches_gen : ?pos:int -> ?len:int -> re -> string -> string gen
[@@ocaml.deprecated "Use Seq.matches"]
val matches_seq : ?pos:int -> ?len:int -> re -> string -> string seq
[@@ocaml.deprecated "Use Seq.matches"]
val split : ?pos:int -> ?len:int -> re -> string -> string list
(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks
themselves, not the separator. For instance this can be used with a
whitespace-matching re such as ["[\t ]+"]. *)
val split_gen : ?pos:int -> ?len:int -> re -> string -> string gen
[@@ocaml.deprecated "Use Seq.split"]
val split_seq : ?pos:int -> ?len:int -> re -> string -> string seq
[@@ocaml.deprecated "Use Seq.split"]
val split_full : ?pos:int -> ?len:int -> re -> string -> split_token list
(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks
along with the separators. For instance this can be used with a
whitespace-matching re such as ["[\t ]+"]. *)
val split_full_gen : ?pos:int -> ?len:int -> re -> string -> split_token gen
[@@ocaml.deprecated "Use Seq.split_full"]
val split_full_seq : ?pos:int -> ?len:int -> re -> string -> split_token seq
[@@ocaml.deprecated "Use Seq.split_full"]
val replace :
?pos:int -> (** Default: 0 *)
......@@ -372,6 +390,31 @@ val pp_re : Format.formatter -> re -> unit
(** Alias for {!pp_re}. Deprecated *)
val print_re : Format.formatter -> re -> unit
module View : sig
type outer
(** A view of the top-level of a regex. This type is unstable and may change *)
type t =
Set of Cset.t
| Sequence of outer list
| Alternative of outer list
| Repeat of outer * int * int option
| Beg_of_line | End_of_line
| Beg_of_word | End_of_word | Not_bound
| Beg_of_str | End_of_str
| Last_end_of_line | Start | Stop
| Sem of Automata.sem * outer
| Sem_greedy of Automata.rep_kind * outer
| Group of outer | No_group of outer | Nest of outer
| Case of outer | No_case of outer
| Intersection of outer list
| Complement of outer list
| Difference of outer * outer
| Pmark of Pmark.t * outer
val view : outer -> t
end with type outer := t
(** {2 Experimental functions}. *)
val witness : t -> string
......@@ -381,28 +424,37 @@ val witness : t -> string
(** {2 Deprecated functions} *)
type substrings = Group.t
[@@ocaml.deprecated "Use Group.t"]
(** Alias for {!Group.t}. Deprecated *)
val get : Group.t -> int -> string
[@@ocaml.deprecated "Use Group.get"]
(** Same as {!Group.get}. Deprecated *)
val get_ofs : Group.t -> int -> int * int
[@@ocaml.deprecated "Use Group.offset"]
(** Same as {!Group.offset}. Deprecated *)
val get_all : Group.t -> string array
[@@ocaml.deprecated "Use Group.all"]
(** Same as {!Group.all}. Deprecated *)
val get_all_ofs : Group.t -> (int * int) array
[@@ocaml.deprecated "Use Group.all_offset"]
(** Same as {!Group.all_offset}. Deprecated *)
val test : Group.t -> int -> bool
[@@ocaml.deprecated "Use Group.test"]
(** Same as {!Group.test}. Deprecated *)
type markid = Mark.t
[@@ocaml.deprecated "Use Mark."]
(** Alias for {!Mark.t}. Deprecated *)
val marked : Group.t -> Mark.t -> bool
[@@ocaml.deprecated "Use Mark.test"]
(** Same as {!Mark.test}. Deprecated *)
val mark_set : Group.t -> Mark.Set.t
[@@ocaml.deprecated "Use Mark.all"]
(** Same as {!Mark.all}. Deprecated *)
(library
(name re)
(synopsis "Pure OCaml regular expression library")
(libraries seq)
(public_name re))
......@@ -164,12 +164,12 @@ module State = struct
end
let one ~explicit_slash ~explicit_period =
Re.(compl (
Re.compl (
List.concat [
if explicit_slash then [char '/'] else [];
if explicit_period then [char '.'] else [];
if explicit_slash then [Re.char '/'] else [];
if explicit_period then [Re.char '.'] else [];
]
))
)
let enclosed enclosed =
match enclosed with
......
(* Result of a successful match. *)
type t =
{ s : string
; marks : Automata.mark_infos
; pmarks : Pmark.Set.t
; gpos : int array
; gcount : int
}
let offset t i =
if 2 * i + 1 >= Array.length t.marks then raise Not_found;
let m1 = t.marks.(2 * i) in
if m1 = -1 then raise Not_found;
let p1 = t.gpos.(m1) - 1 in
let p2 = t.gpos.(t.marks.(2 * i + 1)) - 1 in
(p1, p2)
let get t i =
let (p1, p2) = offset t i in
String.sub t.s p1 (p2 - p1)
let start subs i = fst (offset subs i)
let stop subs i = snd (offset subs i)
let test t i =
if 2 * i >= Array.length t.marks then
false
else
let idx = t.marks.(2 * i) in
idx <> -1
let dummy_offset = (-1, -1)
let all_offset t =
let res = Array.make t.gcount dummy_offset in
for i = 0 to Array.length t.marks / 2 - 1 do
let m1 = t.marks.(2 * i) in
if m1 <> -1 then begin
let p1 = t.gpos.(m1) in
let p2 = t.gpos.(t.marks.(2 * i + 1)) in
res.(i) <- (p1 - 1, p2 - 1)
end
done;
res
let dummy_string = ""
let all t =
let res = Array.make t.gcount dummy_string in
for i = 0 to Array.length t.marks / 2 - 1 do
let m1 = t.marks.(2 * i) in
if m1 <> -1 then begin
let p1 = t.gpos.(m1) in
let p2 = t.gpos.(t.marks.(2 * i + 1)) in
res.(i) <- String.sub t.s (p1 - 1) (p2 - p1)
end
done;
res
let pp fmt t =
let matches =
let offsets = all_offset t in
let strs = all t in
Array.to_list (
Array.init (Array.length strs) (fun i -> strs.(i), offsets.(i))
) in
let open Fmt in
let pp_match fmt (str, (start, stop)) =
fprintf fmt "@[(%s (%d %d))@]" str start stop in
sexp fmt "Group" (list pp_match) matches
let nb_groups t = t.gcount
(* Result of a successful match. *)
type t =
{ s : string
(* Input string. Matched strings are substrings of s *)
; marks : Automata.mark_infos
(* Mapping from group indices to positions in gpos. group i has positions 2*i
- 1, 2*i + 1 in gpos. If the group wasn't matched, then its corresponding
values in marks will be -1,-1 *)
; pmarks : Pmark.Set.t
(* Marks positions. i.e. those marks created with Re.marks *)
; gpos : int array
(* Group positions. Adjacent elements are (start, stop) of group match.
indexed by the values in marks. So group i in an re would be the substring:
start = t.gpos.(marks.(2*i)) - 1
stop = t.gpos.(marks.(2*i + 1)) - 1 *)
; gcount : int
(* Number of groups the regular expression contains. Matched or not *)
}
(** Information about groups in a match. *)
val get : t -> int -> string
(** Raise [Not_found] if the group did not match *)
val offset : t -> int -> int * int
(** Raise [Not_found] if the group did not match *)
val start : t -> int -> int
(** Return the start of the match. Raise [Not_found] if the group did not match. *)
val stop : t -> int -> int
(** Return the end of the match. Raise [Not_found] if the group did not match. *)
val all : t -> string array
(** Return the empty string for each group which did not match *)
val all_offset : t -> (int * int) array
(** Return [(-1,-1)] for each group which did not match *)
val test : t -> int -> bool
(** Test whether a group matched *)
val nb_groups : t -> int
(** Returns the total number of groups defined - matched or not.
This function is experimental. *)
val pp : Format.formatter -> t -> unit