Stephane Glondu · Stephane Glondu · df81fe57 · 3e945b44 · df81fe57 · df81fe57
--- a/CHANGES.md
+++ b/CHANGES.md
+1.9.0 (05-Apr-2019)
+-------------------
+
+* Fix regression in `Re.exec_partial` (#164)
+* Mov gen related functions to `Re.Gen` and deprecate the old names (#167)
+* Introduce `Re.View` that exposes the internal representation (#163)
+
+1.8.0 (04-Aug-2018)
+-------------------
+
+* Fix index-out-of-bounds exception in Re.Perl.re (#160)
+* Add seq based iterators (#170)
+
 1.7.3 (05-Mar-2018)
 -------------------


--- a/INSTALL
+++ b/INSTALL
-
-Requirements
-
-  The installation procedure defined in the Makefile requires findlib
-  (http://www.ocaml-programming.de/packages/documentation/findlib/).
-
-Installation
-
- Compile with "make all".
-
- If you have ocamlopt, do also "make opt".
-
- Become super-user if necessary and do "make install"
-  (A "make uninstall" removes the library.)
--- a/Makefile
+++ b/Makefile
-JBUILDER ?= jbuilder
+DUNE ?= dune

 all:
-	@$(JBUILDER) build
+	@$(DUNE) build

 test:
-	@$(JBUILDER) runtest
+	@$(DUNE) runtest

 check: test

 clean:
-	@$(JBUILDER) clean
+	@$(DUNE) clean

 .PHONY: check test all clean

 .PHONY: all-supported-ocaml-versions
 all-supported-ocaml-versions:
-	jbuilder build @runtest --workspace jbuild-workspace.dev
+	dune build @runtest --workspace dune-workspace.dev
--- a/README.md
+++ b/README.md
@@ -18,20 +18,18 @@ Features

 The following styles of regular expressions are supported:
 - Perl-style regular expressions (module `Re.Perl`);
- Posix extended regular expressions (module `Re_posix`);
+- Posix extended regular expressions (module `Re.Posix`);
 - Emacs-style regular expressions (module `Re.Emacs`);
- Shell-style file globbing (module `Re_glob`).
+- Shell-style file globbing (module `Re.Glob`).

-It is also possible to build regular expressions by combining simpler
-regular expressions (module `Re`).
+It is also possible to build regular expressions by combining simpler regular
+expressions (module `Re`).

 The most notable missing features are **back-references** and
 look-ahead/look-behind **assertions**.

-There is also a subset of the PCRE interface available in the
-`Re.pcre` library. This makes it easier to port code from that
-library to Re without any changes beyond replacing the `pcre`
-findlib package with `re.pcre`.
+There is also a subset of the PCRE interface available in the `Re.Pcre` module.
+This makes it easier to port code from that library to Re minimal changes.

 Performances
 ============

--- a/benchmarks/dune
+++ b/benchmarks/dune
+(executable
+ (libraries re threads core_bench)
+ (name benchmark))
--- a/benchmarks/jbuild
+++ b/benchmarks/jbuild
-(jbuild_version 1)
-
-(executable
- ((libraries (re threads core_bench))
-  (name benchmark)))
--- a/deprecated/dune
+++ b/deprecated/dune
+(library
+ (name re_str)
+ (public_name re.str)
+ (wrapped false)
+ (modules re_str)
+ (synopsis "Deprecated. Use Re.Str")
+ (libraries re))
+
+(library
+ (name re_pcre)
+ (public_name re.pcre)
+ (wrapped false)
+ (modules re_pcre)
+ (synopsis "Deprecated. Use Re.Pcre")
+ (libraries re))
+
+(library
+ (name re_perl)
+ (public_name re.perl)
+ (wrapped false)
+ (modules re_perl)
+ (synopsis "Deprecated. Use Re.Perl")
+ (libraries re))
+
+(library
+ (name re_posix)
+ (public_name re.posix)
+ (wrapped false)
+ (modules re_posix)
+ (synopsis "Deprecated. Use Re.Posix")
+ (libraries re))
+
+(library
+ (name re_emacs)
+ (public_name re.emacs)
+ (wrapped false)
+ (modules re_emacs)
+ (synopsis "Deprecated. Use Re.Emacs")
+ (libraries re))
+
+(library
+ (name re_glob)
+ (public_name re.glob)
+ (wrapped false)
+ (modules re_glob)
+ (synopsis "Deprecated. Use Re.Glob")
+ (libraries re))
--- a/deprecated/jbuild
+++ b/deprecated/jbuild
-(jbuild_version 1)
-
-(library
- ((name re_str)
-  (public_name re.str)
-  (wrapped false)
-  (modules (re_str))
-  (synopsis "Deprecated. Use Re.Str")
-  (libraries (re))))
-
-(library
- ((name re_pcre)
-  (public_name re.pcre)
-  (wrapped false)
-  (modules (re_pcre))
-  (synopsis "Deprecated. Use Re.Pcre")
-  (libraries (re))))
-
-(library
- ((name re_perl)
-  (public_name re.perl)
-  (wrapped false)
-  (modules (re_perl))
-  (synopsis "Deprecated. Use Re.Perl")
-  (libraries (re))))
-
-(library
- ((name re_posix)
-  (public_name re.posix)
-  (wrapped false)
-  (modules (re_posix))
-  (synopsis "Deprecated. Use Re.Posix")
-  (libraries (re))))
-
-(library
- ((name re_emacs)
-  (public_name re.emacs)
-  (wrapped false)
-  (modules (re_emacs))
-  (synopsis "Deprecated. Use Re.Emacs")
-  (libraries (re))))
-
-(library
- ((name re_glob)
-  (public_name re.glob)
-  (wrapped false)
-  (modules (re_glob))
-  (synopsis "Deprecated. Use Re.Glob")
-  (libraries (re))))
--- a/dune
+++ b/dune
+(env
+ (_ (flags (:standard -w -50))))
\ No newline at end of file
--- a/dune-project
+++ b/dune-project
+(lang dune 1.0)
+
+(name re)
--- a/dune-workspace.dev
+++ b/dune-workspace.dev
+(lang dune 1.0)
+;; This file is used by `make all-supported-ocaml-versions`
+(context (opam (switch 4.02.3)))
+(context (opam (switch 4.03.0)))
+(context (opam (switch 4.04.2)))
+(context (opam (switch 4.05.0)))
+(context (opam (switch 4.06.1)))
+(context (opam (switch 4.07.0)))
\ No newline at end of file
--- a/jbuild-workspace.dev
+++ b/jbuild-workspace.dev
-;; This file is used by `make all-supported-ocaml-versions`
-(context ((switch 4.02.3)))
-(context ((switch 4.03.0)))
-(context ((switch 4.04.2)))
-(context ((switch 4.05.0)))
-(context ((switch 4.06.0)))
\ No newline at end of file
--- a/lib/color_map.ml
+++ b/lib/color_map.ml
+(* In reality, this can really be represented as a bool array.
+
+   The representation is best thought of as a list of all chars along with a
+   flag:
+
+   (a, 0), (b, 1), (c, 0), (d, 0), ...
+
+   characters belonging to the same color are represented by sequnces of
+   characters with the flag set to 0.
+*)
+
+type t = Bytes.t
+
+let make () = Bytes.make 257 '\000'
+
+let flatten cm =
+  let c = Bytes.create 256 in
+  let color_repr = Bytes.create 256 in
+  let v = ref 0 in
+  Bytes.set c 0 '\000';
+  Bytes.set color_repr 0 '\000';
+  for i = 1 to 255 do
+    if Bytes.get cm i <> '\000' then incr v;
+    Bytes.set c i (Char.chr !v);
+    Bytes.set color_repr !v (Char.chr i)
+  done;
+  (c, Bytes.sub color_repr 0 (!v + 1), !v + 1)
+
+(* mark all the endpoints of the intervals of the char set with the 1 byte *)
+let split s cm =
+  Cset.iter s ~f:(fun i j ->
+      Bytes.set cm i '\001';
+      Bytes.set cm (j + 1) '\001';
+    )
--- a/lib/color_map.mli
+++ b/lib/color_map.mli
+(* Color maps exists to provide an optimization for the regex engine. The fact
+   that some characters are entirely equivalent for some regexes means that we
+   can use them interchangeably.
+
+   A color map assigns a color to every character in our character set. Any two
+   characters with the same color will be treated equivalently by the automaton.
+*)
+type t
+
+val make : unit -> t
+
+val flatten : t -> bytes * bytes * int
+
+val split : Cset.t -> t -> unit
--- a/lib/core.ml
+++ b/lib/core.ml
--- a/lib/core.mli
+++ b/lib/core.mli
@@ -28,9 +28,40 @@ type t
 type re
 (** Compiled regular expression *)

-type groups
+(** Manipulate matching groups. *)
+module Group : sig
+  type t
  (** Information about groups in a match. *)

+  val get : t -> int -> string
+  (** Raise [Not_found] if the group did not match *)
+
+  val offset : t -> int -> int * int
+  (** Raise [Not_found] if the group did not match *)
+
+  val start : t -> int -> int
+  (** Return the start of the match. Raise [Not_found] if the group did not match. *)
+
+  val stop : t -> int -> int
+  (** Return the end of the match. Raise [Not_found] if the group did not match. *)
+
+  val all : t -> string array
+  (** Return the empty string for each group which did not match *)
+
+  val all_offset : t -> (int * int) array
+  (** Return [(-1,-1)] for each group which did not match *)
+
+  val test : t -> int -> bool
+  (** Test whether a group matched *)
+
+  val nb_groups : t -> int
+  (** Returns the total number of groups defined - matched or not.
+      This function is experimental. *)
+
+  val pp : Format.formatter -> t -> unit
+end
+type groups = Group.t [@@ocaml.deprecated "Use Group.t"]
+
 (** {2 Compilation and execution of a regular expression} *)

 val compile : t -> re
@@ -40,7 +71,7 @@ val compile : t -> re
 val exec :
  ?pos:int ->    (* Default: 0 *)
  ?len:int ->    (* Default: -1 (until end of string) *)
-  re -> string -> groups
+  re -> string -> Group.t
 (** [exec re str] matches [str] against the compiled expression [re],
    and returns the matched groups if any.
    @param pos optional beginning of the string (default 0)
@@ -52,7 +83,7 @@ val exec :
 val exec_opt :
  ?pos:int ->    (* Default: 0 *)
  ?len:int ->    (* Default: -1 (until end of string) *)
-  re -> string -> groups option
+  re -> string -> Group.t option
 (** Similar to {!exec}, but returns an option instead of using an exception. *)

 val execp :
@@ -68,41 +99,6 @@ val exec_partial :
  re -> string -> [ `Full | `Partial | `Mismatch ]
 (** More detailed version of {!exec_p} *)

-(** Manipulate matching groups. *)
-module Group : sig
-
-  type t = groups
-  (** Information about groups in a match. *)
-
-  val get : t -> int -> string
-  (** Raise [Not_found] if the group did not match *)
-
-  val offset : t -> int -> int * int
-  (** Raise [Not_found] if the group did not match *)
-
-  val start : t -> int -> int
-  (** Return the start of the match. Raise [Not_found] if the group did not match. *)
-
-  val stop : t -> int -> int
-  (** Return the end of the match. Raise [Not_found] if the group did not match. *)
-
-  val all : t -> string array
-  (** Return the empty string for each group which did not match *)
-
-  val all_offset : t -> (int * int) array
-  (** Return [(-1,-1)] for each group which did not match *)
-
-  val test : t -> int -> bool
-  (** Test whether a group matched *)
-
-  val nb_groups : t -> int
-  (** Returns the total number of groups defined - matched or not.
-      This function is experimental. *)
-
-  val pp : Format.formatter -> t -> unit
-
-end
-
 (** Marks *)
 module Mark : sig

@@ -124,62 +120,84 @@ end

 (** {2 High Level Operations} *)

-type 'a gen = unit -> 'a option
+type split_token =
+  [ `Text of string  (** Text between delimiters *)
+  | `Delim of Group.t (** Delimiter *)
+  ]

-val all :
-  ?pos:int ->    (** Default: 0 *)
-  ?len:int ->
-  re -> string -> Group.t list
-(** Repeatedly calls {!exec} on the given string, starting at given
-    position and length.*)
+type 'a seq = 'a Seq.t

-val all_gen :
+module Seq : sig
+  val all :
    ?pos:int ->    (** Default: 0 *)
    ?len:int ->
-  re -> string -> Group.t gen
-(** Same as {!all} but returns a generator *)
+    re -> string -> Group.t Seq.t
+    (** Same as {!all} but returns an iterator
+        @since NEXT_RELEASE *)

  val matches :
    ?pos:int ->    (** Default: 0 *)
    ?len:int ->
-  re -> string -> string list
-(** Same as {!all}, but extracts the matched substring rather than
-    returning the whole group. This basically iterates over matched
-    strings *)
-
-val matches_gen :
-  ?pos:int ->    (** Default: 0 *)
-  ?len:int ->
-  re -> string -> string gen
-(** Same as {!matches}, but returns a generator. *)
+    re -> string -> string Seq.t
+    (** Same as {!matches}, but returns an iterator
+        @since NEXT_RELEASE *)

  val split :
    ?pos:int ->    (** Default: 0 *)
    ?len:int ->
-  re -> string -> string list
-(** [split re s] splits [s] into chunks separated by [re]. It yields
-    the chunks themselves, not the separator. For instance
-    this can be used with a whitespace-matching re such as ["[\t ]+"]. *)
+    re -> string -> string Seq.t
+    (** @since NEXT_RELEASE *)

-val split_gen :
+  val split_full :
    ?pos:int ->    (** Default: 0 *)
    ?len:int ->
-  re -> string -> string gen
+    re -> string -> split_token Seq.t
+    (** @since NEXT_RELEASE *)
+end

-type split_token =
-  [ `Text of string  (** Text between delimiters *)
-  | `Delim of Group.t (** Delimiter *)
-  ]
+val all : ?pos:int -> ?len:int -> re -> string -> Group.t list
+(** Repeatedly calls {!exec} on the given string, starting at given position and
+    length.*)

-val split_full :
-  ?pos:int ->    (** Default: 0 *)
-  ?len:int ->
-  re -> string -> split_token list
+type 'a gen = unit -> 'a option

-val split_full_gen :
-  ?pos:int ->    (** Default: 0 *)
-  ?len:int ->
-  re -> string -> split_token gen
+val all_gen : ?pos:int -> ?len:int -> re -> string -> Group.t gen
+[@@ocaml.deprecated "Use Seq.all"]
+
+val all_seq : ?pos:int -> ?len:int -> re -> string -> Group.t seq
+[@@ocaml.deprecated "Use Seq.all"]
+
+val matches : ?pos:int -> ?len:int -> re -> string -> string list
+(** Same as {!all}, but extracts the matched substring rather than returning
+    the whole group. This basically iterates over matched strings *)
+
+val matches_gen : ?pos:int -> ?len:int -> re -> string -> string gen
+[@@ocaml.deprecated "Use Seq.matches"]
+
+val matches_seq : ?pos:int -> ?len:int -> re -> string -> string seq
+[@@ocaml.deprecated "Use Seq.matches"]
+
+val split : ?pos:int -> ?len:int -> re -> string -> string list
+(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks
+    themselves, not the separator. For instance this can be used with a
+    whitespace-matching re such as ["[\t ]+"]. *)
+
+val split_gen : ?pos:int -> ?len:int -> re -> string -> string gen
+[@@ocaml.deprecated "Use Seq.split"]
+
+val split_seq : ?pos:int -> ?len:int -> re -> string -> string seq
+[@@ocaml.deprecated "Use Seq.split"]
+
+val split_full : ?pos:int -> ?len:int -> re -> string -> split_token list
+(** [split re s] splits [s] into chunks separated by [re]. It yields the chunks
+    along with the separators. For instance this can be used with a
+    whitespace-matching re such as ["[\t ]+"]. *)
+
+val split_full_gen : ?pos:int -> ?len:int -> re -> string -> split_token gen
+[@@ocaml.deprecated "Use Seq.split_full"]
+
+val split_full_seq : ?pos:int -> ?len:int -> re -> string -> split_token seq
+[@@ocaml.deprecated "Use Seq.split_full"]

 val replace :
  ?pos:int ->    (** Default: 0 *)
@@ -372,6 +390,31 @@ val pp_re : Format.formatter -> re -> unit
 (** Alias for {!pp_re}. Deprecated *)
 val print_re : Format.formatter -> re -> unit

+module View : sig
+  type outer
+
+  (** A view of the top-level of a regex. This type is unstable and may change *)
+  type t =
+      Set of Cset.t
+    | Sequence of outer list
+    | Alternative of outer list
+    | Repeat of outer * int * int option
+    | Beg_of_line | End_of_line
+    | Beg_of_word | End_of_word | Not_bound
+    | Beg_of_str | End_of_str
+    | Last_end_of_line | Start | Stop
+    | Sem of Automata.sem * outer
+    | Sem_greedy of Automata.rep_kind * outer
+    | Group of outer | No_group of outer | Nest of outer
+    | Case of outer | No_case of outer
+    | Intersection of outer list
+    | Complement of outer list
+    | Difference of outer * outer
+    | Pmark of Pmark.t * outer
+
+  val view : outer -> t
+end with type outer := t
+
 (** {2 Experimental functions}. *)

 val witness : t -> string
@@ -381,28 +424,37 @@ val witness : t -> string
 (** {2 Deprecated functions} *)

 type substrings = Group.t
+[@@ocaml.deprecated "Use Group.t"]
 (** Alias for {!Group.t}. Deprecated *)

 val get : Group.t -> int -> string
+[@@ocaml.deprecated "Use Group.get"]
 (** Same as {!Group.get}. Deprecated *)

 val get_ofs : Group.t -> int -> int * int
+[@@ocaml.deprecated "Use Group.offset"]
 (** Same as {!Group.offset}. Deprecated *)

 val get_all : Group.t -> string array
+[@@ocaml.deprecated "Use Group.all"]
 (** Same as {!Group.all}. Deprecated *)

 val get_all_ofs : Group.t -> (int * int) array
+[@@ocaml.deprecated "Use Group.all_offset"]
 (** Same as {!Group.all_offset}. Deprecated *)

 val test : Group.t -> int -> bool
+[@@ocaml.deprecated "Use Group.test"]
 (** Same as {!Group.test}. Deprecated *)

 type markid = Mark.t
+[@@ocaml.deprecated "Use Mark."]
 (** Alias for {!Mark.t}. Deprecated *)

 val marked : Group.t -> Mark.t -> bool
+[@@ocaml.deprecated "Use Mark.test"]
 (** Same as {!Mark.test}. Deprecated *)

 val mark_set : Group.t -> Mark.Set.t
+[@@ocaml.deprecated "Use Mark.all"]
 (** Same as {!Mark.all}. Deprecated *)
--- a/lib/dune
+++ b/lib/dune
+(library
+ (name re)
+ (synopsis "Pure OCaml regular expression library")
+ (libraries seq)
+ (public_name re))
--- a/lib/glob.ml
+++ b/lib/glob.ml
@@ -164,12 +164,12 @@ module State = struct
 end

 let one ~explicit_slash ~explicit_period =
-  Re.(compl (
+  Re.compl (
    List.concat [
-      if explicit_slash  then [char '/'] else [];
-      if explicit_period then [char '.'] else [];
+      if explicit_slash  then [Re.char '/'] else [];
+      if explicit_period then [Re.char '.'] else [];
    ]
-  ))
+  )

 let enclosed enclosed =
  match enclosed with

--- a/lib/group.ml
+++ b/lib/group.ml
+(* Result of a successful match. *)
+type t =
+  { s : string
+  ; marks : Automata.mark_infos
+  ; pmarks : Pmark.Set.t
+  ; gpos : int array
+  ; gcount : int
+  }
+
+let offset t i =
+  if 2 * i + 1 >= Array.length t.marks then raise Not_found;
+  let m1 = t.marks.(2 * i) in
+  if m1 = -1 then raise Not_found;
+  let p1 = t.gpos.(m1) - 1 in
+  let p2 = t.gpos.(t.marks.(2 * i + 1)) - 1 in
+  (p1, p2)
+
+let get t i =
+  let (p1, p2) = offset t i in
+  String.sub t.s p1 (p2 - p1)
+
+let start subs i = fst (offset subs i)
+
+let stop subs i = snd (offset subs i)
+
+let test t i =
+  if 2 * i >= Array.length t.marks then
+    false
+  else
+    let idx = t.marks.(2 * i) in
+    idx <> -1
+
+let dummy_offset = (-1, -1)
+
+let all_offset t =
+  let res = Array.make t.gcount dummy_offset in
+  for i = 0 to Array.length t.marks / 2 - 1 do
+    let m1 = t.marks.(2 * i) in
+    if m1 <> -1 then begin
+      let p1 = t.gpos.(m1) in
+      let p2 = t.gpos.(t.marks.(2 * i + 1)) in
+      res.(i) <- (p1 - 1, p2 - 1)
+    end
+  done;
+  res
+
+let dummy_string = ""
+
+let all t =
+  let res = Array.make t.gcount dummy_string in
+  for i = 0 to Array.length t.marks / 2 - 1 do
+    let m1 = t.marks.(2 * i) in
+    if m1 <> -1 then begin
+      let p1 = t.gpos.(m1) in
+      let p2 = t.gpos.(t.marks.(2 * i + 1)) in
+      res.(i) <- String.sub t.s (p1 - 1) (p2 - p1)
+    end
+  done;
+  res
+
+let pp fmt t =
+  let matches =
+    let offsets = all_offset t in
+    let strs = all t in
+    Array.to_list (
+      Array.init (Array.length strs) (fun i -> strs.(i), offsets.(i))
+    ) in
+  let open Fmt in
+  let pp_match fmt (str, (start, stop)) =
+    fprintf fmt "@[(%s (%d %d))@]" str start stop in
+  sexp fmt "Group" (list pp_match) matches
+
+let nb_groups t = t.gcount
--- a/lib/group.mli
+++ b/lib/group.mli
+(* Result of a successful match. *)
+type t =
+  { s : string
+  (* Input string. Matched strings are substrings of s *)
+
+  ; marks : Automata.mark_infos
+  (* Mapping from group indices to positions in gpos. group i has positions 2*i
+     - 1, 2*i + 1 in gpos. If the group wasn't matched, then its corresponding
+     values in marks will be -1,-1 *)
+
+  ; pmarks : Pmark.Set.t
+  (* Marks positions. i.e. those marks created with Re.marks *)
+
+  ; gpos : int array
+  (* Group positions. Adjacent elements are (start, stop) of group match.
+     indexed by the values in marks. So group i in an re would be the substring:
+
+     start = t.gpos.(marks.(2*i)) - 1
+     stop = t.gpos.(marks.(2*i + 1)) - 1 *)
+
+  ; gcount : int
+  (* Number of groups the regular expression contains. Matched or not *)
+  }
+
+(** Information about groups in a match. *)
+
+val get : t -> int -> string
+(** Raise [Not_found] if the group did not match *)
+
+val offset : t -> int -> int * int
+(** Raise [Not_found] if the group did not match *)
+
+val start : t -> int -> int
+(** Return the start of the match. Raise [Not_found] if the group did not match. *)
+
+val stop : t -> int -> int
+(** Return the end of the match. Raise [Not_found] if the group did not match. *)
+
+val all : t -> string array
+(** Return the empty string for each group which did not match *)
+
+val all_offset : t -> (int * int) array
+(** Return [(-1,-1)] for each group which did not match *)
+
+val test : t -> int -> bool
+(** Test whether a group matched *)
+
+val nb_groups : t -> int
+(** Returns the total number of groups defined - matched or not.
+    This function is experimental. *)
+
+val pp : Format.formatter -> t -> unit