*This article is a continuation of the earlier post, "Compiling regular expressions (I)".*

Automata are modeled as 'state' records with two fields. The `pos`

field contains the set of positions that are valid for recognition in the given state. Transitions are modeled as lists of pairs of symbols and states. In this way a state may contain transitions that reference itself.

type state = {

pos : Int_set.t;

mutable trans : (char * state) list ;

}

We will require a function that for each input symbol $a$ and a given set of positions $s$, computes the list of pairs $(a, s')$ where $s'$ is the subset of $s$ that correspond to $a$.

This function makes a list from a set of ints.

let (partition : char option array -> Int_set.t

-> (char option * Int_set.t) list) =

fun chars s ->

let f acc c =

match c with

| Some _ ->

if List.mem_assoc c acc then acc

else

let f i acc =

if chars.(i)

c then acc else Int_set.add i acc in

(c, Int_set.fold f s (Int_set.empty)) :: acc

| None ->

if List.mem_assoc c acc then acc else (c, Int_set.empty) :: acc in

List.rev (Array.fold_left f [] chars)

This function,

let list_of_int_set : Int_set.t -> Int_set.elt list =

fun s -> List.rev (Int_set.fold (fun e acc -> e :: acc) s [])

`accessible`

given a state, computes the list of sets that accessible from that state.

let (accessible : state -> Int_set.t array -> char option array

-> (char * Int_set.t) list) =

fun s follow chars ->

let part = partition chars s.pos in

let f p rest =

match p with

| (Some c, l) ->

(c,

List.fold_left

(Int_set.union)

(Int_set.empty)

(List.map (Array.get follow) (list_of_int_set l))

) :: rest

| _ -> rest in

List.fold_right f part []

`find_state`

takes a set $s$ and two lists of states (marked and unmarked). It searches for a state which has a `pos`

field equal to $s$ and returns this state or it fails.

let (find_state : Int_set.t -> state list -> state list -> state) =

fun s l m ->

let test e = e.pos = s in

try

List.find test l

with

| Not_found -> List.find test m

The algorithm to compute the automata works like this. Two lists are maintained, marked and unmarked states. The algorithm is initialized such that the only state is unmarked with a `pos`

field containing `first_pos`

$n_{0}$ where $n_{0}$ is the root of the syntax tree; the list of transitions is empty.

For an unmarked state $st$, the algorithm does these things:

- Calculate a set of numbers accessible from $st$. That is, a set of pairs $(c, s)$, where $c$ is a character and $s$ a set of positions. A position $j$ is accessible from $st$ by $c$ if there is an $i$ in
`st.pos`

such that $j$ is in`follow`

$i$ and $i$ numbers the character $c$. - For each of the pairs $(c, s)$
- If there exists a state
`st'`

(whether marked or unmarked) such that $s = $`st'.pos`

, it adds $(c, st')$ to the transitions of $st$; - Otherwise, a new state $st'$ without transitions is created, added to the transitions of $st$, and $st'$ is added to the list of unmarked states.

- If there exists a state
- It marks $st$.

`Accept`

. Here then is the algorithm in code.

let rec (compute_states : state list -> state list -> Int_set.t array

-> char option array -> state array) =

fun marked unmarked follow chars ->

match unmarked with

| [] -> Array.of_list marked

| st :: umsts ->

let access = accessible st follow chars in

let marked1 = st :: marked in

let f (c, s) umsts =

if Int_set.is_empty s then

umsts (*Suppress empty sets*)

else

try

st.trans <- (c, find_state s marked1 umsts) ::st.trans ;

umsts

with

| Not_found ->

let state1 = {pos = s; trans = []} in

st.trans <- (c, state1) :: st.trans;

state1 :: umsts in

let unmarked1 = List.fold_right f access umsts in

compute_states marked1 unmarked1 follow chars

We are just about ready to write the function to compute the automaton. It is fundamentally a call to `compute_states`

but does one more thing. That is, it searches the resulting array for the index of the initial state and puts the index in the first slot of the array. To do this it uses the utility function `array_indexq`

which performs the search for the index using physical equality. This is because the usual test using structural equality will not terminate on structures that loop.

So, here it is,

let (array_indexq : 'a array -> 'a -> int) =

fun arr e ->

let rec loop i =

if i = Array.length arr then

raise (Not_found)

else if Array.get arr i == e then i

else loop (i + 1) in

loop 0

`dfa_of`

, the function to compute the automaton.

let (dfa_of : augmented_regexp * Int_set.t array * char option array

-> state array) =

fun (e, follow, chars) ->

let init_state = {pos = first_pos e; trans = []} in

let dfa = compute_states [] [init_state] follow chars in

(*Installing initial state at index 0*)

let idx_start = array_indexq dfa init_state in

dfa.(idx_start) <- dfa.(0);

dfa.(0) <- init_state;

dfa

We are now on the home stretch. All that remains is to write a function to *interpret* the automaton. To do this, we'll make use of a mini-combinator library of recognizers. I'll not provide the OCaml code for that today - you could reverse engineer from my earlier 'Recognizers' blog-post or, consult [1].

We wrap up with a couple of high level convenience functions :

let (interpret_dfa : state array -> int -> char Recognizer.recognizer) =

fun dfa accept ->

let num_states = Array.length dfa in

let fvect = Array.make (num_states) (fun _ -> failwith "no value") in

for i = 0 to num_states - 1 do

let trans = dfa.(i).trans in

let f (c, st) =

let pc = Recognizer.recognizer_of_char c in

let j = array_indexq dfa st in

Recognizer.compose_and pc (fun l -> fvect.(j) l) in

let parsers = List.map f trans in

if Int_set.mem accept (dfa.(i).pos) then

fvect.(i) <- compose_or_list

(Recognizer.end_of_input) parsers

else match parsers with

| [] -> failwith "Impossible"

| p :: ps -> fvect.(i) <- Recognizer.compose_or_list p ps

done;

fvect.(0)

`compile`

produces a recognizer from a string representation of a regular expression and `match`

takes a recognizer (that is, a compiled regular expression) and a string and uses the recognizer to categorize the given string as admissible or not (where `explode`

is a simple function that transforms a `string`

into a `char list`

- recognizers operate on lists).

let compile xpr =

let ((e, follow, chars) as ast) = regexp_follow xpr in

let dfa = dfa_of ast in

let parser = interpret_dfa dfa (Array.length chars - 1) in

fun s -> parser (explode s)

let re_match xpr s =

let result = xpr s in

match result with

| Recognizer.Remains [] -> true

| _ -> false

Here's a simple test driver that shows how these functions can be used.

let test xpr s =

match re_match xpr s with

| true -> Printf.printf "\"%s\" : success\n" s

| false -> Printf.printf "\"%s\" : fail\n" s

let _ =

try

let xpr = compile "(a|b)*abb" in

Printf.printf "Pattern: \"%s\"\n" "(a|b)*abb" ;

test xpr "abb" ;

test xpr "aabb" ;

test xpr "baabb" ;

test xpr "bbbbbbbbbbbbbaabb" ;

test xpr "aaaaaaabbbaabbbaabbabaabb" ;

test xpr "baab" ;

test xpr "aa" ;

test xpr "ab" ;

test xpr "bb" ;

test xpr "" ;

test xpr "ccabb" ;

with

| Failure msg -> print_endline msg

So that's it for this series of posts on building recognizers for regular expressions. Hope you enjoyed it!

**References**

[1] "The Functional Approach to Programming" - Cousineau & Mauny

[2] "Compilers Principles, Techniques & Tools" - Aho et. al.

by Shayne Fletcher (noreply@blogger.com) at December 22, 2014 11:47 PM