source: proto/RE/Haskell/CanonicalRE.hs @ 3606

Last change on this file since 3606 was 3606, checked in by cameron, 5 years ago

Use individual helper functions for Seq, Alt, RE

File size: 8.1 KB
Line 
1-- Module Canonical_RE is a research compiler for prototyping
2-- algorithms and ideas in regular expression processing using
3-- parallel bit streams and MatchStar.
4-- It defines a canonical representation for regular expressions
5-- that uses a small number of alternative forms (e.g., combining
6-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
7-- a single Rep structure. 
8
9-- Robert D. Cameron, 2013
10
11module CanonicalRE (RE(..), RepLimit(..), compile) where
12       
13import Data.Char
14import RunPablo
15
16-- RE is the data type for regular expressions
17
18data RE = CC String | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, RepLimit)
19          deriving Show
20data RepLimit = UpperBound Int | Unbounded deriving Show
21
22-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
23-- Start represents the ^ metacharacter for start of line or string matching
24-- End represents the $ metacharacter for end of line or string matching
25-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
26-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
27-- Rep (CC "a", 0, UpperBound 1) represents a?
28-- Rep (Seq[CC "a", CC "b", CC "c"], 0, Unbounded)  represents (abc)*, (without substring capture)
29-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, Unbounded) represents [a-z]+
30-- Rep (CC "ab", 5, Unbounded) represents [ab]{5,}
31-- Rep (CC "ha", 5, UpperBound 10) represents [ha]{5,10}
32
33-- Special cases:
34-- Seq [] represents the empty regular expression, matching only the empty string.
35-- Alt [] represents the empty set, i.e., matching nothing.
36
37--
38-- The Symbol Environment keeps track of the symbols generated
39-- during compilation, so that new unique symbols can be
40-- generated (gensym) as needed.   All symbols are formed from
41-- a string prefix and a numeric suffix.
42--
43-- For example, if the symbols "marker0", "marker1", "condition0"
44-- have been generated and no others have, the environment is
45-- [("marker", 1), ("condition", 0)]
46--
47type Env = [(String, Int)]
48
49gensym :: (Env, String) -> (Env, String)
50gensym ([], s) = ([(s, 0)], s ++ show 0)
51gensym ((prefix1, count1):env, s) 
52  | prefix1 == s = ((s, count1+1):env, s ++ show(count1+1))
53  | otherwise    = ((prefix1, count1):newenv, newsym)
54  where (newenv, newsym) = gensym(env, s)
55
56
57--
58-- To keep track of the current context for code generation during
59-- compilation, we define a CodeGenState that represents the output
60-- at each compilation step.   This state consists of an environment
61-- defining the symbols used so far, the sequence of generated PabloS
62-- statements generated so far and the name of the variable that
63-- holds the match result for the overall expression so far.
64--
65type CodeGenState = (Env, [PabloS], String)
66
67--
68-- In compiling to regular expression form, we use a helper function
69-- that takes a regular expression and a code generation state, and
70-- returns the code generation state.   The main compile routine
71-- initializes the process by setting an initial marker stream
72-- of all ones.
73--
74--
75compile :: RE -> CodeGenState
76re2pablo_helper :: (RE, CodeGenState) -> CodeGenState
77
78-- We also define helper functions for the working through
79-- the RE lists in Seq and Alt structures, as well as the
80-- iteration required for Rep structures.
81seq_helper :: ([RE], CodeGenState) -> CodeGenState
82alt_helper :: ([RE], CodeGenState) -> CodeGenState
83rep_helper :: (RE, Int, RepLimit, CodeGenState) -> CodeGenState
84
85compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker))
86  where 
87    (env, marker) = gensym([], "start_marker")
88
89-- To match a character class, use bitwise and of the marker and
90-- character class, then advance 1. 
91
92re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
93   where 
94     (newenv, newsym) = gensym(env, "marker")
95     new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c))))
96
97-- To match "^" we must be at the start of line, i.e., one past
98-- a newline character or at the beginning of file.  We use the
99-- trick of advancing the negated newline stream to determine these
100-- positions.
101--
102re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
103   where 
104     (newenv, newsym) = gensym(env, "start_of_line_marker")
105     new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(CharClass("\n"))))))
106
107-- To match "$" we must have reached end of line.
108--
109re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
110   where 
111     (newenv, newsym) = gensym(env, "end_of_line_marker")
112     new_stmt = Assign (newsym, And(Var(last_marker), CharClass("\n")))
113
114-- For the structured types (Seq, Alt, Rep), just call the specific helper.
115--
116re2pablo_helper(Seq rs, cg_state) = seq_helper(rs, cg_state)
117re2pablo_helper(Alt rs, cg_state) = alt_helper(rs, cg_state)
118re2pablo_helper(Rep (r, lb, ub), cg_state) = rep_helper(r, lb, ub, cg_state)
119
120
121-- Seq [] is the empty regexp which matches the empty string. 
122-- We just leave the current marker unchanged, with no new statements
123-- needed to compute it.
124--
125seq_helper([], cg_state) = cg_state
126-- Seq (r1:rs): generate code to match r1, the first subexpression,
127-- then to match the rest, concatenating the statements.
128seq_helper((r1:rs), cg_state) = seq_helper(rs, re2pablo_helper(r1, cg_state))
129
130-- Alt[r] has a single alternative r to match, just match
131-- it with no other possibility.
132alt_helper([r], cg_state) = re2pablo_helper(r, cg_state)
133
134-- For completeness, we define Alt[] as the regular expression that
135-- always fails (since no alternatives will match).
136alt_helper([], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
137   where 
138     (newenv, newsym) = gensym(env, "always_fail_marker")
139     new_stmt = Assign (newsym, All(0))
140
141-- Alt (r1:rs): we have r1 and some other alternatives, the
142-- match succeeds if either r1 matches or any of the others
143-- does.
144alt_helper(r1:rs, (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
145  where 
146   (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker))
147   (e2, s2, alt2_marker) = alt_helper(rs, (e1, s1, last_marker))
148   (newenv, newsym) = gensym(e2, "alt_marker")
149   new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))]
150
151-- For Kleene Star character class repetition, use MatchStar
152rep_helper(CC c, 0, Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
153   where 
154     (newenv, newsym) = gensym(env, "marker")
155     new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c))) 
156
157-- For general Kleene Star, we need a while loop.
158rep_helper(r, 0, Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
159  where
160    (e1, while_test) = gensym(env, "while_test")
161    (e2, while_accum) = gensym(e1, "while_accum")
162    (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test))
163    while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))]
164    repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))),
165              Assign(while_accum, Or(Var(while_accum), Var(loop_result)))]
166    loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
167
168rep_helper(r, lb, Unbounded, cg_state) = rep_helper(r, lb-1, Unbounded, cg1_state)
169  where 
170   cg1_state = re2pablo_helper(r, cg_state)
171
172-- Now Bounded Repetition: use multiple copies
173   
174rep_helper(r, 0, UpperBound 0, cg_state) = cg_state
175
176rep_helper(r, 0, UpperBound ub, (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
177  where 
178   (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
179   (e2, s2, rep2plus_marker) = rep_helper(r, 0, UpperBound(ub-1), (e1, s1, rep1_marker))
180   (newenv, newsym) = gensym(e2, "alt_marker")
181   new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
182
183rep_helper(r, lb, UpperBound ub, cg_state) = rep_helper(r, lb-1, UpperBound(ub-1), cg1_state)
184  where 
185   cg1_state = re2pablo_helper(r, cg_state)
186
187---------------
188
189searchRE :: (RE, String) -> PabloVal
190searchRE(regexp, str) = evalPabloS(compiled, marker, str)
191  where (e, compiled, marker) = compile(regexp)
192       
Note: See TracBrowser for help on using the repository browser.