source: proto/RE/Haskell/REcompile.hs @ 4223

Last change on this file since 4223 was 3903, checked in by cameron, 5 years ago

Revert inadvertent check-in

File size: 7.4 KB
Line 
1-- Module REcompile is a research compiler for prototyping
2-- algorithms and ideas in regular expression processing using
3-- parallel bit streams and MatchStar.
4-- It uses a canonical representation for regular expressions
5-- that uses a small number of alternative forms (e.g., combining
6-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
7-- a single Rep structure. 
8
9-- Robert D. Cameron, 2014
10
11module REcompile (compile, searchRE, searchRegExp) where
12       
13import Data.Char
14import SparseCharSet
15import CanonicalRE
16import REparse
17import RunPablo
18
19--
20-- The Symbol Environment keeps track of the symbols generated
21-- during compilation, so that new unique symbols can be
22-- generated (gensym) as needed.   All symbols are formed from
23-- a string prefix and a numeric suffix.
24--
25-- For example, if the symbols "marker0", "marker1", "condition0"
26-- have been generated and no others have, the environment is
27-- [("marker", 1), ("condition", 0)]
28--
29type Env = [(String, Int)]
30
31gensym :: (Env, String) -> (Env, String)
32gensym ([], s) = ([(s, 0)], s ++ show 0)
33gensym ((prefix1, count1):env, s) 
34  | prefix1 == s = ((s, count1+1):env, s ++ show(count1+1))
35  | otherwise    = ((prefix1, count1):newenv, newsym)
36  where (newenv, newsym) = gensym(env, s)
37
38
39--
40-- To keep track of the current context for code generation during
41-- compilation, we define a CodeGenState that represents the output
42-- at each compilation step.   This state consists of an environment
43-- defining the symbols used so far, the sequence of generated PabloS
44-- statements generated so far and the name of the variable that
45-- holds the match result for the overall expression so far.
46--
47type CodeGenState = (Env, [PabloS], String)
48
49--
50-- In compiling to regular expression form, we use a helper function
51-- that takes a regular expression and a code generation state, and
52-- returns the code generation state.   The main compile routine
53-- initializes the process by setting an initial marker stream
54-- of all ones.
55--
56--
57compile :: RE -> CodeGenState
58re2pablo_helper :: (RE, CodeGenState) -> CodeGenState
59
60-- We also define helper functions for the working through
61-- the RE lists in Seq and Alt structures, as well as the
62-- iteration required for Rep structures.
63seq_helper :: ([RE], CodeGenState) -> CodeGenState
64alt_helper :: ([RE], CodeGenState) -> CodeGenState
65unbounded_rep_helper :: (RE, Int, CodeGenState) -> CodeGenState
66bounded_rep_helper :: (RE, Int, Int, CodeGenState) -> CodeGenState
67
68eol_CC = CharClass([CharRange(ord '\n', ord '\n')])
69
70compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker))
71  where 
72    (env, marker) = gensym([], "start_marker")
73
74-- To match a character class, use bitwise and of the marker and
75-- character class, then advance 1. 
76
77re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
78   where 
79     (newenv, newsym) = gensym(env, "marker")
80     new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c)), 1))
81
82-- To match "^" we must be at the start of line, i.e., one past
83-- a newline character or at the beginning of file.  We use the
84-- trick of advancing the negated newline stream to determine these
85-- positions.
86--
87re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
88   where 
89     (newenv, newsym) = gensym(env, "start_of_line_marker")
90     new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(eol_CC), 1))))
91
92-- To match "$" we must have reached end of line.
93--
94re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
95   where 
96     (newenv, newsym) = gensym(env, "end_of_line_marker")
97     new_stmt = Assign (newsym, And(Var(last_marker), eol_CC))
98
99-- For the structured types (Seq, Alt, Rep), just call the specific helper.
100--
101re2pablo_helper(Seq rs, cg_state) = seq_helper(rs, cg_state)
102re2pablo_helper(Alt rs, cg_state) = alt_helper(rs, cg_state)
103re2pablo_helper(Rep (r, lb, ub), cg_state) 
104  | ub == unboundedRep  = unbounded_rep_helper(r, lb, cg_state)
105  | otherwise           = bounded_rep_helper(r, lb, ub, cg_state)
106
107
108-- Seq [] is the empty regexp which matches the empty string. 
109-- We just leave the current marker unchanged, with no new statements
110-- needed to compute it.
111--
112seq_helper([], cg_state) = cg_state
113-- Seq (r1:rs): generate code to match r1, the first subexpression,
114-- then to match the rest, concatenating the statements.
115seq_helper((r1:rs), cg_state) = seq_helper(rs, re2pablo_helper(r1, cg_state))
116
117-- Alt[r] has a single alternative r to match, just match
118-- it with no other possibility.
119alt_helper([r], cg_state) = re2pablo_helper(r, cg_state)
120
121-- For completeness, we define Alt[] as the regular expression that
122-- always fails (since no alternatives will match).
123alt_helper([], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
124   where 
125     (newenv, newsym) = gensym(env, "always_fail_marker")
126     new_stmt = Assign (newsym, All(0))
127
128-- Alt (r1:rs): we have r1 and some other alternatives, the
129-- match succeeds if either r1 matches or any of the others
130-- does.
131alt_helper(r1:rs, (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
132  where 
133   (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker))
134   (e2, s2, alt2_marker) = alt_helper(rs, (e1, s1, last_marker))
135   (newenv, newsym) = gensym(e2, "alt_marker")
136   new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))] 
137
138-- For Kleene Star character class repetition, use MatchStar
139unbounded_rep_helper(CC c, 0, cg_state) = (newenv, stmts ++ [new_stmt], newsym)
140   where 
141     (env, stmts, last_marker) = cg_state
142     (newenv, newsym) = gensym(env, "marker")
143     new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c))) 
144
145-- For general Kleene Star, we need a while loop.
146unbounded_rep_helper(r, 0, cg_state) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
147  where
148    (env, stmts, last_marker) = cg_state
149    (e1, while_test) = gensym(env, "while_test")
150    (e2, while_accum) = gensym(e1, "while_accum")
151    (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test))
152    while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))]
153    repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))),
154              Assign(while_accum, Or(Var(while_accum), Var(loop_result)))]
155    loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
156
157unbounded_rep_helper(r, lb, cg_state) = unbounded_rep_helper(r, lb-1, cg1_state)
158  where 
159   cg1_state = re2pablo_helper(r, cg_state)
160
161-- Now Bounded Repetition: use multiple copies
162   
163bounded_rep_helper(r, 0, 0, cg_state) = cg_state
164
165bounded_rep_helper(r, 0, ub, cg_state) = (newenv, new_stmts, newsym)
166  where 
167   (env, stmts, last_marker) = cg_state
168   (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
169   (e2, s2, rep2plus_marker) = bounded_rep_helper(r, 0, ub-1, (e1, s1, rep1_marker))
170   (newenv, newsym) = gensym(e2, "alt_marker")
171   new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
172
173bounded_rep_helper(r, lb, ub, cg_state) = bounded_rep_helper(r, lb-1, ub-1, cg1_state)
174  where 
175   cg1_state = re2pablo_helper(r, cg_state)
176
177---------------
178
179searchRE :: (RE, String) -> PabloVal
180searchRE(regexp, str) = evalPabloS(compiled, marker, str)
181  where (e, compiled, marker) = compile(regexp)
182
183searchRegExp :: (String, String) -> Maybe PabloVal
184searchRegExp(regexp, str) =
185  case parseRE(regexp) of
186    ParseSuccess r  -> Just (searchRE(r, str))
187    _ -> Nothing
188
Note: See TracBrowser for help on using the repository browser.