source: proto/RE/Haskell/CanonicalRE.hs @ 3600

Last change on this file since 3600 was 3600, checked in by cameron, 5 years ago

Clean out old prototypes; restructure

File size: 7.7 KB
Line 
1-- Module Canonical_RE is a research compiler for prototyping
2-- algorithms and ideas in regular expression processing using
3-- parallel bit streams and MatchStar.
4-- It defines a canonical representation for regular expressions
5-- that uses a small number of alternative forms (e.g., combining
6-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
7-- a single Rep structure. 
8
9-- Robert D. Cameron, 2013
10
11module CanonicalRE (RE(..), RepLimit(..), compile) where
12       
13import Data.Char
14import RunPablo
15
16-- RE is the data type for regular expressions
17
18data RE = CC String | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, RepLimit)
19          deriving Show
20data RepLimit = UpperBound Int | Unbounded deriving Show
21
22-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
23-- Start represents the ^ metacharacter for start of line or string matching
24-- End represents the $ metacharacter for end of line or string matching
25-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
26-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
27-- Rep (CC "a", 0, UpperBound 1) represents a?
28-- Rep (Seq[CC "a", CC "b", CC "c"], 0, Unbounded)  represents (abc)*, (without substring capture)
29-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, Unbounded) represents [a-z]+
30-- Rep (CC "ab", 5, Unbounded) represents [ab]{5,}
31-- Rep (CC "ha", 5, UpperBound 10) represents [ha]{5,10}
32
33-- Special cases:
34-- Seq [] represents the empty regular expression, matching only the empty string.
35-- Alt [] represents the empty set, i.e., matching nothing.
36
37--
38-- The Symbol Environment keeps track of the symbols generated
39-- during compilation, so that new unique symbols can be
40-- generated (gensym) as needed.   All symbols are formed from
41-- a string prefix and a numeric suffix.
42--
43-- For example, if the symbols "marker0", "marker1", "condition0"
44-- have been generated and no others have, the environment is
45-- [("marker", 1), ("condition", 0)]
46--
47type Env = [(String, Int)]
48
49gensym :: (Env, String) -> (Env, String)
50gensym ([], s) = ([(s, 0)], s ++ show 0)
51gensym ((prefix1, count1):env, s) 
52  | prefix1 == s = ((s, count1+1):env, s ++ show(count1+1))
53  | otherwise    = ((prefix1, count1):newenv, newsym)
54  where (newenv, newsym) = gensym(env, s)
55
56
57--
58-- To keep track of the current context for code generation during
59-- compilation, we define a CodeGenState that represents the output
60-- at each compilation step.   This state consists of an environment
61-- defining the symbols used so far, the sequence of generated PabloS
62-- statements generated so far and the name of the variable that
63-- holds the match result for the overall expression so far.
64--
65type CodeGenState = (Env, [PabloS], String)
66
67--
68-- In compiling to regular expression form, we use a helper function
69-- that takes a regular expression and a code generation state, and
70-- returns the code generation state.   The main compile routine
71-- initializes the process by setting an initial marker stream
72-- of all ones.
73--
74--
75compile :: RE -> CodeGenState
76re2pablo_helper :: (RE, CodeGenState) -> CodeGenState
77
78
79compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker))
80  where 
81    (env, marker) = gensym([], "start_marker")
82
83-- To match a character class, use bitwise and of the marker and
84-- character class, then advance 1. 
85
86re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
87   where 
88     (newenv, newsym) = gensym(env, "marker")
89     new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c))))
90
91-- To match "^" we must be at the start of line, i.e., one past
92-- a newline character or at the beginning of file.  We use the
93-- trick of advancing the negated newline stream to determine these
94-- positions.
95--
96re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
97   where 
98     (newenv, newsym) = gensym(env, "start_of_line_marker")
99     new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(CharClass("\n"))))))
100
101-- To match "$" we must have reached end of line.
102--
103re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
104   where 
105     (newenv, newsym) = gensym(env, "end_of_line_marker")
106     new_stmt = Assign (newsym, And(Var(last_marker), CharClass("\n")))
107
108-- Seq [] is the empty regexp which matches the empty string. 
109-- We just leave the current marker unchanged, with no new statements
110-- needed to compute it.
111--
112re2pablo_helper(Seq [], cg_state) = cg_state
113
114-- Seq (r1:rs): generate code to match r1, the first subexpression,
115-- then to match the rest, concatenating the statements.
116re2pablo_helper(Seq (r1:rs), cg_state) = re2pablo_helper(Seq rs, re2pablo_helper(r1, cg_state))
117 
118-- Alt[r] has a single alternative r to match, just match
119-- it with no other possibility.
120re2pablo_helper(Alt [r], cg_state) = re2pablo_helper(r, cg_state)
121
122-- For completeness, we define Alt[] as the regular expression that
123-- always fails (since no alternatives will match).
124re2pablo_helper(Alt [], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
125   where 
126     (newenv, newsym) = gensym(env, "always_fail_marker")
127     new_stmt = Assign (newsym, All(0))
128
129-- Alt (r1:rs): we have r1 and some other alternatives, the
130-- match succeeds if either r1 matches or any of the others
131-- does.
132re2pablo_helper(Alt (r1:rs), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
133  where 
134   (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker))
135   (e2, s2, alt2_marker) = re2pablo_helper(Alt rs, (e1, s1, last_marker))
136   (newenv, newsym) = gensym(e2, "alt_marker")
137   new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))]
138
139-- Repetition Matching Rules
140--
141-- For Kleene Star character class repetition, use MatchStar
142re2pablo_helper(Rep (CC c, 0, Unbounded), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
143   where 
144     (newenv, newsym) = gensym(env, "marker")
145     new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c))) 
146
147-- For general Kleene Star, we need a while loop.
148re2pablo_helper(Rep (r, 0, Unbounded), (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
149  where
150    (e1, while_test) = gensym(env, "while_test")
151    (e2, while_accum) = gensym(e1, "while_accum")
152    (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test))
153    while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))]
154    repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))),
155              Assign(while_accum, Or(Var(while_accum), Var(loop_result)))]
156    loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
157
158re2pablo_helper(Rep (r, lb, Unbounded), cg_state) = re2pablo_helper(Rep (r, lb-1, Unbounded), cg1_state)
159  where 
160   cg1_state = re2pablo_helper(r, cg_state)
161
162-- Now Bounded Repetition: use multiple copies
163   
164re2pablo_helper(Rep (r, 0, UpperBound 0), cg_state) = cg_state
165
166re2pablo_helper(Rep (r, 0, UpperBound ub), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
167  where 
168   (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
169   (e2, s2, rep2plus_marker) = re2pablo_helper(Rep (r, 0, UpperBound(ub-1)), (e1, s1, rep1_marker))
170   (newenv, newsym) = gensym(e2, "alt_marker")
171   new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
172
173re2pablo_helper(Rep (r, lb, UpperBound ub), cg_state) = re2pablo_helper(Rep (r, lb-1, UpperBound(ub-1)), cg1_state)
174  where 
175   cg1_state = re2pablo_helper(r, cg_state)
176
177---------------
178
179searchRE :: (RE, String) -> PabloVal
180searchRE(regexp, str) = evalPabloS(compiled, marker, str)
181  where (e, compiled, marker) = compile(regexp)
182       
Note: See TracBrowser for help on using the repository browser.