Changeset 3600 for proto/RE


Ignore:
Timestamp:
Dec 30, 2013, 6:52:00 AM (6 years ago)
Author:
cameron
Message:

Clean out old prototypes; restructure

Location:
proto/RE/Haskell
Files:
1 added
2 deleted
1 edited

Legend:

Unmodified
Added
Removed
  • proto/RE/Haskell/CanonicalRE.hs

    r3599 r3600  
    1 -- Module Canonical_RE is a canonical representation for regular
    2 -- expressions that uses a minimum number of alternative forms,
    3 -- for example, in comparison with module RE_Proto.
    4 -- Use of this canonical representation should simplify transformation and
    5 -- analysis algorithms. 
    6 -- Compare for example, RE_proto.minMatchLen with CanonicalRE.minMatchLen
     1-- Module Canonical_RE is a research compiler for prototyping
     2-- algorithms and ideas in regular expression processing using
     3-- parallel bit streams and MatchStar.
     4-- It defines a canonical representation for regular expressions
     5-- that uses a small number of alternative forms (e.g., combining
     6-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
     7-- a single Rep structure. 
    78
     9-- Robert D. Cameron, 2013
    810
    9 module CanonicalRE (RE(..), canonicalize, minMatchLen, removeNullablePrefix) where
     11module CanonicalRE (RE(..), RepLimit(..), compile) where
    1012       
    11 import qualified RE_Proto as Proto
    1213import Data.Char
    1314import RunPablo
    1415
     16-- RE is the data type for regular expressions
     17
     18data RE = CC String | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, RepLimit)
     19          deriving Show
    1520data RepLimit = UpperBound Int | Unbounded deriving Show
    16 upper_bound_dec :: RepLimit -> RepLimit
    17 upper_bound_dec (UpperBound j) = UpperBound (j - 1)
    18 upper_bound_dec Unbounded = Unbounded
    1921
    20 data RE = CC String | Seq [RE] | Alt [RE] | Rep RE Int RepLimit |
    21           Start | End  -- lookbehind/lookahead assertions for newline
    22           deriving Show
    23 -- Rep r i j represents UpperBound repetition, with lower bound i and
    24 -- upper bound j.   We use the convention that any negative value of
    25 -- j represents Unbounded repetition.
     22-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
     23-- Start represents the ^ metacharacter for start of line or string matching
     24-- End represents the $ metacharacter for end of line or string matching
     25-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
     26-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
     27-- Rep (CC "a", 0, UpperBound 1) represents a?
     28-- Rep (Seq[CC "a", CC "b", CC "c"], 0, Unbounded)  represents (abc)*, (without substring capture)
     29-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, Unbounded) represents [a-z]+
     30-- Rep (CC "ab", 5, Unbounded) represents [ab]{5,}
     31-- Rep (CC "ha", 5, UpperBound 10) represents [ha]{5,10}
    2632
    27 canonicalize :: Proto.RE -> RE
    28 canonicalize (Proto.CC s) = CC s
    29 canonicalize (Proto.CCnot s) =  CC (filter (\c -> not(elem c s)) (map chr ([0..9]++[11..127])))
    30 canonicalize (Proto.Lit s) = Seq (map (\c -> CC [c]) s)
    31 canonicalize Proto.Start = Start
    32 canonicalize Proto.End = End
    33 canonicalize Proto.Any = CC (map chr ([0..9]++[11..127]))
    34 canonicalize (Proto.Join rs) = Seq (map canonicalize rs)
    35 canonicalize (Proto.Alt rs) = Alt (map canonicalize rs)
    36 canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1)
    37 canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 Unbounded
    38 canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 Unbounded
    39 canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i Unbounded
    40 canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i (UpperBound j)
    41 
    42 
    43 -- Write a minimal match length function that determines the minimum length
    44 -- string that can be matched by an RE, considering that Start and End match
    45 -- single \n characterProto.
    46 
    47 minMatchLen :: RE -> Int
    48 minMatchLen (CC s) = 1
    49 minMatchLen Start = 1
    50 minMatchLen End = 1
    51 minMatchLen (Seq []) = 0
    52 minMatchLen (Seq (r:rs)) = minMatchLen r + (minMatchLen (Seq rs))
    53 minMatchLen (Alt [r]) = minMatchLen r
    54 minMatchLen (Alt (r:rs)) = min (minMatchLen r) (minMatchLen (Alt rs))
    55 minMatchLen (Rep r i j) = i * (minMatchLen r)
    56 
    57 --- removeNullablePrefix takes a regular expression and returns
    58 --- the version of it that has been transformed to remove any nullable
    59 --- prefixeProto.  A prefix p is nullable if it can possibly match the empty
    60 --- string, i.e., minMatchLen(p) = 0.  For example, in the regexp
    61 --- [a-z]*-[0-9]+ the prefix [a-z]* is nullable.
    62 
    63 removeNullablePrefix :: RE -> RE
    64 removeNullablePrefix (CC s) = (CC s)
    65 removeNullablePrefix Start = Start
    66 removeNullablePrefix End = End
    67 removeNullablePrefix (Seq []) = Seq []
    68 removeNullablePrefix (Seq (r:rs))
    69    | minMatchLen(r) == 0  = removeNullablePrefix(Seq rs)
    70    | otherwise            = Seq ((removeNullablePrefix r):rs)
    71 removeNullablePrefix (Alt rs) = Alt (map removeNullablePrefix rs)
    72 removeNullablePrefix (Rep r 0 ub) = Seq []
    73 removeNullablePrefix (Rep r lb ub)
    74    | minMatchLen(r) == 0  = Seq []
    75    | otherwise            = Seq [removeNullablePrefix(r), Rep r (lb-1) (upper_bound_dec ub)]
    76 
     33-- Special cases:
     34-- Seq [] represents the empty regular expression, matching only the empty string.
     35-- Alt [] represents the empty set, i.e., matching nothing.
    7736
    7837--
     
    181140--
    182141-- For Kleene Star character class repetition, use MatchStar
    183 re2pablo_helper(Rep (CC c) 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
     142re2pablo_helper(Rep (CC c, 0, Unbounded), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
    184143   where
    185144     (newenv, newsym) = gensym(env, "marker")
     
    187146
    188147-- For general Kleene Star, we need a while loop.
    189 re2pablo_helper(Rep r 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
     148re2pablo_helper(Rep (r, 0, Unbounded), (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
    190149  where
    191150    (e1, while_test) = gensym(env, "while_test")
     
    197156    loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
    198157
    199 re2pablo_helper(Rep r lb Unbounded, cg_state) = re2pablo_helper(Rep r (lb-1) Unbounded, cg1_state)
     158re2pablo_helper(Rep (r, lb, Unbounded), cg_state) = re2pablo_helper(Rep (r, lb-1, Unbounded), cg1_state)
    200159  where
    201160   cg1_state = re2pablo_helper(r, cg_state)
     
    203162-- Now Bounded Repetition: use multiple copies
    204163   
    205 re2pablo_helper(Rep r 0 (UpperBound 0), cg_state) = cg_state
     164re2pablo_helper(Rep (r, 0, UpperBound 0), cg_state) = cg_state
    206165
    207 re2pablo_helper(Rep r 0 (UpperBound ub), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
     166re2pablo_helper(Rep (r, 0, UpperBound ub), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
    208167  where
    209168   (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
    210    (e2, s2, rep2plus_marker) = re2pablo_helper(Rep r 0 (UpperBound(ub-1)), (e1, s1, rep1_marker))
     169   (e2, s2, rep2plus_marker) = re2pablo_helper(Rep (r, 0, UpperBound(ub-1)), (e1, s1, rep1_marker))
    211170   (newenv, newsym) = gensym(e2, "alt_marker")
    212171   new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
    213172
    214 re2pablo_helper(Rep r lb (UpperBound ub), cg_state) = re2pablo_helper(Rep r (lb-1) (UpperBound(ub-1)), cg1_state)
     173re2pablo_helper(Rep (r, lb, UpperBound ub), cg_state) = re2pablo_helper(Rep (r, lb-1, UpperBound(ub-1)), cg1_state)
    215174  where
    216175   cg1_state = re2pablo_helper(r, cg_state)
Note: See TracChangeset for help on using the changeset viewer.