Changeset 5558


Ignore:
Timestamp:
Jul 9, 2017, 6:32:44 PM (3 months ago)
Author:
cameron
Message:

Unix lines mode and support for 'Byte' character classes

Location:
icGREP/icgrep-devel/icgrep
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.cpp

    r5285 r5558  
    523523inline void UCDCompiler::addTargets(PabloBuilder & entry, const NameMap & names) {
    524524    for (const auto t : names) {
     525        if (t.first->getType() == Name::Type::Byte) {
     526            continue;
     527        }
    525528        if (LLVM_LIKELY(isa<CC>(t.first->getDefinition()))) {
    526529            mTargetMap.emplace(cast<CC>(t.first->getDefinition()), t.second ? t.second : entry.createZeroes());
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5556 r5558  
    252252        nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    253253    }
    254     return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(markerVar(marker), pb.createAnd(mCCCompiler.compileCC(cc), mAny)));
     254    return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(markerVar(marker), pb.createAnd(mCCCompiler.compileCC(cc), mFinal)));
    255255}
    256256
  • icGREP/icgrep-devel/icgrep/re/re_name.h

    r5267 r5558  
    4646    friend Name * makeZeroWidth(const std::string & name, RE * zerowidth);
    4747    friend Name * makeName(CC * const cc);
     48    friend Name * makeByte(CC * const cc);
    4849    friend Name * makeName(const std::string &, Type);
    4950    friend Name * makeName(const std::string &, const std::string &, Type);
     
    161162}
    162163
    163 inline Name * makeCapture(const std::string & name, RE * captured) {
     164inline Name * makeByte(CC * const cc) {
     165    assert(cc->max_codepoint() <= 0xFF);
     166    const std::string name = cc->canonicalName(CC_type::ByteClass);
     167    return new Name(nullptr, 0, name.c_str(), name.length(), Name::Type::Byte, cc);
     168}
     169   
     170    inline Name * makeCapture(const std::string & name, RE * captured) {
    164171    return new Name(nullptr, 0, name.c_str(), name.length(), Name::Type::Capture, captured);
    165172}
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5554 r5558  
    156156            case '^':
    157157                ++mCursor;
     158                if ((fModeFlagSet & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) {
     159                    return makeNegativeLookBehindAssertion(makeByte(makeCC(makeCC(0, '\n'-1), makeCC('\n'+1, 0xFF))));
     160                }
    158161                return makeStart();
    159162            case '$':
    160163                ++mCursor;
     164                if ((fModeFlagSet & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) {
     165                    return makeLookAheadAssertion(makeCC('\n'));
     166                }
    161167                return makeEnd();
    162168            case '|': case ')':
     
    259265                        //case 's': modeBit = DOTALL_MODE_FLAG; break;
    260266                        case 'x': modeBit = IGNORE_SPACE_MODE_FLAG; break;
    261                         //case 'd': modeBit = UNIX_LINES_MODE_FLAG; break;
     267                        case 'd': modeBit = UNIX_LINES_MODE_FLAG; break;
    262268                        default: ParseFailure("Unsupported mode flag.");
    263269                    }
     
    412418    if (isSetEscapeChar(*mCursor)) {
    413419        return parseEscapedSet();
     420    }
     421    else if ((*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) {
     422        codepoint_t cp = parse_escaped_codepoint();
     423        if ((cp >= 0x80) && (cp <= 0xFF)) {
     424            return makeByte(makeCC(cp));
     425        }
     426        else return createCC(cp);
    414427    }
    415428    else if (isdigit(*mCursor)) {
     
    771784    codepoint_t lastCodepointItem = 0;
    772785    bool havePendingOperation = false;
     786    bool possibleByteCodeEscape = false;  // set to true when \x, \o or \0 hex or octal escapes seen.
    773787    CharsetOperatorKind pendingOperationKind = intersectOp;
    774788    RE * pendingOperand = nullptr;
     
    831845                }
    832846                if (!cc->empty()) {
    833                     subexprs.push_back(mMemoizer.memoize(cc));
     847                    if (possibleByteCodeEscape && (cc->max_codepoint() <= 0xFF) && subexprs.empty() && !havePendingOperation) {
     848                        subexprs.push_back(makeByte(cc));
     849                    }
     850                    else {
     851                        subexprs.push_back(mMemoizer.memoize(cc));
     852                    }
    834853                }
    835854                RE * newOperand = makeAlt(subexprs.begin(), subexprs.end());
     
    893912                    ParseFailure("Range operator - has illegal left operand.");
    894913                }
    895                 insert_range(cc, lastCodepointItem, parse_codepoint());
     914                if (*mCursor == '\\') {
     915                    mCursor++;
     916                    if ((*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) possibleByteCodeEscape = true;
     917                    insert_range(cc, lastCodepointItem, parse_escaped_codepoint());
     918                } else {
     919                    insert_range(cc, lastCodepointItem, parse_literal_codepoint());
     920                }
    896921                lastItemKind = RangeItem;
    897922                break;
     
    912937                }
    913938                else {
     939                    if ((*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) possibleByteCodeEscape = true;
    914940                    lastCodepointItem = parse_escaped_codepoint();
    915941                    insert(cc, lastCodepointItem);
     
    927953}
    928954
    929 
    930 codepoint_t RE_Parser::parse_codepoint() {
    931     if (*mCursor == '\\') {
    932         mCursor++;
    933         return parse_escaped_codepoint();
    934     } else {
    935         return parse_literal_codepoint();
    936     }
    937 }
    938955
    939956// A backslash escape was found, and various special cases (back reference,
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r5554 r5558  
    2525    MULTILINE_MODE_FLAG = 2,      // not currently implemented
    2626    DOTALL_MODE_FLAG = 4,         // not currently implemented
    27     IGNORE_SPACE_MODE_FLAG = 8,   // not currently implemented
    28     UNIX_LINES_MODE_FLAG = 16,    // not currently implemented
     27    IGNORE_SPACE_MODE_FLAG = 8,
     28    UNIX_LINES_MODE_FLAG = 16,
    2929    GRAPHEME_CLUSTER_MODE = 32
    3030};
Note: See TracChangeset for help on using the changeset viewer.