Changeset 5894


Ignore:
Timestamp:
Mar 8, 2018, 1:56:35 PM (12 months ago)
Author:
cameron
Message:

Line break controls for Unicode/LF/Null - initial check in

Location:
icGREP/icgrep-devel/icgrep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r5892 r5894  
    146146    mNextFileToPrint(0),
    147147    grepMatchFound(false),
     148    mGrepRecordBreak(GrepRecordBreakKind::Unicode),
    148149    mMoveMatchesToEOL(true),
    149150    mEngineThread(pthread_self()) {}
     
    170171    mFileSuffix = InitialTabFlag ? "\t:" : ":";
    171172    if (LineRegexpFlag) mMoveMatchesToEOL = false;
     173}
     174
     175   
     176void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
     177    mGrepRecordBreak = b;
    172178}
    173179
     
    204210    std::set<re::Name *> UnicodeProperties;
    205211   
     212    re::CC * breakCC = nullptr;
     213    std::string breakName;
     214    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     215        breakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
     216        breakName = "UTF8_LB";
     217    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     218        breakCC = re::makeByte(0);  // Null
     219        breakName = "NULL";
     220    } else {
     221        breakCC = re::makeByte(0x0A); // LF
     222        breakName = "LF";
     223    }
     224
    206225    for(unsigned i = 0; i < nREs; ++i) {
    207226        REs[i] = resolveModesAndExternalSymbols(REs[i]);
    208         REs[i] = excludeUnicodeLineBreak(REs[i]);
     227        REs[i] = re::exclude_CC(REs[i], breakCC);
     228        if (mGrepRecordBreak != GrepRecordBreakKind::Unicode) {
     229            REs[i] = resolveAnchors(REs[i], breakCC);
     230        }
    209231        re::gatherUnicodeProperties(REs[i], UnicodeProperties);
    210        //re::Name * unicodeLB = re::makeName("UTF8_LB", re::Name::Type::Unicode);
    211         //unicodeLB->setDefinition(re::makeCC(0x0A));
    212         //REs[i] = resolveAnchors(REs[i], unicodeLB);
    213232        REs[i] = regular_expression_passes(REs[i]);
    214233        hasGCB[i] = hasGraphemeClusterBoundary(REs[i]);
     
    216235    }
    217236   
    218     StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    219 
    220     #ifdef USE_DIRECT_LF_BUILDER
    221     kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(1, 8), "byteStream", FixedRate(), Principal()});
    222     mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream});
    223     #endif
    224 
    225237    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
    226238    kernel::Kernel * s2pk = nullptr;
     
    233245    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    234246
    235     #ifndef USE_DIRECT_LF_BUILDER
     247    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     248    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     249    StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     250
     251    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    236252    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
    237253    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
    238     #endif
    239 
    240     StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    241 
     254   
    242255    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    243     StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    244     mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
    245 
     256    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
     257
     258    if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
     259        LineBreakStream = LineFeedStream;
     260    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     261        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
     262        mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
     263    } else {
     264        LineBreakStream = UnicodeLB;
     265    }
    246266   
    247267    std::map<std::string, StreamSetBuffer *> propertyStream;
     
    264284    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
    265285    for(unsigned i = 0; i < nREs; ++i) {
    266         REs[i] = resolveModesAndExternalSymbols(REs[i]);
    267         REs[i] = excludeUnicodeLineBreak(REs[i]);
    268         //re::Name * unicodeLB = re::makeName("UTF8_LB", re::Name::Type::Unicode);
    269         //unicodeLB->setDefinition(re::makeCC(0x0A));
    270         //REs[i] = resolveAnchors(REs[i], unicodeLB);
    271         REs[i] = regular_expression_passes(REs[i]);
    272286        std::vector<std::string> externalStreamNames = std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"};
    273287        std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits, LineBreakStream, RequiredStreams};
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.h

    r5892 r5894  
    2121
    2222namespace grep {
    23     class MatchAccumulator {
    24     public:
    25         MatchAccumulator() {}
    26         virtual void accumulate_match(const size_t lineNum, char * line_start, char * line_end) = 0;
    27         virtual void finalize_match(char * buffer_end) {}  // default: no op
    28     };
    2923   
    30     void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end);
    31    
    32     void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end);
    33    
    34     void grepBuffer(re::RE * pattern, const char * buffer, size_t bufferLength, MatchAccumulator * accum);
     24enum class GrepRecordBreakKind {Null, LF, Unicode};
     25
     26class MatchAccumulator {
     27public:
     28    MatchAccumulator() {}
     29    virtual void accumulate_match(const size_t lineNum, char * line_start, char * line_end) = 0;
     30    virtual void finalize_match(char * buffer_end) {}  // default: no op
     31};
     32
     33void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end);
     34
     35void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end);
     36
     37void grepBuffer(re::RE * pattern, const char * buffer, size_t bufferLength, MatchAccumulator * accum);
    3538
    3639class GrepEngine {
     
    4144    virtual ~GrepEngine();
    4245   
     46    void setRecordBreak(GrepRecordBreakKind b);
    4347    void initFileResult(std::vector<std::string> & filenames);
    4448    virtual void grepCodeGen(std::vector<re::RE *> REs);
     
    6165    std::vector<FileStatus> mFileStatus;
    6266    bool grepMatchFound;
     67    GrepRecordBreakKind mGrepRecordBreak;
    6368
    6469    std::unique_ptr<cc::MultiplexedAlphabet> mpx;
  • icGREP/icgrep-devel/icgrep/icgrep.cpp

    r5892 r5894  
    3737static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
    3838
     39static cl::opt<bool> UnixBreaks("Unix-line-breaks", cl::desc("Enable Unix line breaks"));
    3940static cl::opt<bool> ByteMode("enable-byte-mode", cl::desc("Process regular expressions in byte mode"));
    4041
     
    197198    }
    198199               
     200    if (UnixBreaks) {
     201        grepEngine->setRecordBreak(grep::GrepRecordBreakKind::LF);
     202    } else if (grep::NullDataFlag) {
     203        grepEngine->setRecordBreak(grep::GrepRecordBreakKind::Null);
     204    }
     205   
    199206    grepEngine->grepCodeGen(REs);
    200207
Note: See TracChangeset for help on using the changeset viewer.