Changeset 6250


Ignore:
Timestamp:
Dec 21, 2018, 3:17:30 PM (4 weeks ago)
Author:
cameron
Message:

Avoid calculating Unicode line breaks unless necessary

Location:
icGREP/icgrep-devel/icgrep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r6249 r6250  
    308308
    309309        StreamSet * const RequiredStreams = P->CreateStreamSet();
    310         StreamSet * const UnicodeLB = P->CreateStreamSet();
    311         StreamSet * const LineFeedStream = P->CreateStreamSet();
    312 
    313         P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineFeedStream);
    314         P->CreateKernelCall<RequiredStreams_UTF8>(BasisBits, LineFeedStream, RequiredStreams, UnicodeLB);
    315 
    316         if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
    317             LineBreakStream = LineFeedStream;
    318         } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
    319             LineBreakStream = P->CreateStreamSet();
    320             P->CreateKernelCall<CharacterClassKernelBuilder>( "Null", std::vector<re::CC *>{mBreakCC}, BasisBits, LineBreakStream);
     310        StreamSet * UnicodeLB = nullptr;
     311       
     312        if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     313            UnicodeLB = P->CreateStreamSet();
     314            StreamSet * const LineFeedStream = P->CreateStreamSet();
     315            P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineFeedStream);
     316            P->CreateKernelCall<RequiredStreams_UTF8>(BasisBits, LineFeedStream, RequiredStreams, UnicodeLB);
     317            LineBreakStream = UnicodeLB;
    321318        } else {
    322             LineBreakStream = UnicodeLB;
     319            P->CreateKernelCall<UTF8_nonFinal>(BasisBits, RequiredStreams);
     320            if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
     321                P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineBreakStream);
     322            } else { // if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     323                P->CreateKernelCall<CharacterClassKernelBuilder>( "Null", std::vector<re::CC *>{mBreakCC}, BasisBits, LineBreakStream);
     324            }
    323325        }
    324326
     
    341343        for(unsigned i = 0; i < numOfREs; ++i) {
    342344            std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
     345            options->addExternal("UTF8_nonfinal", RequiredStreams);
    343346            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
    344347                options->addExternal("UTF8_LB", LineBreakStream);
    345                 options->addExternal("UTF8_nonfinal", RequiredStreams);
    346348            }
    347349            std::set<re::Name *> UnicodeProperties;
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r6248 r6250  
    6565    Var * const UTF8_LB = getOutputStreamVar("UTF8_LB");
    6666    pb.createAssign(pb.createExtract(UTF8_LB, pb.getInteger(0)), breakStream);
     67}
     68
     69void UTF8_nonFinal::generatePabloMethod() {
     70    PabloBuilder pb(getEntryScope());
     71    std::unique_ptr<cc::CC_Compiler> ccc;
     72    bool useDirectCC = getInput(0)->getType()->getArrayNumElements() == 1;
     73    if (useDirectCC) {
     74        ccc = make_unique<cc::Direct_CC_Compiler>(getEntryScope(), pb.createExtract(getInput(0), pb.getInteger(0)));
     75    } else {
     76        ccc = make_unique<cc::Parabix_CC_Compiler>(getEntryScope(), getInputStreamSet("source"));
     77    }
     78
     79    Zeroes * const ZEROES = pb.createZeroes();
     80    PabloAST * const u8pfx = ccc->compileCC(makeByte(0xC0, 0xFF));
     81
     82
     83    Var * const nonFinal = pb.createVar("nonFinal", u8pfx);
     84    Var * const u8invalid = pb.createVar("u8invalid", ZEROES);
     85    Var * const valid_pfx = pb.createVar("valid_pfx", u8pfx);
     86
     87    auto it = pb.createScope();
     88    pb.createIf(u8pfx, it);
     89    PabloAST * const u8pfx2 = ccc->compileCC(makeByte(0xC2, 0xDF), it);
     90    PabloAST * const u8pfx3 = ccc->compileCC(makeByte(0xE0, 0xEF), it);
     91    PabloAST * const u8pfx4 = ccc->compileCC(makeByte(0xF0, 0xF4), it);
     92    PabloAST * const u8suffix = ccc->compileCC("u8suffix", makeByte(0x80, 0xBF), it);
     93   
     94    //
     95    // Two-byte sequences
     96    Var * const anyscope = it.createVar("anyscope", ZEROES);
     97    auto it2 = it.createScope();
     98    it.createIf(u8pfx2, it2);
     99    it2.createAssign(anyscope, it2.createAdvance(u8pfx2, 1));
     100
     101
     102    //
     103    // Three-byte sequences   
     104    Var * const EF_invalid = it.createVar("EF_invalid", ZEROES);
     105    auto it3 = it.createScope();
     106    it.createIf(u8pfx3, it3);
     107    PabloAST * const u8scope32 = it3.createAdvance(u8pfx3, 1);
     108    it3.createAssign(nonFinal, it3.createOr(nonFinal, u8scope32));
     109    PabloAST * const u8scope33 = it3.createAdvance(u8pfx3, 2);
     110    PabloAST * const u8scope3X = it3.createOr(u8scope32, u8scope33);
     111    it3.createAssign(anyscope, it3.createOr(anyscope, u8scope3X));
     112    PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc->compileCC(makeByte(0xE0), it3), 1), ccc->compileCC(makeByte(0x80, 0x9F), it3));
     113    PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc->compileCC(makeByte(0xED), it3), 1), ccc->compileCC(makeByte(0xA0, 0xBF), it3));
     114    PabloAST * const EX_invalid = it3.createOr(E0_invalid, ED_invalid);
     115    it3.createAssign(EF_invalid, EX_invalid);
     116
     117    //
     118    // Four-byte sequences
     119    auto it4 = it.createScope();
     120    it.createIf(u8pfx4, it4);
     121    PabloAST * const u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
     122    PabloAST * const u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
     123    PabloAST * const u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
     124    PabloAST * const u8scope4nonfinal = it4.createOr(u8scope42, u8scope43);
     125    it4.createAssign(nonFinal, it4.createOr(nonFinal, u8scope4nonfinal));
     126    PabloAST * const u8scope4X = it4.createOr(u8scope4nonfinal, u8scope44);
     127    it4.createAssign(anyscope, it4.createOr(anyscope, u8scope4X));
     128    PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc->compileCC(makeByte(0xF0), it4), 1), ccc->compileCC(makeByte(0x80, 0x8F), it4));
     129    PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc->compileCC(makeByte(0xF4), it4), 1), ccc->compileCC(makeByte(0x90, 0xBF), it4));
     130    PabloAST * const FX_invalid = it4.createOr(F0_invalid, F4_invalid);
     131    it4.createAssign(EF_invalid, it4.createOr(EF_invalid, FX_invalid));
     132   
     133    //
     134    // Invalid cases
     135    PabloAST * const legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
     136    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
     137    //  a scope is a mismatch, i.e., invalid UTF-8.
     138    PabloAST * const mismatch = it.createXor(anyscope, u8suffix);
     139    //
     140    PabloAST * const pfx_invalid = it.createXor(valid_pfx, legalpfx);
     141    it.createAssign(u8invalid, it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
     142    PabloAST * const u8valid = it.createNot(u8invalid, "u8valid");
     143    //
     144    //
     145    it.createAssign(nonFinal, it.createAnd(nonFinal, u8valid));
     146    //pb.createAssign(nonFinal, pb.createOr(nonFinal, CRLF));
     147    //PabloAST * unterminatedLineAtEOF = pb.createAtEOF(pb.createAdvance(pb.createNot(LineBreak), 1), "unterminatedLineAtEOF");
     148   
     149    Var * const required = getOutputStreamVar("nonFinal");
     150    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), nonFinal);
     151}
     152
     153UTF8_nonFinal::UTF8_nonFinal(const std::unique_ptr<kernel::KernelBuilder> & kb, StreamSet * Source, StreamSet * u8nonFinal)
     154: PabloKernel(kb, "UTF8_nonFinal" + std::to_string(Source->getNumElements()) + "x" + std::to_string(Source->getFieldWidth()),
     155// input
     156{Binding{"source", Source}},
     157// output
     158{Binding{"nonFinal", u8nonFinal}}) {
     159
    67160}
    68161
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.h

    r6218 r6250  
    1515
    1616   
    17 class UnicodeNonFinalKernel : public pablo::PabloKernel {
     17class UTF8_nonFinal : public pablo::PabloKernel {
    1818public:
    19     UnicodeNonFinalKernel(const std::unique_ptr<kernel::KernelBuilder> & kb);
     19    UTF8_nonFinal(const std::unique_ptr<kernel::KernelBuilder> & kb, StreamSet * Source, StreamSet * u8nonFinal);
    2020    bool isCachable() const override { return true; }
    2121    bool hasSignature() const override { return false; }
Note: See TracChangeset for help on using the changeset viewer.