Changeset 5770 for icGREP


Ignore:
Timestamp:
Dec 9, 2017, 4:37:51 PM (16 months ago)
Author:
cameron
Message:

Restructure to eliminate unnecessary dependencies on RegExpCompiler? and UCDLIB

Location:
icGREP/icgrep-devel
Files:
11 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5751 r5770  
    294294};
    295295
    296 void caseInsensitiveInsertRange(UCD::UnicodeSet * const cc, const UCD::codepoint_t lo, const UCD::codepoint_t hi);
    297 
    298 inline void caseInsensitiveInsert(UCD::UnicodeSet * const cc, const UCD::codepoint_t cp) {
    299     caseInsensitiveInsertRange(cc, cp, cp);
    300 }
     296UCD::UnicodeSet * caseInsensitize(UCD::UnicodeSet * const cc);
     297
    301298"""
    302299
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5765 r5770  
    9494add_library(CodeGen ${TOOLCHAIN_SRC} ${DRIVER_SRC} ${OBJECT_CACHE_SRC} ${KERNEL_SRC} ${IDISA_SRC})
    9595add_library(PabloADT ${PABLO_SRC})
    96 add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_rep.cpp re/re_diff.cpp re/re_intersect.cpp re/re_range.cpp re/re_assertion.cpp re/printer_re.cpp)
    97 add_library(RegExpCompiler re/casing.cpp re/to_utf8.cpp re/re_parser.cpp re/re_memoizer.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_star_normal.cpp re/re_minimizer.cpp re/re_local.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp re/re_name_resolve.cpp re/re_name_gather.cpp re/re_collect_unicodesets.cpp re/re_multiplex.cpp re/re_parser_pcre.cpp re/re_parser_ere.cpp re/re_parser_bre.cpp re/re_parser_prosite.cpp re/parse_fixed_strings.cpp re/re_utility.cpp ${GREP_CORE_SRC})
    98 add_library(CCADT cc/cc_compiler.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/CaseFolding.cpp cc/alphabet.cpp cc/multiplex_CCs.cpp)
    99 add_library(UCDlib UCD/unicode_set.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp)
     96# CCADT is the core library for representing and compiling character classes
     97add_library(CCADT re/re_re.cpp re/re_cc.cpp cc/cc_compiler.cpp cc/alphabet.cpp cc/multiplex_CCs.cpp UCD/unicode_set.cpp)
     98# RegExpADT is the core library for representing, parsing and printing regular expressions
     99add_library(RegExpADT re/re_rep.cpp re/re_diff.cpp re/re_intersect.cpp re/re_range.cpp re/re_assertion.cpp re/printer_re.cpp re/re_parser_pcre.cpp re/re_parser_ere.cpp re/re_parser_bre.cpp re/re_parser_prosite.cpp re/parse_fixed_strings.cpp)
     100add_library(RegExpCompiler re/casing.cpp re/to_utf8.cpp re/re_parser.cpp re/re_memoizer.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_star_normal.cpp re/re_minimizer.cpp re/re_local.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp re/re_name_resolve.cpp re/re_name_gather.cpp re/re_collect_unicodesets.cpp re/re_multiplex.cpp re/re_utility.cpp ${GREP_CORE_SRC})
     101add_library(UCDlib UCD/CaseFolding.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp)
    100102
    101103# force the compiler to compile the object cache to ensure that the versioning information is up to date
     
    107109target_link_libraries (PabloADT CodeGen)
    108110target_link_libraries (CCADT PabloADT)
     111target_link_libraries (RegExpADT PabloADT CCADT)
    109112target_link_libraries (UCDlib RegExpCompiler RegExpADT PabloADT CCADT)
    110 target_link_libraries (RegExpADT PabloADT CCADT UCDlib)
    111 target_link_libraries (RegExpCompiler UCDlib RegExpADT)
     113target_link_libraries (RegExpCompiler UCDlib CCADT RegExpADT)
    112114
    113115add_executable(icgrep icgrep.cpp grep_interface.cpp grep_engine.cpp kernels/u8u32_kernel.cpp kernels/delmask_kernel.cpp kernels/cc_kernel.cpp kernels/cc_scan_kernel.cpp kernels/charclasses.cpp kernels/linebreak_kernel.cpp kernels/streams_merge.cpp kernels/grep_kernel.cpp kernels/until_n.cpp)
     
    122124
    123125target_link_libraries (icgrep UCDlib PabloADT RegExpCompiler CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    124 target_link_libraries (u8u16 UCDlib PabloADT RegExpCompiler CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    125 target_link_libraries (base64 UCDlib PabloADT RegExpCompiler CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    126 target_link_libraries (wc UCDlib PabloADT RegExpCompiler CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    127 target_link_libraries (editd UCDlib PabloADT RegExpCompiler CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
     126target_link_libraries (u8u16 PabloADT CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
     127target_link_libraries (base64 PabloADT CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
     128target_link_libraries (wc PabloADT CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
     129target_link_libraries (editd PabloADT CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    128130target_link_libraries (array-test PabloADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    129 target_link_libraries (lz4d PabloADT UCDlib RegExpCompiler CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
     131target_link_libraries (lz4d PabloADT CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    130132target_link_libraries (core RegExpCompiler ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES})
    131133
  • icGREP/icgrep-devel/icgrep/UCD/CaseFolding.cpp

    r5748 r5770  
    77
    88#include "CaseFolding.h"
     9#include <UCD/unicode_set.h>
    910#include <algorithm>
    1011
     
    7374    }
    7475}
     76inline codepoint_t lo_codepoint(const interval_t & i) {
     77    return std::get<0>(i);
     78}
     79
     80inline codepoint_t hi_codepoint(const interval_t & i) {
     81    return std::get<1>(i);
     82}
     83
     84UnicodeSet * caseInsensitize(UnicodeSet * const cc) {
     85    UnicodeSet * cci = new UnicodeSet();
     86    for (const interval_t i : *cc) {
     87        caseInsensitiveInsertRange(cci, lo_codepoint(i), hi_codepoint(i));
     88    }
     89    return cci;
     90}
    7591
    7692
  • icGREP/icgrep-devel/icgrep/UCD/CaseFolding.h

    r5751 r5770  
    2121};
    2222
    23 void caseInsensitiveInsertRange(UCD::UnicodeSet * const cc, const UCD::codepoint_t lo, const UCD::codepoint_t hi);
    24 
    25 inline void caseInsensitiveInsert(UCD::UnicodeSet * const cc, const UCD::codepoint_t cp) {
    26     caseInsensitiveInsertRange(cc, cp, cp);
    27 }
     23UCD::UnicodeSet * caseInsensitize(UCD::UnicodeSet * const cc);
     24
    2825
    2926const int foldTableSize = 246;
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r5747 r5770  
    44 *  icgrep is a trademark of International Characters.
    55 */
     6#include <re/re_re.h>
    67#include "resolve_properties.h"
    78#include <re/re_alt.h>
     
    3132void UnicodePropertyExpressionError(std::string errmsg) {
    3233    llvm::report_fatal_error(errmsg);
    33 
    34 }
    35 
     34}
     35
     36#define Behind(x) makeLookBehindAssertion(x)
     37#define Ahead(x) makeLookAheadAssertion(x)
    3638
    3739void generateGraphemeClusterBoundaryRule(Name * const &property) {
    3840    // 3.1.1 Grapheme Cluster Boundary Rules
    39 #define Behind(x) makeLookBehindAssertion(x)
    40 #define Ahead(x) makeLookAheadAssertion(x)
    4141
    4242//    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5769 r5770  
    2626#include <re/re_toolchain.h>
    2727#include <toolchain/toolchain.h>
    28 #include <re/re_name_resolve.h>   
     28#include <re/re_name_resolve.h>
    2929#include <re/re_collect_unicodesets.h>
    3030#include <re/re_multiplex.h>
     
    5050
    5151// Grep Engine construction and initialization.
    52    
     52
    5353GrepEngine::GrepEngine() :
    5454    mGrepDriver(nullptr),
     
    5858    mMoveMatchesToEOL(true),
    5959    mEngineThread(pthread_self()) {}
    60    
     60
    6161GrepEngine::~GrepEngine() {
    6262    delete mGrepDriver;
    6363}
    64    
     64
    6565QuietModeEngine::QuietModeEngine() : GrepEngine() {
    6666    mMoveMatchesToEOL = false;
     
    9292    inputFiles = filenames;
    9393}
    94    
    9594
    9695// Code Generation
     
    103102    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    104103    const unsigned encodingBits = 8;
    105    
     104
    106105    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
    107106    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    108107    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    109    
     108
    110109    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    111110    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
    112111    mGrepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
    113    
     112
    114113    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    115114    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments);
    116115    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
    117    
     116
    118117    const auto n = REs.size();
    119    
    120118    std::vector<std::vector<re::CC *>> charclasses(n);
    121    
    122119    for (unsigned i = 0; i < n; i++) {
    123         REs[i] = resolveCaseInsensitiveMode(REs[i], false);
     120        REs[i] = resolveCaseInsensitiveMode(REs[i], grep::IgnoreCaseFlag);
    124121        REs[i] = re::resolveNames(REs[i]);
    125122        const auto UnicodeSets = re::collectUnicodeSets(REs[i]);
     
    129126        REs[i] = regular_expression_passes(REs[i]);
    130127  }
    131    
     128
    132129    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
    133    
     130
    134131    for(unsigned i = 0; i < n; ++i){
    135132        const auto numOfCharacterClasses = charclasses[i].size();
     
    149146    }
    150147    StreamSetBuffer * Matches = MergedResults;
    151    
     148
    152149    if (mMoveMatchesToEOL) {
    153150        StreamSetBuffer * OriginalMatches = Matches;
     
    156153        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
    157154    }
    158    
     155
    159156    if (InvertMatchFlag) {
    160157        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
     
    178175
    179176void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
    180    
     177
    181178    assert (mGrepDriver == nullptr);
    182179    mGrepDriver = new ParabixDriver("engine");
    183180    auto & idb = mGrepDriver->getBuilder();
    184181    Module * M = idb->getModule();
    185    
     182
    186183    const auto segmentSize = codegen::SegmentSize;
    187184    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    188185
    189186    const unsigned encodingBits = 8;
    190    
     187
    191188    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
    192189    mainFunc->setCallingConv(CallingConv::C);
    193190    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
    194191    auto args = mainFunc->arg_begin();
    195    
     192
    196193    Value * const fileDescriptor = &*(args++);
    197194    fileDescriptor->setName("fileDescriptor");
    198    
     195
    199196    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
    200197    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
    201198    sourceK->setInitialArguments({fileDescriptor});
    202199    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
    203    
     200
    204201    StreamSetBuffer * LineBreakStream;
    205202    StreamSetBuffer * Matches;
    206203    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
    207    
     204
    208205    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
    209206    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
     
    279276    auto & idb = mGrepDriver->getBuilder();
    280277    Module * M = idb->getModule();
    281    
     278
    282279    const auto segmentSize = codegen::SegmentSize;
    283280    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
    284281    const unsigned encodingBits = 8;
    285    
     282
    286283    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
    287284    mainFunc->setCallingConv(CallingConv::C);
    288285    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
    289286    auto args = mainFunc->arg_begin();
    290    
     287
    291288    Value * const fileDescriptor = &*(args++);
    292289    fileDescriptor->setName("fileDescriptor");
    293290    Value * match_accumulator = &*(args++);
    294291    match_accumulator->setName("match_accumulator");
    295    
     292
    296293    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
    297294    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
    298295    sourceK->setInitialArguments({fileDescriptor});
    299296    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
    300    
     297
    301298    StreamSetBuffer * LineBreakStream;
    302299    StreamSetBuffer * Matches;
    303300    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
    304    
     301
    305302    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
    306303    scanMatchK->setInitialArguments({match_accumulator});
     
    308305    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
    309306    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
    310    
     307
    311308    mGrepDriver->generatePipelineIR();
    312309    mGrepDriver->deallocateBuffers();
     
    319316//  The doGrep methods apply a GrepEngine to a single file, processing the results
    320317//  differently based on the engine type.
    321    
     318
    322319uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
    323320    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
    324321    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
    325    
     322
    326323    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx].get());
    327324    if (fileDescriptor == -1) return 0;
    328    
     325
    329326    uint64_t grepResult = f(fileDescriptor);
    330327    close(fileDescriptor);
     
    347344    }
    348345}
    349    
     346
    350347uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
    351348    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
     
    359356    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
    360357    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
    361    
     358
    362359    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx].get());
    363360    if (fileDescriptor == -1) return 0;
     
    404401// The process of searching a group of files may use a sequential or a task
    405402// parallel approach.
    406    
     403
    407404void * DoGrepThreadFunction(void *args) {
    408405    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
     
    412409    const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
    413410    pthread_t threads[numOfThreads];
    414    
     411
    415412    for(unsigned long i = 1; i < numOfThreads; ++i) {
    416413        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
     
    420417    }
    421418    // Main thread also does the work;
    422    
     419
    423420    DoGrepThreadMethod();
    424421    for(unsigned i = 1; i < numOfThreads; ++i) {
     
    480477
    481478}
    482 
  • icGREP/icgrep-devel/icgrep/re/casing.cpp

    r5768 r5770  
    11#include <re/casing.h>
    22#include <re/re_cc.h>
     3#include <UCD/unicode_set.h>
     4#include <UCD/CaseFolding.h>
    35#include <re/re_alt.h>             // for Alt, makeAlt
    46#include <re/re_any.h>             // for makeAny, Any
     
    2224RE * resolveCaseInsensitiveMode(RE * re, bool inCaseInsensitiveMode) {
    2325    if (isa<CC>(re)) {
    24         if (inCaseInsensitiveMode) return caseInsensitize(cast<CC>(re));
     26        if (inCaseInsensitiveMode) {
     27            UCD::UnicodeSet * cased = caseInsensitize(cast<CC>(re));
     28            return makeCC(std::move(*cased));
     29        }
    2530        else return re;
    2631    }
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r5736 r5770  
    1212#include <re/re_intersect.h>
    1313#include <re/re_assertion.h>
     14#include <re/re_group.h>
    1415#include <re/re_nullable.h>
    1516#include <re/printer_re.h>
     
    5253    } else if (isa<CC>(re)) {
    5354        return false;
     55    } else if (const Group * g = dyn_cast<Group>(re)) {
     56        return matchesEmptyString(g->getRE());
    5457    } else if (const Name * n = dyn_cast<Name>(re)) {
    5558        return matchesEmptyString(n->getDefinition());
     
    368371    } else if (isa<Start>(re) || isa<End>(re) || isa<Assertion>(re)) {
    369372        return true;
     373    } else if (const Group * g = dyn_cast<Group>(re)) {
     374        if ((g->getMode() == Group::Mode::GraphemeMode) && (g->getSense() == Group::Sense::On)) {
     375            return true;
     376        }
     377        else {
     378            return hasAssertion(g->getRE());
     379        }
    370380    }
    371381    else llvm_unreachable("Unknown RE type");
  • icGREP/icgrep-devel/icgrep/re/re_cc.cpp

    r5748 r5770  
    77#include "re_cc.h"
    88#include <llvm/Support/Compiler.h>
    9 #include <UCD/CaseFolding.h>
    109#include <sstream>
    1110
     
    3433}
    3534   
    36 CC * caseInsensitize(const CC * cc) {
    37     CC * cci = makeCC();
    38     for (const interval_t i : *cc) {
    39         caseInsensitiveInsertRange(cci, lo_codepoint(i), hi_codepoint(i));
    40     }
    41     return cci;
    42 }
    43 
    4435CC::CC()
    4536: RE(ClassTypeId::CC)
  • icGREP/icgrep-devel/icgrep/re/re_cc.h

    r5748 r5770  
    136136}
    137137
    138 CC * caseInsensitize(const CC * cc);
    139 
    140138}
    141139
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5769 r5770  
    856856                    }
    857857                }
    858                 if (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) {
    859                     if (CC * cc1 = dyn_cast<CC>(newOperand)) {
    860                         newOperand = caseInsensitize(cc1);
    861                     }
    862                 }
    863858                return negated ? makeComplement(newOperand) : newOperand;
    864859            }
Note: See TracChangeset for help on using the changeset viewer.