Ignore:
Timestamp:
Jun 9, 2016, 3:34:07 PM (3 years ago)
Author:
xuedongx
Message:

Support over UTF-16 representation of Unicode

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5037 r5045  
    6666
    6767
    68 
    69 
    70 bool GrepEngine::finalLineIsUnterminated(const char * const fileBuffer, const size_t fileSize) {
     68bool isUTF_16 = false;
     69
     70bool GrepEngine::finalLineIsUnterminated(const char * const fileBuffer, const size_t fileSize, bool UTF_16) {
    7171    if (fileSize == 0) return false;
    7272    unsigned char end_byte = static_cast<unsigned char>(fileBuffer[fileSize-1]);
     
    7777    // NEL
    7878    unsigned char penult_byte = static_cast<unsigned char>(fileBuffer[fileSize-2]);
    79     if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
     79    if ((end_byte == 0x85) && (penult_byte == (UTF_16 ? 0x00 : 0xC2))) return false;
    8080    if (fileSize == 2) return true;
    8181    // LS and PS
    8282    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
    83     return (static_cast<unsigned char>(fileBuffer[fileSize-3]) != 0xE2) || (penult_byte != 0x80);
    84 }
    85 
    86 void GrepEngine::doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<uint64_t> & total_CountOnly) {
     83        if (!UTF_16) {
     84        return (static_cast<unsigned char>(fileBuffer[fileSize-3]) != 0xE2) || (penult_byte != 0x80);
     85        }
     86        else {// UTF_16
     87                return (penult_byte != 0x20);
     88        }
     89}
     90
     91void GrepEngine::doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<uint64_t> & total_CountOnly, bool UTF_16) {
    8792    path file(fileName);
    8893    if (exists(file)) {
     
    101106            char * fileBuffer = const_cast<char *>(source.data());
    102107            if (CountOnly) {
    103                 total_CountOnly[fileIdx] = mGrepFunction_CountOnly(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize));
     108                total_CountOnly[fileIdx] = mGrepFunction_CountOnly(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize, UTF_16));
    104109            } else {
    105                 mGrepFunction(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize));
     110                mGrepFunction(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize, UTF_16));
    106111            }
    107112            source.close();
     
    119124
    120125
    121 void GrepEngine::grepCodeGen(std::string moduleName, re::RE * re_ast, bool CountOnly, bool isNameExpression) {
    122     Module * M = new Module(moduleName, getGlobalContext());
     126void GrepEngine::grepCodeGen(std::string moduleName, re::RE * re_ast, bool CountOnly, bool UTF_16, bool isNameExpression) {
     127    isUTF_16 = UTF_16;
     128        Module * M = new Module(moduleName, getGlobalContext());
    123129   
    124130    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
     
    126132    kernel::PipelineBuilder pipelineBuilder(M, idb);
    127133
    128     Encoding encoding(Encoding::Type::UTF_8, 8);
    129     mIsNameExpression = isNameExpression;
     134        Encoding::Type type;
     135        type = UTF_16 ? Encoding::Type::UTF_16 : Encoding::Type::UTF_8;
     136        unsigned bits;
     137        bits = UTF_16 ? 16 : 8;
     138
     139        Encoding encoding(type, bits);
     140
     141        mIsNameExpression = isNameExpression;
    130142    re_ast = re::regular_expression_passes(encoding, re_ast);   
    131143    pablo::PabloFunction * function = re::re2pablo_compiler(encoding, re_ast);
    132144   
    133145
    134     pipelineBuilder.CreateKernels(function, isNameExpression);
    135 
    136     llvm::Function * grepIR = pipelineBuilder.ExecuteKernels(CountOnly);
     146    pipelineBuilder.CreateKernels(function, UTF_16, isNameExpression);
     147
     148    llvm::Function * grepIR = pipelineBuilder.ExecuteKernels(CountOnly, UTF_16);
    137149
    138150    mEngine = JIT_to_ExecutionEngine(M);
     
    165177
    166178    uint64_t finalLineUnterminated = 0;
    167     if(finalLineIsUnterminated(mFileBuffer, mFileSize))
     179    if(finalLineIsUnterminated(mFileBuffer, mFileSize, isUTF_16))
    168180        finalLineUnterminated = 1;   
    169181    mGrepFunction(mFileBuffer, mFileSize, 0, finalLineUnterminated);
     
    197209extern "C" {
    198210    void wrapped_report_match(uint64_t lineNum, uint64_t line_start, uint64_t line_end, const char * buffer, uint64_t filesize, int fileIdx) {
    199        
    200         int idx = fileIdx;
     211                int index = isUTF_16 ? 2 : 1;
     212                int idx = fileIdx;
    201213       
    202214        if (ShowFileNames) {
     
    207219        }
    208220       
    209         if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
     221        if ((!isUTF_16 && buffer[line_start] == 0xA) && (line_start != line_end)) {
    210222            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
    211223            line_start++;
    212224        }
     225        if (((isUTF_16 && buffer[line_start] == 0x0) && buffer[line_start + 1] == 0xA) && (line_start != line_end)) {
     226            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
     227            line_start += 2;
     228        }
    213229        if (line_end == filesize) {
    214230            // The match position is at end-of-file.   We have a final unterminated line.
    215             resultStrs[idx].write(&buffer[line_start], line_end - line_start);
     231            resultStrs[idx].write(&buffer[line_start * index], (line_end - line_start) * index);
    216232            if (NormalizeLineBreaks) {
    217233                resultStrs[idx] << '\n';  // terminate it
     
    220236        }
    221237        unsigned char end_byte = (unsigned char)buffer[line_end];
     238                unsigned char penult_byte = (unsigned char)(buffer[line_end - 1]);
    222239        if (NormalizeLineBreaks) {
    223240            if (end_byte == 0x85) {
     
    226243            } else if (end_byte > 0xD) {
    227244                // Line terminated with PS or LS, on the third byte.  Back up 2.
    228                 line_end -= 2;
    229             }
    230             resultStrs[idx].write(&buffer[line_start], line_end - line_start);
     245                isUTF_16 ? line_end-- : line_end -= 2;
     246            }
     247            resultStrs[idx].write(&buffer[line_start * index], (line_end - line_start) * index);
    231248            resultStrs[idx] << '\n';
    232249        }
    233250        else{   
    234             if (end_byte == 0x0D) {
     251            if ((!isUTF_16 && end_byte == 0x0D) || (isUTF_16 && (end_byte == 0x0D && penult_byte == 0x0))) {
    235252                // Check for line_end on first byte of CRLF;  note that we don't
    236253                // want to access past the end of buffer.
    237                 if ((line_end + 1 < filesize) && (buffer[line_end + 1] == 0x0A)) {
     254                                if (line_end + 1 < filesize) {
     255                                        if (!isUTF_16 && buffer[line_end + 1] == 0x0A) {
    238256                    // Found CRLF; preserve both bytes.
    239                     line_end++;
    240                 }
    241             }
    242             resultStrs[idx].write(&buffer[line_start], line_end - line_start + 1);
     257                                                line_end++;;
     258                                        }
     259                                        if (isUTF_16 && buffer[line_end + 1] == 0x0 && buffer[line_end + 2] == 0x0A) {
     260                    // Found CRLF; preserve both bytes.
     261                                                line_end += 2;
     262                                        }
     263                                }
     264            }
     265            resultStrs[idx].write(&buffer[line_start * index], (line_end - line_start + 1) * index);
    243266        }
    244267    }
Note: See TracChangeset for help on using the changeset viewer.