Changeset 5362 for icGREP/icgrep-devel


Ignore:
Timestamp:
Mar 13, 2017, 10:38:00 PM (2 years ago)
Author:
cameron
Message:

Adam's changes to add swizzling option to DeleteByPEXT kernel

Location:
icGREP/icgrep-devel/icgrep
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/alignedprint.cpp

    r5356 r5362  
    1212namespace kernel {
    1313
    14     void p2s_step(IDISA::IDISA_Builder * iBuilder, Value * p0, Value * p1, Value * hi_mask, unsigned shift, Value * &s1, Value * &s0) {
     14    void ap_p2s_step(IDISA::IDISA_Builder * iBuilder, Value * p0, Value * p1, Value * hi_mask, unsigned shift, Value * &s1, Value * &s0) {
    1515    Value * t0 = iBuilder->simd_if(1, hi_mask, p0, iBuilder->simd_srli(16, p1, shift));
    1616    Value * t1 = iBuilder->simd_if(1, hi_mask, iBuilder->simd_slli(16, p0, shift), p1);
     
    2424    Value * bit11115555[2];
    2525    Value * bit33337777[2];
    26     p2s_step(iBuilder, p[0], p[4], iBuilder->simd_himask(8), 4, bit00004444[1], bit00004444[0]);
    27     p2s_step(iBuilder, p[1], p[5], iBuilder->simd_himask(8), 4, bit11115555[1], bit11115555[0]);
    28     p2s_step(iBuilder, p[2], p[6], iBuilder->simd_himask(8), 4, bit22226666[1], bit22226666[0]);
    29     p2s_step(iBuilder, p[3], p[7], iBuilder->simd_himask(8), 4, bit33337777[1], bit33337777[0]);
     26    ap_p2s_step(iBuilder, p[0], p[4], iBuilder->simd_himask(8), 4, bit00004444[1], bit00004444[0]);
     27    ap_p2s_step(iBuilder, p[1], p[5], iBuilder->simd_himask(8), 4, bit11115555[1], bit11115555[0]);
     28    ap_p2s_step(iBuilder, p[2], p[6], iBuilder->simd_himask(8), 4, bit22226666[1], bit22226666[0]);
     29    ap_p2s_step(iBuilder, p[3], p[7], iBuilder->simd_himask(8), 4, bit33337777[1], bit33337777[0]);
    3030    Value * bit00224466[4];
    3131    Value * bit11335577[4];
    3232    for (unsigned j = 0; j<2; j++) {
    33         p2s_step(iBuilder, bit00004444[j], bit22226666[j],iBuilder->simd_himask(4), 2, bit00224466[2*j+1], bit00224466[2*j]);
    34         p2s_step(iBuilder, bit11115555[j], bit33337777[j],iBuilder->simd_himask(4), 2, bit11335577[2*j+1], bit11335577[2*j]);
     33        ap_p2s_step(iBuilder, bit00004444[j], bit22226666[j],iBuilder->simd_himask(4), 2, bit00224466[2*j+1], bit00224466[2*j]);
     34        ap_p2s_step(iBuilder, bit11115555[j], bit33337777[j],iBuilder->simd_himask(4), 2, bit11335577[2*j+1], bit11335577[2*j]);
    3535    }
    3636    for (unsigned j = 0; j<4; j++) {
    37         p2s_step(iBuilder, bit00224466[j], bit11335577[j], iBuilder->simd_himask(2), 1, s[2*j+1], s[2*j]);
     37        ap_p2s_step(iBuilder, bit00224466[j], bit11335577[j], iBuilder->simd_himask(2), 1, s[2*j+1], s[2*j]);
    3838    }
    3939}
  • icGREP/icgrep-devel/icgrep/kernels/deletion.cpp

    r5356 r5362  
    9292}
    9393
    94 
    95 
    9694const unsigned PEXT_width = 64;
    9795
     
    105103}
    106104
    107 inline Value * apply_PEXT_deletion(IDISA::IDISA_Builder * iBuilder, const std::vector<Value *> & masks, Value * strm) {   
     105// Apply PEXT deletion to a collection of blocks and swizzle the result.
     106// strms contains the blocks to process
     107inline std::vector<Value *> apply_PEXT_deletion_with_swizzle(IDISA::IDISA_Builder * iBuilder, const std::vector<Value *> & masks, std::vector<Value *> strms) {   
    108108    Value * PEXT_func = nullptr;
    109109    if (PEXT_width == 64) {
     
    112112        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
    113113    }
     114   
     115    std::vector<Value *> output;     
     116    for (unsigned i = 0; i < strms.size(); i++) {
     117        Value * v = iBuilder->fwCast(PEXT_width, strms[i]);
     118        output.push_back(Constant::getNullValue(v->getType()));
     119    }
     120
     121    // For each of the input streams
     122    for (unsigned j = 0; j < strms.size(); j++) {
     123        Value * v = iBuilder->fwCast(PEXT_width, strms[j]); // load stream j
     124        // Process the stream's block in PEXT_width chunks (PEXT operation can't do more than 64 bits at a time)
     125        for (unsigned i = 0; i < iBuilder->getBitBlockWidth()/PEXT_width; i++) {
     126            Value * field = iBuilder->CreateExtractElement(v, i); // Load from block j at index i (fw of j is PEXT_width)
     127            Value * compressed = iBuilder->CreateCall(PEXT_func, {field, masks[i]}); // Apply PEXT deletion to the block segment we just loaded
     128            /*
     129             We loaded from input at index i within stream j's block. We store result in ouput within stream i's block at position j. This swizzles the output blocks . E.g.:
     130
     131             a b c d
     132             e f g h
     133             i j k l
     134             m n o p
     135
     136             Apply pext deletion at each position, then swizzle results:
     137
     138             a` e` i` m`
     139             b` f` j` n`
     140             c` g` k` o` 
     141             d` i` l` p`         
     142            */     
     143            output[i] = iBuilder->CreateInsertElement(output[i], compressed, j);
     144        }
     145    }
     146   
     147    return output;
     148}
     149
     150inline Value * apply_PEXT_deletion(IDISA::IDISA_Builder * iBuilder, const std::vector<Value *> & masks, Value * strm) { 
     151    Value * PEXT_func = nullptr;
     152    if (PEXT_width == 64) {
     153        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_64);
     154    } else if (PEXT_width == 32) {
     155        PEXT_func = Intrinsic::getDeclaration(iBuilder->getModule(), Intrinsic::x86_bmi_pext_32);
     156    }
     157       
    114158    Value * v = iBuilder->fwCast(PEXT_width, strm);
    115159    Value * output = Constant::getNullValue(v->getType());
     
    122166}
    123167
    124 // Apply deletion to a set of stream_count input streams to produce a set of output streams.
     168// Apply deletion to a set of stream_count input streams and produce a set of swizzled output streams.
    125169// Kernel inputs: stream_count data streams plus one del_mask stream
    126 // Outputs: the deleted streams, plus a partial sum popcount
     170// Outputs: swizzles containing the swizzled deleted streams, plus a partial sum popcount
    127171
    128172void DeleteByPEXTkernel::generateDoBlockMethod() {
    129173    Value * delMask = loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0));
    130174    const auto masks = get_PEXT_masks(iBuilder, delMask);
    131     for (unsigned j = 0; j < mStreamCount; ++j) {
    132         Value * input = loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
    133         Value * output = apply_PEXT_deletion(iBuilder, masks, input);
    134         storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    135     }
    136     //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
    137     Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    138     storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
     175    generateProcessingLoop(masks, delMask);
    139176}
    140177
     
    145182    Value * delMask = iBuilder->CreateOr(EOF_del, loadInputStreamBlock("delMaskSet", iBuilder->getInt32(0)));
    146183    const auto masks = get_PEXT_masks(iBuilder, delMask);
     184    generateProcessingLoop(masks, delMask);
     185}
     186
     187void DeleteByPEXTkernel::generateProcessingLoop(const std::vector<Value *> & masks, Value * delMask) {
     188    if (mShouldSwizzle)   
     189        generatePEXTAndSwizzleLoop(masks);
     190    else
     191        generatePEXTLoop(masks);   
     192   
     193    //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
     194    Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
     195    storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
     196}
     197
     198void DeleteByPEXTkernel::generatePEXTLoop(const std::vector<Value *> & masks) {
    147199    for (unsigned j = 0; j < mStreamCount; ++j) {
    148200        Value * input = loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(j));
     
    150202        storeOutputStreamBlock("outputStreamSet", iBuilder->getInt32(j), output);
    151203    }
    152     //Value * delCount = partial_sum_popcount(iBuilder, mDelCountFieldWidth, apply_PEXT_deletion(iBuilder, masks, iBuilder->simd_not(delMask)));
    153     Value * delCount = iBuilder->simd_popcount(mDelCountFieldWidth, iBuilder->simd_not(delMask));
    154     storeOutputStreamBlock("deletionCounts", iBuilder->getInt32(0), iBuilder->bitCast(delCount));
    155 }
    156 
    157 DeleteByPEXTkernel::DeleteByPEXTkernel(IDISA::IDISA_Builder * iBuilder, unsigned fw, unsigned streamCount)
     204}
     205
     206void DeleteByPEXTkernel::generatePEXTAndSwizzleLoop(const std::vector<Value *> & masks) {
     207    // Group blocks together into input vector. Input should contain mStreamCount/mSwizzleFactor blocks (e.g. for U8U16 16/4=4)
     208    // mStreamCount/mSwizzleFactor -> (mStreamCount + mSwizzleFactor - 1) / mSwizzleFactor
     209    for (unsigned j = 0; j < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; ++j) {
     210        std::vector<Value *> input;
     211        unsigned streamSelectionIndex = j * mSwizzleFactor;
     212        for (unsigned i = streamSelectionIndex; i < (streamSelectionIndex + mSwizzleFactor); ++i) {
     213                // Check if i > mStreamCount. If it is, add null streams until we get mStreamCount/mSwizzleFactor streams in the input vector
     214            if ( i >= mStreamCount)
     215                                input.push_back(iBuilder->allZeroes());
     216                        else
     217                        input.push_back(loadInputStreamBlock("inputStreamSet", iBuilder->getInt32(i)));
     218        }
     219        std::vector<Value *> output = apply_PEXT_deletion_with_swizzle(iBuilder, masks, input);
     220        for (unsigned i = 0; i < mSwizzleFactor; i++) {
     221             storeOutputStreamBlock(std::string(mOutputSwizzleNameBase) + std::to_string(j), iBuilder->getInt32(i), output[i]);
     222        }
     223    }
     224}
     225
     226DeleteByPEXTkernel::DeleteByPEXTkernel(IDISA::IDISA_Builder * iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle)
    158227: BlockOrientedKernel(iBuilder, "PEXTdel",
    159228                      {Binding{iBuilder->getStreamSetTy(streamCount), "inputStreamSet"},
    160229                          Binding{iBuilder->getStreamSetTy(), "delMaskSet"}},
    161                       {Binding{iBuilder->getStreamSetTy(streamCount), "outputStreamSet"},
    162                           Binding{iBuilder->getStreamSetTy(), "deletionCounts"}},
    163                       {}, {}, {})
     230                      {}, {}, {}, {})
    164231, mDelCountFieldWidth(fw)
    165 , mStreamCount(streamCount) {
     232, mStreamCount(streamCount)
     233, mSwizzleFactor(iBuilder->getBitBlockWidth() / PEXT_width)
     234, mShouldSwizzle(shouldSwizzle)
     235{
     236    if(mShouldSwizzle) {       
     237        for (unsigned i = 0; i < (mStreamCount + mSwizzleFactor - 1)/mSwizzleFactor; i++) {
     238            mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mSwizzleFactor), std::string(mOutputSwizzleNameBase) + std::to_string(i));
     239        }
     240    } else {
     241        // No swizzling. Output results as single stream set
     242        mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(mStreamCount), "outputStreamSet");
     243    }
     244    mStreamSetOutputs.emplace_back(iBuilder->getStreamSetTy(), "deletionCounts");
    166245}
    167246
  • icGREP/icgrep-devel/icgrep/kernels/deletion.h

    r5355 r5362  
    77
    88#include "kernel.h"
     9#include <llvm/IR/Value.h>
    910namespace IDISA { class IDISA_Builder; }
    1011
     
    4041class DeleteByPEXTkernel : public BlockOrientedKernel {
    4142public:
    42    
    43     DeleteByPEXTkernel(IDISA::IDISA_Builder * iBuilder, unsigned fw, unsigned streamCount);
     43
     44    DeleteByPEXTkernel(IDISA::IDISA_Builder * iBuilder, unsigned fw, unsigned streamCount, bool shouldSwizzle);
    4445   
    4546protected:
     
    4849   
    4950    void generateFinalBlockMethod(llvm::Value * remainingBytes) override;
     51
     52    void generatePEXTAndSwizzleLoop(const std::vector<llvm::Value *> & masks);
     53
     54    void generatePEXTLoop(const std::vector<llvm::Value *> & masks);
     55
     56    void generateProcessingLoop(const std::vector<llvm::Value *> & masks, llvm::Value * delMask);
    5057   
    5158private:
    5259    const unsigned mDelCountFieldWidth;
    5360    const unsigned mStreamCount;
     61    const unsigned mSwizzleFactor;
     62    const bool mShouldSwizzle;
     63    static constexpr const char* mOutputSwizzleNameBase = "outputStreamSet";
    5464};
    5565   
  • icGREP/icgrep-devel/icgrep/u8u16.cpp

    r5355 r5362  
    259259    const unsigned segmentSize = codegen::SegmentSize;
    260260    const unsigned bufferSegments = codegen::ThreadNum+1;
    261    
     261
    262262    assert (iBuilder);
    263263
     
    305305    u8u16_pablo(&u8u16k);
    306306    u8u16k.generateKernel({&BasisBits}, {&U8u16Bits, &DelMask, &ErrorMask});
    307    
    308    
     307
    309308    // Apply a deletion algorithm to discard all but the final position of the UTF-8
    310     // sequences for each UTF-16 code unit.
    311     CircularBuffer u16CompressedInFields(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
    312     CircularBuffer DeletionCounts(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
    313 
    314     DeleteByPEXTkernel delK(iBuilder, 64, 16);
    315     delK.generateKernel({&U8u16Bits, &DelMask}, {&u16CompressedInFields, &DeletionCounts});
    316    
    317     // Swizzle for sequential compression within SIMD lanes.
     309    // sequences for each UTF-16 code unit. Swizzle the results.
    318310    CircularBuffer SwizzleFields0(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
    319311    CircularBuffer SwizzleFields1(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
    320312    CircularBuffer SwizzleFields2(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
    321313    CircularBuffer SwizzleFields3(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * bufferSegments);
    322     SwizzleGenerator swizzleK(iBuilder, 16, 4, 1);
    323     swizzleK.generateKernel({&u16CompressedInFields}, {&SwizzleFields0, &SwizzleFields1, &SwizzleFields2, &SwizzleFields3});
    324    
     314    CircularBuffer DeletionCounts(iBuilder, iBuilder->getStreamSetTy(), segmentSize * bufferSegments);
     315
     316    DeleteByPEXTkernel delK(iBuilder, 64, 16, true);
     317    delK.generateKernel({&U8u16Bits, &DelMask}, {&SwizzleFields0, &SwizzleFields1, &SwizzleFields2, &SwizzleFields3, &DeletionCounts});
     318;
    325319    //  Produce fully compressed swizzled UTF-16 bit streams
    326320    SwizzledCopybackBuffer u16Swizzle0(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
     
    328322    SwizzledCopybackBuffer u16Swizzle2(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
    329323    SwizzledCopybackBuffer u16Swizzle3(iBuilder, iBuilder->getStreamSetTy(4), segmentSize * (bufferSegments+2), 1);
    330     //
     324
    331325    SwizzledBitstreamCompressByCount compressK(iBuilder, 16);
    332326    compressK.generateKernel({&DeletionCounts, &SwizzleFields0, &SwizzleFields1, &SwizzleFields2, &SwizzleFields3},
    333327                             {&u16Swizzle0, &u16Swizzle1, &u16Swizzle2, &u16Swizzle3});
    334     
     328 
    335329    // Produce unswizzled UTF-16 bit streams
    336     //
    337330    CircularBuffer u16bits(iBuilder, iBuilder->getStreamSetTy(16), segmentSize * bufferSegments);
    338331    SwizzleGenerator unSwizzleK(iBuilder, 16, 1, 4);
     
    364357    DelMask.allocateBuffer();
    365358    ErrorMask.allocateBuffer();
    366     u16CompressedInFields.allocateBuffer();
    367359    DeletionCounts.allocateBuffer();
    368360    SwizzleFields0.allocateBuffer();
     
    375367    u16Swizzle3.allocateBuffer();
    376368    u16bits.allocateBuffer();
     369
    377370    if (mMapBuffering || memAlignBuffering) {
    378371        U16external.setEmptyBuffer(outputStream);
     
    384377
    385378    if (segmentPipelineParallel){
    386         generateSegmentParallelPipeline(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &swizzleK, &compressK, &unSwizzleK, &p2sk, &outK});
    387     } else {
    388         generatePipelineLoop(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &swizzleK, &compressK, &unSwizzleK, &p2sk, &outK});
     379        generateSegmentParallelPipeline(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &compressK, &unSwizzleK, &p2sk, &outK});
     380    } else {
     381        generatePipelineLoop(iBuilder, {&mmapK, &s2pk, &u8u16k, &delK, &compressK, &unSwizzleK, &p2sk, &outK});
    389382    }
    390383
Note: See TracChangeset for help on using the changeset viewer.