Ignore:
Timestamp:
Jun 8, 2018, 8:51:15 AM (11 months ago)
Author:
cameron
Message:

u32u8.cpp initial check-in

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r6055 r6071  
    11/*
    2  *  Copyright (c) 2017 International Characters.
     2 *  Copyright (c) 2018 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 */
     
    142142}
    143143   
    144 StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
    145 : MultiBlockKernel("streamExpand" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     144StreamExpandKernel::StreamExpandKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, unsigned sourceStreamCount, unsigned selectedStreamBase, unsigned selectedStreamCount)
     145: MultiBlockKernel("streamExpand" + std::to_string(fieldWidth) + "_" + std::to_string(sourceStreamCount) + "_" + std::to_string(selectedStreamBase) + "_" + std::to_string(selectedStreamCount),
    146146                   {Binding{kb->getStreamSetTy(), "marker", FixedRate(), Principal()},
    147                        Binding{kb->getStreamSetTy(streamCount), "source", PopcountOf("marker")}},
    148                    {Binding{kb->getStreamSetTy(streamCount), "output", FixedRate()}},
     147                       Binding{kb->getStreamSetTy(sourceStreamCount), "source", PopcountOf("marker")}},
     148                   {Binding{kb->getStreamSetTy(selectedStreamCount), "output", FixedRate()}},
    149149                   {}, {}, {})
    150150, mFieldWidth(fieldWidth)
    151 , mStreamCount(streamCount) {
    152     for (unsigned i = 0; i < streamCount; i++) {
    153         addScalar(kb->getBitBlockType(), "pendingSourceBlock_" + std::to_string(i));
     151, mSelectedStreamBase(selectedStreamBase)
     152, mSelectedStreamCount(selectedStreamCount) {
     153    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
     154        addScalar(kb->getBitBlockType(), "pendingSourceBlock_" + std::to_string(mSelectedStreamBase + i));
    154155    }
    155156}
     
    175176    Value * processedSourceItems = b->getProcessedItemCount("source");
    176177    Value * sourceOffset = b->CreateURem(processedSourceItems, bwConst);
    177    
    178     std::vector<Value *> pendingData(mStreamCount);
    179     for (unsigned i = 0; i < mStreamCount; i++) {
    180         pendingData[i] = b->getScalarField("pendingSourceBlock_" + std::to_string(i));
     178    std::vector<Value *> pendingData(mSelectedStreamCount);
     179    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
     180        pendingData[i] = b->getScalarField("pendingSourceBlock_" + std::to_string(mSelectedStreamBase + i));
    181181    }
    182182   
     
    186186    PHINode * blockNoPhi = b->CreatePHI(b->getSizeTy(), 2);
    187187    PHINode * pendingItemsPhi = b->CreatePHI(b->getSizeTy(), 2);
    188     PHINode * pendingDataPhi[mStreamCount];
     188    PHINode * pendingDataPhi[mSelectedStreamCount];
    189189    blockNoPhi->addIncoming(ZERO, entry);
    190190    pendingItemsPhi->addIncoming(sourceOffset, entry);
    191     for (unsigned i = 0; i < mStreamCount; i++) {
     191    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
    192192        pendingDataPhi[i] = b->CreatePHI(b->getBitBlockType(), 2);
    193193        pendingDataPhi[i]->addIncoming(pendingData[i], entry);
     
    205205    // B = b->simd_sll(fw, b->mvmd_dsll(fw, source, pending, field_offset_hi), shift_fwd);
    206206    // all_source_bits = simd_or(A, B);
    207     Value * pendingOffset = b->CreateURem(pendingBlockEnd, bwConst);
     207    Value * pendingOffset = b->CreateURem(pendingItemsPhi, bwConst);
    208208    Value * field_offset_lo =  b->CreateUDiv(pendingOffset, fwConst);
    209209    Value * bit_offset = b->simd_fill(fw, b->CreateURem(pendingOffset, fwConst));
    210    
    211210    // Carefully avoid a shift by the full fieldwith (which gives a poison value).
    212211    // field_offset_lo + 1 unless the bit_offset is 0, in which case it is just field_offset_lo.
     
    214213    // fw - bit_offset, unless bit_offset is 0, in which case, the shift_fwd is 0.
    215214    Value * shift_fwd = b->CreateURem(b->CreateSub(fwSplat, bit_offset), fwSplat);
    216    
     215
    217216    // Once all source bits are assembled, they need to be distributed to the
    218217    // output fields in accord with the popcounts of the deposit mask fields.
     
    234233    Value * source_shift_lo = b->CreateAnd(partialSum, fw_sub1Splat);  // parallel URem
    235234    Value * source_shift_hi = b->CreateAnd(b->CreateSub(fwSplat, source_shift_lo), fw_sub1Splat);
    236    
     235
    237236    // Now load and process source streams.
    238     for (unsigned i = 0; i < mStreamCount; i++) {
    239         Value * source = b->loadInputStreamBlock("source", b->getInt32(i), srcBlockNo);
     237    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
     238        Value * source = b->loadInputStreamBlock("source", b->getInt32(mSelectedStreamBase + i), srcBlockNo);
    240239        Value * A = b->simd_srlv(fw, b->mvmd_dsll(fw, source, pendingDataPhi[i], field_offset_lo), bit_offset);
    241240        Value * B = b->simd_sllv(fw, b->mvmd_dsll(fw, source, pendingDataPhi[i], field_offset_hi), shift_fwd);
    242241        Value * full_source_block = b->simd_or(A, B);
    243        
    244242        Value * C = b->simd_srlv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_lo), source_shift_lo);
    245243        Value * D = b->simd_sllv(fw, b->mvmd_shuffle(fw, full_source_block, source_field_hi), source_shift_hi);
     
    262260    b->SetInsertPoint(expansionDone);
    263261    // Update kernel state.
    264     for (unsigned i = 0; i < mStreamCount; i++) {
    265         b->setScalarField("pendingSourceBlock_" + std::to_string(i), b->bitCast(pendingDataPhi[i]));
     262    for (unsigned i = 0; i < mSelectedStreamCount; i++) {
     263        b->setScalarField("pendingSourceBlock_" + std::to_string(mSelectedStreamBase + i), b->bitCast(pendingDataPhi[i]));
    266264    }
    267265}
     
    269267FieldDepositKernel::FieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
    270268: MultiBlockKernel("FieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
    271                    {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
     269                   {Binding{kb->getStreamSetTy(1), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
    272270                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
    273271                   {}, {}, {})
     
    298296}
    299297
    300 PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount)
    301 : MultiBlockKernel("PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount),
     298PDEPFieldDepositKernel::PDEPFieldDepositKernel(const std::unique_ptr<kernel::KernelBuilder> & kb, const unsigned fieldWidth, const unsigned streamCount, std::string suffix)
     299: MultiBlockKernel("PDEPFieldDeposit" + std::to_string(fieldWidth) + "_" + std::to_string(streamCount) + suffix,
    302300                   {Binding{kb->getStreamSetTy(), "depositMask"}, Binding{kb->getStreamSetTy(streamCount), "inputStreamSet"}},
    303301                   {Binding{kb->getStreamSetTy(streamCount), "outputStreamSet"}},
     
    305303, mPDEPWidth(fieldWidth)
    306304, mStreamCount(streamCount) {
    307     if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldCompressKernel");
     305    if ((fieldWidth != 32) && (fieldWidth != 64)) llvm::report_fatal_error("Unsupported PDEP width for PDEPFieldDepositKernel");
    308306}
    309307
     
    327325    blockOffsetPhi->addIncoming(ZERO, entry);
    328326    std::vector<Value *> mask(fieldsPerBlock);
    329     Value * extractionMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
    330     extractionMaskPtr = kb->CreatePointerCast(extractionMaskPtr, fieldPtrTy);
     327//  When operating on fields individually, we can use vector load/store with
     328//  extract/insert element operations, or we can use individual field load
     329//  and stores.   Individual field operations require fewer total operations,
     330//  but more memory instructions.   It may be that vector load/extract is better,
     331//  while field store is better.   Vector insert then store creates long dependence
     332//  chains.
     333//
     334#define PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
     335#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
     336    Value * depositMaskPtr = kb->getInputStreamBlockPtr("depositMask", ZERO, blockOffsetPhi);
     337    depositMaskPtr = kb->CreatePointerCast(depositMaskPtr, fieldPtrTy);
    331338    for (unsigned i = 0; i < fieldsPerBlock; i++) {
    332         mask[i] = kb->CreateLoad(kb->CreateGEP(extractionMaskPtr, kb->getInt32(i)));
    333     }
     339        mask[i] = kb->CreateLoad(kb->CreateGEP(depositMaskPtr, kb->getInt32(i)));
     340    }
     341#else
     342    Value * depositMask = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("depositMask", ZERO, blockOffsetPhi));
     343    for (unsigned i = 0; i < fieldsPerBlock; i++) {
     344        mask[i] = kb->CreateExtractElement(depositMask, kb->getInt32(i));
     345    }
     346#endif
    334347    for (unsigned j = 0; j < mStreamCount; ++j) {
     348#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
    335349        Value * inputPtr = kb->getInputStreamBlockPtr("inputStreamSet", kb->getInt32(j), blockOffsetPhi);
    336350        inputPtr = kb->CreatePointerCast(inputPtr, fieldPtrTy);
     351#else
     352        Value * inputStrm = kb->fwCast(mPDEPWidth, kb->loadInputStreamBlock("inputStreamSet", kb->getInt32(j), blockOffsetPhi));
     353#endif
     354#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
    337355        Value * outputPtr = kb->getOutputStreamBlockPtr("outputStreamSet", kb->getInt32(j), blockOffsetPhi);
    338356        outputPtr = kb->CreatePointerCast(outputPtr, fieldPtrTy);
     357#else
     358        Value * outputStrm = kb->fwCast(mPDEPWidth, kb->allZeroes());
     359#endif
    339360        for (unsigned i = 0; i < fieldsPerBlock; i++) {
     361#ifdef PREFER_FIELD_LOADS_OVER_EXTRACT_ELEMENT
    340362            Value * field = kb->CreateLoad(kb->CreateGEP(inputPtr, kb->getInt32(i)));
     363#else
     364            Value * field = kb->CreateExtractElement(inputStrm, kb->getInt32(i));
     365#endif
    341366            Value * compressed = kb->CreateCall(PDEP_func, {field, mask[i]});
     367#ifdef PREFER_FIELD_STORES_OVER_INSERT_ELEMENT
    342368            kb->CreateStore(compressed, kb->CreateGEP(outputPtr, kb->getInt32(i)));
    343369        }
     370#else
     371            outputStrm = kb->CreateInsertElement(outputStrm, compressed, kb->getInt32(i));
     372        }
     373        kb->storeOutputStreamBlock("outputStreamSet", kb->getInt32(j), blockOffsetPhi, outputStrm);
     374#endif
    344375    }
    345376    Value * nextBlk = kb->CreateAdd(blockOffsetPhi, kb->getSize(1));
     
    354385        llvm::report_fatal_error("StreamDepositCompiler needs a non-zero bufferBlocks parameter (for now).");
    355386    }
    356     auto & iBuilder = mDriver.getBuilder();
    357     unsigned N = IDISA::getNumOfStreams(ssType);
    358     if (IDISA::getStreamFieldWidth(ssType) != 1) {
    359         llvm::report_fatal_error("StreamDepositCompiler only compresses bit streams (for now)");
    360     }
    361     parabix::StreamSetBuffer * expandedStreams = mDriver.addBuffer<parabix::StaticBuffer>(iBuilder, iBuilder->getStreamSetTy(N), mBufferBlocks);
    362     Kernel * streamK = mDriver.addKernelInstance<StreamExpandKernel>(iBuilder, mFieldWidth, N);
     387    auto & b = mDriver.getBuilder();
     388    unsigned N = mSelectedStreamCount;
     389    parabix::StreamSetBuffer * expandedStreams = mDriver.addBuffer<parabix::StaticBuffer>(b, b->getStreamSetTy(N), mBufferBlocks);
     390    Kernel * streamK = mDriver.addKernelInstance<StreamExpandKernel>(b, mFieldWidth, mSourceStreamCount, mSelectedStreamBase, N);
    363391    mDriver.makeKernelCall(streamK, {depositMask, inputs}, {expandedStreams});
    364392
    365393    Kernel * depositK = nullptr;
    366394    if (AVX2_available()) {
    367         depositK = mDriver.addKernelInstance<PDEPFieldDepositKernel>(iBuilder, mFieldWidth, N);
     395        depositK = mDriver.addKernelInstance<PDEPFieldDepositKernel>(b, mFieldWidth, N, std::to_string(mSelectedStreamBase));
    368396    } else {
    369         depositK = mDriver.addKernelInstance<FieldDepositKernel>(iBuilder, mFieldWidth, N);
     397        depositK = mDriver.addKernelInstance<FieldDepositKernel>(b, mFieldWidth, N);
    370398    }
    371399    mDriver.makeKernelCall(depositK, {depositMask, expandedStreams}, {outputs});
     
    373401
    374402}
     403
Note: See TracChangeset for help on using the changeset viewer.