Ignore:
Timestamp:
Apr 24, 2018, 2:57:34 PM (15 months ago)
Author:
nmedfort
Message:

Restructured MultiBlock? kernel. Removal of Swizzled buffers. Inclusion of PopCount? rates / non-linear access. Modifications to several kernels to better align them with the kernel and pipeline changes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/kernels/pdep_kernel.cpp

    r5870 r5985  
    1616// input stream sets
    1717{Binding{b->getStreamSetTy(), "marker", FixedRate(), Principal()},
    18 Binding{b->getStreamSetTy(swizzleFactor), "source", FixedRate(), Deferred()}},
     18Binding{b->getStreamSetTy(swizzleFactor), "source", PopcountOf("marker"), BlockSize(b->getBitBlockWidth() / swizzleFactor) }},
    1919// output stream set
    20 {Binding{b->getStreamSetTy(swizzleFactor), "output"}},
    21 {}, {},
    22 // internal scalars
    23 {Binding{b->getBitBlockType(), "buffer"},
    24 Binding{b->getSizeTy(), "buffered"},
    25 Binding{b->getSizeTy(), "sourceOffset"}})
     20{Binding{b->getStreamSetTy(swizzleFactor), "output", FixedRate(), BlockSize(b->getBitBlockWidth() / swizzleFactor)}},
     21{}, {}, {})
    2622, mSwizzleFactor(swizzleFactor) {
    2723
     
    4541    }
    4642
    47     // We store an internal source offset here because this kernel processes items in an unusual way.
    48     // The pipeline and multiblock assume that if we report we're on the i-th bit of a stream we have
    49     // fully processed all of the bits up to the i-th position.
     43    Constant * const ZERO = b->getSize(0);
     44    Value * const sourceItemCount = b->getProcessedItemCount("source");
    5045
    51     //                                             v
    52     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    53     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    54     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    55     //                |XXXXXXXXXXXX XXXXXXXXXXXX XX                       |
    56 
    57     // However, this kernel divides the stream into K elements and fully consumes a single stream of
    58     // the stream set before consuming the next one. So the same i-th position above is actually:
    59 
    60     //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
    61     //                |XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|XXXXXXXXXXXX|
    62     //                |XX          |XX          |XX          |XX          |
    63     //                |            |            |            |            |
    64 
    65     // In the future, we may want the pipeline and multiblock to understand this style of processing
    66     // but for now, we hide it by delaying writing the actual processed offset until we've fully
    67     // processed the entire block.
    68 
    69     Value * const initialBuffer = b->getScalarField("buffer");
    70     Value * const initialBufferSize = b->getScalarField("buffered");
    71     Value * const initialSourceOffset = b->getScalarField("sourceOffset");
     46    Value * const initialSourceOffset = b->CreateURem(sourceItemCount, BLOCK_WIDTH);
    7247    b->CreateBr(processBlock);
    7348
    7449    b->SetInsertPoint(processBlock);
    7550    PHINode * const strideIndex = b->CreatePHI(b->getSizeTy(), 2);
    76     strideIndex->addIncoming(b->getSize(0), entry);
    77     PHINode * const bufferPhi = b->CreatePHI(initialBuffer->getType(), 2);
    78     bufferPhi->addIncoming(initialBuffer, entry);
     51    strideIndex->addIncoming(ZERO, entry);
     52    PHINode * const bufferPhi = b->CreatePHI(b->getBitBlockType(), 2);
     53    bufferPhi->addIncoming(Constant::getNullValue(b->getBitBlockType()), entry);
    7954    PHINode * const sourceOffsetPhi = b->CreatePHI(b->getSizeTy(), 2);
    8055    sourceOffsetPhi->addIncoming(initialSourceOffset, entry);
    8156    PHINode * const bufferSizePhi = b->CreatePHI(b->getSizeTy(), 2);
    82     bufferSizePhi->addIncoming(initialBufferSize, entry);
     57    bufferSizePhi->addIncoming(ZERO, entry);
    8358
    8459    // Extract the values we will use in the main processing loop
    85     Value * const markerStream = b->getInputStreamBlockPtr("marker", b->getInt32(0), strideIndex);
    86     Value * const selectors = b->fwCast(pdepWidth, b->CreateBlockAlignedLoad(markerStream));
     60    Value * const markerStream = b->getInputStreamBlockPtr("marker", ZERO, strideIndex);
     61    Value * const markerValue = b->CreateBlockAlignedLoad(markerStream);
     62    Value * const selectors = b->fwCast(pdepWidth, markerValue);
    8763    Value * const numOfSelectors = b->simd_popcount(pdepWidth, selectors);
    88 
    89     // If we run out of source items here, it is a failure of the MultiBlockKernel and/or PipelineGenerator
    90     if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
    91         Value * requiredSourceItems = b->CreateExtractElement(numOfSelectors, b->getInt64(0));
    92         for (unsigned i = 1; i < mSwizzleFactor; i++) {
    93             requiredSourceItems = b->CreateAdd(requiredSourceItems, b->CreateExtractElement(numOfSelectors, b->getInt64(i)));
    94         }
    95         Value * const availableSourceItems = b->getAvailableItemCount("source");
    96         Value * const unreadSourceItems = b->CreateSub(availableSourceItems, sourceOffsetPhi);
    97         Value * const hasSufficientSourceItems = b->CreateICmpULE(requiredSourceItems, unreadSourceItems);
    98         b->CreateAssert(hasSufficientSourceItems, getName() + " has insufficient source items for the given marker stream");
    99     }
    10064
    10165    // For each element of the marker block
     
    10670
    10771        // How many bits will we deposit?
    108         Value * const required = b->CreateExtractElement(numOfSelectors, b->getInt32(i));
     72        Value * const required = b->CreateExtractElement(numOfSelectors, b->getSize(i));
    10973
    11074        // Aggressively enqueue any additional bits
     
    11478
    11579        b->SetInsertPoint(enqueueBits);
    116         PHINode * const bufferSize2 = b->CreatePHI(bufferSize->getType(), 2);
    117         bufferSize2->addIncoming(bufferSize, entry);
    118         PHINode * const sourceOffset2 = b->CreatePHI(sourceOffset->getType(), 2);
    119         sourceOffset2->addIncoming(sourceOffset, entry);
    120         PHINode * const buffer2 = b->CreatePHI(buffer->getType(), 2);
    121         buffer2->addIncoming(buffer, entry);
     80        PHINode * const updatedBufferSize = b->CreatePHI(bufferSize->getType(), 2);
     81        updatedBufferSize->addIncoming(bufferSize, entry);
     82        PHINode * const updatedSourceOffset = b->CreatePHI(sourceOffset->getType(), 2);
     83        updatedSourceOffset->addIncoming(sourceOffset, entry);
     84        PHINode * const updatedBuffer = b->CreatePHI(buffer->getType(), 2);
     85        updatedBuffer->addIncoming(buffer, entry);
    12286
    123         // Calculate the block and swizzle index of the "current" swizzle
    124         Value * const block_index = b->CreateUDiv(sourceOffset2, BLOCK_WIDTH);
    125         Value * const stream_index = b->CreateUDiv(b->CreateURem(sourceOffset2, BLOCK_WIDTH), PDEP_WIDTH);
    126         Value * const ptr = b->getInputStreamBlockPtr("source", stream_index, block_index);
    127         Value * const swizzle = b->CreateBlockAlignedLoad(ptr);
    128         Value * const swizzleOffset = b->CreateURem(sourceOffset2, PDEP_WIDTH);
     87        // Calculate the block and swizzle index of the current swizzle row
     88        Value * const blockOffset = b->CreateUDiv(updatedSourceOffset, BLOCK_WIDTH);
     89        Value * const swizzleIndex = b->CreateUDiv(b->CreateURem(updatedSourceOffset, BLOCK_WIDTH), PDEP_WIDTH);
     90        Value * const swizzle = b->CreateBlockAlignedLoad(b->getInputStreamBlockPtr("source", swizzleIndex, blockOffset));
     91        Value * const swizzleOffset = b->CreateURem(updatedSourceOffset, PDEP_WIDTH);
    12992
    13093        // Shift the swizzle to the right to clear off any used bits ...
     
    13396
    13497        // ... then to the left to align the bits with the buffer and combine them.
    135         Value * const bufferShift = b->simd_fill(pdepWidth, bufferSize2);
     98        Value * const bufferShift = b->simd_fill(pdepWidth, updatedBufferSize);
    13699        Value * const pendingBits = b->CreateShl(unreadBits, bufferShift);
    137         buffer = b->CreateOr(buffer, pendingBits);
    138         buffer2->addIncoming(buffer, enqueueBits);
    139100
    140         // Update the buffer size by the number of bits we have actually enqueued
    141         Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), bufferSize2);
     101        buffer = b->CreateOr(updatedBuffer, pendingBits);
     102        updatedBuffer->addIncoming(buffer, enqueueBits);
     103
     104        // Update the buffer size with the number of bits we have actually enqueued
     105        Value * const maxBufferSize = b->CreateAdd(b->CreateSub(PDEP_WIDTH, swizzleOffset), updatedBufferSize);
    142106        bufferSize = b->CreateUMin(maxBufferSize, PDEP_WIDTH);
     107        updatedBufferSize->addIncoming(bufferSize, enqueueBits);
     108
    143109        // ... and increment the source offset by the number we actually inserted
    144         sourceOffset = b->CreateAdd(sourceOffset2, b->CreateSub(bufferSize, bufferSize2));
    145         bufferSize2->addIncoming(bufferSize, enqueueBits);
    146         sourceOffset2->addIncoming(sourceOffset, enqueueBits);
     110        Value * const inserted = b->CreateSub(bufferSize, updatedBufferSize);
     111        sourceOffset = b->CreateAdd(updatedSourceOffset, inserted);
     112        updatedSourceOffset->addIncoming(sourceOffset, enqueueBits);
     113
     114        // INVESTIGATE: we can branch at most once here. I'm not sure whether the potential
     115        // branch misprediction is better or worse than always filling from two swizzles to
     116        // ensure that we have enough bits to deposit.
    147117        BasicBlock * const depositBits = b->CreateBasicBlock();
    148         b->CreateCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
     118        b->CreateUnlikelyCondBr(b->CreateICmpULT(bufferSize, required), enqueueBits, depositBits);
    149119
    150120        b->SetInsertPoint(depositBits);
     121
    151122        // Apply PDEP to each element of the combined swizzle using the current PDEP mask
    152123        Value * result = UndefValue::get(buffer->getType());
     
    159130
    160131        // Store the result
    161         Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getInt32(i), strideIndex);
     132        Value * const outputStreamPtr = b->getOutputStreamBlockPtr("output", b->getSize(i), strideIndex);
    162133        b->CreateBlockAlignedStore(result, outputStreamPtr);
    163134
     
    177148
    178149    b->SetInsertPoint(finishedStrides);
    179     Value * const sourceItemsProcessed = b->CreateMul(b->CreateUDiv(sourceOffset, BLOCK_WIDTH), BLOCK_WIDTH);
    180     b->setProcessedItemCount("source", b->CreateAdd(b->getProcessedItemCount("source"), sourceItemsProcessed));
    181     b->setScalarField("buffer", buffer);
    182     b->setScalarField("buffered", bufferSize);
    183     b->setScalarField("sourceOffset", b->CreateURem(sourceOffset, BLOCK_WIDTH));
    184150}
    185151
Note: See TracChangeset for help on using the changeset viewer.