source: icGREP/icgrep-devel/icgrep/pablo/carrypack_manager.cpp @ 5712

Last change on this file since 5712 was 5712, checked in by cameron, 20 months ago

Fixes for indexed advance

File size: 63.0 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "carrypack_manager.h"
8#include <pablo/carry_data.h>
9#include <pablo/codegenstate.h>
10#include <llvm/IR/BasicBlock.h>
11#include <llvm/IR/DerivedTypes.h>
12#include <pablo/branch.h>
13#include <pablo/pe_advance.h>
14#include <pablo/pe_scanthru.h>
15#include <pablo/pe_matchstar.h>
16#include <pablo/pe_var.h>
17#include <kernels/kernel_builder.h>
18#include <toolchain/toolchain.h>
19#include <array>
20
21#include <llvm/Support/CommandLine.h>
22
23#include <llvm/Support/raw_ostream.h>
24
25using namespace llvm;
26
27namespace pablo {
28
29inline static bool is_power_2(const unsigned n) {
30    return (n && ((n & (n - 1)) == 0));
31}
32
33inline static unsigned ceil_log2(const unsigned v) {
34    assert ("log2(0) is undefined!" && v != 0);
35    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
36}
37
38inline static unsigned floor_log2(const unsigned v) {
39    assert ("log2(0) is undefined!" && v != 0);
40    return ((sizeof(unsigned) * CHAR_BIT) - 1U) - __builtin_clz(v);
41}
42
43inline static unsigned nearest_pow2(const unsigned v) {
44    assert (v > 0 && v < (UINT32_MAX / 2));
45    return (v < 2) ? 1 : (1U << ceil_log2(v));
46}
47
48inline static unsigned nearest_multiple(const unsigned n, const unsigned m) {
49    assert (is_power_2(m));
50    return (n + m - 1U) & ~(m - 1U);
51}
52
53inline static bool is_multiple_of(const unsigned n, const unsigned m) {
54    return nearest_multiple(n, m) == n;
55}
56
57inline static unsigned udiv(const unsigned x, const unsigned y) {
58    assert (is_power_2(y));
59    const unsigned z = x >> floor_log2(y);
60    assert (z == (x / y));
61    return z;
62}
63
64inline static unsigned ceil_udiv(const unsigned x, const unsigned y) {
65    assert (is_power_2(x) && is_power_2(y));
66    return udiv(((x - 1U) | (y - 1U)) + 1U, y);
67}
68
69inline static unsigned getElementWidth(Type * ty) {
70    if (LLVM_LIKELY(isa<VectorType>(ty))) {
71        ty = cast<IntegerType>(ty->getVectorElementType());
72    }
73    return cast<IntegerType>(ty)->getBitWidth();
74}
75
76inline static unsigned gcd(unsigned a, unsigned b) {
77  while (a) {
78     assert (is_power_2(a));
79     const unsigned c = a;
80     a = b & (a - 1);
81     b = c;
82  }
83  return b;
84}
85
86inline static unsigned getPackingSize(const unsigned elementWidth, const unsigned depth) {
87    return std::max<unsigned>(elementWidth / nearest_pow2(depth + 1), 1);
88}
89
90using TypeId = PabloAST::ClassTypeId;
91
92#define ELEMENTS_IN_DYNAMICALLY_ALLOCATED_CARRY_STRUCT 3
93
94/** ------------------------------------------------------------------------------------------------------------- *
95 * @brief initializeCarryData
96 ** ------------------------------------------------------------------------------------------------------------- */
97void CarryManager::initializeCarryData(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, PabloKernel * const kernel) {
98
99    // Each scope constructs its own CarryData struct, which will be added to the final "carries" struct
100    // that is added to the Kernel. The scope index will indicate which struct to access.
101
102    // A CarryData struct either contains an array of CarryPackBlocks or an integer indicating the capacity of
103    // the variable length CarryData struct and pointer. A variable length CarryData struct is required whenever
104    // the streams accessed by a loop could vary between iterations. When resizing a CarryData struct for a
105    // particular loop, the current loop struct and all nested structs need to be resized. This accommodates
106    // the fact every pablo While loop must be executed at least once.
107
108    // A nested loop may also contain a variable length CarryData struct
109
110    // To determine whether we require a variable length CarryData struct, we test the escaped variables of
111    // each loop branch to see whether they are used as the index parameter of a nested Extract statement.
112    // Any scope that requires variable length CarryData, requires that all nested branches have a unique
113    // set of carries for that iteration.
114
115    assert (mKernel == nullptr);
116
117    mCurrentScope = kernel->getEntryBlock();
118    mKernel = kernel;
119
120    Type * const carryTy = iBuilder->getBitBlockType();
121    mVectorWidth = carryTy->getPrimitiveSizeInBits();
122    mElementWidth = getElementWidth(carryTy);
123    assert (is_power_2(mElementWidth));
124
125    mCarryScopes = 0;
126    mCarryMetadata.resize(getScopeCount(mCurrentScope));
127    mCarryGroup.resize(assignDefaultCarryGroups(kernel->getEntryBlock()));
128
129    kernel->setCarryDataTy(analyse(iBuilder, mCurrentScope));
130
131    kernel->addScalar(kernel->getCarryDataTy(), "carries");
132
133    if (mHasLoop) {
134        kernel->addScalar(iBuilder->getInt32Ty(), "selector");
135    }
136    if (mHasLongAdvance) {
137        kernel->addScalar(iBuilder->getSizeTy(), "CarryBlockIndex");
138    }
139}
140
141bool isDynamicallyAllocatedType(const Type * const ty) {
142    if (isa<StructType>(ty) && ty->getStructNumElements() == ELEMENTS_IN_DYNAMICALLY_ALLOCATED_CARRY_STRUCT) {
143        return (ty->getStructElementType(1)->isPointerTy() && ty->getStructElementType(2)->isPointerTy() && ty->getStructElementType(0)->isIntegerTy());
144    }
145    return false;
146}
147
148bool containsDynamicallyAllocatedType(const Type * const ty) {
149    if (isa<StructType>(ty)) {
150        for (unsigned i = 0; i < ty->getStructNumElements(); ++i) {
151            if (isDynamicallyAllocatedType(ty->getStructElementType(i))) {
152                return true;
153            }
154        }
155    }
156    return false;
157}
158
159void freeDynamicallyAllocatedMemory(const std::unique_ptr<kernel::KernelBuilder> & idb, Value * const frame) {
160    StructType * const ty = cast<StructType>(frame->getType()->getPointerElementType());
161    std::array<Value *, 3> indices;
162    indices[0] = idb->getInt32(0);
163    for (unsigned i = 0; i < ty->getStructNumElements(); ++i) {
164        if (isDynamicallyAllocatedType(ty->getStructElementType(i))) {
165            indices[1] = idb->getInt32(i);
166            indices[2] = idb->getInt32(1);
167            Value * const innerFrame = idb->CreateLoad(idb->CreateGEP(frame, ArrayRef<Value*>(indices.data(), 3)));
168            if (containsDynamicallyAllocatedType(innerFrame->getType())) {
169                indices[2] = indices[0];
170                Value *  const count = idb->CreateLoad(idb->CreateGEP(frame, ArrayRef<Value*>(indices.data(), 3)));
171                BasicBlock * const entry = idb->GetInsertBlock();
172                BasicBlock * const cond = idb->CreateBasicBlock("freeCarryDataCond");
173                BasicBlock * const body = idb->CreateBasicBlock("freeCarryDataLoop");
174                BasicBlock * const exit = idb->CreateBasicBlock("freeCarryDataExit");
175                idb->CreateBr(cond);
176                idb->SetInsertPoint(cond);
177                PHINode * const index = idb->CreatePHI(count->getType(), 2);
178                index->addIncoming(ConstantInt::getNullValue(count->getType()), entry);
179                Value * test = idb->CreateICmpNE(index, count);
180                idb->CreateCondBr(test, body, exit);
181                idb->SetInsertPoint(body);
182                freeDynamicallyAllocatedMemory(idb, idb->CreateGEP(innerFrame, index));
183                index->addIncoming(idb->CreateAdd(index, ConstantInt::get(count->getType(), 1)), body);
184                idb->CreateBr(cond);
185                idb->SetInsertPoint(exit);
186            }
187            idb->CreateFree(innerFrame);
188            indices[2] = idb->getInt32(2);
189            Value *  const summary = idb->CreateLoad(idb->CreateGEP(frame, ArrayRef<Value*>(indices.data(), 3)));
190            idb->CreateFree(summary);
191        }
192    }
193}
194
195/** ------------------------------------------------------------------------------------------------------------- *
196 * @brief releaseCarryData
197 ** ------------------------------------------------------------------------------------------------------------- */
198void CarryManager::releaseCarryData(const std::unique_ptr<kernel::KernelBuilder> & idb) {
199    if (mHasNonCarryCollapsingLoops) {
200        freeDynamicallyAllocatedMemory(idb, idb->getScalarFieldPtr("carries"));
201    }
202}
203
204/** ------------------------------------------------------------------------------------------------------------- *
205 * @brief initializeCodeGen
206 ** ------------------------------------------------------------------------------------------------------------- */
207void CarryManager::initializeCodeGen(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
208
209    assert(!mCarryMetadata.empty());
210    mCarryInfo = &mCarryMetadata[0];
211    assert (!mCarryInfo->hasSummary());
212
213    mCurrentFrame = iBuilder->getScalarFieldPtr("carries");
214    mCurrentFrameIndex = 0;
215    mCarryScopes = 0;
216    mCarryScopeIndex.push_back(0);
217
218    assert (mCurrentFrameOffset.empty());
219
220    mCurrentFrameOffset.push_back(iBuilder->getInt32(0));
221
222    assert (mNonCarryCollapsingLoopCarryFrameStack.empty());
223
224    assert (mCarrySummaryStack.empty());
225
226    Type * const carryTy = iBuilder->getBitBlockType();
227
228    mCarrySummaryStack.push_back(Constant::getNullValue(carryTy));
229
230    if (mHasLoop) {       
231        mLoopSelector[0] = iBuilder->getScalarField("selector");
232        mLoopSelector[1] = iBuilder->CreateXor(mLoopSelector[0], ConstantInt::get(mLoopSelector[0]->getType(), 1));
233    }
234
235}
236
237/** ------------------------------------------------------------------------------------------------------------- *
238 * @brief finalizeCodeGen
239 ** ------------------------------------------------------------------------------------------------------------- */
240void CarryManager::finalizeCodeGen(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) {
241    if (mHasLoop) {
242        iBuilder->setScalarField("selector", mLoopSelector[1]);
243    }
244    if (mHasLongAdvance) {
245        Value * idx = iBuilder->getScalarField("CarryBlockIndex");
246        idx = iBuilder->CreateAdd(idx, iBuilder->getSize(1));
247        iBuilder->setScalarField("CarryBlockIndex", idx);
248    }
249    assert (mNonCarryCollapsingLoopCarryFrameStack.empty());
250    assert ("base summary value was deleted!" && mCarrySummaryStack.size() == 1);
251    assert ("base summary value was overwritten with non-zero value!" && isa<Constant>(mCarrySummaryStack[0]) && cast<Constant>(mCarrySummaryStack[0])->isNullValue());
252    mCarrySummaryStack.clear();
253    assert (mCarryScopeIndex.size() == 1);
254    mCarryScopeIndex.clear();
255}
256
257/** ------------------------------------------------------------------------------------------------------------- *
258 * @brief enterLoopScope
259 ** ------------------------------------------------------------------------------------------------------------- */
260void CarryManager::enterLoopScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const PabloBlock * const scope) {
261    assert (scope);
262    assert (mHasLoop);
263    ++mLoopDepth;
264    enterScope(iBuilder, scope);
265}
266
267/** ------------------------------------------------------------------------------------------------------------- *
268 * @brief enterLoopBody
269 ** ------------------------------------------------------------------------------------------------------------- */
270void CarryManager::enterLoopBody(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const entryBlock) {
271    if (mCarryInfo->hasSummary()) {
272        Type * const carryTy = iBuilder->getBitBlockType();
273        PHINode * phiCarryOutSummary = iBuilder->CreatePHI(carryTy, 2, "summary");
274        assert (!mCarrySummaryStack.empty());
275        phiCarryOutSummary->addIncoming(mCarrySummaryStack.back(), entryBlock);
276        // Replace the incoming carry summary with the phi node and add the phi node to the stack  so that we can
277        // properly OR it into the outgoing summary value.
278        // NOTE: this may change the base summary value; when exiting to the base scope, replace this summary with
279        // a null value to prevent subsequent nested scopes from inheriting the summary of this scope.
280        mCarrySummaryStack.back() = phiCarryOutSummary;
281        mCarrySummaryStack.push_back(phiCarryOutSummary);
282    }
283    if (LLVM_UNLIKELY(mCarryInfo->nonCarryCollapsingMode())) {
284
285        assert (mCarryInfo->hasSummary());
286
287        Type * const int8PtrTy = iBuilder->getInt8PtrTy();
288        Type * const carryTy = iBuilder->getBitBlockType();
289        PointerType * const carryPtrTy = carryTy->getPointerTo();
290
291        // Check whether we need to resize the carry state
292        PHINode * index = iBuilder->CreatePHI(iBuilder->getSizeTy(), 2);
293        mLoopIndicies.push_back(index);
294        index->addIncoming(iBuilder->getSize(0), entryBlock);
295        Value * capacityPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
296        Value * capacity = iBuilder->CreateLoad(capacityPtr, "capacity");
297        Constant * const ONE = ConstantInt::get(capacity->getType(), 1);
298        Value * arrayPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(1)});
299        Value * array = iBuilder->CreateLoad(arrayPtr, "array");
300        BasicBlock * const entry = iBuilder->GetInsertBlock();
301        BasicBlock * const resizeCarryState = iBuilder->CreateBasicBlock("ResizeCarryState");
302        BasicBlock * const reallocExisting = iBuilder->CreateBasicBlock("ReallocExisting");
303        BasicBlock * const createNew = iBuilder->CreateBasicBlock("CreateNew");
304        BasicBlock * const resumeKernel = iBuilder->CreateBasicBlock("ResumeKernel");
305        iBuilder->CreateLikelyCondBr(iBuilder->CreateICmpNE(index, capacity), resumeKernel, resizeCarryState);
306
307        // RESIZE CARRY BLOCK
308        iBuilder->SetInsertPoint(resizeCarryState);
309        const auto BlockWidth = iBuilder->getBitBlockWidth() / 8;
310        const auto Log2BlockWidth = floor_log2(BlockWidth);
311        Constant * const carryStateWidth = ConstantExpr::getIntegerCast(ConstantExpr::getSizeOf(array->getType()->getPointerElementType()), iBuilder->getSizeTy(), false);
312        Value * const summaryPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(2)});
313        Value * const hasCarryState = iBuilder->CreateICmpNE(array, ConstantPointerNull::get(cast<PointerType>(array->getType())));
314        iBuilder->CreateLikelyCondBr(hasCarryState, reallocExisting, createNew);
315
316        // REALLOCATE EXISTING
317        iBuilder->SetInsertPoint(reallocExisting);
318        Value * const capacitySize = iBuilder->CreateMul(capacity, carryStateWidth);
319        Value * const newCapacitySize = iBuilder->CreateShl(capacitySize, 1); // x 2
320        Value * const newArray = iBuilder->CreateCacheAlignedMalloc(newCapacitySize);
321        iBuilder->CreateMemCpy(newArray, array, capacitySize, iBuilder->getCacheAlignment());
322        iBuilder->CreateFree(array);
323        iBuilder->CreateStore(newArray, arrayPtr);
324        Value * const startNewArrayPtr = iBuilder->CreateGEP(iBuilder->CreatePointerCast(newArray, int8PtrTy), capacitySize);
325        iBuilder->CreateMemZero(startNewArrayPtr, capacitySize, BlockWidth);
326        Value * const newCapacity = iBuilder->CreateShl(capacity, 1);
327        iBuilder->CreateStore(newCapacity, capacityPtr);
328        Value * const summary = iBuilder->CreateLoad(summaryPtr, false);
329        Value * const summarySize = iBuilder->CreateShl(iBuilder->CreateAdd(iBuilder->CreateCeilLog2(capacity), ONE), Log2BlockWidth + 1);
330        Constant * const additionalSpace = iBuilder->getSize(2 * BlockWidth);
331        Value * const newSummarySize = iBuilder->CreateAdd(summarySize, additionalSpace);
332        Value * const newSummary = iBuilder->CreateBlockAlignedMalloc(newSummarySize);
333        iBuilder->CreateMemCpy(newSummary, summary, summarySize, BlockWidth);
334        iBuilder->CreateFree(summary);
335        iBuilder->CreateStore(iBuilder->CreatePointerCast(newSummary, carryPtrTy), summaryPtr);
336        Value * const startNewSummaryPtr = iBuilder->CreateGEP(iBuilder->CreatePointerCast(newSummary, int8PtrTy), summarySize);
337        iBuilder->CreateMemZero(startNewSummaryPtr, additionalSpace, BlockWidth);
338        iBuilder->CreateBr(resumeKernel);
339
340        // CREATE NEW
341        iBuilder->SetInsertPoint(createNew);
342        Constant * const initialLog2Capacity = iBuilder->getInt64(4);
343        Constant * const initialCapacity = ConstantExpr::getShl(ONE, initialLog2Capacity);
344        iBuilder->CreateStore(initialCapacity, capacityPtr);
345        Constant * const initialCapacitySize = ConstantExpr::getMul(initialCapacity, carryStateWidth);
346        Value * initialArray = iBuilder->CreateCacheAlignedMalloc(initialCapacitySize);
347        iBuilder->CreateMemZero(initialArray, initialCapacitySize, BlockWidth);
348        initialArray = iBuilder->CreatePointerCast(initialArray, array->getType());
349        iBuilder->CreateStore(initialArray, arrayPtr);
350        Constant * initialSummarySize = ConstantExpr::getShl(ConstantExpr::getAdd(initialLog2Capacity, iBuilder->getInt64(1)), iBuilder->getInt64(Log2BlockWidth + 1));
351        Value * initialSummary = iBuilder->CreateBlockAlignedMalloc(initialSummarySize);
352        iBuilder->CreateMemZero(initialSummary, initialSummarySize, BlockWidth);
353        initialSummary = iBuilder->CreatePointerCast(initialSummary, carryPtrTy);
354        iBuilder->CreateStore(initialSummary, summaryPtr);
355        iBuilder->CreateBr(resumeKernel);
356
357        // RESUME KERNEL
358        iBuilder->SetInsertPoint(resumeKernel);
359        PHINode * phiArrayPtr = iBuilder->CreatePHI(array->getType(), 3);
360        phiArrayPtr->addIncoming(array, entry);
361        phiArrayPtr->addIncoming(initialArray, createNew);
362        phiArrayPtr->addIncoming(newArray, reallocExisting);
363        mNonCarryCollapsingLoopCarryFrameStack.emplace_back(mCurrentFrame, std::vector<Value *>{mCurrentFrameOffset.begin(), mCurrentFrameOffset.end()});
364        mCurrentFrame = iBuilder->CreateGEP(phiArrayPtr, index);
365        assert (&std::get<1>(mNonCarryCollapsingLoopCarryFrameStack.back()) != &mCurrentFrameOffset);
366        mCurrentFrameOffset.resize(1);
367    }
368}
369
370/** ------------------------------------------------------------------------------------------------------------- *
371 * @brief leaveLoopBody
372 ** ------------------------------------------------------------------------------------------------------------- */
373void CarryManager::leaveLoopBody(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * /* exitBlock */) {
374
375    Type * const carryTy = iBuilder->getBitBlockType();
376
377    if (LLVM_UNLIKELY(mCarryInfo->nonCarryCollapsingMode())) {
378
379        assert (mCarryInfo->hasSummary());
380
381        ConstantInt * const summaryIndex = iBuilder->getInt32(mCarryInfo->hasExplicitSummary() ? mCurrentFrameIndex : (mCurrentFrameIndex - 1));
382
383        Value * const carryInAccumulator = readCarryInSummary(iBuilder, summaryIndex);
384        Value * const carryOutAccumulator = mCarrySummaryStack.back();
385
386        if (mCarryInfo->hasExplicitSummary()) {
387            writeCarryOutSummary(iBuilder, carryOutAccumulator, summaryIndex);
388        }
389        std::tie(mCurrentFrame, mCurrentFrameOffset) = mNonCarryCollapsingLoopCarryFrameStack.back();
390        mNonCarryCollapsingLoopCarryFrameStack.pop_back();
391
392        // In non-carry-collapsing mode, we cannot rely on the fact that performing a single iteration of this
393        // loop will consume all of the incoming carries from the prior block. We need to subtract the carries
394        // consumed by this iteration from our carry summary state. To do so in parallel, we use the the half-
395        // subtractor circuit to do it in ceil log2 steps. Similarly, we compute our carry out summary state
396        // (for the subsequent block to subtract) using a half-adder circuit.
397
398        // NOTE: this requires that, for all loop iterations, i, and all block iterations, j, the carry in
399        // summary, CI_i,j, matches the carry out summary of the prior block iteration, CO_i,j - 1.
400        // Otherwise we will end up with an incorrect result or being trapped in an infinite loop.
401
402        Value * capacityPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(0)});
403        Value * capacity = iBuilder->CreateLoad(capacityPtr, false);
404        Value * summaryPtr = iBuilder->CreateGEP(mCurrentFrame, {iBuilder->getInt32(0), iBuilder->getInt32(2)});
405        Value * summary = iBuilder->CreateLoad(summaryPtr, false);
406
407        Constant * const ONE = ConstantInt::get(capacity->getType(), 1);
408
409        Value * loopSelector = iBuilder->CreateZExt(mLoopSelector[0], capacity->getType());
410
411        BasicBlock * entry = iBuilder->GetInsertBlock();
412        BasicBlock * update = iBuilder->CreateBasicBlock("UpdateNonCarryCollapsingSummary");
413        BasicBlock * resume = iBuilder->CreateBasicBlock("ResumeAfterUpdatingNonCarryCollapsingSummary");
414
415        iBuilder->CreateBr(update);
416
417        iBuilder->SetInsertPoint(update);
418        PHINode * i = iBuilder->CreatePHI(capacity->getType(), 2);
419        i->addIncoming(ConstantInt::getNullValue(capacity->getType()), entry);
420        PHINode * const borrow = iBuilder->CreatePHI(carryInAccumulator->getType(), 2);
421        borrow->addIncoming(carryInAccumulator, entry);
422        PHINode * const carry = iBuilder->CreatePHI(carryOutAccumulator->getType(), 2);
423        carry->addIncoming(carryOutAccumulator, entry);
424        // OR the updated carry in summary later for the summaryTest
425        PHINode * const carryInSummary = iBuilder->CreatePHI(carryTy, 2);
426        carryInSummary->addIncoming(Constant::getNullValue(carryTy), entry);
427
428        // half subtractor
429        Value * const carryInOffset = iBuilder->CreateOr(iBuilder->CreateShl(i, 1), loopSelector);
430        Value * const carryInPtr = iBuilder->CreateGEP(summary, carryInOffset);
431        Value * const carryIn = iBuilder->CreateBlockAlignedLoad(carryInPtr);
432        Value * const nextCarryIn = iBuilder->CreateXor(carryIn, borrow);
433        Value * const nextSummary = iBuilder->CreateOr(carryInSummary, nextCarryIn);
434        iBuilder->CreateBlockAlignedStore(nextCarryIn, carryInPtr);
435        carryInSummary->addIncoming(nextSummary, update);
436        Value * finalBorrow = iBuilder->CreateAnd(iBuilder->CreateNot(carryIn), borrow);
437        borrow->addIncoming(finalBorrow, update);
438
439        // half adder
440        Value * const carryOutOffset = iBuilder->CreateXor(carryInOffset, ConstantInt::get(carryInOffset->getType(), 1));
441        Value * const carryOutPtr = iBuilder->CreateGEP(summary, carryOutOffset);
442        Value * const carryOut = iBuilder->CreateBlockAlignedLoad(carryOutPtr);
443        Value * const nextCarryOut = iBuilder->CreateXor(carryOut, carry);
444        iBuilder->CreateBlockAlignedStore(nextCarryOut, carryOutPtr);
445        Value * finalCarry = iBuilder->CreateAnd(carryOut, carry);
446        carry->addIncoming(finalCarry, update);
447
448        // loop condition
449        i->addIncoming(iBuilder->CreateAdd(i, ONE), update);
450        iBuilder->CreateCondBr(iBuilder->CreateICmpNE(iBuilder->CreateShl(ONE, i), capacity), update, resume);
451
452        iBuilder->SetInsertPoint(resume);
453
454        if (codegen::EnableAsserts) {
455            iBuilder->CreateAssertZero(iBuilder->CreateOr(finalBorrow, finalCarry),
456                                       "CarryPackManager: loop post-condition violated: final borrow and carry must be zero!");
457        }
458
459        assert (!mLoopIndicies.empty());
460        PHINode * index = mLoopIndicies.back();
461        index->addIncoming(iBuilder->CreateAdd(index, iBuilder->getSize(1)), resume);
462        mLoopIndicies.pop_back();
463
464        mNextSummaryTest = nextSummary;
465    }
466    if (mCarryInfo->hasSummary()) {
467        const auto n = mCarrySummaryStack.size(); assert (n > 1);
468        Value * carryOut = mCarrySummaryStack.back();
469        mCarrySummaryStack.pop_back();
470        PHINode * phiCarryOut = cast<PHINode>(mCarrySummaryStack.back());
471        phiCarryOut->addIncoming(carryOut, iBuilder->GetInsertBlock());
472        // If we're returning to the base scope, reset our accumulated summary value.
473        if (n == 2) {
474            carryOut = Constant::getNullValue(carryTy);
475        }
476        mCarrySummaryStack.back() = carryOut;
477    }
478}
479
480/** ------------------------------------------------------------------------------------------------------------- *
481 * @brief leaveLoopScope
482 ** ------------------------------------------------------------------------------------------------------------- */
483void CarryManager::leaveLoopScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const /* entryBlock */, BasicBlock * const /* exitBlock */) {
484    assert (mLoopDepth > 0);
485    --mLoopDepth;
486    leaveScope(iBuilder);
487}
488
489/** ------------------------------------------------------------------------------------------------------------- *
490 * @brief enterIfScope
491 ** ------------------------------------------------------------------------------------------------------------- */
492void CarryManager::enterIfScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const PabloBlock * const scope) {
493    ++mIfDepth;
494    enterScope(iBuilder, scope);
495    // We zero-initialized the nested summary value and later OR in the current summary into the escaping summary
496    // so that upon processing the subsequent block iteration, we branch into this If scope iff a carry out was
497    // generated by a statement within this If scope and not by a dominating statement in the outer scope.
498    if (LLVM_LIKELY(mCarryInfo->hasSummary())) {
499        assert (mCurrentFrameIndex == 0);
500        mNextSummaryTest = readCarryInSummary(iBuilder, iBuilder->getInt32(0));
501        if (mCarryInfo->hasExplicitSummary()) {
502            mCurrentFrameIndex = 1;
503        }
504    }
505    Type * const carryTy = iBuilder->getBitBlockType();
506    mCarrySummaryStack.push_back(Constant::getNullValue(carryTy));
507}
508
509/** ------------------------------------------------------------------------------------------------------------- *
510 * @brief generateSummaryTest
511 ** ------------------------------------------------------------------------------------------------------------- */
512Value * CarryManager::generateSummaryTest(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * condition) {
513    if (LLVM_LIKELY(mCarryInfo->hasSummary())) {
514        assert ("summary test was not generated" && mNextSummaryTest);
515        condition = iBuilder->simd_or(condition, mNextSummaryTest);
516        mNextSummaryTest = nullptr;
517    }
518    assert ("summary test was not consumed" && (mNextSummaryTest == nullptr));
519    return condition;
520}
521
522/** ------------------------------------------------------------------------------------------------------------- *
523 * @brief enterIfBody
524 ** ------------------------------------------------------------------------------------------------------------- */
525void CarryManager::enterIfBody(const std::unique_ptr<kernel::KernelBuilder> & /* iBuilder */, BasicBlock * const entryBlock) {
526    assert (entryBlock);
527}
528
529/** ------------------------------------------------------------------------------------------------------------- *
530 * @brief leaveIfBody
531 ** ------------------------------------------------------------------------------------------------------------- */
532void CarryManager::leaveIfBody(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const exitBlock) {
533    assert (exitBlock);
534    const auto n = mCarrySummaryStack.size();
535    if (LLVM_LIKELY(mCarryInfo->hasExplicitSummary())) {
536        writeCarryOutSummary(iBuilder, mCarrySummaryStack[n - 1], iBuilder->getInt32(0));
537    }
538    if (n > 2) {
539        mCarrySummaryStack[n - 1] = iBuilder->CreateOr(mCarrySummaryStack[n - 1], mCarrySummaryStack[n - 2], "summary");
540    }
541}
542
543/** ------------------------------------------------------------------------------------------------------------- *
544 * @brief leaveIfScope
545 ** ------------------------------------------------------------------------------------------------------------- */
546void CarryManager::leaveIfScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, BasicBlock * const entryBlock, BasicBlock * const exitBlock) {
547    assert (mIfDepth > 0);
548    if (LLVM_LIKELY(mCarryInfo->hasSummary())) {
549        const auto n = mCarrySummaryStack.size(); assert (n > 0);
550        if (n > 2) {
551            // When leaving a nested If scope with a summary value, phi out the summary to ensure the
552            // appropriate summary is stored in the outer scope.
553            Value * nested = mCarrySummaryStack[n - 1];
554            Value * outer = mCarrySummaryStack[n - 2];
555            assert (nested->getType() == outer->getType());
556            PHINode * const phi = iBuilder->CreatePHI(nested->getType(), 2, "summary");
557            phi->addIncoming(outer, entryBlock);
558            phi->addIncoming(nested, exitBlock);
559            mCarrySummaryStack[n - 2] = phi;
560        }
561    }
562    --mIfDepth;
563    leaveScope(iBuilder);
564    mCarrySummaryStack.pop_back();
565}
566
567/** ------------------------------------------------------------------------------------------------------------ *
568 * @brief enterScope
569 ** ------------------------------------------------------------------------------------------------------------- */
570void CarryManager::enterScope(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const PabloBlock * const scope) {
571    assert (scope);
572    // Store the state of the current frame and update the scope state
573    mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex));
574    mCurrentScope = scope;
575    mCarryScopeIndex.push_back(++mCarryScopes);
576    mCarryInfo = &mCarryMetadata[mCarryScopes];
577    mCurrentFrameIndex = 0;
578}
579
580/** ------------------------------------------------------------------------------------------------------------- *
581 * @brief leaveScope
582 ** ------------------------------------------------------------------------------------------------------------- */
583void CarryManager::leaveScope(const std::unique_ptr<kernel::KernelBuilder> & /* iBuilder */) {
584
585    assert (mCurrentFrameOffset.size() > 1);
586    ConstantInt * const carryIndex = cast<ConstantInt>(mCurrentFrameOffset.back());
587    mCurrentFrameIndex = carryIndex->getLimitedValue() + 1;
588    mCurrentFrameOffset.pop_back();
589
590    mCarryScopeIndex.pop_back();
591    assert (!mCarryScopeIndex.empty());
592    mCurrentScope = mCurrentScope->getPredecessor();
593    assert (mCurrentScope);
594    mCarryInfo = &mCarryMetadata[mCarryScopeIndex.back()];
595}
596
597/** ------------------------------------------------------------------------------------------------------------- *
598 * @brief addCarryInCarryOut
599 ** ------------------------------------------------------------------------------------------------------------- */
600Value * CarryManager::addCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const CarryProducingStatement * const op, Value * const e1, Value * const e2) {
601    assert (op && !isa<Advance>(op));
602    Value * const carryIn = getCarryIn(iBuilder, op);
603    Value * carryOut, * result;
604    std::tie(carryOut, result) = iBuilder->bitblock_add_with_carry(e1, e2, carryIn);
605    setCarryOut(iBuilder, op, carryOut);
606    assert (result->getType() == iBuilder->getBitBlockType());
607    return result;
608}
609
610/** ------------------------------------------------------------------------------------------------------------- *
611 * @brief advanceCarryInCarryOut
612 ** ------------------------------------------------------------------------------------------------------------- */
613Value * CarryManager::advanceCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const Advance * const advance, Value * const value) {
614    const auto shiftAmount = advance->getAmount();
615    if (LLVM_LIKELY(shiftAmount < mElementWidth)) {
616        Value * const carryIn = getCarryIn(iBuilder, advance);
617        Value * carryOut, * result;
618        std::tie(carryOut, result) = iBuilder->bitblock_advance(value, carryIn, shiftAmount);
619        setCarryOut(iBuilder, advance, carryOut);
620        assert (result->getType() == iBuilder->getBitBlockType());
621        return result;
622    } else {
623        return longAdvanceCarryInCarryOut(iBuilder, value, shiftAmount);
624    }
625}
626
627
628/** ------------------------------------------------------------------------------------------------------------- *
629 * @brief indexedAdvanceCarryInCarryOut
630 ** ------------------------------------------------------------------------------------------------------------- */
631Value * CarryManager::indexedAdvanceCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & b, const IndexedAdvance * const advance, Value * const strm, Value * const index_strm) {
632    const auto shiftAmount = advance->getAmount();
633    Value * popcount_f = Intrinsic::getDeclaration(b->getModule(), Intrinsic::ctpop, b->getSizeTy());
634    Value * PEXT_f = nullptr;
635    Value * PDEP_f = nullptr;
636    unsigned bitWidth = sizeof(size_t) * 8;
637    if (bitWidth == 64) {
638        PEXT_f = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_64);
639        PDEP_f = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_64);
640    }
641    else if ((bitWidth == 32)  && (shiftAmount < 32)) {
642        PEXT_f = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pext_32);
643        PDEP_f = Intrinsic::getDeclaration(b->getModule(), Intrinsic::x86_bmi_pdep_32);
644    }
645    else {
646        llvm::report_fatal_error("indexed_advance unsupported bit width");
647    }
648    if (LLVM_LIKELY(shiftAmount < bitWidth)) {
649        Value * const carryIn = getNextCarryIn(b);
650        Value * shiftVal = b->getSize(shiftAmount);
651        Value * carry = b->mvmd_extract(bitWidth, carryIn, 0);
652        Value * result = b->allZeroes();
653        for (unsigned i = 0; i < b->getBitBlockWidth()/bitWidth; i++) {
654            Value * s = b->mvmd_extract(bitWidth, strm, i);
655            Value * ix = b->mvmd_extract(bitWidth, index_strm, i);
656            Value * ix_popcnt = b->CreateCall(popcount_f, {ix});
657            Value * bits = b->CreateCall(PEXT_f, {s, ix});
658            Value * adv = b->CreateOr(b->CreateShl(bits, shiftAmount), carry);
659            Value * popcount_small = b->CreateICmpULT(ix_popcnt, shiftVal);
660            Value * carry_if_popcount_small = 
661            b->CreateOr(b->CreateShl(bits, b->CreateSub(shiftVal, ix_popcnt)),
662                        b->CreateLShr(carry, ix_popcnt));
663            Value * carry_if_popcount_large = b->CreateLShr(bits, b->CreateSub(ix_popcnt, shiftVal));
664            carry = b->CreateSelect(popcount_small, carry_if_popcount_small, carry_if_popcount_large);
665            result = b->mvmd_insert(bitWidth, result, b->CreateCall(PDEP_f, {adv, ix}), i);
666        }
667        Value * carryOut = b->mvmd_insert(bitWidth, b->allZeroes(), carry, 0);
668        setNextCarryOut(b, carryOut);
669        return result;
670    } else if (shiftAmount <= b->getBitBlockWidth()) {
671        // A single bitblock still holds all the shifted bits.   In this case, we know
672        // that the shift amount is always greater than the popcount of the individual
673        // elements that we deal with.   This simplifies some of the logic.
674        Type * iBitBlock = b->getIntNTy(b->getBitBlockWidth());
675        Value * carryInPtr = b->CreateGEP(mCurrentFrame, {b->getInt32(0), b->getInt32(mCurrentFrameIndex++), b->getInt32(0)});
676        Value * carryIn = b->CreateBlockAlignedLoad(carryInPtr);
677        Value * shiftVal = b->getSize(shiftAmount);
678        Value * carry = b->CreateBitCast(carryIn, iBitBlock);
679        Value * result = b->allZeroes();
680        for (unsigned i = 0; i < b->getBitBlockWidth()/bitWidth; i++) {
681            Value * s = b->mvmd_extract(bitWidth, strm, i);
682            Value * ix = b->mvmd_extract(bitWidth, index_strm, i);
683            Value * ix_popcnt = b->CreateCall(popcount_f, {ix});
684            Value * bits = b->CreateCall(PEXT_f, {s, ix});  // All these bits are shifted out (appended to carry).
685            result = b->mvmd_insert(bitWidth, result, b->CreateCall(PDEP_f, {b->mvmd_extract(bitWidth, carry, 0), ix}), i);
686            carry = b->CreateLShr(carry, b->CreateZExt(ix_popcnt, iBitBlock)); // Remove the carry bits consumed, make room for new bits.
687            carry = b->CreateOr(carry, b->CreateShl(b->CreateZExt(bits, iBitBlock), b->CreateZExt(b->CreateSub(shiftVal, ix_popcnt), iBitBlock)));
688        }
689        b->CreateBlockAlignedStore(b->CreateBitCast(carry, b->getBitBlockType()), carryInPtr);
690        if ((mIfDepth > 0) && mCarryInfo->hasExplicitSummary()) {
691            addToCarryOutSummary(b, strm);
692        }
693        return result;
694    } else {
695        mIndexedLongAdvanceIndex++;
696        llvm::report_fatal_error("IndexedAdvance > BlockSize not yet supported.");
697    }
698}
699
700
701
702/** ------------------------------------------------------------------------------------------------------------- *
703 * @brief longAdvanceCarryInCarryOut
704 ** ------------------------------------------------------------------------------------------------------------- */
705inline Value * CarryManager::longAdvanceCarryInCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * const value, const unsigned shiftAmount) {
706
707    assert (mHasLongAdvance);
708    assert (shiftAmount >= mElementWidth);
709
710    Type * const streamTy = iBuilder->getIntNTy(iBuilder->getBitBlockWidth());
711
712    if (mIfDepth > 0) {
713        if (shiftAmount > iBuilder->getBitBlockWidth()) {
714
715            mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex++));
716            mCurrentFrameOffset.push_back(nullptr);
717
718            Value * carry = iBuilder->CreateZExt(iBuilder->bitblock_any(value), streamTy);
719            const unsigned summarySize = ceil_udiv(shiftAmount, iBuilder->getBitBlockWidth() * iBuilder->getBitBlockWidth());
720            for (unsigned i = 0;;++i) {
721
722                mCurrentFrameOffset.back() = iBuilder->getInt32(i);
723
724                Value * const ptr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
725                Value * const prior = iBuilder->CreateBitCast(iBuilder->CreateBlockAlignedLoad(ptr), streamTy);
726                Value * const stream = iBuilder->CreateBitCast(iBuilder->CreateOr(iBuilder->CreateShl(prior, 1), carry), iBuilder->getBitBlockType());
727                if (LLVM_LIKELY(i == summarySize)) {
728                    Value * const maskedStream = iBuilder->CreateAnd(stream, iBuilder->bitblock_mask_from(iBuilder->getInt32(summarySize % iBuilder->getBitBlockWidth())));
729                    addToCarryOutSummary(iBuilder, maskedStream);
730                    iBuilder->CreateBlockAlignedStore(maskedStream, ptr);
731                    break;
732                }
733                addToCarryOutSummary(iBuilder, stream);
734                iBuilder->CreateBlockAlignedStore(stream, ptr);
735                carry = iBuilder->CreateLShr(prior, iBuilder->getBitBlockWidth() - 1);
736            }
737
738            mCurrentFrameOffset.pop_back();
739            mCurrentFrameOffset.pop_back();
740
741        } else if (LLVM_LIKELY(mCarryInfo->hasExplicitSummary())) {
742            addToCarryOutSummary(iBuilder, value);
743        }
744    }
745    mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex++));
746    Value * result = nullptr;
747    // special case using a single buffer entry and the carry_out value.
748    if (LLVM_LIKELY((shiftAmount < iBuilder->getBitBlockWidth()) && (mLoopDepth == 0))) {
749        mCurrentFrameOffset.push_back(iBuilder->getInt32(0));
750        Value * const buffer = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
751        assert (buffer->getType()->getPointerElementType() == iBuilder->getBitBlockType());
752        Value * carryIn = iBuilder->CreateBlockAlignedLoad(buffer);
753        iBuilder->CreateBlockAlignedStore(value, buffer);
754        /* Very special case - no combine */
755        if (LLVM_UNLIKELY(shiftAmount == iBuilder->getBitBlockWidth())) {
756            return iBuilder->CreateBitCast(carryIn, iBuilder->getBitBlockType());
757        }
758        Value * block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), iBuilder->getBitBlockWidth() - shiftAmount);
759        Value * block1_shl = iBuilder->CreateShl(iBuilder->CreateBitCast(value, streamTy), shiftAmount);
760        result = iBuilder->CreateBitCast(iBuilder->CreateOr(block1_shl, block0_shr), iBuilder->getBitBlockType());
761    } else { //
762        const unsigned blockShift = shiftAmount % iBuilder->getBitBlockWidth();
763        const unsigned blocks = ceil_udiv(shiftAmount, iBuilder->getBitBlockWidth());
764        // Create a mask to implement circular buffer indexing
765        Value * const indexMask = iBuilder->getSize(nearest_pow2(blocks + ((mLoopDepth != 0) ? 1 : 0)) - 1);
766        Value * const blockIndex = iBuilder->getScalarField("CarryBlockIndex");
767        Value * const carryIndex0 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(blocks));
768        Value * const loadIndex0 = iBuilder->CreateAnd(carryIndex0, indexMask);
769        mCurrentFrameOffset.push_back(loadIndex0);
770
771        Value * const carryInPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
772        Value * const carryIn = iBuilder->CreateBlockAlignedLoad(carryInPtr);
773        Value * const storeIndex = iBuilder->CreateAnd(blockIndex, indexMask);
774        mCurrentFrameOffset.back() = storeIndex;
775
776        Value * const carryOutPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
777        assert (carryIn->getType() == iBuilder->getBitBlockType());
778        // If the long advance is an exact multiple of BitBlockWidth, we simply return the oldest
779        // block in the long advance carry data area.
780        if (LLVM_UNLIKELY(blockShift == 0)) {
781            iBuilder->CreateBlockAlignedStore(value, carryOutPtr);
782            result = carryIn;
783        } else { // Otherwise we need to combine data from the two oldest blocks.
784            Value * carryIndex1 = iBuilder->CreateSub(blockIndex, iBuilder->getSize(blocks - 1));
785            Value * loadIndex1 = iBuilder->CreateAnd(carryIndex1, indexMask);
786            mCurrentFrameOffset.back() = loadIndex1;
787            Value * const carryInPtr2 = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
788            Value * carry_block1 = iBuilder->CreateBlockAlignedLoad(carryInPtr2);
789            Value * block0_shr = iBuilder->CreateLShr(iBuilder->CreateBitCast(carryIn, streamTy), iBuilder->getBitBlockWidth() - blockShift);
790            Value * block1_shl = iBuilder->CreateShl(iBuilder->CreateBitCast(carry_block1, streamTy), blockShift);
791            iBuilder->CreateBlockAlignedStore(value, carryOutPtr);
792            result = iBuilder->CreateBitCast(iBuilder->CreateOr(block1_shl, block0_shr), iBuilder->getBitBlockType());
793        }
794    }
795    mCurrentFrameOffset.pop_back();
796    mCurrentFrameOffset.pop_back();
797    return result;
798}
799
800/** ------------------------------------------------------------------------------------------------------------- *
801 * @brief getNextCarryIn
802 ** ------------------------------------------------------------------------------------------------------------- */
803Value * CarryManager::getCarryIn(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const CarryProducingStatement * const op) {
804
805    VectorType * const carryTy = iBuilder->getBitBlockType();
806
807    const auto carryGroup = op->getCarryGroup();
808    assert (carryGroup < mCarryGroup.size());
809    CarryGroup & group = mCarryGroup[carryGroup];
810
811    Value * carryIn = group.carryIn;
812    if (carryIn) {
813        // if this carry crosses a non-carry-collapsing loop frame, compute the address for this carry in the 'new' frame.
814        Value * const frame = cast<GetElementPtrInst>(cast<LoadInst>(carryIn)->getPointerOperand())->getPointerOperand();
815        carryIn = (frame == mCurrentFrame) ? carryIn : nullptr;
816    }
817
818    if (carryIn == nullptr) {
819        mCurrentFrameOffset.push_back(iBuilder->getInt32(mCurrentFrameIndex));
820        if (LLVM_UNLIKELY(mLoopDepth != 0)) {
821            assert ("Loop selector was not initialized!" && mLoopSelector[0]);
822            mCurrentFrameOffset.push_back(mLoopSelector[0]);
823        }
824        Value * ptr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
825        carryIn = iBuilder->CreateBlockAlignedLoad(ptr);
826        carryIn = iBuilder->CreateBitCast(carryIn, carryTy);
827
828        group.carryIn = carryIn;
829        group.carryOut = nullptr;
830        group.frameIndex = mCurrentFrameIndex++;
831        group.packedSize = 0;
832
833        // If we're within a loop, we need to clear out the carry in value for the subsequent loop iteration
834        if (LLVM_UNLIKELY(mLoopDepth != 0)) {
835            iBuilder->CreateBlockAlignedStore(Constant::getNullValue(carryTy), ptr);
836            mCurrentFrameOffset.pop_back();
837        }
838        mCurrentFrameOffset.pop_back();
839        if (LLVM_UNLIKELY(group.groupSize == 1)) {
840            return carryIn;
841        }
842    }
843
844    // Initially, our carry in is densely packed. Our goal is to move the carry to the 0-th position and leave
845    // all other values zeroed out. We can do so easily using ShuffleVector by moving the appropriate element
846    // and selecting 0s for all other elements but this produces poor code by LLVM.
847
848    // Instead we divide this process into two stages:
849
850    // First use the blend instruction to select only the elements of the carry we want
851
852
853
854    const auto width = std::max<unsigned>(gcd(std::min<unsigned>(op->getCarryWidth(), mElementWidth), group.packedSize), 8);
855    const auto n = udiv(iBuilder->getBitBlockWidth(), width);
856
857    VectorType * packTy = VectorType::get(iBuilder->getIntNTy(width), n);
858
859    Constant * const zero = Constant::getNullValue(packTy);
860
861    carryIn = iBuilder->CreateBitCast(carryIn, packTy);
862
863    if (LLVM_UNLIKELY(group.groupSize > 1)) {
864        int blend_mask[n];
865        for (unsigned i = 0; i < n; ++i) {
866            const auto w = i * width;
867            if (w >= group.packedSize && w < (group.packedSize + op->getCarryWidth())) {
868                blend_mask[i] = i;
869            } else {
870                blend_mask[i] = i + n;
871            }
872        }
873        carryIn = iBuilder->CreateShuffleVector(carryIn, zero, ArrayRef<int>(blend_mask, n));
874    }
875
876    // Then use a byte shift to move them to the 0-th position
877
878    const auto offset = udiv(group.packedSize, width);
879
880    if (offset) {
881        int shift[n];
882        for (unsigned i = 0; i < n; ++i) {
883            shift[i] = offset + i;
884        }
885        carryIn = iBuilder->CreateShuffleVector(carryIn, zero, ArrayRef<int>(shift, n));
886    }
887
888    // if this element contains more than one carry, we need to mask mask off any subsequent carries
889    const unsigned nextPackedSize = group.packedSize + op->getCarryWidth();
890    const unsigned subsequentOffset = (-nextPackedSize) & (width - 1U);
891    if (LLVM_UNLIKELY(subsequentOffset != 0)) {
892        carryIn = iBuilder->CreateShl(carryIn, subsequentOffset);
893    }
894
895    // then shift out the prior carries while moving the current carry to the correct position
896    const unsigned trailingOffset = subsequentOffset + (group.packedSize & (width - 1U));
897    assert (trailingOffset < width);
898    if (LLVM_UNLIKELY(trailingOffset != 0)) {
899        carryIn = iBuilder->CreateLShr(carryIn, trailingOffset);
900    }
901
902    carryIn = iBuilder->CreateBitCast(carryIn, carryTy, "CarryIn_" + op->getName());
903
904    return carryIn;
905}
906
907/** ------------------------------------------------------------------------------------------------------------- *
908 * @brief setNextCarryOut
909 ** ------------------------------------------------------------------------------------------------------------- */
910void CarryManager::setCarryOut(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, const CarryProducingStatement * const op, Value * carryOut) {
911
912    VectorType * const carryTy = iBuilder->getBitBlockType();
913
914    assert ("Unexpected carry out value type!" && carryOut->getType()->canLosslesslyBitCastTo(carryTy));
915
916    const auto carryGroup = op->getCarryGroup();
917    assert (carryGroup < mCarryGroup.size());
918    CarryGroup & group = mCarryGroup[carryGroup];
919
920    const auto width = std::max<unsigned>(gcd(op->getCarryWidth(), group.packedSize), 8);
921    const auto count = udiv(iBuilder->getBitBlockWidth(), width);
922    VectorType * packTy = VectorType::get(iBuilder->getIntNTy(width), count);
923
924    carryOut = iBuilder->CreateBitCast(carryOut, packTy);
925
926    // shuffle the carryOut to have the 0-th slot being in the correct packed offset
927    const auto index = udiv(group.packedSize, width);
928
929    if (index) {
930        assert (index < count);
931        int position[count];
932        for (unsigned i = 0; i < count; ++i) {
933            position[i] = ((count - index) + i);
934        }
935        Constant * const zero = Constant::getNullValue(packTy);
936        carryOut = iBuilder->CreateShuffleVector(zero, carryOut, ArrayRef<int>(position, count));
937    }
938
939    const auto offset = group.packedSize & (width - 1U);
940    if (LLVM_UNLIKELY(offset != 0)) {
941        carryOut = iBuilder->CreateShl(carryOut, offset);
942    }
943    carryOut = iBuilder->CreateBitCast(carryOut, carryTy);
944
945    assert (op->getCarryWidth() > 0);
946
947    if (group.carryOut == nullptr) {
948        group.carryOut = carryOut;
949    } else {
950        #ifndef NDEBUG
951        iBuilder->CreateAssertZero(iBuilder->CreateAnd(group.carryOut, carryOut),
952                               "CarryPackManager: bit collision detected when combining partial carry outs");
953        #endif
954        group.carryOut = iBuilder->CreateOr(group.carryOut, carryOut);
955    }
956
957    group.packedSize += op->getCarryWidth();
958
959    if (LLVM_UNLIKELY(group.groupSize == 1)) {
960        Value * ptr = nullptr;
961        if (LLVM_LIKELY(mLoopDepth == 0)) {
962            ptr = cast<LoadInst>(group.carryIn)->getPointerOperand();
963        } else {
964            mCurrentFrameOffset.push_back(iBuilder->getInt32(group.frameIndex));
965            mCurrentFrameOffset.push_back(mLoopSelector[1]);
966            ptr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
967            mCurrentFrameOffset.pop_back();
968            mCurrentFrameOffset.pop_back();
969            if (LLVM_LIKELY(!mCarryInfo->nonCarryCollapsingMode())) {
970                group.carryOut = iBuilder->CreateOr(group.carryOut, iBuilder->CreateBlockAlignedLoad(ptr));
971            }
972        }
973
974        if (mCarryInfo->hasSummary()) {
975            // TODO: if this group is all within a single scope, only add the packed carry to the summary
976            addToCarryOutSummary(iBuilder, group.carryOut);
977        }
978
979        iBuilder->CreateBlockAlignedStore(group.carryOut, ptr);
980    }
981
982    group.groupSize -= 1;
983
984}
985
986/** ------------------------------------------------------------------------------------------------------------- *
987 * @brief readCarryInSummary
988 ** ------------------------------------------------------------------------------------------------------------- */
989Value * CarryManager::readCarryInSummary(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, ConstantInt * index) {
990    assert (mCarryInfo->hasSummary());
991    const auto size = mCurrentFrameOffset.size();
992    // If we have a borrowed summary, we cannot be sure how deeply nested it is but do know that it will be the first
993    // element of a nested struct that is not a struct. Traverse inwards until we find it.
994    if (LLVM_UNLIKELY(mCarryInfo->hasBorrowedSummary())) {
995        Type * frameTy = mCurrentFrame->getType()->getPointerElementType();
996        for (unsigned i = 1; i < mCurrentFrameOffset.size(); ++i) {
997            frameTy = frameTy->getStructElementType(cast<ConstantInt>(mCurrentFrameOffset[i])->getLimitedValue());
998        }
999        assert (frameTy->isStructTy());
1000        unsigned count = 1;
1001        for (;;) {
1002            frameTy = frameTy->getStructElementType(0);
1003            if (!frameTy->isStructTy()) {
1004                break;
1005            }
1006            ++count;
1007        }
1008        assert (count > 0);
1009        mCurrentFrameOffset.insert(mCurrentFrameOffset.end(), count - 1, iBuilder->getInt32(0));
1010    }
1011    mCurrentFrameOffset.push_back(index);
1012    if (mLoopDepth != 0) {
1013        mCurrentFrameOffset.push_back(mLoopSelector[0]);
1014    }
1015    Value * const summaryPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
1016    mCurrentFrameOffset.resize(size);
1017    Value * const summary = iBuilder->CreateBlockAlignedLoad(summaryPtr);
1018    if (mLoopDepth != 0 && mCarryInfo->hasExplicitSummary()) {
1019        Type * const carryTy = iBuilder->getBitBlockType();
1020        iBuilder->CreateBlockAlignedStore(Constant::getNullValue(carryTy), summaryPtr);
1021    }
1022    return summary;
1023}
1024
1025/** ------------------------------------------------------------------------------------------------------------- *
1026 * @brief writeCarryOutSummary
1027 ** ------------------------------------------------------------------------------------------------------------- */
1028inline void CarryManager::writeCarryOutSummary(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * const summary, ConstantInt * index) {
1029    assert (mCarryInfo->hasExplicitSummary());
1030    mCurrentFrameOffset.push_back(index);
1031    if (mLoopDepth != 0) {
1032        mCurrentFrameOffset.push_back(mLoopSelector[1]);
1033    }
1034    Value * const summaryPtr = iBuilder->CreateGEP(mCurrentFrame, mCurrentFrameOffset);
1035    iBuilder->CreateBlockAlignedStore(summary, summaryPtr);
1036    if (mLoopDepth != 0) {
1037        mCurrentFrameOffset.pop_back();
1038    }
1039    mCurrentFrameOffset.pop_back();
1040}
1041
1042/** ------------------------------------------------------------------------------------------------------------- *
1043 * @brief addToCarryOutSummary
1044 ** ------------------------------------------------------------------------------------------------------------- */
1045inline void CarryManager::addToCarryOutSummary(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, Value * const value) {
1046    assert ("cannot add null summary value!" && value);   
1047    assert ("summary stack is empty!" && !mCarrySummaryStack.empty());
1048    assert ("current scope does not have a summary!" && mCarryInfo->hasSummary());
1049    Value * const currentSummary = mCarrySummaryStack.back();
1050    mCarrySummaryStack.back() = iBuilder->CreateOr(value, currentSummary);
1051}
1052
1053/** ------------------------------------------------------------------------------------------------------------- *
1054 * @brief enumerate
1055 ** ------------------------------------------------------------------------------------------------------------- */
1056unsigned CarryManager::getScopeCount(const PabloBlock * const scope, unsigned index) {
1057    for (const Statement * stmt : *scope) {
1058        if (LLVM_UNLIKELY(isa<Branch>(stmt))) {
1059            index = getScopeCount(cast<Branch>(stmt)->getBody(), index);
1060        }
1061    }
1062    return index + 1;
1063}
1064
1065/** ------------------------------------------------------------------------------------------------------------- *
1066 * @brief hasIterationSpecificAssignment
1067 ** ------------------------------------------------------------------------------------------------------------- */
1068bool CarryManager::hasIterationSpecificAssignment(const PabloBlock * const scope) {
1069#if 0
1070    return dyn_cast_or_null<While>(scope->getBranch()) != nullptr;
1071#else
1072    if (const While * const br = dyn_cast_or_null<While>(scope->getBranch())) {
1073        for (const Var * var : br->getEscaped()) {
1074            for (const PabloAST * user : var->users()) {
1075                if (const Extract * e = dyn_cast<Extract>(user)) {
1076                    if (LLVM_UNLIKELY(e->getIndex() == var)) {
1077                        // If we assign this Var a value and read the value as the index parameter
1078                        // of a nested Extract statement, then we cannot collapse the carries.
1079                        const PabloBlock * parent = e->getParent();
1080                        for (;;) {
1081                            if (parent == scope) {
1082                                return true;
1083                            }
1084                            parent = parent->getPredecessor();
1085                            if (parent == nullptr) {
1086                                break;
1087                            }
1088                        }
1089                    }
1090                }
1091            }
1092        }
1093    }
1094    return false;
1095#endif
1096}
1097
1098/** ------------------------------------------------------------------------------------------------------------- *
1099 * @brief assignDefaultCarryGroups
1100 ** ------------------------------------------------------------------------------------------------------------- */
1101unsigned CarryManager::assignDefaultCarryGroups(PabloBlock * const scope, const unsigned ifDepth, const unsigned loopDepth, unsigned carryGroups) {
1102    const auto packingSize = getPackingSize(mVectorWidth, (ifDepth + loopDepth));
1103    unsigned packedWidth = 0;
1104    for (Statement * stmt : *scope) {
1105        if (LLVM_UNLIKELY(isa<CarryProducingStatement>(stmt))) {
1106            unsigned amount = 1;
1107            bool canPack = true;
1108            if (LLVM_LIKELY(isa<Advance>(stmt))) {
1109                amount = cast<Advance>(stmt)->getAmount();
1110                canPack = (amount < mElementWidth);
1111            }
1112            else if (isa<IndexedAdvance>(stmt)) {
1113                amount = cast<Advance>(stmt)->getAmount();
1114                canPack = (amount < mElementWidth);
1115            }
1116            if (packedWidth == 0) {
1117                ++carryGroups;
1118            }
1119            if (LLVM_LIKELY(canPack)) {
1120                amount = nearest_multiple(amount, packingSize);
1121                // if we cannot insert this carry into the current pack, start a new group
1122                if ((packedWidth + amount) > mVectorWidth) {
1123                    ++carryGroups;
1124                    packedWidth = amount;
1125                } else {
1126                    packedWidth += amount;
1127                }
1128            } else if (LLVM_LIKELY(packedWidth != 0)) {
1129                ++carryGroups;
1130                packedWidth = 0;
1131            }
1132            assert (carryGroups > 0);
1133            cast<CarryProducingStatement>(stmt)->setCarryGroup(carryGroups - 1);
1134            cast<CarryProducingStatement>(stmt)->setCarryWidth(amount);
1135        } else if (LLVM_UNLIKELY(isa<Branch>(stmt))) {
1136            const auto nestedIfDepth = ifDepth + isa<If>(stmt) ? 1 : 0;
1137            const auto nestedLoopDepth = loopDepth + isa<If>(stmt) ? 0 : 1;
1138            carryGroups = assignDefaultCarryGroups(cast<Branch>(stmt)->getBody(), nestedIfDepth, nestedLoopDepth, carryGroups);
1139            packedWidth = 0;
1140        }
1141    }
1142    return carryGroups;
1143}
1144
1145
1146/** ------------------------------------------------------------------------------------------------------------- *
1147 * @brief analyse
1148 ** ------------------------------------------------------------------------------------------------------------- */
1149StructType * CarryManager::analyse(const std::unique_ptr<kernel::KernelBuilder> & iBuilder,
1150                                   const PabloBlock * const scope,
1151                                   const unsigned ifDepth, const unsigned loopDepth,
1152                                   const bool isNestedWithinNonCarryCollapsingLoop) {
1153
1154    assert ("scope cannot be null!" && scope);
1155    assert ("the entry scope -- and only the entry scope -- must be in carry scope 0"
1156            && (mCarryScopes == 0 ? (scope == mKernel->getEntryBlock()) : (scope != mKernel->getEntryBlock())));
1157    assert (mCarryScopes < mCarryMetadata.size());
1158
1159    Type * const carryTy = iBuilder->getBitBlockType();
1160
1161    CarryData & cd = mCarryMetadata[mCarryScopes++];
1162    const bool nonCarryCollapsingMode = hasIterationSpecificAssignment(scope);
1163    Type * const carryPackTy = (loopDepth == 0) ? carryTy : ArrayType::get(carryTy, 2);
1164    std::vector<Type *> state;
1165
1166    for (const Statement * stmt : *scope) {
1167        if (LLVM_UNLIKELY(isa<CarryProducingStatement>(stmt))) {
1168            const auto index = cast<CarryProducingStatement>(stmt)->getCarryGroup();
1169            assert (index < mCarryGroup.size());
1170            CarryGroup & carryGroup = mCarryGroup[index];
1171            if (carryGroup.groupSize == 0) {
1172                Type * packTy = carryPackTy;
1173                if (LLVM_UNLIKELY(isa<Advance>(stmt))) {
1174                    const auto amount = cast<Advance>(stmt)->getAmount();
1175                    if (LLVM_UNLIKELY(amount >= mElementWidth)) {
1176                        if (LLVM_UNLIKELY(ifDepth > 0 && amount > iBuilder->getBitBlockWidth())) {
1177                            // 1 bit will mark the presense of any bit in each block.
1178                            state.push_back(ArrayType::get(carryTy, ceil_udiv(amount, std::pow(iBuilder->getBitBlockWidth(), 2))));
1179                        }
1180                        mHasLongAdvance = true;
1181                        const auto blocks = ceil_udiv(amount, iBuilder->getBitBlockWidth());
1182                        packTy = ArrayType::get(carryTy, nearest_pow2(blocks + ((loopDepth != 0) ? 1 : 0)));
1183                    }
1184                }
1185                if (LLVM_UNLIKELY(isa<IndexedAdvance>(stmt))) {
1186                    // The carry data for the indexed advance stores N bits of carry data,
1187                    // organized in packs that can be processed with GR instructions (such as PEXT, PDEP, popcount).
1188                    // A circular buffer is used.  Because the number of bits to be dequeued
1189                    // and enqueued is variable (based on the popcount of the index), an extra
1190                    // pack stores the offset position in the circular buffer.
1191                    const auto amount = cast<IndexedAdvance>(stmt)->getAmount();
1192                    const auto packWidth = sizeof(size_t) * 8;
1193                    const auto packs = ceil_udiv(amount, packWidth);
1194                    packTy = ArrayType::get(iBuilder->getSizeTy(), nearest_pow2(packs) + 1);
1195                }
1196                state.push_back(packTy);
1197            }
1198            carryGroup.groupSize += 1;
1199        } else if (LLVM_UNLIKELY(isa<Branch>(stmt))) {
1200            unsigned nestedIfDepth = ifDepth;
1201            unsigned nestedLoopDepth = loopDepth;
1202            if (LLVM_LIKELY(isa<If>(stmt))) {
1203                ++nestedIfDepth;
1204            } else {
1205                mHasLoop = true;
1206                ++nestedLoopDepth;
1207            }
1208            state.push_back(analyse(iBuilder, cast<Branch>(stmt)->getBody(),
1209                                    nestedIfDepth, nestedLoopDepth, nonCarryCollapsingMode | isNestedWithinNonCarryCollapsingLoop));
1210        }
1211    }
1212    // Build the carry state struct and add the summary pack if needed.
1213    StructType * carryState = nullptr;
1214    CarryData::SummaryType summaryType = CarryData::NoSummary;
1215    if (LLVM_UNLIKELY(state.empty())) {
1216        carryState = StructType::get(iBuilder->getContext());
1217    } else {
1218        if (dyn_cast_or_null<If>(scope->getBranch()) || nonCarryCollapsingMode || isNestedWithinNonCarryCollapsingLoop) {
1219            if (LLVM_LIKELY(state.size() > 1)) {
1220                summaryType = CarryData::ExplicitSummary;
1221                // NOTE: summaries are stored differently depending whether we're entering an If or While branch. With an If branch, they
1222                // preceed the carry state data and with a While loop they succeed it. This is to help cache prefectching performance.
1223                state.insert(isa<If>(scope->getBranch()) ? state.begin() : state.end(), carryPackTy);
1224            } else {
1225                summaryType = CarryData::ImplicitSummary;
1226                if (state[0]->isStructTy()) {
1227                    summaryType = CarryData::BorrowedSummary;
1228                }
1229            }           
1230        }
1231        carryState = StructType::get(iBuilder->getContext(), state);
1232        // If we're in a loop and cannot use collapsing carry mode, convert the carry state struct into a capacity,
1233        // carry state pointer, and summary pointer struct.
1234        if (LLVM_UNLIKELY(nonCarryCollapsingMode)) {
1235            mHasNonCarryCollapsingLoops = true;
1236            carryState = StructType::get(iBuilder->getSizeTy(), carryState->getPointerTo(), carryTy->getPointerTo(), nullptr);
1237            assert (isDynamicallyAllocatedType(carryState));
1238        }
1239        cd.setNonCollapsingCarryMode(nonCarryCollapsingMode);
1240    }
1241    cd.setSummaryType(summaryType);
1242
1243    return carryState;
1244}
1245
1246/** ------------------------------------------------------------------------------------------------------------- *
1247 * @brief constructor
1248 ** ------------------------------------------------------------------------------------------------------------- */
1249CarryManager::CarryManager() noexcept
1250: mKernel(nullptr)
1251, mVectorWidth(0)
1252, mElementWidth(0)
1253, mCurrentFrame(nullptr)
1254, mCurrentFrameIndex(0)
1255, mCurrentScope(nullptr)
1256, mCarryInfo(nullptr)
1257, mNextSummaryTest(nullptr)
1258, mIfDepth(0)
1259, mHasLongAdvance(false)
1260, mHasNonCarryCollapsingLoops(false)
1261, mHasLoop(false)
1262, mLoopDepth(0)
1263, mLoopSelector{nullptr, nullptr}
1264, mCarryScopes(0) {
1265
1266}
1267
1268}
Note: See TracBrowser for help on using the repository browser.