- Timestamp:
- Jan 14, 2018, 3:30:04 PM (13 months ago)
- Location:
- icGREP/icgrep-devel/icgrep
- Files:
-
- 14 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/icgrep/grep_engine.cpp
r5824 r5831 97 97 // 98 98 // All engines share a common pipeline to compute a stream of Matches from a given input Bytestream. 99 //#define USE_DIRECT_LF_BUILDER 1 99 100 unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) { 101 const unsigned packSize = b->getSizeTy()->getBitWidth(); 102 return (packSize * packSize) / b->getBitBlockWidth(); 103 } 100 104 101 105 std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) { … … 103 107 const unsigned segmentSize = codegen::SegmentSize; 104 108 const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum; 109 // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be. 110 const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments); 105 111 const unsigned encodingBits = 8; 106 112 107 StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments); 113 StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 114 115 #ifdef USE_DIRECT_LF_BUILDER 116 kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits); 117 mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream}); 118 #endif 119 120 StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize); 108 121 kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb); 109 122 mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits}); 110 123 111 StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments); 112 #ifdef USE_DIRECT_LF_BUILDER 113 kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::DirectLineFeedBuilder>(idb); 114 mGrepDriver->makeKernelCall(linefeedK, {ByteStream}, {LineFeedStream}); 115 #else 124 #ifndef USE_DIRECT_LF_BUILDER 116 125 kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits); 117 126 mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream}); 118 #endif119 120 StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);121 StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);127 #endif 128 129 StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 130 StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 122 131 kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits); 123 132 mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream}); 124 133 125 134 kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb); 126 StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), segmentSize * bufferSegments);135 StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), baseBufferSize); 127 136 mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams}); 128 137 129 138 const auto n = REs.size(); 130 139 std::vector<StreamSetBuffer *> MatchResultsBufs(n); 131 132 140 for(unsigned i = 0; i < n; ++i) { 133 141 REs[i] = resolveModesAndExternalSymbols(REs[i]); … … 137 145 #ifdef USE_MULTIPLEX_CC 138 146 const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]); 147 148 StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 139 149 if (UnicodeSets.size() <= 1) { 140 StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);141 150 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]); 142 151 mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults}); 143 152 MatchResultsBufs[i] = MatchResults; 144 } 145 else { 153 } else { 146 154 mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets); 147 155 REs[i] = transformCCs(mpx.get(), REs[i]); 148 156 std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs(); 149 157 auto numOfCharacterClasses = mpx_basis.size(); 150 StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);158 StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize); 151 159 kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis)); 152 mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses}); 153 StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments); 160 mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses}); 154 161 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()}); 155 162 mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams, CharClasses}, {MatchResults}); … … 157 164 } 158 165 #else 159 StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);166 StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 160 167 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]); 161 168 mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults}); … … 165 172 StreamSetBuffer * MergedResults = MatchResultsBufs[0]; 166 173 if (REs.size() > 1) { 167 MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);174 MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 168 175 kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size()); 169 176 mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults}); … … 174 181 StreamSetBuffer * OriginalMatches = Matches; 175 182 kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb); 176 Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);183 Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 177 184 mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches}); 178 185 } … … 181 188 kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb); 182 189 StreamSetBuffer * OriginalMatches = Matches; 183 Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);190 Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 184 191 mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches}); 185 192 } … … 187 194 kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb); 188 195 untilK->setInitialArguments({idb->getSize(MaxCountFlag)}); 189 StreamSetBuffer * AllMatches = Matches;190 Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);196 StreamSetBuffer * const AllMatches = Matches; 197 Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize); 191 198 mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches}); 192 199 } -
icGREP/icgrep-devel/icgrep/kernels/kernel.cpp
r5793 r5831 593 593 * @brief requiresBufferedFinalStride 594 594 ** ------------------------------------------------------------------------------------------------------------- */ 595 inline bool requiresBufferedFinalStride(const Binding & binding) {595 inline bool LLVM_READNONE requiresBufferedFinalStride(const Binding & binding) { 596 596 if (LLVM_LIKELY(isa<ArrayType>(binding.getType()))) { 597 597 return binding.getType()->getArrayNumElements() == 1; … … 603 603 * @brief getItemWidth 604 604 ** ------------------------------------------------------------------------------------------------------------- */ 605 inline unsigned getItemWidth(const Binding & b) {605 inline unsigned LLVM_READNONE getItemWidth(const Binding & b) { 606 606 Type * ty = b.getType(); 607 607 if (LLVM_LIKELY(isa<ArrayType>(ty))) { … … 612 612 613 613 /** ------------------------------------------------------------------------------------------------------------- * 614 * @brief getUpperBound615 ** ------------------------------------------------------------------------------------------------------------- */ 616 bool MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const {614 * @brief isTransitivelyUnknownRate 615 ** ------------------------------------------------------------------------------------------------------------- */ 616 bool LLVM_READNONE MultiBlockKernel::isTransitivelyUnknownRate(const ProcessingRate & rate) const { 617 617 if (rate.isUnknown()) { 618 618 return true; … … 624 624 625 625 /** ------------------------------------------------------------------------------------------------------------- * 626 * @brief requiresTemporaryInputBuffer 627 ** ------------------------------------------------------------------------------------------------------------- */ 628 inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const { 629 if (requiresBufferedFinalStride(binding)) { 630 return true; 631 } else if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate))) { 632 report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input."); 633 } else { 634 return !rate.isFixed(); 635 } 636 } 637 638 /** ------------------------------------------------------------------------------------------------------------- * 639 * @brief requiresTemporaryOutputBuffer 640 ** ------------------------------------------------------------------------------------------------------------- */ 641 inline bool LLVM_READNONE MultiBlockKernel::requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const { 642 if (requiresBufferedFinalStride(binding)) { 643 return true; 644 } else { 645 return !(rate.isFixed() || isTransitivelyUnknownRate(rate)); 646 } 647 } 648 649 /** ------------------------------------------------------------------------------------------------------------- * 626 650 * @brief getItemAlignment 627 651 ** ------------------------------------------------------------------------------------------------------------- */ 628 inline unsigned MultiBlockKernel::getItemAlignment(const Binding & binding) const {652 inline unsigned LLVM_READNONE MultiBlockKernel::getItemAlignment(const Binding & binding) const { 629 653 const auto & rate = binding.getRate(); 630 654 if (rate.isFixed() && binding.nonDeferred() && !binding.isMisaligned()) { … … 641 665 642 666 /** ------------------------------------------------------------------------------------------------------------- * 667 * @brief getCopyAlignment 668 ** ------------------------------------------------------------------------------------------------------------- */ 669 inline unsigned LLVM_READNONE MultiBlockKernel::getCopyAlignment(const Binding & binding) const { 670 return ((getItemAlignment(binding) * getItemWidth(binding)) + 7) / 8; 671 } 672 673 /** ------------------------------------------------------------------------------------------------------------- * 643 674 * @brief getStrideSize 644 675 ** ------------------------------------------------------------------------------------------------------------- */ 645 llvm::Value * MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) {676 llvm::Value * LLVM_READNONE MultiBlockKernel::getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate) { 646 677 // NOTE: if we ever support feedback loops, using upper bound could lead to a deadlock due to data starvation 647 678 const auto r = getUpperBound(rate); … … 669 700 const auto outputSetCount = mStreamSetOutputs.size(); 670 701 671 // Define and allocate the temporary buffer area in the prolog. 702 // Define and allocate the temporary buffer area in the prolog. 672 703 const auto blockAlignment = b->getBitBlockWidth() / 8; 673 704 AllocaInst * temporaryInputBuffer[inputSetCount]; 674 for (unsigned i = 0; i < inputSetCount; ++i) { 705 for (unsigned i = 0; i < inputSetCount; ++i) { 675 706 const Binding & input = mStreamSetInputs[i]; 676 707 const ProcessingRate & rate = input.getRate(); 677 if (isTransitivelyUnknownRate(rate)) { 678 report_fatal_error("MultiBlock kernels do not support unknown rate input streams or streams relative to an unknown rate input."); 679 } else if (rate.isFixed() && !requiresBufferedFinalStride(input)) { 680 temporaryInputBuffer[i] = nullptr; 681 } else { 708 temporaryInputBuffer[i] = nullptr; 709 if (requiresTemporaryInputBuffer(input, rate)) { 682 710 Type * const ty = mStreamSetInputBuffers[i]->getStreamSetBlockType(); 683 711 auto ub = getUpperBound(rate); … … 696 724 const Binding & output = mStreamSetOutputs[i]; 697 725 const ProcessingRate & rate = output.getRate(); 698 if (LLVM_UNLIKELY(isTransitivelyUnknownRate(rate) || (rate.isFixed() && !requiresBufferedFinalStride(output)))) { 699 temporaryOutputBuffer[i] = nullptr; 700 } else { 726 temporaryOutputBuffer[i] = nullptr; 727 if (requiresTemporaryOutputBuffer(output, rate)) { 701 728 auto ub = getUpperBound(rate); 702 729 if (LLVM_UNLIKELY(mStreamSetOutputBuffers[i]->supportsCopyBack() && requiresCopyBack(rate))) { … … 710 737 } 711 738 } 712 713 // Now we iteratively process these blocks using the doMultiBlock method.714 // In each iteration, we check how many linearly accessible / writable715 // items can be processed with our current input / output buffers. If we716 // cannot support an full stride, we check whether (a) there is enough717 // input data to process but it is not linearly accessible, in which case718 // we move the data into temporary buffers or (b) there is not enough data719 // to process, in which case we abort unless IsFinal was set.720 739 721 740 Constant * const ZERO = b->getSize(0); … … 738 757 } 739 758 740 // Now proceed with creation of the doSegment method.741 BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop");742 743 b->CreateBr(segmentLoop);744 745 /// DO SEGMENT LOOP746 747 b->SetInsertPoint(segmentLoop);748 749 // For each input buffer, get the initial processed item count, base input pointer, and the number of750 // linearly available strides.751 Value * numOfStrides = nullptr;752 759 mInitialAvailableItemCount.assign(mAvailableItemCount.begin(), mAvailableItemCount.end()); 753 760 mInitialProcessedItemCount.resize(inputSetCount); 754 761 mStreamSetInputBaseAddress.resize(inputSetCount); 762 763 // Now proceed with creation of the doSegment method. 764 BasicBlock * const segmentLoop = b->CreateBasicBlock("SegmentLoop"); 765 766 b->CreateBr(segmentLoop); 767 768 /// DO SEGMENT LOOP 769 770 b->SetInsertPoint(segmentLoop); 771 772 Value * numOfStrides = nullptr; 773 774 // TODO: we don't want the our available output space to limit how many conditional blocks we 775 // can check. When we have a conditional region, split computation of input/output strides and 776 // check as many input strides as possible but leave the kernel in a state that respects our 777 // available output space. NOTE: we know coming into this block that the pipeline or kernel has 778 // ensured there is at least one stride worth of space. 779 780 781 // For each input buffer, get the initial processed item count, base input pointer, and the number of 782 // linearly available strides. 755 783 Value * inputStrideSize[inputSetCount]; 784 Value * linearlyAccessible[inputSetCount]; 756 785 for (unsigned i = 0; i < inputSetCount; i++) { 757 786 const Binding & input = mStreamSetInputs[i]; 758 787 const auto & name = input.getName(); 759 const ProcessingRate & rate = input.getRate();760 788 Value * const processed = b->getProcessedItemCount(name); 761 762 789 mInitialProcessedItemCount[i] = processed; 763 Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH)); 764 765 if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) { 790 mStreamSetInputBaseAddress[i] = b->getBlockAddress(name, b->CreateLShr(processed, LOG_2_BLOCK_WIDTH)); 791 if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) { 766 792 b->CreateAssert(b->CreateICmpULE(processed, mAvailableItemCount[i]), 767 793 getName() + ": " + name + " processed item count exceeds its available item count"); 768 794 } 769 770 // Ensure that everything between SâP/Sâ, and Sân*(P + L)/Sâ is linearly available, where S is the stride size, 771 // P is the current processed position, L is the lookahead amount and n â â€+. 772 773 Value * const unprocessed = b->CreateSub(mAvailableItemCount[i], processed); 774 Value * avail = b->getLinearlyAccessibleItems(name, processed, unprocessed); 775 Value * remaining = avail; 776 if (LLVM_UNLIKELY(input.hasLookahead())) { 777 Constant * const lookahead = b->getSize(input.getLookahead()); 778 remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO); 779 } 780 781 inputStrideSize[i] = getStrideSize(b, rate); 782 Value * accessibleStrides = b->CreateUDiv(remaining, inputStrideSize[i]); 795 Value * const unprocessed = b->CreateNUWSub(mAvailableItemCount[i], processed); 796 mAvailableItemCount[i] = unprocessed; 797 Value * const accessible = b->getLinearlyAccessibleItems(name, processed, unprocessed); 798 linearlyAccessible[i] = accessible; 799 inputStrideSize[i] = getStrideSize(b, input.getRate()); 800 Value * const accessibleStrides = b->CreateUDiv(accessible, inputStrideSize[i]); 801 numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides); 802 } 803 804 BasicBlock * const checkInputAvailability = b->CreateBasicBlock("CheckInputAvailability"); 805 BasicBlock * const selectOutputBuffers = b->CreateBasicBlock("SelectOutputBuffers"); 806 b->CreateLikelyCondBr(b->CreateICmpNE(numOfStrides, ZERO), selectOutputBuffers, checkInputAvailability); 807 808 // Ensure that everything between SâP/Sâ and Sân*(P + L)/Sâ is linearly available, where S is the stride size, 809 // P is the current processed position, L is the lookahead amount and n is our number of accessible strides â â€+. 810 b->SetInsertPoint(checkInputAvailability); 811 Value * const initiallyFinal = mIsFinal; 812 Value * linearlyCopyable[inputSetCount]; 813 PHINode * selectedInputBuffer[inputSetCount]; 814 for (unsigned i = 0; i < inputSetCount; i++) { 783 815 AllocaInst * const tempBuffer = temporaryInputBuffer[i]; 816 selectedInputBuffer[i] = nullptr; 784 817 if (tempBuffer) { 785 818 786 // Since we trust that the pipeline won't call this kernel unless there is enough data to process a stride, whenever 787 // we discover that there isn't enough linearly available data, optimistically copy the data to the temporary buffer. 819 const Binding & input = mStreamSetInputs[i]; 820 const auto & name = input.getName(); 821 Value * const processed = mInitialProcessedItemCount[i]; 822 Value * const unprocessed = mAvailableItemCount[i]; 823 Value * const accessible = linearlyAccessible[i]; 788 824 789 825 BasicBlock * const entry = b->GetInsertBlock(); … … 792 828 BasicBlock * const resume = b->CreateBasicBlock(name + "Resume"); 793 829 794 b->CreateUnlikelyCondBr(b->CreateICmpEQ(accessibleStrides, ZERO), copyFromBack, resume); 830 Value * strideSize = inputStrideSize[i]; 831 if (LLVM_UNLIKELY(input.hasLookahead())) { 832 Constant * const lookahead = b->getSize(input.getLookahead()); 833 strideSize = b->CreateNUWAdd(strideSize, lookahead); 834 } 835 Value * const requiresCopy = b->CreateICmpULT(accessible, strideSize); 836 b->CreateUnlikelyCondBr(requiresCopy, copyFromBack, resume); 795 837 796 838 b->SetInsertPoint(copyFromBack); 797 839 Value * const arraySize = b->CreateZExt(tempBuffer->getArraySize(), b->getInt64Ty()); 798 Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), unprocessed->getType());799 Value * const temporaryAvailable = b->CreateUMin(unprocessed, temporarySize);840 Value * const temporarySize = b->CreateTrunc(b->CreateMul(arraySize, b->getInt64(mStride)), accessible->getType()); 841 Value * const copyable = b->CreateUMin(unprocessed, temporarySize); // <- we only really need strideSize items 800 842 Value * const offset = b->CreateAnd(processed, BLOCK_WIDTH_MASK); 801 843 Value * const bufferSize = b->CreateMul(ConstantExpr::getSizeOf(tempBuffer->getAllocatedType()), arraySize); 802 844 b->CreateMemZero(tempBuffer, bufferSize, blockAlignment); 803 const auto copyAlignment = getItemAlignment(mStreamSetInputs[i]); 804 b->CreateStreamCpy(name, tempBuffer, ZERO, baseBuffer, offset, avail, copyAlignment); 805 Value * const temporaryStrides = b->CreateSelect(b->CreateICmpULT(unprocessed, inputStrideSize[i]), ZERO, ONE); 845 b->CreateStreamCpy(name, tempBuffer, ZERO, mStreamSetInputBaseAddress[i], offset, accessible, getItemAlignment(input)); 806 846 BasicBlock * const copyToBackEnd = b->GetInsertBlock(); 807 b->CreateCondBr(b->CreateICmpNE( temporaryAvailable, unprocessed), copyFromFront, resume);847 b->CreateCondBr(b->CreateICmpNE(copyable, accessible), copyFromFront, resume); 808 848 809 849 b->SetInsertPoint(copyFromFront); 810 Value * const remaining = b->CreateSub( temporaryAvailable, avail);850 Value * const remaining = b->CreateSub(copyable, accessible); 811 851 Value * const baseAddress = b->getBaseAddress(name); 812 b->CreateStreamCpy(name, tempBuffer, avail, baseAddress, ZERO, remaining, copyAlignment); 852 b->CreateStreamCpy(name, tempBuffer, accessible, baseAddress, ZERO, remaining, getItemAlignment(input)); 853 Value * const isPartialStride = b->CreateICmpUGE(copyable, strideSize); 813 854 BasicBlock * const copyToFrontEnd = b->GetInsertBlock(); 814 855 b->CreateBr(resume); 815 856 816 857 b->SetInsertPoint(resume); 817 PHINode * const bufferPtr = b->CreatePHI(baseBuffer->getType(), 4); 818 bufferPtr->addIncoming(baseBuffer, entry); 819 bufferPtr->addIncoming(tempBuffer, copyToBackEnd); 820 bufferPtr->addIncoming(tempBuffer, copyToFrontEnd); 821 baseBuffer = bufferPtr; 822 823 PHINode * const phiAvailItemCount = b->CreatePHI(b->getSizeTy(), 4); 824 phiAvailItemCount->addIncoming(avail, entry); 825 phiAvailItemCount->addIncoming(temporaryAvailable, copyToBackEnd); 826 phiAvailItemCount->addIncoming(temporaryAvailable, copyToFrontEnd); 827 avail = phiAvailItemCount; 828 829 PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 4); 830 phiStrides->addIncoming(accessibleStrides, entry); 831 phiStrides->addIncoming(temporaryStrides, copyToBackEnd); 832 phiStrides->addIncoming(temporaryStrides, copyToFrontEnd); 833 accessibleStrides = phiStrides; 834 } 835 mAvailableItemCount[i] = avail; 836 mStreamSetInputBaseAddress[i] = baseBuffer; 837 numOfStrides = b->CreateUMin(numOfStrides, accessibleStrides); 838 } 858 PHINode * const address = b->CreatePHI(tempBuffer->getType(), 3); 859 address->addIncoming(mStreamSetInputBaseAddress[i], entry); 860 address->addIncoming(tempBuffer, copyToBackEnd); 861 address->addIncoming(tempBuffer, copyToFrontEnd); 862 selectedInputBuffer[i] = address; 863 PHINode * const available = b->CreatePHI(accessible->getType(), 3); 864 available->addIncoming(accessible, entry); 865 available->addIncoming(copyable, copyToBackEnd); 866 available->addIncoming(copyable, copyToFrontEnd); 867 linearlyCopyable[i] = available; 868 PHINode * const finalStride = b->CreatePHI(b->getInt1Ty(), 3); 869 finalStride->addIncoming(mIsFinal, entry); 870 finalStride->addIncoming(b->getTrue(), copyToBackEnd); 871 finalStride->addIncoming(isPartialStride, copyToFrontEnd); 872 mIsFinal = finalStride; 873 if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) { 874 Value * const hasStride = b->CreateOr(initiallyFinal, b->CreateNot(finalStride)); 875 b->CreateAssert(hasStride, getName() + ": " + name + " has insufficient input data for one stride"); 876 } 877 } 878 } 879 880 BasicBlock * const endCheckInputAvailability = b->GetInsertBlock(); 881 selectOutputBuffers->moveAfter(endCheckInputAvailability); 882 b->CreateBr(selectOutputBuffers); 883 884 b->SetInsertPoint(selectOutputBuffers); 885 PHINode * const final = b->CreatePHI(mIsFinal->getType(), 2); 886 final->addIncoming(b->getFalse(), segmentLoop); 887 final->addIncoming(mIsFinal, endCheckInputAvailability); 888 mIsFinal = final; 889 for (unsigned i = 0; i < inputSetCount; i++) { 890 if (selectedInputBuffer[i]) { 891 PHINode * const address = b->CreatePHI(selectedInputBuffer[i]->getType(), 2); 892 address->addIncoming(mStreamSetInputBaseAddress[i], segmentLoop); 893 address->addIncoming(selectedInputBuffer[i], endCheckInputAvailability); 894 mStreamSetInputBaseAddress[i] = address; 895 PHINode * const accessible = b->CreatePHI(linearlyAccessible[i]->getType(), 2); 896 accessible->addIncoming(linearlyAccessible[i], segmentLoop); 897 accessible->addIncoming(linearlyCopyable[i], endCheckInputAvailability); 898 linearlyAccessible[i] = accessible; 899 } 900 } 901 PHINode * const strides = b->CreatePHI(numOfStrides->getType(), 2); 902 strides->addIncoming(numOfStrides, segmentLoop); 903 strides->addIncoming(ONE, endCheckInputAvailability); 904 numOfStrides = strides; 839 905 840 906 // Now determine the linearly writeable strides 907 Value * outputStrideSize[outputSetCount]; 841 908 Value * linearlyWritable[outputSetCount]; 842 Value * outputStrideSize[outputSetCount];843 909 mInitialProducedItemCount.resize(outputSetCount); 844 910 mStreamSetOutputBaseAddress.resize(outputSetCount); … … 846 912 const auto & output = mStreamSetOutputs[i]; 847 913 const auto & name = output.getName(); 848 const ProcessingRate & rate = output.getRate();849 914 Value * const produced = b->getProducedItemCount(name); 850 915 Value * baseBuffer = b->getBlockAddress(name, b->CreateLShr(produced, LOG_2_BLOCK_WIDTH)); 851 assert (baseBuffer->getType()->isPointerTy());852 linearlyWritable[i] = b->getLinearlyWritableItems(name, produced);853 outputStrideSize[i] = getStrideSize(b, rate); 916 mInitialProducedItemCount[i] = produced; 917 mStreamSetOutputBaseAddress[i] = baseBuffer; 918 854 919 // Is the number of linearly writable items sufficient for a stride? 920 outputStrideSize[i] = getStrideSize(b, output.getRate()); 855 921 if (outputStrideSize[i]) { 922 linearlyWritable[i] = b->getLinearlyWritableItems(name, produced); 923 Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]); 924 numOfStrides = b->CreateUMin(numOfStrides, writableStrides); 925 // Do we require a temporary buffer to write to? 856 926 AllocaInst * const tempBuffer = temporaryOutputBuffer[i]; 857 Value * writableStrides = b->CreateUDiv(linearlyWritable[i], outputStrideSize[i]);858 // Do we require a temporary buffer to write to?859 927 if (tempBuffer) { 860 928 assert (tempBuffer->getType() == baseBuffer->getType()); … … 876 944 baseBuffer = phiBuffer; 877 945 PHINode * const phiStrides = b->CreatePHI(b->getSizeTy(), 2); 878 phiStrides->addIncoming( writableStrides, entry);946 phiStrides->addIncoming(numOfStrides, entry); 879 947 phiStrides->addIncoming(ONE, prepareTempBuffer); 880 writableStrides = phiStrides; 881 } 882 numOfStrides = b->CreateUMin(numOfStrides, writableStrides); 883 } 884 mInitialProducedItemCount[i] = produced; 885 mStreamSetOutputBaseAddress[i] = baseBuffer; 886 } 887 888 BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone"); 889 890 Value * const initiallyFinal = mIsFinal; 891 if (LLVM_LIKELY(numOfStrides != nullptr)) { 892 mIsFinal = b->CreateAnd(mIsFinal, b->CreateICmpEQ(numOfStrides, ZERO)); 893 if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) { 894 Value * const hasStride = b->CreateOr(b->CreateICmpNE(numOfStrides, ZERO), mIsFinal); 895 b->CreateAssert(hasStride, getName() + " has insufficient input data or output space for one stride"); 896 } 897 for (unsigned i = 0; i < inputSetCount; ++i) { 898 const auto & input = mStreamSetInputs[i]; 899 const ProcessingRate & rate = input.getRate(); 900 if (rate.isFixed() && input.nonDeferred()) { 901 mAvailableItemCount[i] = b->CreateSelect(mIsFinal, mAvailableItemCount[i], b->CreateMul(numOfStrides, inputStrideSize[i])); 902 } 903 } 904 } 905 906 // We have one or more blocks of input data and output buffer space for all stream sets. 948 numOfStrides = phiStrides; 949 } 950 if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) { 951 b->CreateAssert(numOfStrides, getName() + ": " + name + " has insufficient output space for one stride"); 952 } 953 } 954 } 955 956 // Update the locally available item count to reflect the current state 957 for (unsigned i = 0; i < inputSetCount; i++) { 958 const Binding & input = mStreamSetInputs[i]; 959 if (input.getRate().isFixed() && input.nonDeferred()) { 960 Value * const processable = b->CreateMul(numOfStrides, inputStrideSize[i]); 961 linearlyAccessible[i] = b->CreateSelect(mIsFinal, linearlyAccessible[i], processable); 962 } 963 mAvailableItemCount[i] = linearlyAccessible[i]; 964 } 965 966 // We have one or more strides of input data and output buffer space for all stream sets. 907 967 generateMultiBlockLogic(b, numOfStrides); 908 968 … … 920 980 const ProcessingRate & rate = output.getRate(); 921 981 if (rate.isFixed()) { 922 assert (output.nonDeferred());923 982 Value * const produced = b->CreateMul(numOfStrides, outputStrideSize[i]); 924 Value * const ic = b->Create Add(mInitialProducedItemCount[i], produced);983 Value * const ic = b->CreateNUWAdd(mInitialProducedItemCount[i], produced); 925 984 b->setProducedItemCount(output.getName(), ic); 926 985 } … … 950 1009 // Copy back data to the actual output buffers. 951 1010 for (unsigned i = 0; i < outputSetCount; i++) { 1011 952 1012 AllocaInst * const tempBuffer = temporaryOutputBuffer[i]; 953 1013 if (LLVM_UNLIKELY(tempBuffer == nullptr)) { … … 955 1015 } 956 1016 1017 const auto & name = mStreamSetOutputs[i].getName(); 1018 Value * const produced = b->getProducedItemCount(name); 957 1019 Value * const baseBuffer = mStreamSetOutputBaseAddress[i]; 958 1020 assert ("stack corruption likely" && (tempBuffer->getType() == baseBuffer->getType())); 959 const auto & name = mStreamSetOutputs[i].getName();1021 //const auto & name = mStreamSetOutputs[i].getName(); 960 1022 BasicBlock * const copyToBack = b->CreateBasicBlock(name + "CopyToBack"); 961 1023 BasicBlock * const copyToFront = b->CreateBasicBlock(name + "CopyToFront"); 962 1024 BasicBlock * const resume = b->CreateBasicBlock(name + "ResumeCopyBack"); 963 1025 // If we used a temporary buffer, copy it back to the original output buffer 964 b->CreateCondBr(b->CreateICmpEQ(tempBuffer, baseBuffer), copyToBack, resume); 1026 Value * const requiresCopy = b->CreateICmpEQ(tempBuffer, baseBuffer); 1027 b->CreateCondBr(requiresCopy, copyToBack, resume); 965 1028 966 1029 b->SetInsertPoint(copyToBack); 967 1030 Value * const offset = b->CreateAnd(mInitialProducedItemCount[i], BLOCK_WIDTH_MASK); 968 Value * const newProducedItemCount = b->getProducedItemCount(name);969 Value * const newlyProduced = b->Create Sub(newProducedItemCount, mInitialProducedItemCount[i]);1031 //Value * const newProducedItemCount = b->getProducedItemCount(name); 1032 Value * const newlyProduced = b->CreateNUWSub(produced, mInitialProducedItemCount[i]); 970 1033 Value * const toWrite = b->CreateUMin(newlyProduced, linearlyWritable[i]); 971 1034 const auto alignment = getItemAlignment(mStreamSetOutputs[i]); … … 975 1038 976 1039 b->SetInsertPoint(copyToFront); 977 Value * const remaining = b->Create Sub(newlyProduced, toWrite);1040 Value * const remaining = b->CreateNUWSub(newlyProduced, toWrite); 978 1041 Value * const baseAddress = b->getBaseAddress(name); 979 1042 b->CreateStreamCpy(name, baseAddress, ZERO, tempBuffer, toWrite, remaining, alignment); … … 987 1050 BasicBlock * const setTermination = b->CreateBasicBlock("setTermination"); 988 1051 b->CreateCondBr(mIsFinal, setTermination, strideDone); 989 990 1052 b->SetInsertPoint(setTermination); 991 1053 b->setTerminationSignal(); 1054 BasicBlock * const segmentDone = b->CreateBasicBlock("SegmentDone"); 992 1055 b->CreateBr(segmentDone); 993 1056 … … 1007 1070 } 1008 1071 Value * remaining = b->CreateSub(avail, processed); 1072 Value * strideSize = inputStrideSize[i]; 1009 1073 if (LLVM_UNLIKELY(input.hasLookahead())) { 1010 Constant * const lookahead = b->getSize(input.getLookahead()); 1011 remaining = b->CreateSelect(b->CreateICmpULT(lookahead, remaining), b->CreateSub(remaining, lookahead), ZERO); 1012 } 1013 Value * const remainingStrides = b->CreateUDiv(remaining, inputStrideSize[i]); 1014 Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO); 1074 strideSize = b->CreateNUWAdd(strideSize, b->getSize(input.getLookahead())); 1075 } 1076 Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, strideSize); 1015 1077 hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides); 1016 1078 } … … 1037 1099 } 1038 1100 Value * const remaining = b->CreateSub(capacity, unconsumed); 1039 Value * const remainingStrides = b->CreateUDiv(remaining, outputStrideSize[i]); 1040 Value * const hasRemainingStrides = b->CreateICmpNE(remainingStrides, ZERO); 1041 1101 Value * const hasRemainingStrides = b->CreateICmpUGE(remaining, outputStrideSize[i]); 1042 1102 hasMoreStrides = b->CreateAnd(hasMoreStrides, hasRemainingStrides); 1043 1103 } … … 1225 1285 * @brief generateMultiBlockLogic 1226 1286 ** ------------------------------------------------------------------------------------------------------------- */ 1227 Value *BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) {1287 void BlockOrientedKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfBlocks) { 1228 1288 1229 1289 if (LLVM_UNLIKELY(mStride != b->getBitBlockWidth())) { … … 1239 1299 BasicBlock * const doFinalBlock = b->CreateBasicBlock(getName() + "_doFinalBlock"); 1240 1300 BasicBlock * const segmentDone = b->CreateBasicBlock(getName() + "_segmentDone"); 1241 if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) { 1242 b->CreateAssert(b->CreateXor(b->CreateIsNotNull(numOfBlocks), mIsFinal), 1243 "numOfStrides cannot be 0 unless this is the final stride and must be 0 if it is"); 1244 } 1301 1245 1302 const auto inputSetCount = mStreamSetInputs.size(); 1246 1303 Value * baseProcessedIndex[inputSetCount]; … … 1344 1401 } 1345 1402 1346 Value * const remainingItems = getRemainingItems(b); 1347 1348 // b->CallPrintInt(getName() + "_remainingItems", remainingItems); 1349 1350 writeFinalBlockMethod(b, remainingItems); 1403 writeFinalBlockMethod(b, getRemainingItems(b)); 1351 1404 1352 1405 b->CreateBr(segmentDone); … … 1368 1421 } 1369 1422 1370 return numOfBlocks;1371 1423 } 1372 1424 -
icGREP/icgrep-devel/icgrep/kernels/kernel.h
r5793 r5831 433 433 // exit the RetVoid instruction will be added to complete the method. 434 434 // 435 virtual llvm::Value *generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;435 virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0; 436 436 437 437 private: … … 444 444 unsigned getItemAlignment(const Binding & binding) const; 445 445 446 unsigned getCopyAlignment(const Binding & binding) const; 447 446 448 bool isTransitivelyUnknownRate(const ProcessingRate & rate) const; 449 450 bool requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const; 451 452 bool requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const; 447 453 448 454 llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate); … … 488 494 private: 489 495 490 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;496 void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final; 491 497 492 498 void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b); -
icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.cpp
r5793 r5831 11 11 using namespace kernel; 12 12 13 Value *LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) {13 void LZ4ByteStreamDecoderKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * numOfStrides) { 14 14 15 15 BasicBlock * entry_block = b->GetInsertBlock(); … … 169 169 170 170 b->SetInsertPoint(loopExit); 171 return numOfStrides;172 171 } 173 172 -
icGREP/icgrep-devel/icgrep/kernels/lz4_bytestream_decoder.h
r5755 r5831 18 18 LZ4ByteStreamDecoderKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, size_t bufferSize); 19 19 protected: 20 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override;20 void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * numOfStrides) override; 21 21 private: 22 22 size_t mBufferSize; -
icGREP/icgrep-devel/icgrep/kernels/radix64.cpp
r5755 r5831 39 39 // of bytes to the actual output stream. 40 40 41 Value *expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {41 void expand3_4Kernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) { 42 42 43 43 BasicBlock * expand2_3entry = iBuilder->GetInsertBlock(); … … 131 131 iBuilder->SetInsertPoint(expand3_4_exit); 132 132 133 return numOfStrides;134 133 } 135 134 -
icGREP/icgrep-devel/icgrep/kernels/radix64.h
r5755 r5831 25 25 bool hasSignature() const override { return false; } 26 26 private: 27 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override;27 void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, llvm::Value * const numOfStrides) override; 28 28 }; 29 29 -
icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.cpp
r5782 r5831 21 21 namespace kernel { 22 22 23 Value * ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> &iBuilder, Value * const numOfStrides) {23 void ScanMatchKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, Value * const numOfStrides) { 24 24 25 25 Module * const m = iBuilder->getModule(); … … 47 47 Value * line_break = iBuilder->getInputStreamBlockPtr("lineBreak", iBuilder->getInt32(0)); 48 48 49 Value * blocksToDo = iBuilder->CreateAdd(numOfStrides, iBuilder->CreateZExt(mIsFinal, numOfStrides->getType())); 50 blocksToDo = iBuilder->CreateMul(blocksToDo, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth())); 49 Value * const blocksToDo = iBuilder->CreateMul(numOfStrides, iBuilder->getSize(mStride / iBuilder->getBitBlockWidth())); 51 50 52 51 Value * match_result_ptr = iBuilder->CreateBitCast(match_result, scanwordVectorType->getPointerTo()); … … 205 204 206 205 iBuilder->SetInsertPoint(scanReturn); 207 return numOfStrides;208 206 } 209 207 -
icGREP/icgrep-devel/icgrep/kernels/scanmatchgen.h
r5755 r5831 20 20 bool hasSignature() const override { return false; } 21 21 private: 22 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;22 void generateMultiBlockLogic(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override; 23 23 }; 24 24 -
icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.cpp
r5793 r5831 15 15 namespace kernel { 16 16 17 Value *StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) {17 void StdOutKernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) { 18 18 Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0)); 19 19 codeUnitBuffer = b->CreatePointerCast(codeUnitBuffer, b->getInt8PtrTy()); … … 25 25 } 26 26 b->CreateWriteCall(b->getInt32(1), codeUnitBuffer, bytesToDo); 27 return numOfStrides;28 27 } 29 28 … … 64 63 } 65 64 66 Value *FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) {65 void FileSink::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, Value * const numOfStrides) { 67 66 Value * const fileDes = b->getScalarField("fileDes"); 68 67 Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0)); … … 75 74 } 76 75 b->CreateWriteCall(fileDes, codeUnitBuffer, bytesToDo); 77 return numOfStrides;78 76 } 79 77 -
icGREP/icgrep-devel/icgrep/kernels/stdout_kernel.h
r5793 r5831 16 16 StdOutKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, unsigned codeUnitWidth); 17 17 private: 18 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;18 void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override; 19 19 private: 20 20 const unsigned mCodeUnitWidth; … … 27 27 protected: 28 28 void generateInitializeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) override; 29 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override;29 void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & iBuilder, llvm::Value * const numOfStrides) override; 30 30 void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & b) override; 31 31 private: -
icGREP/icgrep-devel/icgrep/kernels/until_n.cpp
r5830 r5831 17 17 namespace kernel { 18 18 19 const unsigned packSize = 64; 20 21 llvm::Value * UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) { 19 void UntilNkernel::generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) { 22 20 23 21 /* … … 39 37 */ 40 38 39 const unsigned packSize = b->getSizeTy()->getBitWidth(); 41 40 Constant * const ZERO = b->getSize(0); 42 41 Constant * const ONE = b->getSize(1); … … 101 100 //Value * const packPtr = b->CreateGEP(b->CreatePointerCast(groupPtr, packPtrTy), packOffset); 102 101 //Value * const packBits = b->CreateLoad(packPtr); 103 Value * const packCount = b->Create Popcount(packBits);102 Value * const packCount = b->CreateZExtOrTrunc(b->CreatePopcount(packBits), b->getSizeTy()); 104 103 Value * const observedUpTo = b->CreateNUWAdd(observed, packCount); 105 104 … … 173 172 b->setProducedItemCount("uptoN", producedCount); 174 173 175 return numOfStrides; 174 } 175 176 unsigned LLVM_READNONE calculateRate(const std::unique_ptr<kernel::KernelBuilder> & b) { 177 const unsigned packSize = b->getSizeTy()->getBitWidth(); 178 return (packSize * packSize) / b->getBitBlockWidth(); 176 179 } 177 180 178 181 UntilNkernel::UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & b) 179 : MultiBlockKernel("UntilN ",182 : MultiBlockKernel("UntilN_" + std::to_string(calculateRate(b)), 180 183 // inputs 181 {Binding{b->getStreamSetTy(), "bits", FixedRate( (packSize * packSize) / b->getBitBlockWidth())}},184 {Binding{b->getStreamSetTy(), "bits", FixedRate(calculateRate(b))}}, 182 185 // outputs 183 {Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, (packSize * packSize) / b->getBitBlockWidth())}},186 {Binding{b->getStreamSetTy(), "uptoN", BoundedRate(0, calculateRate(b))}}, 184 187 // input scalar 185 188 {Binding{b->getSizeTy(), "N"}}, {}, -
icGREP/icgrep-devel/icgrep/kernels/until_n.h
r5830 r5831 14 14 UntilNkernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder); 15 15 private: 16 llvm::Value *generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;16 void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final; 17 17 18 18 }; -
icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.cpp
r5828 r5831 42 42 using TypeId = PabloAST::ClassTypeId; 43 43 44 inline static unsigned getAlignment(const Value * const type) { 45 return type->getType()->getPrimitiveSizeInBits() / 8; 44 inline static unsigned getAlignment(const Type * const type) { 45 return type->getPrimitiveSizeInBits() / 8; 46 } 47 48 inline static unsigned getAlignment(const Value * const expr) { 49 return getAlignment(expr->getType()); 46 50 } 47 51 48 52 inline static unsigned getPointerElementAlignment(const Value * const ptr) { 49 return ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits() / 8;53 return getAlignment(ptr->getType()->getPointerElementType()); 50 54 } 51 55 … … 672 676 lhvStreamIndex = compileExpression(b, cast<Extract>(lh)->getIndex()); 673 677 } else { 674 baseLhv = compileExpression(b, lh , false);678 baseLhv = compileExpression(b, lh); 675 679 } 676 680 … … 682 686 rhvStreamIndex = compileExpression(b, cast<Extract>(rh)->getIndex()); 683 687 } else { 684 baseRhv = compileExpression(b, rh , false);688 baseRhv = compileExpression(b, rh); 685 689 } 686 690 … … 688 692 689 693 if (LLVM_UNLIKELY(typeId == TypeId::Add || typeId == TypeId::Subtract)) { 690 691 692 694 693 695 value = b->CreateAlloca(vTy, b->getInt32(n)); … … 700 702 } else { 701 703 lhv = getPointerToVar(b, cast<Var>(lh), lhvStreamIndex, index); 702 lhv = b->Create AlignedLoad(lhv, getAlignment(lhv));704 lhv = b->CreateBlockAlignedLoad(lhv); 703 705 } 704 706 lhv = b->CreateBitCast(lhv, vTy); … … 709 711 } else { 710 712 rhv = getPointerToVar(b, cast<Var>(rh), rhvStreamIndex, index); 711 rhv = b->Create AlignedLoad(rhv, getAlignment(rhv));713 rhv = b->CreateBlockAlignedLoad(rhv); 712 714 } 713 715 rhv = b->CreateBitCast(rhv, vTy); … … 716 718 if (typeId == TypeId::Add) { 717 719 result = b->CreateAdd(lhv, rhv); 718 } else { 720 } else { // if (typeId == TypeId::Subtract) { 719 721 result = b->CreateSub(lhv, rhv); 720 722 } 721 723 b->CreateAlignedStore(result, b->CreateGEP(value, {b->getInt32(0), b->getInt32(i)}), getAlignment(result)); 722 724 } 723 724 725 725 726 726 } else { … … 735 735 } else { 736 736 lhv = getPointerToVar(b, cast<Var>(lh), lhvStreamIndex, index); 737 lhv = b->Create AlignedLoad(lhv, getAlignment(lhv));737 lhv = b->CreateBlockAlignedLoad(lhv); 738 738 } 739 739 lhv = b->CreateBitCast(lhv, vTy); … … 744 744 } else { 745 745 rhv = getPointerToVar(b, cast<Var>(rh), rhvStreamIndex, index); 746 rhv = b->Create AlignedLoad(rhv, getAlignment(rhv));746 rhv = b->CreateBlockAlignedLoad(rhv); 747 747 } 748 748 rhv = b->CreateBitCast(rhv, vTy); … … 764 764 default: llvm_unreachable("invalid vector operator id"); 765 765 } 766 Value * const mask = b->Create BitCast(b->hsimd_signmask(n, comp), fw);766 Value * const mask = b->CreateZExtOrTrunc(b->hsimd_signmask(n, comp), fw); 767 767 value = b->mvmd_insert(m, value, mask, i); 768 768 }
Note: See TracChangeset
for help on using the changeset viewer.