source: icGREP/icgrep-devel/icgrep/kernels/kernel.h @ 5902

Last change on this file since 5902 was 5883, checked in by nmedfort, 17 months ago

MustExplicitlyTerminate? attribute for Xiangyu

File size: 21.8 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#ifndef KERNEL_H
7#define KERNEL_H
8
9#include "interface.h"
10#include <boost/container/flat_map.hpp>
11
12namespace llvm { class BasicBlock; }
13namespace llvm { class Constant; }
14namespace llvm { class Function; }
15namespace llvm { class IntegerType; }
16namespace llvm { class IndirectBrInst; }
17namespace llvm { class PHINode; }
18namespace llvm { class LoadInst; }
19namespace llvm { class Type; }
20namespace llvm { class Value; }
21namespace parabix { class StreamSetBuffer; }
22
23namespace kernel {
24   
25class KernelBuilder;
26
27class Kernel : public KernelInterface {
28    friend class KernelBuilder;
29protected:
30
31    static const std::string DO_BLOCK_SUFFIX;
32    static const std::string FINAL_BLOCK_SUFFIX;
33    static const std::string MULTI_BLOCK_SUFFIX;
34    static const std::string LOGICAL_SEGMENT_NO_SCALAR;
35    static const std::string PROCESSED_ITEM_COUNT_SUFFIX;
36    static const std::string CONSUMED_ITEM_COUNT_SUFFIX;
37    static const std::string PRODUCED_ITEM_COUNT_SUFFIX;
38    static const std::string TERMINATION_SIGNAL;
39    static const std::string BUFFER_PTR_SUFFIX;
40    static const std::string CONSUMER_SUFFIX;
41    static const std::string CYCLECOUNT_SCALAR;
42
43public:
44   
45    enum class Port { Input, Output };
46    using StreamPort = std::pair<Port, unsigned>;
47    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
48    using KernelFieldMap = boost::container::flat_map<std::string, unsigned>;
49    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
50
51    // Kernel Signatures and Module IDs
52    //
53    // A kernel signature uniquely identifies a kernel and its full functionality.
54    // In the event that a particular kernel instance is to be generated and compiled
55    // to produce object code, and we have a cached kernel object code instance with
56    // the same signature and targetting the same IDISA architecture, then the cached
57    // object code may safely be used to avoid recompilation.
58    //
59    // A kernel signature is a byte string of arbitrary length.
60    //
61    // Kernel developers should take responsibility for designing appropriate signature
62    // mechanisms that are short, inexpensive to compute and guarantee uniqueness
63    // based on the semantics of the kernel.
64    //
65    // If no other mechanism is available, the default makeSignature() method uses the
66    // full LLVM IR (before optimization) of the kernel instance.
67    //
68    // A kernel Module ID is short string that is used as a name for a particular kernel
69    // instance.  Kernel Module IDs are used to look up and retrieve cached kernel
70    // instances and so should be highly likely to uniquely identify a kernel instance.
71    //
72    // The ideal case is that a kernel Module ID serves as a full kernel signature thus
73    // guaranteeing uniqueness.  In this case, hasSignature() should return false.
74    //
75
76    //
77    // Kernel builder subtypes define their logic of kernel construction
78    // in terms of 3 virtual methods for
79    // (a) preparing the Kernel state data structure
80    // (c) defining the logic of the finalBlock function.
81    //
82    // Note: the kernel state data structure must only be finalized after
83    // all scalar fields have been added.   If there are no fields to
84    // be added, the default method for preparing kernel state may be used.
85
86
87    std::string makeSignature(const std::unique_ptr<KernelBuilder> & idb) override;
88
89    // Can the module ID itself serve as the unique signature?
90    virtual bool hasSignature() const { return true; }
91
92    bool isCachable() const override { return false; }
93
94    void bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs);
95
96    llvm::Module * setModule(llvm::Module * const module);
97
98    llvm::Module * makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb);
99
100    llvm::Module * getModule() const {
101        return mModule;
102    }
103
104    void generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb);
105   
106    llvm::Value * createInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
107
108    void initializeInstance(const std::unique_ptr<KernelBuilder> & idb) final;
109
110    void finalizeInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
111
112    StreamPort getStreamPort(const std::string & name) const;
113
114    const Binding & getBinding(const std::string & name) const;
115
116    ProcessingRate::RateValue getLowerBound(const ProcessingRate &rate) const;
117
118    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
119
120    const StreamSetBuffers & getStreamSetInputBuffers() const {
121        return mStreamSetInputBuffers;
122    }
123
124    const parabix::StreamSetBuffer * getStreamSetInputBuffer(const unsigned i) const {
125        assert (i < mStreamSetInputBuffers.size());
126        assert (mStreamSetInputBuffers[i]);
127        return mStreamSetInputBuffers[i];
128    }
129
130    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
131        const auto port = getStreamPort(name);
132        assert (port.first == Port::Input);
133        return getStreamSetInputBuffer(port.second);
134    }
135
136    const StreamSetBuffers & getStreamSetOutputBuffers() const {
137        return mStreamSetOutputBuffers;
138    }
139
140    const Binding & getStreamInput(const unsigned i) const {
141        return KernelInterface::getStreamInput(i);
142    }
143
144    const Binding & getStreamInput(const std::string & name) const {
145        const auto port = getStreamPort(name);
146        assert (port.first == Port::Input);
147        return KernelInterface::getStreamInput(port.second);
148    }
149
150    const Binding & getStreamInput(const parabix::StreamSetBuffer * const buffer) const {
151        for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
152            if (mStreamSetInputBuffers[i] == buffer) {
153                return getStreamInput(i);
154            }
155        }
156        throw std::runtime_error("no output binding found given buffer");
157    }
158
159    const parabix::StreamSetBuffer * getStreamSetOutputBuffer(const unsigned i) const {
160        assert (i < mStreamSetOutputBuffers.size());
161        assert (mStreamSetOutputBuffers[i]);
162        return mStreamSetOutputBuffers[i];
163    }
164
165    const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
166        const auto port = getStreamPort(name);
167        assert (port.first == Port::Output);
168        return getStreamSetOutputBuffer(port.second);
169    }
170
171    const Binding & getStreamOutput(const unsigned i) const {
172        return KernelInterface::getStreamOutput(i);
173    }
174
175    const Binding & getStreamOutput(const parabix::StreamSetBuffer * const buffer) const {
176        for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
177            if (mStreamSetOutputBuffers[i] == buffer) {
178                return getStreamOutput(i);
179            }
180        }
181        throw std::runtime_error("no output binding found given buffer");
182    }
183
184    const Binding & getStreamOutput(const std::string & name) const {
185        const auto port = getStreamPort(name);
186        assert (port.first == Port::Output);
187        return KernelInterface::getStreamOutput(port.second);
188    }
189   
190    // Kernels typically perform block-at-a-time processing, but some kernels may require
191    // a different stride.   In the case of multiblock kernels, the stride attribute
192    // determines the number of minimum number of items that will be provided to the kernel
193    // on each doMultiBlock call.
194    //
195   
196    unsigned getStride() const { return mStride; }
197   
198    virtual ~Kernel() = 0;
199
200    void prepareKernel(const std::unique_ptr<KernelBuilder> & idb);
201
202    void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb);
203
204    std::string getCacheName(const std::unique_ptr<KernelBuilder> & idb) const;
205
206protected:
207
208    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
209
210    void getDoSegmentFunctionArguments(const std::vector<llvm::Value *> & availItems) const;
211
212    // Constructor
213    Kernel(std::string && kernelName, Bindings && stream_inputs,
214          Bindings && stream_outputs,
215          Bindings && scalar_parameters,
216          Bindings && scalar_outputs,
217          Bindings && internal_scalars);
218
219    llvm::Value * getPrincipalItemCount() const {
220        return mAvailablePrincipalItemCount;
221    }
222
223    unsigned getScalarIndex(const std::string & name) const;
224
225    void prepareStreamSetNameMap();
226   
227    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
228
229    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { }
230   
231    virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> & iBuilder) = 0;
232
233    virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) { }
234
235    // Add an additional scalar field to the KernelState struct.
236    // Must occur before any call to addKernelDeclarations or createKernelModule.
237    unsigned addScalar(llvm::Type * type, const std::string & name);
238
239    unsigned addUnnamedScalar(llvm::Type * type);
240
241    void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & idb);
242
243    void callGenerateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb);
244
245    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
246
247    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
248        unsigned index; Port port;
249        std::tie(port, index) = getStreamPort(name);
250        if (port == Port::Input) {
251            assert (index < mStreamSetInputBuffers.size());
252            assert (mStreamSetInputBuffers[index]);
253            return mStreamSetInputBuffers[index];
254        } else {
255            assert (index < mStreamSetOutputBuffers.size());
256            assert (mStreamSetOutputBuffers[index]);
257            return mStreamSetOutputBuffers[index];
258        }
259    }
260
261    void setStride(unsigned stride) { mStride = stride; }
262
263private:
264
265    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
266
267    llvm::Value * getStreamSetInputAddress(const std::string & name) const {
268        const Kernel::StreamPort p = getStreamPort(name);
269        assert (p.first == Port::Input);
270        return mStreamSetInputBaseAddress[p.second];
271    }
272
273    llvm::Value * getStreamSetOutputAddress(const std::string & name) const {
274        const Kernel::StreamPort p = getStreamPort(name);
275        assert (p.first == Port::Output);
276        return mStreamSetOutputBaseAddress[p.second];
277    }
278
279    llvm::Value * getAvailableItemCount(const unsigned i) const {
280        return mAvailableItemCount[i];
281    }
282
283    void normalizeStreamProcessingRates();
284
285    bool normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate);
286
287protected:
288
289    llvm::Function *                    mCurrentMethod;
290    llvm::Value *                       mAvailablePrincipalItemCount;   
291    unsigned                            mStride;
292    llvm::Value *                       mIsFinal;
293    llvm::Value *                       mOutputScalarResult;
294    mutable bool                        mIsGenerated;
295
296    std::vector<llvm::Value *>          mAvailableItemCount;
297
298    KernelFieldMap                      mKernelFieldMap;
299    std::vector<llvm::Type *>           mKernelFields;
300
301    StreamMap                           mStreamMap;
302
303    // TODO: once the kernel no longer needs to be aware of what type of buffers its working with,
304    // these should be removed from the Kernel class and put into the Pipeline
305    StreamSetBuffers                    mStreamSetInputBuffers;
306    std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
307    StreamSetBuffers                    mStreamSetOutputBuffers;
308    std::vector<llvm::Value *>          mStreamSetOutputBaseAddress;
309};
310
311using Kernels = std::vector<Kernel *>;
312
313class SegmentOrientedKernel : public Kernel {
314protected:
315
316    SegmentOrientedKernel(std::string && kernelName,
317                          Bindings && stream_inputs,
318                          Bindings && stream_outputs,
319                          Bindings && scalar_parameters,
320                          Bindings && scalar_outputs,
321                          Bindings && internal_scalars);
322protected:
323
324    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
325
326    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
327
328};
329
330/*
331The Multi-Block Kernel Builder
332------------------------------
333
334The Multi-Block Kernel Builder is designed to simplify the programming of
335efficient kernels with possibly variable and/or nonaligned output, subject to
336exact or MaxRatio processing constraints.   The following restrictions apply.
337
338#.  The input consists of one or more stream sets, the first of which is
339    known as the principal input stream set.
340
341#.  If there is more than one input stream set, the additional stream sets
342    are first classified as having either a derived processing rate or
343    a variable processing rate.   Stream sets with a derived processing rate
344    have a processing rate defined with respect to the input stream set of one
345    of the following types:  FixedRate, Add1 or RoundUp.    Note that stream sets
346    declared without a processing rate attribute have the FixedRate(1) attribute
347    by default and therefore satisfy this constraint.  All other processing rate
348    types are classified as variable rate.
349
350#.  All output stream sets must be declared with processing rate attributes
351    of one of the following types:
352    *  FixedRate, Add1, Roundup, or MaxRatio with respect to the principal input stream set.
353    *  FixedRate with respect to some other output stream set.
354
355    When using the Multi-Block Kernel Builder to program a new type of kernel,
356    the programmer must implement the generateDoMultiBlockMethod for normal
357    multi-block processing according to the requirements below, as well as
358    providing for special final block processing, if necessary.
359
360#.  The doMultiBlockMethod will be called with the following parameters:
361    * the number of items of the principal input stream to process (itemsToDo),
362    * additional items available parameters for each additional input stream set
363      that is classified as a variable rate stream set
364    * pointers to linear contiguous buffer areas for each of the input stream sets, and
365    * pointers to linear contiguous output buffer areas for each of the output stream sets.
366
367    Notes:
368    * if the kernel has a Lookahead dependency declared on any input stream set, then
369      there will be two buffer pointers for that stream set, one for accessing stream set
370      items without lookahead and one for accessing the items with lookahead.   
371    * pointers are to the beginning of the block corresponding to the
372      processedItemCount or producedItemCount of the given stream set.
373    * the base type of each pointer is the StreamSetBlockType of that streamset
374
375#.  The Multi-Block Kernel Builder will arrange that these input parameters may be
376    processed under the following simplifying assumptions.
377    * the number of itemsToDo will either be an exact multiple of the kernel stride,
378      or, for processing the final block, a value less than the kernel stride
379    * the input buffer of the principal stream set and all input buffers of stream sets
380      with derived processing rates will be safe to access and have data available in
381      accord with their processing rates based on the given number of itemsToDo
382      of the principal input stream set; no further bounds checking is needed. 
383    * input buffers of stream sets with MaxRatio attributes will be safe to access,
384      but will only have valid data as specified by the available items parameter for
385      that stream set.
386    * the kernel programmer is responsible for safe access and bounds checking for any
387      input stream set classified as Unknown rate.   No temporary buffers are used
388      for such stream sets.
389    * all output buffers will be safe to access and have space available
390      for the given maximum output generation rates based on the given number
391      of itemsToDo of the principal input stream set; no further bounds checking
392      is needed.
393    * for final block processing, all input buffers will be extended to be safely
394      treated as containing data corresponding to a full block of the principal
395      input stream set, with the actual data in each buffer padded with null values
396      beyond the end of input.  Similarly, all output buffers will contain space
397      sufficient for the maximum output that can be generated for a full block of
398      input processing.
399    * input and output pointers will be typed to allow convenient and logical access
400      to corresponding streams based on their declared stream set type and processing rate.
401    * for any input pointer p, a GEP instruction with a single int32 index i
402      will produce a pointer to the buffer position corresponding to the ith block of the
403      input stream set, relative to the initial block based on the processedItemCount.
404    * for any output stream set declared with a Fixed or Add1 processing rate with respect
405      to the principal input stream set, a GEP instruction with a single int32 index i
406      will produce a pointer to the buffer position corresponding to the ith block of the
407      stream set, relative to the initial block based on the producedItemCount.
408
409#.  Upon completion of multi-block processing, the Multi-Block Kernel Builder will arrange that
410    processed and produced item counts are updated for all stream sets that have exact
411    processing rate attributes.   Programmers are responsible for updating the counts
412    of any stream set declared with a variable attribute (MaxRatio or Unknown).
413
414#.  An important caveat is that buffer areas may change arbitrarily between
415    calls to the doMultiBlockMethod.   In no case should a kernel store a
416    buffer pointer in its internal state.   Furthermore a kernel must not make
417    any assumptions about the accessibility of stream set data outside of the
418    processing range outside of the block boundaries associated with the given itemsToDo.
419*/
420
421class MultiBlockKernel : public Kernel {
422protected:
423
424    MultiBlockKernel(std::string && kernelName,
425                     Bindings && stream_inputs,
426                     Bindings && stream_outputs,
427                     Bindings && scalar_parameters,
428                     Bindings && scalar_outputs,
429                     Bindings && internal_scalars);
430
431    // Each multi-block kernel subtype must provide its own logic for handling
432    // doMultiBlock calls, subject to the requirements laid out above.
433    // The generateMultiBlockLogic must be written to generate this logic, given
434    // a created but empty function.  Upon entry to generateMultiBlockLogic,
435    // the builder insertion point will be set to the entry block; upone
436    // exit the RetVoid instruction will be added to complete the method.
437    //
438    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
439
440private:
441
442    // Given a kernel subtype with an appropriate interface, the generateDoSegment
443    // method of the multi-block kernel builder makes all the necessary arrangements
444    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
445    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
446
447    unsigned getItemAlignment(const Binding & binding) const;
448
449    unsigned getCopyAlignment(const Binding & binding) const;
450
451    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
452
453    bool requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const;
454
455    bool requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const;
456
457    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
458
459    bool requiresCopyBack(const ProcessingRate & rate) const;
460
461    void reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b);
462
463protected:
464
465    std::vector<llvm::Value *>      mInitialAvailableItemCount;
466    std::vector<llvm::Value *>      mInitialProcessedItemCount;
467    std::vector<llvm::Value *>      mInitialProducedItemCount;
468
469};
470
471
472class BlockOrientedKernel : public MultiBlockKernel {
473protected:
474
475    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b);
476
477    // Each kernel builder subtype must provide its own logic for generating
478    // doBlock calls.
479    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
480
481    // Each kernel builder subtypre must also specify the logic for processing the
482    // final block of stream data, if there is any special processing required
483    // beyond simply calling the doBlock function.   In the case that the final block
484    // processing may be trivially implemented by dispatching to the doBlock method
485    // without additional preparation, the default generateFinalBlockMethod need
486    // not be overridden.
487
488    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
489
490    BlockOrientedKernel(std::string && kernelName,
491                        Bindings && stream_inputs,
492                        Bindings && stream_outputs,
493                        Bindings && scalar_parameters,
494                        Bindings && scalar_outputs,
495                        Bindings && internal_scalars);
496
497private:
498
499    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
500
501    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
502
503    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
504
505    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
506
507private:
508
509    llvm::Function *            mDoBlockMethod;
510    llvm::BasicBlock *          mStrideLoopBody;
511    llvm::IndirectBrInst *      mStrideLoopBranch;
512    llvm::PHINode *             mStrideLoopTarget;
513    llvm::PHINode *             mStrideBlockIndex;
514};
515
516}
517
518#endif
Note: See TracBrowser for help on using the repository browser.