source: icGREP/icgrep-devel/icgrep/kernels/kernel.h @ 5856

Last change on this file since 5856 was 5856, checked in by nmedfort, 15 months ago

Revised pipeline structure to better control I/O rates

File size: 21.9 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#ifndef KERNEL_H
7#define KERNEL_H
8
9#include "interface.h"
10#include <boost/container/flat_map.hpp>
11
12namespace llvm { class BasicBlock; }
13namespace llvm { class Constant; }
14namespace llvm { class Function; }
15namespace llvm { class IntegerType; }
16namespace llvm { class IndirectBrInst; }
17namespace llvm { class PHINode; }
18namespace llvm { class LoadInst; }
19namespace llvm { class Type; }
20namespace llvm { class Value; }
21namespace parabix { class StreamSetBuffer; }
22
23namespace kernel {
24   
25class KernelBuilder;
26
27class Kernel : public KernelInterface {
28    friend class KernelBuilder;
29protected:
30
31    static const std::string DO_BLOCK_SUFFIX;
32    static const std::string FINAL_BLOCK_SUFFIX;
33    static const std::string MULTI_BLOCK_SUFFIX;
34    static const std::string LOGICAL_SEGMENT_NO_SCALAR;
35    static const std::string PROCESSED_ITEM_COUNT_SUFFIX;
36    static const std::string CONSUMED_ITEM_COUNT_SUFFIX;
37    static const std::string PRODUCED_ITEM_COUNT_SUFFIX;
38    static const std::string TERMINATION_SIGNAL;
39    static const std::string BUFFER_PTR_SUFFIX;
40    static const std::string CONSUMER_SUFFIX;
41    static const std::string CYCLECOUNT_SCALAR;
42
43public:
44   
45    enum class Port { Input, Output };
46    using StreamPort = std::pair<Port, unsigned>;
47    using StreamMap = boost::container::flat_map<std::string, StreamPort>;
48    using KernelFieldMap = boost::container::flat_map<std::string, unsigned>;
49    using StreamSetBuffers = std::vector<parabix::StreamSetBuffer *>;
50
51    // Kernel Signatures and Module IDs
52    //
53    // A kernel signature uniquely identifies a kernel and its full functionality.
54    // In the event that a particular kernel instance is to be generated and compiled
55    // to produce object code, and we have a cached kernel object code instance with
56    // the same signature and targetting the same IDISA architecture, then the cached
57    // object code may safely be used to avoid recompilation.
58    //
59    // A kernel signature is a byte string of arbitrary length.
60    //
61    // Kernel developers should take responsibility for designing appropriate signature
62    // mechanisms that are short, inexpensive to compute and guarantee uniqueness
63    // based on the semantics of the kernel.
64    //
65    // If no other mechanism is available, the default makeSignature() method uses the
66    // full LLVM IR (before optimization) of the kernel instance.
67    //
68    // A kernel Module ID is short string that is used as a name for a particular kernel
69    // instance.  Kernel Module IDs are used to look up and retrieve cached kernel
70    // instances and so should be highly likely to uniquely identify a kernel instance.
71    //
72    // The ideal case is that a kernel Module ID serves as a full kernel signature thus
73    // guaranteeing uniqueness.  In this case, hasSignature() should return false.
74    //
75
76    //
77    // Kernel builder subtypes define their logic of kernel construction
78    // in terms of 3 virtual methods for
79    // (a) preparing the Kernel state data structure
80    // (c) defining the logic of the finalBlock function.
81    //
82    // Note: the kernel state data structure must only be finalized after
83    // all scalar fields have been added.   If there are no fields to
84    // be added, the default method for preparing kernel state may be used.
85
86
87    std::string makeSignature(const std::unique_ptr<KernelBuilder> & idb) override;
88
89    // Can the module ID itself serve as the unique signature?
90    virtual bool hasSignature() const { return true; }
91
92    bool isCachable() const override { return false; }
93
94    void bindPorts(const StreamSetBuffers & inputs, const StreamSetBuffers & outputs);
95
96    llvm::Module * setModule(llvm::Module * const module);
97
98    llvm::Module * makeModule(const std::unique_ptr<kernel::KernelBuilder> & idb);
99
100    llvm::Module * getModule() const {
101        return mModule;
102    }
103
104    void generateKernel(const std::unique_ptr<kernel::KernelBuilder> & idb);
105   
106    llvm::Value * createInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
107
108    void initializeInstance(const std::unique_ptr<KernelBuilder> & idb) final;
109
110    void finalizeInstance(const std::unique_ptr<kernel::KernelBuilder> & idb) final;
111
112    StreamPort getStreamPort(const std::string & name) const;
113
114    const Binding & getBinding(const std::string & name) const;
115
116    ProcessingRate::RateValue getLowerBound(const ProcessingRate &rate) const;
117
118    ProcessingRate::RateValue getUpperBound(const ProcessingRate & rate) const;
119
120    const StreamSetBuffers & getStreamSetInputBuffers() const {
121        return mStreamSetInputBuffers;
122    }
123
124    const parabix::StreamSetBuffer * getStreamSetInputBuffer(const unsigned i) const {
125        assert (i < mStreamSetInputBuffers.size());
126        assert (mStreamSetInputBuffers[i]);
127        return mStreamSetInputBuffers[i];
128    }
129
130    const parabix::StreamSetBuffer * getInputStreamSetBuffer(const std::string & name) const {
131        const auto port = getStreamPort(name);
132        assert (port.first == Port::Input);
133        return getStreamSetInputBuffer(port.second);
134    }
135
136    const StreamSetBuffers & getStreamSetOutputBuffers() const {
137        return mStreamSetOutputBuffers;
138    }
139
140    const Binding & getStreamInput(const unsigned i) const {
141        return KernelInterface::getStreamInput(i);
142    }
143
144    const Binding & getStreamInput(const std::string & name) const {
145        const auto port = getStreamPort(name);
146        assert (port.first == Port::Input);
147        return KernelInterface::getStreamInput(port.second);
148    }
149
150    const Binding & getStreamInput(const parabix::StreamSetBuffer * const buffer) const {
151        for (unsigned i = 0; i < mStreamSetInputBuffers.size(); ++i) {
152            if (mStreamSetInputBuffers[i] == buffer) {
153                return getStreamInput(i);
154            }
155        }
156        throw std::runtime_error("no output binding found given buffer");
157    }
158
159    const parabix::StreamSetBuffer * getStreamSetOutputBuffer(const unsigned i) const {
160        assert (i < mStreamSetOutputBuffers.size());
161        assert (mStreamSetOutputBuffers[i]);
162        return mStreamSetOutputBuffers[i];
163    }
164
165    const parabix::StreamSetBuffer * getOutputStreamSetBuffer(const std::string & name) const {
166        const auto port = getStreamPort(name);
167        assert (port.first == Port::Output);
168        return getStreamSetOutputBuffer(port.second);
169    }
170
171    const Binding & getStreamOutput(const unsigned i) const {
172        return KernelInterface::getStreamOutput(i);
173    }
174
175    const Binding & getStreamOutput(const parabix::StreamSetBuffer * const buffer) const {
176        for (unsigned i = 0; i < mStreamSetOutputBuffers.size(); ++i) {
177            if (mStreamSetOutputBuffers[i] == buffer) {
178                return getStreamOutput(i);
179            }
180        }
181        throw std::runtime_error("no output binding found given buffer");
182    }
183
184    const Binding & getStreamOutput(const std::string & name) const {
185        const auto port = getStreamPort(name);
186        assert (port.first == Port::Output);
187        return KernelInterface::getStreamOutput(port.second);
188    }
189   
190    // Kernels typically perform block-at-a-time processing, but some kernels may require
191    // a different stride.   In the case of multiblock kernels, the stride attribute
192    // determines the number of minimum number of items that will be provided to the kernel
193    // on each doMultiBlock call.
194    //
195   
196    unsigned getStride() const { return mStride; }
197   
198    virtual ~Kernel() = 0;
199
200    void prepareKernel(const std::unique_ptr<KernelBuilder> & idb);
201
202    void prepareCachedKernel(const std::unique_ptr<KernelBuilder> & idb);
203
204    std::string getCacheName(const std::unique_ptr<KernelBuilder> & idb) const;
205
206    bool canTerminateEarly() const { return hasAttribute(Attribute::KindId::CanTerminateEarly); }
207
208protected:
209
210    virtual void addInternalKernelProperties(const std::unique_ptr<KernelBuilder> & idb) { }
211
212    void getDoSegmentFunctionArguments(const std::vector<llvm::Value *> & availItems) const;
213
214    // Constructor
215    Kernel(std::string && kernelName, Bindings && stream_inputs,
216          Bindings && stream_outputs,
217          Bindings && scalar_parameters,
218          Bindings && scalar_outputs,
219          Bindings && internal_scalars);
220
221    llvm::Value * getPrincipalItemCount() const {
222        return mAvailablePrincipalItemCount;
223    }
224
225    unsigned getScalarIndex(const std::string & name) const;
226
227    void prepareStreamSetNameMap();
228   
229    void linkExternalMethods(const std::unique_ptr<kernel::KernelBuilder> &) override { }
230
231    virtual void generateInitializeMethod(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) { }
232   
233    virtual void generateKernelMethod(const std::unique_ptr<KernelBuilder> & iBuilder) = 0;
234
235    virtual void generateFinalizeMethod(const std::unique_ptr<KernelBuilder> & iBuilder) { }
236
237    // Add an additional scalar field to the KernelState struct.
238    // Must occur before any call to addKernelDeclarations or createKernelModule.
239    unsigned addScalar(llvm::Type * type, const std::string & name);
240
241    unsigned addUnnamedScalar(llvm::Type * type);
242
243    void callGenerateInitializeMethod(const std::unique_ptr<KernelBuilder> & idb);
244
245    void callGenerateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & idb);
246
247    void callGenerateFinalizeMethod(const std::unique_ptr<KernelBuilder> & idb);
248
249    const parabix::StreamSetBuffer * getAnyStreamSetBuffer(const std::string & name) const {
250        unsigned index; Port port;
251        std::tie(port, index) = getStreamPort(name);
252        if (port == Port::Input) {
253            assert (index < mStreamSetInputBuffers.size());
254            assert (mStreamSetInputBuffers[index]);
255            return mStreamSetInputBuffers[index];
256        } else {
257            assert (index < mStreamSetOutputBuffers.size());
258            assert (mStreamSetOutputBuffers[index]);
259            return mStreamSetOutputBuffers[index];
260        }
261    }
262
263    void setStride(unsigned stride) { mStride = stride; }
264
265private:
266
267    void addBaseKernelProperties(const std::unique_ptr<KernelBuilder> & idb);
268
269    llvm::Value * getStreamSetInputAddress(const std::string & name) const {
270        const Kernel::StreamPort p = getStreamPort(name);
271        assert (p.first == Port::Input);
272        return mStreamSetInputBaseAddress[p.second];
273    }
274
275    llvm::Value * getStreamSetOutputAddress(const std::string & name) const {
276        const Kernel::StreamPort p = getStreamPort(name);
277        assert (p.first == Port::Output);
278        return mStreamSetOutputBaseAddress[p.second];
279    }
280
281    llvm::Value * getAvailableItemCount(const unsigned i) const {
282        return mAvailableItemCount[i];
283    }
284
285    void normalizeStreamProcessingRates();
286
287    bool normalizeRelativeToFixedProcessingRate(const ProcessingRate & base, ProcessingRate & toUpdate);
288
289protected:
290
291    llvm::Function *                    mCurrentMethod;
292    llvm::Value *                       mAvailablePrincipalItemCount;   
293    unsigned                            mStride;
294    llvm::Value *                       mIsFinal;
295    llvm::Value *                       mOutputScalarResult;
296    mutable bool                        mIsGenerated;
297
298    std::vector<llvm::Value *>          mAvailableItemCount;
299
300    KernelFieldMap                      mKernelFieldMap;
301    std::vector<llvm::Type *>           mKernelFields;
302
303    StreamMap                           mStreamMap;
304
305    // TODO: once the kernel no longer needs to be aware of what type of buffers its working with,
306    // these should be removed from the Kernel class and put into the Pipeline
307    StreamSetBuffers                    mStreamSetInputBuffers;
308    std::vector<llvm::Value *>          mStreamSetInputBaseAddress;
309    StreamSetBuffers                    mStreamSetOutputBuffers;
310    std::vector<llvm::Value *>          mStreamSetOutputBaseAddress;
311};
312
313using Kernels = std::vector<Kernel *>;
314
315class SegmentOrientedKernel : public Kernel {
316protected:
317
318    SegmentOrientedKernel(std::string && kernelName,
319                          Bindings && stream_inputs,
320                          Bindings && stream_outputs,
321                          Bindings && scalar_parameters,
322                          Bindings && scalar_outputs,
323                          Bindings && internal_scalars);
324protected:
325
326    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
327
328    virtual void generateDoSegmentMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
329
330};
331
332/*
333The Multi-Block Kernel Builder
334------------------------------
335
336The Multi-Block Kernel Builder is designed to simplify the programming of
337efficient kernels with possibly variable and/or nonaligned output, subject to
338exact or MaxRatio processing constraints.   The following restrictions apply.
339
340#.  The input consists of one or more stream sets, the first of which is
341    known as the principal input stream set.
342
343#.  If there is more than one input stream set, the additional stream sets
344    are first classified as having either a derived processing rate or
345    a variable processing rate.   Stream sets with a derived processing rate
346    have a processing rate defined with respect to the input stream set of one
347    of the following types:  FixedRate, Add1 or RoundUp.    Note that stream sets
348    declared without a processing rate attribute have the FixedRate(1) attribute
349    by default and therefore satisfy this constraint.  All other processing rate
350    types are classified as variable rate.
351
352#.  All output stream sets must be declared with processing rate attributes
353    of one of the following types:
354    *  FixedRate, Add1, Roundup, or MaxRatio with respect to the principal input stream set.
355    *  FixedRate with respect to some other output stream set.
356
357    When using the Multi-Block Kernel Builder to program a new type of kernel,
358    the programmer must implement the generateDoMultiBlockMethod for normal
359    multi-block processing according to the requirements below, as well as
360    providing for special final block processing, if necessary.
361
362#.  The doMultiBlockMethod will be called with the following parameters:
363    * the number of items of the principal input stream to process (itemsToDo),
364    * additional items available parameters for each additional input stream set
365      that is classified as a variable rate stream set
366    * pointers to linear contiguous buffer areas for each of the input stream sets, and
367    * pointers to linear contiguous output buffer areas for each of the output stream sets.
368
369    Notes:
370    * if the kernel has a Lookahead dependency declared on any input stream set, then
371      there will be two buffer pointers for that stream set, one for accessing stream set
372      items without lookahead and one for accessing the items with lookahead.   
373    * pointers are to the beginning of the block corresponding to the
374      processedItemCount or producedItemCount of the given stream set.
375    * the base type of each pointer is the StreamSetBlockType of that streamset
376
377#.  The Multi-Block Kernel Builder will arrange that these input parameters may be
378    processed under the following simplifying assumptions.
379    * the number of itemsToDo will either be an exact multiple of the kernel stride,
380      or, for processing the final block, a value less than the kernel stride
381    * the input buffer of the principal stream set and all input buffers of stream sets
382      with derived processing rates will be safe to access and have data available in
383      accord with their processing rates based on the given number of itemsToDo
384      of the principal input stream set; no further bounds checking is needed. 
385    * input buffers of stream sets with MaxRatio attributes will be safe to access,
386      but will only have valid data as specified by the available items parameter for
387      that stream set.
388    * the kernel programmer is responsible for safe access and bounds checking for any
389      input stream set classified as Unknown rate.   No temporary buffers are used
390      for such stream sets.
391    * all output buffers will be safe to access and have space available
392      for the given maximum output generation rates based on the given number
393      of itemsToDo of the principal input stream set; no further bounds checking
394      is needed.
395    * for final block processing, all input buffers will be extended to be safely
396      treated as containing data corresponding to a full block of the principal
397      input stream set, with the actual data in each buffer padded with null values
398      beyond the end of input.  Similarly, all output buffers will contain space
399      sufficient for the maximum output that can be generated for a full block of
400      input processing.
401    * input and output pointers will be typed to allow convenient and logical access
402      to corresponding streams based on their declared stream set type and processing rate.
403    * for any input pointer p, a GEP instruction with a single int32 index i
404      will produce a pointer to the buffer position corresponding to the ith block of the
405      input stream set, relative to the initial block based on the processedItemCount.
406    * for any output stream set declared with a Fixed or Add1 processing rate with respect
407      to the principal input stream set, a GEP instruction with a single int32 index i
408      will produce a pointer to the buffer position corresponding to the ith block of the
409      stream set, relative to the initial block based on the producedItemCount.
410
411#.  Upon completion of multi-block processing, the Multi-Block Kernel Builder will arrange that
412    processed and produced item counts are updated for all stream sets that have exact
413    processing rate attributes.   Programmers are responsible for updating the counts
414    of any stream set declared with a variable attribute (MaxRatio or Unknown).
415
416#.  An important caveat is that buffer areas may change arbitrarily between
417    calls to the doMultiBlockMethod.   In no case should a kernel store a
418    buffer pointer in its internal state.   Furthermore a kernel must not make
419    any assumptions about the accessibility of stream set data outside of the
420    processing range outside of the block boundaries associated with the given itemsToDo.
421*/
422
423class MultiBlockKernel : public Kernel {
424protected:
425
426    MultiBlockKernel(std::string && kernelName,
427                     Bindings && stream_inputs,
428                     Bindings && stream_outputs,
429                     Bindings && scalar_parameters,
430                     Bindings && scalar_outputs,
431                     Bindings && internal_scalars);
432
433    // Each multi-block kernel subtype must provide its own logic for handling
434    // doMultiBlock calls, subject to the requirements laid out above.
435    // The generateMultiBlockLogic must be written to generate this logic, given
436    // a created but empty function.  Upon entry to generateMultiBlockLogic,
437    // the builder insertion point will be set to the entry block; upone
438    // exit the RetVoid instruction will be added to complete the method.
439    //
440    virtual void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) = 0;
441
442private:
443
444    // Given a kernel subtype with an appropriate interface, the generateDoSegment
445    // method of the multi-block kernel builder makes all the necessary arrangements
446    // to translate doSegment calls into a minimal sequence of doMultiBlock calls.
447    void generateKernelMethod(const std::unique_ptr<KernelBuilder> & b) final;
448
449    unsigned getItemAlignment(const Binding & binding) const;
450
451    unsigned getCopyAlignment(const Binding & binding) const;
452
453    bool isTransitivelyUnknownRate(const ProcessingRate & rate) const;
454
455    bool requiresTemporaryInputBuffer(const Binding & binding, const ProcessingRate & rate) const;
456
457    bool requiresTemporaryOutputBuffer(const Binding & binding, const ProcessingRate & rate) const;
458
459    llvm::Value * getStrideSize(const std::unique_ptr<KernelBuilder> & b, const ProcessingRate & rate);
460
461    bool requiresCopyBack(const ProcessingRate & rate) const;
462
463    void reviseFinalProducedItemCounts(const std::unique_ptr<KernelBuilder> & b);
464
465protected:
466
467    std::vector<llvm::Value *>      mInitialAvailableItemCount;
468    std::vector<llvm::Value *>      mInitialProcessedItemCount;
469    std::vector<llvm::Value *>      mInitialProducedItemCount;
470
471};
472
473
474class BlockOrientedKernel : public MultiBlockKernel {
475protected:
476
477    void CreateDoBlockMethodCall(const std::unique_ptr<KernelBuilder> & b);
478
479    // Each kernel builder subtype must provide its own logic for generating
480    // doBlock calls.
481    virtual void generateDoBlockMethod(const std::unique_ptr<KernelBuilder> & b) = 0;
482
483    // Each kernel builder subtypre must also specify the logic for processing the
484    // final block of stream data, if there is any special processing required
485    // beyond simply calling the doBlock function.   In the case that the final block
486    // processing may be trivially implemented by dispatching to the doBlock method
487    // without additional preparation, the default generateFinalBlockMethod need
488    // not be overridden.
489
490    virtual void generateFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
491
492    BlockOrientedKernel(std::string && kernelName,
493                        Bindings && stream_inputs,
494                        Bindings && stream_outputs,
495                        Bindings && scalar_parameters,
496                        Bindings && scalar_outputs,
497                        Bindings && internal_scalars);
498
499private:
500
501    void generateMultiBlockLogic(const std::unique_ptr<KernelBuilder> & b, llvm::Value * const numOfStrides) final;
502
503    void writeDoBlockMethod(const std::unique_ptr<KernelBuilder> & b);
504
505    void writeFinalBlockMethod(const std::unique_ptr<KernelBuilder> & b, llvm::Value * remainingItems);
506
507    llvm::Value * getRemainingItems(const std::unique_ptr<KernelBuilder> & b);
508
509private:
510
511    llvm::Function *            mDoBlockMethod;
512    llvm::BasicBlock *          mStrideLoopBody;
513    llvm::IndirectBrInst *      mStrideLoopBranch;
514    llvm::PHINode *             mStrideLoopTarget;
515    llvm::PHINode *             mStrideBlockIndex;
516};
517
518}
519
520#endif
Note: See TracBrowser for help on using the repository browser.