source: icGREP/icgrep-devel/icgrep/kernels/attributes.h @ 6297

Last change on this file since 6297 was 6272, checked in by nmedfort, 8 months ago

simplification of pop count logic

File size: 16.6 KB
Line 
1#ifndef ATTRIBUTES_H
2#define ATTRIBUTES_H
3
4#include <vector>
5#include <llvm/Support/Compiler.h>
6#include <assert.h>
7
8namespace llvm { class raw_ostream; }
9
10namespace kernel {
11
12struct Attribute {
13
14    friend struct AttributeSet;
15
16    enum class KindId {
17
18        /** INPUT STREAM ATTRIBUTES **/
19
20        LookAhead,
21
22        // A LookAhead(n) attribute on an input stream set S declares that the kernel
23        // looks ahead n positions in the input stream.  That is, processing of item
24        // S[i, j] may be defined in terms of S[i, j+n].
25
26        // Guarantee required: the pipeline compiler must ensure that, when
27        // the kernel is called with an available item count of N for the
28        // input stream, that the corresponding input data buffer has
29        // valid items in the buffer up to and including position N+n,
30        // and that the blocks containing the items from N-1 through N+n are
31        // linearly contiguous.  When the final doMultiBlock call is made,
32        // the pipeline compiler must ensure that the n items at lookahead
33        // positions are zero-initialized.
34
35        // (Note: this guarantee anticipates using a single buffer pointer
36        // for both normal stream access and lookahead access.   This avoids
37        // the cost of extra parameters per doMultiBlock call, but requires
38        // that the corresponding circular buffer have a "lookahead extension area"
39        // that holds a copy of the data at the physical start of buffer).
40
41        LookBehind, /// NOT DONE
42
43        // A LookBehind(n) attribute on an input stream S declares that the kernel
44        // requires access to input items up to n positions prior to the current
45        // processed item position.
46
47        // (Notes: this may allow more efficient advances within n (without saving state).
48        // However, a lookbehind extension area prior to the normal buffer base address
49        // is necessary, which must be initially zero-filled.)
50
51        // A LookBehind(n) attribute on an output stream S declares that the kernel
52        // requires access to up to n previously generated output items.
53        // (Example: lz4d lookbehind(65536)).
54
55        Principal,
56
57        // One input stream can be declared as the principal input buffer for a kernel.
58        // If a kernel has a principal input stream, when processing the final stride,
59        // a MultiBlockKernel assumes the item count of the principle is the correct
60        // one and zero extends / truncates all other input streams to match it.
61
62        Deferred,
63
64        // Normally, the processed item count of fixed rate streams is automatically
65        // updated by the MultiBlock kernel. However, some streams behave like Fixed
66        // rate streams (in that they will always eventually process a Fixed amount of
67        // data) but the kernel processes the data in unpredictable chunks. Rather than
68        // declaring those as Unknown or Bounded rates, marking their rate calculation
69        // as Deferred provides the pipeline with a stronger guarantee when it comes to
70        // buffer size calculations.
71
72        ZeroExtend, /// NOT DONE
73
74        // If the available item count of an input stream is less than some other input
75        // stream(s), the stream will be zero-extended to the length of the larger stream.
76        // If this option is not set and the kernel does not have a MustExplicitlyTerminate
77        // attribute, it will end once any input has been exhausted.
78
79        // NOTE: zero-extended streams are not considered by the pipeline when ascertaining
80        // whether it is entering the final segment. At least one input stream must not be
81        // zero-extended and a stream cannot have both Principal and ZeroExtend attributes.
82
83        IndependentRegionBegin, IndependentRegionEnd, /// NOT DONE
84
85        // Some kernels can divide their processing into concrete non-overlapping regions
86        // between a beginning and ending position. This is a hard guarantee that regardless
87        // of the computations between the start of the stream and the beginning of the first
88        // independent region or between the *beginning* of any two independent regions, A,
89        // B, the calculations that occur prior to the beginning of B do not affect the
90        // calculations after it --- even if A is started at an arbitrary position with a
91        // zeroed-out kernel state.
92
93        // If a kernel K is processed simultaneously by two threads, K_0 and K_1, and K_1 is
94        // waiting K_0 to finish and update it's kernel state for K_1 to resume at, K_1 can
95        // compute what its state will be and begin processing before K_0 is finished. This
96        // requires a the pipeline to intervene and call an optimized "output-less" instance
97        // of the kernel prior to calling B.
98
99        ConditionalRegionBegin, ConditionalRegionEnd, /// NOT DONE
100
101        // Some kernels have clearly demarcated regions in which a MultiBlock kernel will
102        // produce useful outputs for only the inputs within those regions. This attribute
103        // instructs the kernel to "zero-fill" the output of any non-selected regions,
104        // skipping strides entirely whenever possible.
105
106        // If the same regions are also independent, we can avoid the overhead of "masking
107        // out" the input streams. Otherwise a MultiBlock will use temporary buffers for all
108        // uses of the streams and zero out any non-regions from the data.
109
110        RequiresPopCountArray, RequiresNegatedPopCountArray,
111
112        // Marks whether a particular input stream requires a popcount or negated popcount
113        // array for its own internal processing.
114
115        /** OUTPUT STREAM ATTRIBUTES **/
116
117        Add,
118
119        // An Add(K) attribute states that K items will be added to this stream after
120        // processing the final block.
121
122        RoundUpTo,
123
124        // A RoundUpTo(k) attribute indicates the final item count of this stream will
125        // be rounded up to the nearest multiple of k
126
127        ManagedBuffer,
128
129        // Generally, kernels do not require knowledge about how many items are consumed
130        // from their produced streams or who is consuming them and instead rely on the
131        // pipeline to manage their output buffers for them. The major exception are source
132        // kernels since they produce data "autonomously" and may manage their memory
133        // internally. Thus this attribute instructs both the kernel compiler and pipeline
134        // that a particular output stream needs both the consumed item count and a pointer
135        // to each of its consumers logical segment number for its internal logic.
136
137        /** INPUT/OUTPUT STREAM ATTRIBUTES **/
138
139        Misaligned,
140
141        // Assume that we cannot statically compute the alignment of this stream set and
142        // perform any operations accordingly
143
144        BlockSize,
145
146        // Typically a kernel assumes that each stream of a stream set is a linear sequence
147        // of items. The BlockSize(K) attribute informs the kernel is actually divided into
148        // (BlockWidth / K) elements and each "stream" actually contains K items of the
149        // first stream followed by K elements of the second stream and so on. The notion
150        // of produced/processed item count changes to suite. I.e., when typical kernels
151        // report that they've processed/produced up to the i-th position, it means:
152
153        //                                         v
154        //                 ...|AAAAAAAA AAAAAAAA AA*              |...
155        //                 ...|BBBBBBBB BBBBBBBB BB*              |...
156        //                 ...|CCCCCCCC CCCCCCCC CC*              |...
157        //                 ...|DDDDDDDD DDDDDDDD DD*              |...
158
159        // However, if (BlockWidth / K) is 4, the same i-th position above is actually:
160
161        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
162        //                 ...|AAAAAAAA|BBBBBBBB|CCCCCCCC|DDDDDDDD|...
163        //                 ...|AA*     |BB*     |CC*     |DD*     |...
164        //                 ...|        |        |        |        |...
165
166        // (Note: this replaces the concept of swizzling and anticipates that the pipeline
167        // will take on the role of automatically inserting the swizzling code necessary).
168
169        ReverseRegionBegin, ReverseRegionEnd, /// NOT DONE
170
171        // Conceptually, reversing a stream S is simple: {S_1,...,S_n} -> {S_n,...,S_1}.
172        // However, this means all of the input data must be computed and stored prior to
173        // executing this kernel. In practice, this is unnecessary as in the context of
174        // text parsing, we're almost always searching for the true starting position of
175        // something ambigious after we've found its end position in some prior kernel.
176
177
178//        Here is a revised definition of SegmentedReverse:
179
180//        Given a stream of data bits S that is considered to be divided into
181//        segments, and a marker stream S having a one bit at the final position
182//        of each segment, the R = SegmentedReverse(S, M) when
183
184//        R_{i} = S_{l + (h - i)}
185//              where l = the maximum j such that j <= i and either j = 0 or M_{j-1} = 1
186//          and where h = the minimum j such that j >= i and either j = length(S) -  or M_j = 1
187//          (l and h are the low and high positions of the segment containing i)
188
189//        This is an invertible operation, so we can apply R to a kernel's input
190//        and then to its output to get a SegmentedReverse version of a kernel
191
192//        A kernel which computes segmented reverse is feasible, but seems complex
193//        to implement, and probably too slow.  I have played around with several
194//        ways of tackling it, no good method yet.
195
196//        If there are multiple segments within a block, we could instead use
197//        the following:
198
199//        BlockSegmentedReverse
200
201//        B_{i} = S_{L + (H - i)}
202//             where l = the maximum j such that j <= i and either j = 0 or M_{j-1} = 1
203//                   h = the minimum j such that j >= i and either j = length(S) -  or M_j = 1
204//                   L = l if l div BlockSize < h divBlockSize, otherwise (i div BlockSize) * BlockSize
205//                   H = h if l div BlockSize < h divBlockSize, otherwise L + BlockSize - 1
206
207//        An alternative way of looking at this is to eliminate all but the first
208//        and last marker positions within a block.
209
210//        The main point is that, if we apply B to inputs, perform the kernel
211//        and the apply B to outputs, we get the same result if we applied R
212//        (assuming that the kernel computations do not cross boundaries in M).
213
214//        This will be more efficient to compute, but still involves overhead
215//        for shifting and combining streams.
216
217//        I think it may be better to focus on the ReverseKernel adapter, that
218//        handles the reverse operations for both input and output.   This actually
219//        gives more flexibility, because, in a multiblock scenario, we can process
220//        the longest sequence of blocks such that both the beginning and end blocks
221//        have a one bit.   If there are any interior blocks with one bits, then
222//        they will be handled automatically without special shifting and masking.
223
224//        By the way, in my designs, I am wanting to have a callable Multiblock
225//        function, so that the Multiblock function for a Reversed Kernel just
226//        does a little work before calling the Multiblock function of the base kernel.
227//        That seems to have disappeared in the current system.
228
229
230        SliceOffset, /// NOT DONE
231
232        // Given a SliceOffset of k, the k-th stream set be the base (zeroth) stream set
233        // for the kernel. Internally, this stores a scalar in the kernel state and loads
234        // it once at the start of each segment.
235
236        Expandable, /// NOT DONE
237
238        // Indicates that the number of stream sets in this buffer can increase.
239
240        /** KERNEL ATTRIBUTES **/
241
242        CanTerminateEarly,
243
244        // Indicates that this kernel can call setTerminationSignal() to terminate the
245        // kernel prior to processing all of its input streams.
246
247        MustExplicitlyTerminate,
248
249        // This kernel terminates *only* after the programmer calls setTerminationSignal()
250        // and will be called even when there is no new input data when the prior kernels
251        // in the pipeline have also terminated.
252
253        SideEffecting,
254
255        // Mark this kernel as side-effecting, which will prevent the pipeline compiler
256        // from wrongly removing it from the pipeline. All sink kernels that produce a
257        // result through StdOut/StdErr should be marked as SideEffecting.
258
259        Family,
260
261        // Marks that this kernel is belongs to a named family and whether an input scalar
262        // should be surpressed from the generated "main" function because it will be bound
263        // within it to the appropriate handle/function pointers.
264
265    };
266
267    KindId getKind() const {
268        return mKind;
269    }
270
271    bool isAdd() const {
272        return mKind == KindId::Add;
273    }
274
275    bool isPrincipal() const {
276        return mKind == KindId::Principal;
277    }
278
279    bool isRoundUpTo() const {
280        return mKind == KindId::RoundUpTo;
281    }
282
283    bool isBlockSize() const {
284        return mKind == KindId::BlockSize;
285    }
286
287    unsigned amount() const {
288        return mAmount;
289    }
290
291    void setAmount(const unsigned amount) {
292        mAmount = amount;
293    }
294
295    bool operator == (const Attribute & other) const {
296        return mKind == other.mKind && mAmount == other.mAmount;
297    }
298
299    bool operator != (const Attribute & other) const {
300        return !(*this == other);
301    }
302
303    void print(llvm::raw_ostream & out) const noexcept;
304
305    Attribute(const KindId kind, const unsigned k) : mKind(kind), mAmount(k) { }
306
307private:
308
309    const KindId    mKind;
310    unsigned        mAmount;
311};
312
313struct AttributeSet : public std::vector<Attribute> {
314
315    using AttributeId = Attribute::KindId;
316
317    const AttributeSet & getAttributes() const {
318        return *this;
319    }
320
321    Attribute & findOrAddAttribute(const AttributeId id) {
322        if (Attribute * const attr = __findAttribute(id)) {
323            return *attr;
324        } else {
325            return addAttribute(Attribute(id, 0));
326        }
327    }
328
329    Attribute & findAttribute(const AttributeId id) const LLVM_READNONE {
330        return *__findAttribute(id);
331    }
332
333    Attribute & addAttribute(Attribute attribute);
334
335    bool hasAttributes() const LLVM_READNONE {
336        return !empty();
337    }
338
339    bool hasAttribute(const AttributeId id) const LLVM_READNONE {
340        return __findAttribute(id) != nullptr;
341    }
342
343    AttributeSet() = default;
344
345    AttributeSet(const AttributeSet &) = default;
346
347    AttributeSet(Attribute && attr) { emplace_back(std::move(attr)); }
348
349    AttributeSet(std::initializer_list<Attribute> attrs) : std::vector<Attribute>(attrs) { }
350
351protected:
352
353    void print(llvm::raw_ostream & out) const noexcept;
354
355private:
356
357    Attribute * __findAttribute(const AttributeId id) const;
358
359};
360
361inline Attribute Add1() {
362    return Attribute(Attribute::KindId::Add, 1);
363}
364
365inline Attribute RoundUpTo(const unsigned k) {
366    return Attribute(Attribute::KindId::RoundUpTo, k);
367}
368
369inline Attribute ManagedBuffer() {
370    return Attribute(Attribute::KindId::ManagedBuffer, 0);
371}
372
373inline Attribute Principal() {
374    return Attribute(Attribute::KindId::Principal, 0);
375}
376
377inline Attribute LookAhead(const unsigned k) {
378    return Attribute(Attribute::KindId::LookAhead, k);
379}
380
381inline Attribute LookBehind(const unsigned k) {
382    return Attribute(Attribute::KindId::LookBehind, k);
383}
384
385inline Attribute Deferred() {
386    return Attribute(Attribute::KindId::Deferred, 0);
387}
388
389inline Attribute BlockSize(const unsigned k) {
390    assert (k && ((k & (k - 1)) == 0));
391    return Attribute(Attribute::KindId::BlockSize, k);
392}
393
394inline Attribute Swizzled() {
395    return BlockSize(64);
396}
397
398inline Attribute Misaligned() {
399    return Attribute(Attribute::KindId::Misaligned, 0);
400}
401
402inline Attribute ConditionalRegionBegin() {
403    return Attribute(Attribute::KindId::ConditionalRegionBegin, 0);
404}
405
406inline Attribute ConditionalRegionEnd() {
407    return Attribute(Attribute::KindId::ConditionalRegionEnd, 0);
408}
409
410inline Attribute CanTerminateEarly() {
411    return Attribute(Attribute::KindId::CanTerminateEarly, 0);
412}
413
414inline Attribute MustExplicitlyTerminate() {
415    return Attribute(Attribute::KindId::MustExplicitlyTerminate, 0);
416}
417
418inline Attribute SideEffecting() {
419    return Attribute(Attribute::KindId::SideEffecting, 0);
420}
421
422inline Attribute Family() {
423    return Attribute(Attribute::KindId::Family, 0);
424}
425
426inline Attribute RequiresPopCountArray() {
427    return Attribute(Attribute::KindId::RequiresPopCountArray, 0);
428}
429
430inline Attribute RequiresNegatedPopCountArray() {
431    return Attribute(Attribute::KindId::RequiresNegatedPopCountArray, 0);
432}
433
434}
435#endif // ATTRIBUTES_H
Note: See TracBrowser for help on using the repository browser.