source: proto/parabix2/src/TagMatcher.hpp @ 2170

Last change on this file since 2170 was 2170, checked in by ksherdy, 7 years ago

Refactored TagMatcher? and pablo template.

File size: 10.1 KB
Line 
1#ifndef TAGMATCHER_HPP_
2#define TAGMATCHER_HPP_
3
4#include "../lib/bitblock.hpp"
5
6#include <algorithm>
7#include <iostream>
8#include <string.h>
9using namespace std;
10
11#define MAX_DEPTH 100
12#define MAX_ATTS 100
13
14struct attribute{
15  char * start;
16  int lgth;
17};
18
19template <uint64_t BUF_SIZE, uint64_t LA_SIZE> // buffer size, look ahead size
20class TagMatcher {
21
22public:
23  TagMatcher();
24  ~TagMatcher();
25  void setSrc(char * src);
26  int StreamScan(int chars_avail);
27  void store_streams(BitBlock tagMark, BitBlock NameFollow, BitBlock miscMarks, int chars_avail);
28  int tag_match(int pos, int chars_avail);
29  void Advance_buffer();
30  int does_match(char * s1, char * s2, int lgth);
31  int lookup_or_insert(char*s, int lgth);
32  int depth;
33
34private:
35  BitBlock tagMarks[BUF_SIZE/BLOCK_SIZE];
36  BitBlock miscMarks[BUF_SIZE/BLOCK_SIZE];
37  char tags_buf[BUF_SIZE];
38  int tags_buf_cur;
39  int stream_index;
40  char * srcbuf;
41
42  int inTagPos;
43  int finalStartPos;
44  char* tag_stack[MAX_DEPTH];
45  int tag_lgth_stack[MAX_DEPTH];
46  BitBlock NameFollows[BUF_SIZE/BLOCK_SIZE+1]; // 1 extra block for sentinel
47  int buf_base;
48  enum TagMatchState {InStartTag, InEndTag, InAttName, Clear} state;
49  enum TagMatchMode {StartOfFile, InFile} mode;
50  struct attribute Attr[MAX_ATTS];
51  struct attribute InAtt;
52  int att_index;
53  int InFinalEndTag;
54
55};
56
57template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
58int TagMatcher<BUF_SIZE, LA_SIZE>::does_match(char * s1, char * s2, int lgth){
59    int matchlen = 0;
60    int i=0;
61    while (lgth > sizeof(BitBlock)) {
62      /* full 16 byte match */
63
64      if (bitblock::any(simd_xor(bitblock::load_unaligned((BitBlock*)&s1[i]),
65                 bitblock::load_unaligned((BitBlock*)&s2[i])))) {
66        return 0;
67      }
68      else {
69        lgth -= sizeof(BitBlock);
70        i +=sizeof(BitBlock);
71      }
72    }
73
74    scanword_t temp = ~hsimd<8>::signmask(simd<8>::eq(bitblock::load_unaligned((BitBlock*)&s1[i]),
75                                                  bitblock::load_unaligned((BitBlock*)&s2[i])));
76
77    return lgth <= scan_forward_zeroes(temp);
78}
79
80
81template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
82int TagMatcher<BUF_SIZE, LA_SIZE>::lookup_or_insert(char* s, int lgth){
83  for(int i=0; i< this->att_index; i++)
84    if(lgth == this->Attr[i].lgth &&  this->does_match(s,this->Attr[i].start,lgth))
85      return 1;
86
87  this->Attr[att_index].start = s;
88  this->Attr[att_index].lgth = lgth;
89  this->att_index++;
90  return 0;
91}
92
93
94template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
95int TagMatcher<BUF_SIZE, LA_SIZE>:: tag_match(int pos, int chars_avail) {
96        int rt_val=0;
97//      end tag
98    if(this->srcbuf[pos]=='/' ){
99          pos++;
100      this->depth--;
101      if (this->depth<0)
102            return pos;
103      int lgth = this->tag_lgth_stack[depth];
104
105      if (does_match(this->tag_stack[depth],&this->srcbuf[pos],lgth) && ((this->srcbuf[pos+lgth] == '>') ||(this->srcbuf[pos+lgth] <= ' '))) rt_val=0;
106          else if (pos + lgth >= BUF_SIZE + LA_SIZE) {
107        this->state = InEndTag;
108        this-> inTagPos = BUF_SIZE - pos;
109            rt_val=0;
110          }
111          else {
112//            cout << "end tag is " << string(&srcbuf[pos],tag_lgth_stack[depth]) << endl ;
113//            cout << "start tag is " << string(tag_stack[depth],tag_lgth_stack[depth]) << endl ;
114              fprintf(stderr,"tag name mismatch at position = %i\n",buf_base+pos);
115              exit(-1);
116          }
117
118      if (this->depth == 0){
119        while(this->srcbuf[pos]!='>'){
120              pos++;
121              if(pos>=chars_avail){
122        this->InFinalEndTag = 1;
123                return 0;
124              }
125            }
126        pos = bitstream_scan(this->miscMarks,pos+1);
127            if(pos!=chars_avail){
128          fprintf(stderr,"illegal content after root element at position = %i\n",this->buf_base+pos);
129              exit(-1);
130            }
131          }
132          return rt_val;
133        }
134//      empty tag
135    else if(this->srcbuf[pos]=='>'){
136      this->depth--;
137      if (this->depth == 0){
138        while(this->srcbuf[pos]!='>')
139              pos++;
140        pos = bitstream_scan(this->miscMarks,pos+1);
141
142            if(pos!=chars_avail){
143              fprintf(stderr,"illegal content after root element at position = %i\n",buf_base+pos);
144              exit(-1);
145            }
146          }
147        }
148//      start tag
149    else if(this->srcbuf[pos-1]=='<'){
150      this->att_index = 0;
151      if(this->depth<MAX_DEPTH){
152        int end_pos = bitstream_scan(this->NameFollows,pos);
153        this->tag_lgth_stack[this->depth] = end_pos-pos;
154        this->tag_stack[depth] = &this->srcbuf[pos];
155        if(end_pos<BUF_SIZE){
156         this->depth++;
157            }
158            else{
159          this->state = InStartTag;
160          this->finalStartPos = pos;
161            }
162          }
163          else{
164        fprintf(stderr,"Max nesting depth exceeded at position =%i. depth = %i\n",this->buf_base+pos, this->depth);
165            exit(-1);
166          }
167        }
168//      attribute
169        else{
170      int end_pos = bitstream_scan(this->NameFollows,pos);
171      if(end_pos<BUF_SIZE){
172        if(lookup_or_insert(&this->srcbuf[pos], end_pos-pos)){
173          fprintf(stderr,"Attribute name is not unique at position =%i.\n",this->buf_base+pos);
174              exit(-1);
175            }
176          }
177          else{
178        this->state = InAttName;
179        this->InAtt.start = &this->srcbuf[pos];
180        this->InAtt.lgth = BUF_SIZE-pos;
181          }
182        }
183        return 0;
184}
185
186template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
187int TagMatcher<BUF_SIZE, LA_SIZE>::StreamScan(int chars_avail) {
188
189        int blk;
190        int blk_counts = (chars_avail+sizeof(scanword_t)*8-1)/(sizeof(scanword_t)*8);
191        int block_pos = 0;
192
193    if(this->mode == StartOfFile){
194          int pos = bitstream_scan(miscMarks,0);
195          if (pos==chars_avail){
196        fprintf(stderr,"no element at position =%i.\n",this->buf_base+pos);
197            exit(-1);
198          }
199      if(this->srcbuf[pos-1]!='<'|| this->srcbuf[pos]=='!'|| this->srcbuf[pos]=='/'){
200#ifdef DUMP
201print_register<BitBlock>("srcbuf", bitblock::load_unaligned((BitBlock *) srcbuf));
202#endif
203            fprintf(stderr,"illegal content before root element at position =%i.\n",buf_base+pos);
204            exit(-1);
205          }
206      this->mode = InFile;
207        }
208        for (blk = 0; blk < blk_counts; blk++) {
209        scanword_t s = ((scanword_t*)this->tagMarks)[blk];
210                while(s) {
211                        int code = tag_match(scan_forward_zeroes(s) + block_pos, chars_avail);
212                        if (code) return code;
213                        s = s & (s-1);  // clear rightmost bit.
214                }
215                block_pos += 8 * sizeof(scanword_t);
216        }
217
218        return 0;
219}
220
221template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
222void TagMatcher<BUF_SIZE, LA_SIZE>::store_streams(BitBlock tagMark, BitBlock NameFollow, BitBlock miscMark, int chars_avail){
223#ifdef DUMP
224print_register<BitBlock>("tagMark", tagMark);
225print_register<BitBlock>("NameFollow", NameFollow);
226print_register<BitBlock>("miscMark", miscMark);
227printf("chars_avail = %i\n", chars_avail);
228printf("stream_index = %i\n", stream_index);
229#endif
230  this->tagMarks[this->stream_index] = tagMark;
231  this->miscMarks[this->stream_index] = simd_not(miscMark);
232  this->NameFollows[this->stream_index] = NameFollow;
233  this->stream_index++;
234  if(this->stream_index==1){
235
236    if (this->InFinalEndTag == 1){
237      int pos = -1;
238      while(this->srcbuf[pos]!='>'){
239        pos++;
240        if(pos>=chars_avail){
241      this->InFinalEndTag = 1;
242          return;
243        }
244      }
245      pos = bitstream_scan(this->miscMarks,pos+1);
246#ifdef DUMP
247print_register<BitBlock>("miscMarks[0]", miscMarks[0]);
248printf("pos = %i\n", pos);
249#endif
250      if(pos!=chars_avail){
251        fprintf(stderr,"illegal content after root element at position = %i\n",buf_base+pos);
252        exit(-1);
253      }
254    }
255
256    if(this->state == InStartTag) {
257      this->state = this->Clear;
258      int remain_lgth = bitstream_scan(this->NameFollows,0);
259      memcpy(&this->tags_buf[this->tags_buf_cur],this->srcbuf,remain_lgth);
260      this->tag_lgth_stack[this->depth] += remain_lgth;
261      this->depth++;
262    }
263    else if (this->state == InEndTag) {
264      this->state = Clear;
265      int lgth = this->tag_lgth_stack[this->depth];
266      if (does_match(this->tag_stack[this->depth]+this->inTagPos,this->srcbuf,lgth-this->inTagPos) && ((this->srcbuf[lgth-this->inTagPos] == '>') ||(this->srcbuf[lgth-this->inTagPos] <= ' '))) return ;
267      else {
268          fprintf(stderr,"tag name mismatch at position = %i\n",buf_base);
269          exit(-1);
270      }
271    }
272    else if (this->state == InAttName) {
273      this->state = Clear;
274      int remain_lgth = bitstream_scan(this->NameFollows,0);
275      memcpy(&this->tags_buf[this->tags_buf_cur],this->srcbuf,remain_lgth);
276      if(lookup_or_insert(this->InAtt.start, this->InAtt.lgth+remain_lgth)){
277              fprintf(stderr,"Attribute name is not unique at position =%i.\n",buf_base);
278              exit(-1);
279      }
280    }
281  }
282}
283
284template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
285TagMatcher<BUF_SIZE, LA_SIZE>::TagMatcher(){
286  this->stream_index = 0;
287  this->depth = 0;
288  this->buf_base = 0;
289  this->state = Clear;
290  this->mode = StartOfFile;
291  this->InFinalEndTag = 0;
292  this->NameFollows[BUF_SIZE/BLOCK_SIZE]=simd<1>::constant<1>();// TODO - verify simd_const_1(1);  //sentinel
293}
294
295template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
296TagMatcher<BUF_SIZE, LA_SIZE>::~TagMatcher(){
297
298}
299
300template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
301void TagMatcher<BUF_SIZE, LA_SIZE>::setSrc(char * src){
302  this->srcbuf = src;
303}
304
305template <uint64_t BUF_SIZE, uint64_t LA_SIZE>
306void TagMatcher<BUF_SIZE, LA_SIZE>::Advance_buffer(){
307  this->buf_base += BUF_SIZE;
308  this->stream_index=0;
309  this->tags_buf_cur = 0;
310  this->att_index = 0;
311  for(int i=0; i< this->depth; i++){
312    if(&this->tags_buf[this->tags_buf_cur]!=this->tag_stack[i])
313      memcpy(&this->tags_buf[this->tags_buf_cur],this->tag_stack[i],this->tag_lgth_stack[i]);
314    this->tag_stack[i] = &this->tags_buf[tags_buf_cur];
315    this->tags_buf_cur += this->tag_lgth_stack[i];
316  }
317  if(this->state == InStartTag) {
318      memcpy(&this->tags_buf[this->tags_buf_cur],&this->srcbuf[this->finalStartPos],this->tag_lgth_stack[this->depth]);
319      this->tag_stack[depth] = &this->tags_buf[this->tags_buf_cur];
320      this->tags_buf_cur += this->tag_lgth_stack[this->depth];
321  }
322  else if(this->state == InEndTag) {
323     memcpy(&this->tags_buf[this->tags_buf_cur],this->tag_stack[this->depth],this->tag_lgth_stack[this->depth]);
324    this->tag_stack[depth] = &this->tags_buf[this->tags_buf_cur];
325    this->tags_buf_cur += this->tag_lgth_stack[this->depth];
326  }
327  else if(this->state == InAttName) {
328      memcpy(&this->tags_buf[this->tags_buf_cur],this->InAtt.start,this->InAtt.lgth);
329      this->InAtt.start = &this->tags_buf[tags_buf_cur];
330      this->tags_buf_cur += this->InAtt.lgth;
331  }
332  this->srcbuf[-1] = this->srcbuf[BUF_SIZE-1];
333}
334
335
336#endif /* TAGMATCHER_HPP_ */
Note: See TracBrowser for help on using the repository browser.