Changeset 2160
- Timestamp:
- 05/24/12 20:12:42 (13 months ago)
- Location:
- proto/parabix2
- Files:
-
- 2 modified
-
pablo_template.cpp (modified) (5 diffs)
-
src/TagMatcher.hpp (modified) (16 diffs)
Legend:
- Unmodified
- Added
- Removed
-
proto/parabix2/pablo_template.cpp
r2155 r2160 13 13 #include "../lib/bitblock_iterator.hpp" 14 14 #include "../lib/s2p.hpp" 15 16 #define SEGMENT_BLOCKS 12 17 #define BUFFER_SIZE (BLOCK_SIZE * SEGMENT_BLOCKS) 18 #define OVERLAP_BUFSIZE (sizeof(BitBlock)) 15 #include "../lib/perflib/perfsec.h" 19 16 20 17 #include "xmldecl.h" 21 18 #include "namechars.h" 22 #include "../lib/perflib/perfsec.h"23 24 19 #include "TagMatcher.hpp" 25 20 #include "LineColTracker.hpp" … … 29 24 30 25 #ifdef BUFFER_PROFILING 31 BOM_Table * parser_timer; 32 26 BOM_Table * parser_timer; 33 27 #elif CODE_CLOCKER 34 #define NUM_EVENTS 1 35 int Events[NUM_EVENTS] = {PAPI_TOT_CYC}; 36 //int Events[NUM_EVENTS] = {PAPI_L2_DCM}; 37 //int Events[NUM_EVENTS] = {PAPI_TOT_CYC, PAPI_BR_MSP}; 38 int cal_size = 20; 39 CC * parser_timer = new CC(Events,NUM_EVENTS,cal_size); 28 //#define NUM_EVENTS 1 29 //int Events[NUM_EVENTS] = {PAPI_TOT_CYC}; 30 //int Events[NUM_EVENTS] = {PAPI_L2_DCM}; 31 #define NUM_EVENTS 2 32 int Events[NUM_EVENTS] = {PAPI_TOT_CYC, PAPI_BR_MSP}; 33 int cal_size = 20; 34 CC * parser_timer = new CC(Events,NUM_EVENTS,cal_size); 40 35 #else 41 void * parser_timer;36 void * parser_timer; 42 37 #endif 43 38 44 int block_base=0;45 int buffer_base=0;46 char * source;47 48 LineColTracker tracker;49 TagMatcher matcher;50 39 ErrorTracker error_tracker; 51 40 BitBlock EOF_mask = simd<1>::constant<1>(); 52 41 53 static inline int NameStrt_check(int pos); 54 static inline int Name_check(int pos); 55 static inline int PIName_check(int pos); 56 static inline int CD_check(int pos); 57 static inline int GenRef_check(int pos); 58 static inline int HexRef_check(int pos); 59 static inline int DecRef_check(int pos); 60 static inline int AttRef_check(int pos); 61 42 ////////////////////////////////////////////////////////////////////////////////////////// 43 // Buffer Management 44 ////////////////////////////////////////////////////////////////////////////////////////// 45 #include "../lib/buffer.hpp" 46 47 #define OVERLAP_BUFSIZE PADDING_SIZE //sizeof(BitBlock) 48 49 ////////////////////////////////////////////////////////////////////////////////////////// 50 // @ global depends on 'error_tracker' and 'EOF_mask' definitions. 51 ////////////////////////////////////////////////////////////////////////////////////////// 62 52 @global 63 53 64 static inline void s2p_do_block(BytePack U8[], Basis_bits & basis_bits); 65 static inline void s2p_do_final_block(BytePack U8[], Basis_bits & basis_bits, BitBlock EOF_mask); 66 static inline void postprocess_do_block(Lex & lex, CtCDPI_Callouts & ctCDPI_Callouts, Ref_Callouts & ref_Callouts, Check_streams & check_streams, int chars_avail); 67 68 void do_process(FILE *infile, FILE *outfile); 69 70 static inline void validate_block(BitBlockForwardIterator & start, int block_base, int is_valid(int)); 71 static inline void validate_block(BitBlockForwardIterator & start, int block_base, int is_valid(int,int)); 54 ////////////////////////////////////////////////////////////////////////////////////////// 55 // Headers that depend @ global stream struct types. 56 ////////////////////////////////////////////////////////////////////////////////////////// 57 #include "../lib/transpose.hpp" 58 #include "post_process.hpp" 59 60 static void do_process(FILE *infile, FILE *outfile); 72 61 73 62 int main(int argc, char * argv[]) { … … 100 89 } 101 90 102 //PERF_SEC_BIND(1);91 PERF_SEC_BIND(1); 103 92 104 93 PERF_SEC_INIT(parser_timer); … … 116 105 } 117 106 118 /* s2p Definitions */119 static inline void s2p_do_block(BytePack U8[], Basis_bits & basis_bits) {120 s2p(U8[0], U8[1], U8[2], U8[3], U8[4], U8[5], U8[6], U8[7],121 basis_bits.bit_0, basis_bits.bit_1, basis_bits.bit_2, basis_bits.bit_3, basis_bits.bit_4, basis_bits.bit_5, basis_bits.bit_6, basis_bits.bit_7);122 }123 124 static inline void s2p_do_final_block(BytePack U8[], Basis_bits & basis_bits, BitBlock EOF_mask) {125 s2p_do_block(U8, basis_bits);126 basis_bits.bit_0 = simd_and(basis_bits.bit_0, EOF_mask);127 basis_bits.bit_1 = simd_and(basis_bits.bit_1, EOF_mask);128 basis_bits.bit_2 = simd_and(basis_bits.bit_2, EOF_mask);129 basis_bits.bit_3 = simd_and(basis_bits.bit_3, EOF_mask);130 basis_bits.bit_4 = simd_and(basis_bits.bit_4, EOF_mask);131 basis_bits.bit_5 = simd_and(basis_bits.bit_5, EOF_mask);132 basis_bits.bit_6 = simd_and(basis_bits.bit_6, EOF_mask);133 basis_bits.bit_7 = simd_and(basis_bits.bit_7, EOF_mask);134 }135 136 137 static inline int NameStrt_check(int pos) {138 if(XML_10_UTF8_NameStrt_bytes((unsigned char*)&source[pos]) == 0){139 return XMLTestSuiteError::NAME_START;140 }141 return 0;142 }143 144 static inline int Name_check(int pos) {145 if(XML_10_UTF8_NameChar_bytes((unsigned char*)&source[pos]) == 0){146 return XMLTestSuiteError::NAME;147 }148 return 0;149 }150 151 static inline int PIName_check(int pos, int file_pos) {152 if (at_XxMmLll<ASCII>((unsigned char*)&source[pos]) && (source[pos+3]=='?' || source[pos+3]<= ' ')) {153 // "<?xml" legal at start of file.154 if (!((file_pos == 2) && at_XmlDecl_start<ASCII>((unsigned char*)&source[0]))) {155 return XMLTestSuiteError::XMLPINAME;156 }157 }158 return 0;159 }160 161 static inline int CD_check(int pos) {162 if (!at_CDATA1<ASCII>((unsigned char*)&source[pos])){163 return XMLTestSuiteError::CDATA;164 }165 return 0;166 }167 168 static inline int GenRef_check(int pos) {169 unsigned char* s = (unsigned char*)&source[pos];170 if (!(at_Ref_gt<ASCII>(s)||at_Ref_lt<ASCII>(s)||at_Ref_amp<ASCII>(s)||at_Ref_quot<ASCII>(s)||at_Ref_apos<ASCII>(s))){171 return XMLTestSuiteError::UNDEFREF;172 }173 return 0;174 }175 176 static inline int HexRef_check(int pos) {177 unsigned char* s = (unsigned char*)&source[pos];178 int ch_val = 0;179 while(at_HexDigit<ASCII>(s)){180 ch_val = HexVal<ASCII>(s[0]) + (ch_val<<4);181 if (ch_val> 0x10FFFF ){182 return XMLTestSuiteError::CHARREF;183 }184 s++;185 }186 if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF)){187 return XMLTestSuiteError::CHARREF;188 }189 else if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA))){190 return XMLTestSuiteError::XML10CHARREF;191 }192 return 0;193 }194 195 static inline int DecRef_check(int pos) {196 unsigned char* s = (unsigned char*)&source[pos];197 int ch_val = 0;198 while(at_HexDigit<ASCII>(s)){199 ch_val = DigitVal<ASCII>(s[0]) + ch_val*10;200 if (ch_val> 0x10FFFF ){201 return XMLTestSuiteError::CHARREF;202 }203 s++;204 }205 if ((ch_val == 0x0) || ((ch_val | 0x7FF) == 0xDFFF)|| ((ch_val | 0x1) == 0xFFFF)){206 return XMLTestSuiteError::CHARREF;207 }208 else if (((ch_val < 0x20) && (ch_val != 0x9) && (ch_val != 0xD) && (ch_val != 0xA))){209 return XMLTestSuiteError::XML10CHARREF;210 }211 return 0;212 }213 214 static inline int AttRef_check(int pos) {215 unsigned char* s = (unsigned char*)&source[pos];216 int ch_val = 0;217 if(s[0]=='#'){218 s++;219 if(s[0]=='x' || s[0]=='X'){220 s++;221 while(at_HexDigit<ASCII>(s)){222 ch_val = HexVal<ASCII>(s[0]) + (ch_val<<4);223 s++;224 }225 }226 else{227 while(at_HexDigit<ASCII>(s)){228 ch_val = DigitVal<ASCII>(s[0]) + ch_val*10;229 s++;230 }231 }232 if (ch_val==60){233 return XMLTestSuiteError::ATTREF;234 }235 }236 else if(at_Ref_lt<ASCII>(s)){237 return XMLTestSuiteError::ATTREF;238 }239 return 0;240 }241 242 static inline void validate_block(BitBlockForwardIterator & start, int block_base, int is_valid(int)) {243 244 int pos, block_pos;245 BitBlockForwardIterator end;246 while(start != end) {247 248 block_pos = block_base + *start;249 int rv = is_valid(block_pos);250 251 if (rv) {252 int error_line, error_column;253 tracker.get_Line_and_Column(block_pos, error_line, error_column);254 ReportError(XMLTestSuiteError::get_msg(rv), error_line, error_column);255 exit(-1);256 }257 start++;258 }259 }260 261 static inline void validate_block(BitBlockForwardIterator & start, int block_base, int buffer_base, int is_valid(int,int)) {262 263 int pos, block_pos, file_pos;264 BitBlockForwardIterator end;265 while(start != end) {266 267 block_pos = block_base + *start;268 file_pos = block_pos+buffer_base;269 270 271 int rv = is_valid(block_pos, file_pos);272 273 if (rv) {274 int error_line, error_column;275 tracker.get_Line_and_Column(block_pos, error_line, error_column);276 ReportError(XMLTestSuiteError::get_msg(rv), error_line, error_column);277 exit(-1);278 }279 start++;280 }281 }282 283 static inline void postprocess_do_block(Lex & lex, CtCDPI_Callouts & ctCDPI_Callouts, Ref_Callouts & ref_Callouts, Check_streams & check_streams, int chars_avail){284 BitBlockForwardIterator iter;285 286 tracker.StoreNewlines(lex.LF);287 288 if (bitblock::any(simd_or(check_streams.non_ascii_name_starts, check_streams.non_ascii_names))) {289 iter.init(&check_streams.non_ascii_name_starts);290 validate_block(iter, block_base, NameStrt_check);291 iter.init(&check_streams.non_ascii_names);292 validate_block(iter, block_base, Name_check);293 }294 if (bitblock::any(ctCDPI_Callouts.PI_name_starts)){295 iter.init(&(ctCDPI_Callouts.PI_name_starts));296 validate_block(iter, block_base, buffer_base, PIName_check);297 }298 if (bitblock::any(ctCDPI_Callouts.CD_starts)){299 iter.init(&ctCDPI_Callouts.CD_starts);300 validate_block(iter, block_base, CD_check);301 }302 if(bitblock::any(ref_Callouts.GenRef_starts)){303 iter.init(&ref_Callouts.GenRef_starts);304 validate_block(iter, block_base, GenRef_check);305 }306 if(bitblock::any(ref_Callouts.DecRef_starts)){307 iter.init(&ref_Callouts.DecRef_starts);308 validate_block(iter, block_base, DecRef_check);309 }310 if(bitblock::any(ref_Callouts.HexRef_starts)){311 iter.init(&ref_Callouts.HexRef_starts);312 validate_block(iter, block_base, HexRef_check);313 }314 if(bitblock::any(check_streams.att_refs)){315 iter.init(&check_streams.att_refs);316 validate_block(iter, block_base, AttRef_check);317 }318 319 if(error_tracker.Has_Noted_Error()){320 int error_line, error_column;321 tracker.get_Line_and_Column(error_tracker.Noted_Pos_In_Block(), error_line, error_column);322 ReportError(error_tracker.Noted_Error_Msg(), error_line, error_column);323 exit(-1);324 }325 326 matcher.store_streams(check_streams.tag_marks, check_streams.name_follows, check_streams.misc_mask, chars_avail);327 tracker.AdvanceBlock();328 329 }330 331 107 void do_process(FILE *infile, FILE *outfile) { 332 108 333 @decl 334 335 int buf_pos = 0; 336 int block_pos = 0; 337 int chars_avail = 0; 338 int check_pos = 0; 339 int chars_read = 0; 340 BytePack buf[(BUFFER_SIZE+BLOCK_SIZE+OVERLAP_BUFSIZE*2)/sizeof(BitBlock)]; 341 342 char * srcbuf = ((char *) buf) + OVERLAP_BUFSIZE; 343 buffer_base = buf_pos; 344 source = srcbuf; 345 346 chars_read = fread((void *)srcbuf, 1, BUFFER_SIZE + OVERLAP_BUFSIZE, infile); 347 chars_avail = chars_read; 348 if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE; 349 350 matcher.setSrc(srcbuf); 351 352 if(chars_read<4){ 109 @decl 110 111 LineColTracker tracker; 112 TagMatcher<SEGMENT_SIZE,OVERLAP_BUFSIZE> matcher; 113 114 uint8_t * src_buf; 115 int block_base=0; 116 int buffer_base=0; 117 int buffer_pos = 0; 118 int block_pos = 0; 119 int chars_avail = 0; 120 int check_pos = 0; 121 int chars_read = 0; 122 123 ////////////////////////////////////////////////////////////////////////////////////////// 124 // Buffer Management 125 ////////////////////////////////////////////////////////////////////////////////////////// 126 BitBlock buf[(PADDING_SIZE + SEGMENT_SIZE + PADDING_SIZE)/sizeof(BitBlock)]; 127 src_buf = (uint8_t *)buf + PADDING_SIZE; 128 129 //ALLOC_STATIC_ALIGNED_BYTE_BUFFER(src_buf, (PADDING_SIZE + SEGMENT_SIZE + PADDING_SIZE)); 130 131 buffer_base = buffer_pos; 132 chars_read = fread((void *)src_buf, 1, BUFFER_SIZE, infile); 133 chars_avail = chars_read; 134 if (chars_avail > SEGMENT_SIZE) chars_avail = SEGMENT_SIZE; 135 136 ////////////////////////////////////////////////////////////////////////////////////////// 137 // XML Validation / Content Model 138 ////////////////////////////////////////////////////////////////////////////////////////// 139 if(chars_read<4){ 353 140 fprintf(stderr,"File is too short. Not well formed.\n"); 354 141 exit(-1); 355 }356 357 Entity_Info * e = new Entity_Info;358 e->AnalyzeSignature((unsigned char *)srcbuf);359 360 if (e->code_unit_base == ASCII) {361 362 XML_Decl_Parser<ASCII> decl_parser((unsigned char *)src buf);142 } 143 144 Entity_Info * e = new Entity_Info; 145 e->AnalyzeSignature((unsigned char *)src_buf); 146 147 if (e->code_unit_base == ASCII) { 148 149 XML_Decl_Parser<ASCII> decl_parser((unsigned char *)src_buf); 363 150 364 151 decl_parser.ReadXMLInfo(*e); 365 152 366 153 if (e->code_unit_size != SingleByte || (e->has_encoding_decl && (!at_UTF_8(e->encoding)))){ 367 fprintf(stderr,"Sorry, this xmlwf demo only works for UTF-8.\n"); 368 exit(-1); 369 } 370 } 371 else { 372 fprintf(stderr,"Sorry, this xmlwf demo does not process EBCDIC.\n"); 373 exit(-1); 374 } 375 376 if (e->content_start != 0) { 377 memmove(&srcbuf[0], &srcbuf[e->content_start], chars_read - e->content_start); 378 buf_pos = e->content_start; 379 if (chars_avail == BUFFER_SIZE) { 380 chars_read = chars_read - e->content_start + 381 fread(&srcbuf[chars_read-e->content_start], 1, e->content_start, infile); 382 chars_avail = chars_read; 383 if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE; 384 } 385 else { 386 chars_read -=e->content_start; 387 chars_avail -=e->content_start; 388 } 389 } 390 391 @stream_stmts 392 393 /* Full Buffers */ 394 395 while (chars_avail == BUFFER_SIZE) { 154 fprintf(stderr,"Sorry, this xmlwf demo only works for UTF-8.\n"); 155 exit(-1); 156 } 157 } 158 else { 159 fprintf(stderr,"Sorry, this xmlwf demo does not process EBCDIC.\n"); 160 exit(-1); 161 } 162 163 if (e->content_start != 0) { 164 memmove(&src_buf[0], &src_buf[e->content_start], chars_read - e->content_start); 165 buffer_pos = e->content_start; 166 if (chars_avail == SEGMENT_SIZE) { 167 chars_read = chars_read - e->content_start + fread(&src_buf[chars_read-e->content_start], 1, e->content_start, infile); 168 chars_avail = chars_read; 169 if (chars_avail > SEGMENT_SIZE) chars_avail = SEGMENT_SIZE; 170 } 171 else { 172 chars_read -=e->content_start; 173 chars_avail -=e->content_start; 174 } 175 } 176 177 @stream_stmts 178 179 180 ////////////////////////////////////////////////////////////////////////////////////////// 181 // Full Segments 182 ////////////////////////////////////////////////////////////////////////////////////////// 183 matcher.setSrc((char *)src_buf); 184 while (chars_avail == SEGMENT_SIZE) { 396 185 PERF_SEC_START(parser_timer); 397 186 for (int blk = 0; blk < SEGMENT_BLOCKS; blk++) { 398 187 block_base = blk*BLOCK_SIZE; 399 s2p_do_block((BytePack *) &srcbuf[block_base], basis_bits); 188 s2p_do_block((BytePack *) &src_buf[block_base], basis_bits); 189 400 190 @block_stmts 401 postprocess_do_block(lex, ctCDPI_Callouts, ref_Callouts, check_streams, chars_avail); 191 192 tracker.StoreNewlines(lex.LF); 193 postprocess_do_block(lex, ctCDPI_Callouts, ref_Callouts, check_streams, (char *)src_buf, buffer_base, block_base, chars_avail, tracker); 194 matcher.store_streams(check_streams.tag_marks, check_streams.name_follows, check_streams.misc_mask, chars_avail); 195 tracker.AdvanceBlock(); 402 196 } 403 197 matcher.StreamScan(chars_avail); … … 406 200 407 201 int bytes_left = chars_read - chars_avail; 408 memmove(src buf, &srcbuf[BUFFER_SIZE], bytes_left);409 chars_read = fread(&src buf[bytes_left],1, BUFFER_SIZE + OVERLAP_BUFSIZE - bytes_left, infile) + bytes_left;202 memmove(src_buf, &src_buf[SEGMENT_SIZE], bytes_left); 203 chars_read = fread(&src_buf[bytes_left], 1, BUFFER_SIZE - bytes_left, infile) + bytes_left; 410 204 chars_avail = chars_read; 411 if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE; 412 buf_pos += chars_avail; 413 buffer_base = buf_pos; 414 } 415 /* Final Partial Buffer */ 205 if (chars_avail > SEGMENT_SIZE) chars_avail = SEGMENT_SIZE; 206 buffer_pos += chars_avail; 207 buffer_base = buffer_pos; 208 } 209 210 ////////////////////////////////////////////////////////////////////////////////////////// 211 // Final Partial Segment 212 ////////////////////////////////////////////////////////////////////////////////////////// 416 213 PERF_SEC_START(parser_timer); 417 214 418 215 block_pos = 0; 419 216 int remaining = chars_avail; 420 /* Full Blocks */ 217 218 /* Full Blocks */ 421 219 while (remaining >= BLOCK_SIZE) { 422 block_base = block_pos;423 s2p_do_block((BytePack *) &srcbuf[block_pos], basis_bits);220 block_base = block_pos; 221 s2p_do_block((BytePack *) &src_buf[block_pos], basis_bits); 424 222 @block_stmts 425 postprocess_do_block(lex, ctCDPI_Callouts, ref_Callouts, check_streams, chars_avail); 426 block_pos += BLOCK_SIZE; 223 tracker.StoreNewlines(lex.LF); 224 postprocess_do_block(lex, ctCDPI_Callouts, ref_Callouts, check_streams, (char *)src_buf, buffer_base, block_base, chars_avail, tracker); 225 matcher.store_streams(check_streams.tag_marks, check_streams.name_follows, check_streams.misc_mask, chars_avail); 226 tracker.AdvanceBlock(); 227 block_pos += BLOCK_SIZE; 427 228 remaining -= BLOCK_SIZE; 428 229 } 429 230 block_base = block_pos; 231 232 /* Partial Block or Any Carry */ 430 233 if (remaining > 0 || @any_carry) { 431 234 EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining)); 432 s2p_do_final_block((BytePack *) &src buf[block_pos], basis_bits, EOF_mask);235 s2p_do_final_block((BytePack *) &src_buf[block_pos], basis_bits, EOF_mask); 433 236 @final_block_stmts 434 postprocess_do_block(lex, ctCDPI_Callouts, ref_Callouts, check_streams, chars_avail); 435 } 436 buf_pos += chars_avail; 437 buffer_base = buf_pos; 237 tracker.StoreNewlines(lex.LF); 238 postprocess_do_block(lex, ctCDPI_Callouts, ref_Callouts, check_streams, (char *)src_buf, buffer_base, block_base, chars_avail, tracker); 239 matcher.store_streams(check_streams.tag_marks, check_streams.name_follows, check_streams.misc_mask, chars_avail); 240 tracker.AdvanceBlock(); 241 242 } 243 244 buffer_pos += chars_avail; 245 buffer_base = buffer_pos; 438 246 439 247 matcher.StreamScan(chars_avail); 440 248 matcher.Advance_buffer(); 441 442 249 443 250 PERF_SEC_END(parser_timer, chars_avail); -
proto/parabix2/src/TagMatcher.hpp
r2155 r2160 4 4 #include "../lib/bitblock.hpp" 5 5 6 #define MAX_DEPTH 1007 6 #include <algorithm> 8 7 #include <iostream> … … 10 9 using namespace std; 11 10 12 13 11 #define MAX_DEPTH 100 14 12 #define MAX_ATTS 100 … … 19 17 }; 20 18 19 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 21 20 class TagMatcher { 22 public: 23 BitBlock tagMarks[BUFFER_SIZE/BLOCK_SIZE]; 24 BitBlock miscMarks[BUFFER_SIZE/BLOCK_SIZE]; 25 char tags_buf[BUFFER_SIZE]; 26 int tags_buf_cur; 27 int stream_index; 28 char * srcbuf; 29 int depth; 30 int inTagPos; 31 int finalStartPos; 32 char* tag_stack[MAX_DEPTH]; 33 int tag_lgth_stack[MAX_DEPTH]; 34 BitBlock NameFollows[BUFFER_SIZE/BLOCK_SIZE+1]; // 1 extra block for sentinel 35 int buf_base; 36 enum TagMatchState {InStartTag, InEndTag, InAttName, Clear} state; 37 enum TagMatchMode {StartOfFile, InFile} mode; 38 struct attribute Attr[MAX_ATTS]; 39 struct attribute InAtt; 40 int att_index; 41 int InFinalEndTag; 42 21 22 public: 43 23 TagMatcher(); 44 24 ~TagMatcher(); … … 50 30 int does_match(char * s1, char * s2, int lgth); 51 31 int lookup_or_insert(char*s, int lgth); 32 33 int depth; 34 35 private: 36 BitBlock tagMarks[BUF_SIZE/BLOCK_SIZE]; 37 BitBlock miscMarks[BUF_SIZE/BLOCK_SIZE]; 38 char tags_buf[BUF_SIZE]; 39 int tags_buf_cur; 40 int stream_index; 41 char * srcbuf; 42 43 int inTagPos; 44 int finalStartPos; 45 char* tag_stack[MAX_DEPTH]; 46 int tag_lgth_stack[MAX_DEPTH]; 47 BitBlock NameFollows[BUF_SIZE/BLOCK_SIZE+1]; // 1 extra block for sentinel 48 int buf_base; 49 enum TagMatchState {InStartTag, InEndTag, InAttName, Clear} state; 50 enum TagMatchMode {StartOfFile, InFile} mode; 51 struct attribute Attr[MAX_ATTS]; 52 struct attribute InAtt; 53 int att_index; 54 int InFinalEndTag; 55 52 56 }; 53 57 54 int TagMatcher::lookup_or_insert(char* s, int lgth){ 55 for(int i=0; i< att_index; i++) 56 if(lgth == Attr[i].lgth && does_match(s,Attr[i].start,lgth)) 57 return 1; 58 59 Attr[att_index].start = s; 60 Attr[att_index].lgth = lgth; 61 att_index++; 62 return 0; 63 } 64 65 int TagMatcher::does_match(char * s1, char * s2, int lgth){ 58 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 59 int TagMatcher<BUF_SIZE, OVER_SIZE>::does_match(char * s1, char * s2, int lgth){ 66 60 int matchlen = 0; 67 61 int i=0; … … 69 63 /* full 16 byte match */ 70 64 71 if (bitblock::any(simd_xor(bitblock::load_unaligned((BitBlock*)&s1[i]), 72 bitblock::load_unaligned((BitBlock*)&s2[i])))) {73 return 0;65 if (bitblock::any(simd_xor(bitblock::load_unaligned((BitBlock*)&s1[i]), 66 bitblock::load_unaligned((BitBlock*)&s2[i])))) { 67 return 0; 74 68 } 75 69 else { 76 lgth -= sizeof(BitBlock);77 i +=sizeof(BitBlock);78 } 79 } 80 81 scanword_t temp = ~hsimd<8>::signmask(simd<8>::eq(bitblock::load_unaligned((BitBlock*)&s1[i]),70 lgth -= sizeof(BitBlock); 71 i +=sizeof(BitBlock); 72 } 73 } 74 75 scanword_t temp = ~hsimd<8>::signmask(simd<8>::eq(bitblock::load_unaligned((BitBlock*)&s1[i]), 82 76 bitblock::load_unaligned((BitBlock*)&s2[i]))); 83 77 84 return lgth <= scan_forward_zeroes(temp); 85 } 86 87 88 int TagMatcher:: tag_match(int pos, int chars_avail) { 78 return lgth <= scan_forward_zeroes(temp); 79 } 80 81 82 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 83 int TagMatcher<BUF_SIZE, OVER_SIZE>::lookup_or_insert(char* s, int lgth){ 84 for(int i=0; i< this->att_index; i++) 85 if(lgth == this->Attr[i].lgth && this->does_match(s,this->Attr[i].start,lgth)) 86 return 1; 87 88 this->Attr[att_index].start = s; 89 this->Attr[att_index].lgth = lgth; 90 this->att_index++; 91 return 0; 92 } 93 94 95 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 96 int TagMatcher<BUF_SIZE, OVER_SIZE>:: tag_match(int pos, int chars_avail) { 89 97 int rt_val=0; 90 98 // end tag 91 if(srcbuf[pos]=='/' ){99 if(this->srcbuf[pos]=='/' ){ 92 100 pos++; 93 depth--;94 if (depth<0)101 this->depth--; 102 if (this->depth<0) 95 103 return pos; 96 int lgth =tag_lgth_stack[depth];97 98 if (does_match(tag_stack[depth],&srcbuf[pos],lgth) && ((srcbuf[pos+lgth] == '>') ||(srcbuf[pos+lgth] <= ' '))) rt_val=0;99 else if (pos + lgth >= BUFFER_SIZE + OVERLAP_BUFSIZE) {100 state = InEndTag;101 inTagPos = BUFFER_SIZE - pos;104 int lgth = this->tag_lgth_stack[depth]; 105 106 if (does_match(this->tag_stack[depth],&this->srcbuf[pos],lgth) && ((this->srcbuf[pos+lgth] == '>') ||(this->srcbuf[pos+lgth] <= ' '))) rt_val=0; 107 else if (pos + lgth >= BUF_SIZE + OVER_SIZE) { 108 this->state = InEndTag; 109 this-> inTagPos = BUF_SIZE - pos; 102 110 rt_val=0; 103 111 } … … 109 117 } 110 118 111 if (depth == 0){112 while(srcbuf[pos]!='>'){119 if (this->depth == 0){ 120 while(this->srcbuf[pos]!='>'){ 113 121 pos++; 114 122 if(pos>=chars_avail){ 115 InFinalEndTag = 1;123 this->InFinalEndTag = 1; 116 124 return 0; 117 125 } 118 126 } 119 pos = bitstream_scan(miscMarks,pos+1); 127 pos = bitstream_scan(this->miscMarks,pos+1); 128 if(pos!=chars_avail){ 129 fprintf(stderr,"illegal content after root element at position = %i\n",this->buf_base+pos); 130 exit(-1); 131 } 132 } 133 return rt_val; 134 } 135 // empty tag 136 else if(this->srcbuf[pos]=='>'){ 137 this->depth--; 138 if (this->depth == 0){ 139 while(this->srcbuf[pos]!='>') 140 pos++; 141 pos = bitstream_scan(this->miscMarks,pos+1); 142 120 143 if(pos!=chars_avail){ 121 144 fprintf(stderr,"illegal content after root element at position = %i\n",buf_base+pos); … … 123 146 } 124 147 } 125 return rt_val;126 }127 // empty tag128 else if(srcbuf[pos]=='>'){129 depth--;130 if (depth == 0){131 while(srcbuf[pos]!='>')132 pos++;133 pos = bitstream_scan(miscMarks,pos+1);134 135 if(pos!=chars_avail){136 fprintf(stderr,"illegal content after root element at position = %i\n",buf_base+pos);137 exit(-1);138 }139 }140 148 } 141 149 // start tag 142 else if(srcbuf[pos-1]=='<'){143 att_index = 0;144 if(depth<MAX_DEPTH){145 int end_pos = bitstream_scan(NameFollows,pos);146 tag_lgth_stack[depth] = end_pos-pos;147 tag_stack[depth] = &srcbuf[pos];148 if(end_pos<BUFFER_SIZE){149 depth++;150 else if(this->srcbuf[pos-1]=='<'){ 151 this->att_index = 0; 152 if(this->depth<MAX_DEPTH){ 153 int end_pos = bitstream_scan(this->NameFollows,pos); 154 this->tag_lgth_stack[this->depth] = end_pos-pos; 155 this->tag_stack[depth] = &this->srcbuf[pos]; 156 if(end_pos<BUF_SIZE){ 157 this->depth++; 150 158 } 151 159 else{ 152 state = InStartTag;153 finalStartPos = pos;160 this->state = InStartTag; 161 this->finalStartPos = pos; 154 162 } 155 163 } 156 164 else{ 157 fprintf(stderr,"Max nesting depth exceeded at position =%i. depth = %i\n",buf_base+pos,depth);165 fprintf(stderr,"Max nesting depth exceeded at position =%i. depth = %i\n",this->buf_base+pos, this->depth); 158 166 exit(-1); 159 167 } … … 161 169 // attribute 162 170 else{ 163 int end_pos = bitstream_scan(NameFollows,pos);164 if(end_pos<BUFFER_SIZE){165 if(lookup_or_insert(&srcbuf[pos], end_pos-pos)){166 fprintf(stderr,"Attribute name is not unique at position =%i.\n",buf_base+pos);171 int end_pos = bitstream_scan(this->NameFollows,pos); 172 if(end_pos<BUF_SIZE){ 173 if(lookup_or_insert(&this->srcbuf[pos], end_pos-pos)){ 174 fprintf(stderr,"Attribute name is not unique at position =%i.\n",this->buf_base+pos); 167 175 exit(-1); 168 176 } 169 177 } 170 178 else{ 171 state = InAttName;172 InAtt.start = &srcbuf[pos];173 InAtt.lgth = BUFFER_SIZE-pos;179 this->state = InAttName; 180 this->InAtt.start = &this->srcbuf[pos]; 181 this->InAtt.lgth = BUF_SIZE-pos; 174 182 } 175 183 } … … 177 185 } 178 186 179 180 int TagMatcher ::StreamScan(int chars_avail) {187 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 188 int TagMatcher<BUF_SIZE, OVER_SIZE>::StreamScan(int chars_avail) { 181 189 182 190 int blk; … … 184 192 int block_pos = 0; 185 193 186 if(mode == StartOfFile){194 if(this->mode == StartOfFile){ 187 195 int pos = bitstream_scan(miscMarks,0); 188 196 if (pos==chars_avail){ 189 fprintf(stderr,"no element at position =%i.\n",buf_base+pos);197 fprintf(stderr,"no element at position =%i.\n",this->buf_base+pos); 190 198 exit(-1); 191 199 } 192 if(srcbuf[pos-1]!='<'|| srcbuf[pos]=='!'||srcbuf[pos]=='/'){200 if(this->srcbuf[pos-1]!='<'|| this->srcbuf[pos]=='!'|| this->srcbuf[pos]=='/'){ 193 201 #ifdef DUMP 194 202 print_register<BitBlock>("srcbuf", bitblock::load_unaligned((BitBlock *) srcbuf)); … … 197 205 exit(-1); 198 206 } 199 mode = InFile;207 this->mode = InFile; 200 208 } 201 209 for (blk = 0; blk < blk_counts; blk++) { 202 scanword_t s = ((scanword_t*)tagMarks)[blk];210 scanword_t s = ((scanword_t*)this->tagMarks)[blk]; 203 211 while(s) { 204 212 int code = tag_match(scan_forward_zeroes(s) + block_pos, chars_avail); … … 212 220 } 213 221 214 void TagMatcher::store_streams(BitBlock tagMark, BitBlock NameFollow, BitBlock miscMark, int chars_avail){ 222 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 223 void TagMatcher<BUF_SIZE, OVER_SIZE>::store_streams(BitBlock tagMark, BitBlock NameFollow, BitBlock miscMark, int chars_avail){ 215 224 #ifdef DUMP 216 225 print_register<BitBlock>("tagMark", tagMark); … … 220 229 printf("stream_index = %i\n", stream_index); 221 230 #endif 222 t agMarks[stream_index] = tagMark;223 miscMarks[stream_index] = simd_not(miscMark);224 NameFollows[stream_index] = NameFollow;225 stream_index++;226 if( stream_index==1){227 228 if ( InFinalEndTag == 1){231 this->tagMarks[this->stream_index] = tagMark; 232 this->miscMarks[this->stream_index] = simd_not(miscMark); 233 this->NameFollows[this->stream_index] = NameFollow; 234 this->stream_index++; 235 if(this->stream_index==1){ 236 237 if (this->InFinalEndTag == 1){ 229 238 int pos = -1; 230 while( srcbuf[pos]!='>'){239 while(this->srcbuf[pos]!='>'){ 231 240 pos++; 232 241 if(pos>=chars_avail){ 233 InFinalEndTag = 1;242 this->InFinalEndTag = 1; 234 243 return; 235 244 } 236 245 } 237 pos = bitstream_scan( miscMarks,pos+1);246 pos = bitstream_scan(this->miscMarks,pos+1); 238 247 #ifdef DUMP 239 248 print_register<BitBlock>("miscMarks[0]", miscMarks[0]); … … 246 255 } 247 256 248 if( state == InStartTag) {249 state =Clear;250 int remain_lgth = bitstream_scan( NameFollows,0);251 memcpy(&t ags_buf[tags_buf_cur],srcbuf,remain_lgth);252 t ag_lgth_stack[depth] += remain_lgth;253 depth++;254 } 255 else if ( state == InEndTag) {256 state = Clear;257 int lgth = t ag_lgth_stack[depth];258 if (does_match(t ag_stack[depth]+inTagPos,srcbuf,lgth-inTagPos) && ((srcbuf[lgth-inTagPos] == '>') ||(srcbuf[lgth-inTagPos] <= ' '))) return ;257 if(this->state == InStartTag) { 258 this->state = this->Clear; 259 int remain_lgth = bitstream_scan(this->NameFollows,0); 260 memcpy(&this->tags_buf[this->tags_buf_cur],this->srcbuf,remain_lgth); 261 this->tag_lgth_stack[this->depth] += remain_lgth; 262 this->depth++; 263 } 264 else if (this->state == InEndTag) { 265 this->state = Clear; 266 int lgth = this->tag_lgth_stack[this->depth]; 267 if (does_match(this->tag_stack[this->depth]+this->inTagPos,this->srcbuf,lgth-this->inTagPos) && ((this->srcbuf[lgth-this->inTagPos] == '>') ||(this->srcbuf[lgth-this->inTagPos] <= ' '))) return ; 259 268 else { 260 269 fprintf(stderr,"tag name mismatch at position = %i\n",buf_base); … … 262 271 } 263 272 } 264 else if ( state == InAttName) {265 state = Clear;266 int remain_lgth = bitstream_scan( NameFollows,0);267 memcpy(&t ags_buf[tags_buf_cur],srcbuf,remain_lgth);268 if(lookup_or_insert( InAtt.start,InAtt.lgth+remain_lgth)){273 else if (this->state == InAttName) { 274 this->state = Clear; 275 int remain_lgth = bitstream_scan(this->NameFollows,0); 276 memcpy(&this->tags_buf[this->tags_buf_cur],this->srcbuf,remain_lgth); 277 if(lookup_or_insert(this->InAtt.start, this->InAtt.lgth+remain_lgth)){ 269 278 fprintf(stderr,"Attribute name is not unique at position =%i.\n",buf_base); 270 279 exit(-1); … … 274 283 } 275 284 276 TagMatcher::TagMatcher(){ 277 stream_index = 0; 278 depth = 0; 279 buf_base = 0; 280 state = Clear; 281 mode = StartOfFile; 282 InFinalEndTag = 0; 283 NameFollows[BUFFER_SIZE/BLOCK_SIZE]=simd<1>::constant<1>();// TODO - verify simd_const_1(1); //sentinel 284 } 285 286 287 288 TagMatcher::~TagMatcher(){ 289 290 } 291 292 void TagMatcher::setSrc(char * src){ 293 srcbuf = src; 294 } 295 296 void TagMatcher::Advance_buffer(){ 297 buf_base += BUFFER_SIZE; 298 stream_index=0; 299 tags_buf_cur = 0; 300 att_index = 0; 301 for(int i=0; i< depth; i++){ 302 if(&tags_buf[tags_buf_cur]!=tag_stack[i]) 303 memcpy(&tags_buf[tags_buf_cur],tag_stack[i],tag_lgth_stack[i]); 304 tag_stack[i] = &tags_buf[tags_buf_cur]; 305 tags_buf_cur += tag_lgth_stack[i]; 306 } 307 if(state == InStartTag) { 308 memcpy(&tags_buf[tags_buf_cur],&srcbuf[finalStartPos],tag_lgth_stack[depth]); 309 tag_stack[depth] = &tags_buf[tags_buf_cur]; 310 tags_buf_cur += tag_lgth_stack[depth]; 311 } 312 else if(state == InEndTag) { 313 memcpy(&tags_buf[tags_buf_cur],tag_stack[depth],tag_lgth_stack[depth]); 314 tag_stack[depth] = &tags_buf[tags_buf_cur]; 315 tags_buf_cur += tag_lgth_stack[depth]; 316 } 317 else if(state == InAttName) { 318 memcpy(&tags_buf[tags_buf_cur],InAtt.start,InAtt.lgth); 319 InAtt.start = &tags_buf[tags_buf_cur]; 320 tags_buf_cur += InAtt.lgth; 321 } 322 srcbuf[-1] = srcbuf[BUFFER_SIZE-1]; 323 } 285 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 286 TagMatcher<BUF_SIZE, OVER_SIZE>::TagMatcher(){ 287 this->stream_index = 0; 288 this->depth = 0; 289 this->buf_base = 0; 290 this->state = Clear; 291 this->mode = StartOfFile; 292 this->InFinalEndTag = 0; 293 this->NameFollows[BUF_SIZE/BLOCK_SIZE]=simd<1>::constant<1>();// TODO - verify simd_const_1(1); //sentinel 294 } 295 296 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 297 TagMatcher<BUF_SIZE, OVER_SIZE>::~TagMatcher(){ 298 299 } 300 301 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 302 void TagMatcher<BUF_SIZE, OVER_SIZE>::setSrc(char * src){ 303 this->srcbuf = src; 304 } 305 306 template <uint64_t BUF_SIZE, uint64_t OVER_SIZE> 307 void TagMatcher<BUF_SIZE, OVER_SIZE>::Advance_buffer(){ 308 this->buf_base += BUF_SIZE; 309 this->stream_index=0; 310 this->tags_buf_cur = 0; 311 this->att_index = 0; 312 for(int i=0; i< this->depth; i++){ 313 if(&this->tags_buf[this->tags_buf_cur]!=this->tag_stack[i]) 314 memcpy(&this->tags_buf[this->tags_buf_cur],this->tag_stack[i],this->tag_lgth_stack[i]); 315 this->tag_stack[i] = &this->tags_buf[tags_buf_cur]; 316 this->tags_buf_cur += this->tag_lgth_stack[i]; 317 } 318 if(this->state == InStartTag) { 319 memcpy(&this->tags_buf[this->tags_buf_cur],&this->srcbuf[this->finalStartPos],this->tag_lgth_stack[this->depth]); 320 this->tag_stack[depth] = &this->tags_buf[this->tags_buf_cur]; 321 this->tags_buf_cur += this->tag_lgth_stack[this->depth]; 322 } 323 else if(this->state == InEndTag) { 324 memcpy(&this->tags_buf[this->tags_buf_cur],this->tag_stack[this->depth],this->tag_lgth_stack[this->depth]); 325 this->tag_stack[depth] = &this->tags_buf[this->tags_buf_cur]; 326 this->tags_buf_cur += this->tag_lgth_stack[this->depth]; 327 } 328 else if(this->state == InAttName) { 329 memcpy(&this->tags_buf[this->tags_buf_cur],this->InAtt.start,this->InAtt.lgth); 330 this->InAtt.start = &this->tags_buf[tags_buf_cur]; 331 this->tags_buf_cur += this->InAtt.lgth; 332 } 333 this->srcbuf[-1] = this->srcbuf[BUF_SIZE-1]; 334 } 335 324 336 325 337 #endif /* TAGMATCHER_HPP_ */
