Changeset 366 for proto/u16u8/template.c


Ignore:
Timestamp:
Feb 24, 2010, 4:35:15 PM (9 years ago)
Author:
lindanl
Message:

Buffered version; fix error report for surrogate at block boundary.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/u16u8/template.c

    r361 r366  
    22#include <stdlib.h>
    33#include <errno.h>
     4#include <stdint.h>
     5#include <string.h>
     6#include <sys/types.h>
     7#include <sys/stat.h>
    48#include "lib/lib_simd.h"
    5 
     9#define min(x,y) ((x) <(y) ?(x) :(y) )  \
    610
    711typedef SIMD_type BytePack;
     
    1620#endif
    1721
     22/*===========================================================================*/
     23/* UErrorCode */
     24/* Extracted from ICU */
     25/*===========================================================================*/
     26
     27typedef enum UErrorCode {
     28
     29    U_ZERO_ERROR              =  0,     /**< No error, no warning. */
     30    U_TRUNCATED_CHAR_FOUND    = 11,     /**< Character conversion: Incomplete input sequence. */
     31    U_ILLEGAL_CHAR_FOUND      = 12,     /**< Character conversion: Illegal input sequence/combination of input units. */
     32} UErrorCode;
     33
     34typedef  uint16_t UChar;
    1835
    1936#define s2p_step(s0,s1,hi_mask,shift,p0,p1)  \
     
    173190delcounts_16 = simd_add_16_lh(delcounts_8, delcounts_8);
    174191sisd_store_aligned(simd_sub_16(simd_const_16(16), delcounts_16), (BytePack *) &u8_bytes_per_reg[0]);
    175 
    176192}
    177193
     
    210226#define BLOCK_SIZE 128
    211227
    212 void do_process(FILE *infile, FILE *outfile) {
     228void u16u8(char ** targetbuf, const char * targetlimit, const UChar ** srcbuf, const UChar * srclimit, UErrorCode * err){
    213229
    214230  @decl
    215231
    216   BytePack U16[16];
     232  BytePack * U16;
    217233  BytePack U16h[8];
    218234  BytePack U16l[8];
     
    226242  BitBlock U8[32];
    227243
    228   int block_pos = 0;
     244  int target_bckup_steps = 0;
    229245  int pos = 0;
    230246
    231   int bytes_read;
     247  int  units_read  = 0;
    232248
    233249  bool error_found = false;
    234250  int err_pos;
    235 //  while (bytes_read > 0) {
     251
     252
     253
    236254  do {
    237     pos = 0;
    238     bytes_read = fread(&U16[0], 1, BLOCK_SIZE*2, infile);
    239     if(bytes_read < BLOCK_SIZE*2){
    240         endmask = sisd_sll(simd_const_1(1),sisd_from_int(bytes_read/2));
    241 //      ((uint16_t *) U16)[bytes_read/2] = 0;
     255
     256    U16 = (BytePack *)(*srcbuf);
     257    units_read = min(srclimit-(*srcbuf),BLOCK_SIZE);
     258
     259    if(units_read < BLOCK_SIZE){
     260        endmask = sisd_sll(simd_const_1(1),sisd_from_int(units_read));
     261//      ((uint16_t *) U16)[units_read/2] = 0;
    242262    }
    243263    else endmask = simd_const_1(0);
    244264
    245     mask = simd_const_1(1);
    246265
    247266    for (int i=0; i< 8; i++){
    248       U16h[i] = simd_pack_16_ll(U16[i*2+1],U16[i*2]);
    249       U16l[i] = simd_pack_16_hh(U16[i*2+1],U16[i*2]);
     267      U16h[i] = simd_pack_16_ll(sisd_load_unaligned(&U16[i*2+1]),sisd_load_unaligned(&U16[i*2]));
     268      U16l[i] = simd_pack_16_hh(sisd_load_unaligned(&U16[i*2+1]),sisd_load_unaligned(&U16[i*2]));
    250269    }
    251270
     
    263282       err_pos = count_forward_zeroes(u16.error);
    264283       error_found = true;
    265         if ((err_pos * 2 == bytes_read)) {
    266           err_pos--;
    267            fprintf(stderr, "EOF with incomplete UTF-16 sequence at position %i in source.\n", block_pos + err_pos*2);
     284        if ((err_pos == units_read)) {
     285                err_pos--;
     286                *err = U_TRUNCATED_CHAR_FOUND;
     287                (*srcbuf) += err_pos;
    268288        }
    269289        else {
    270           if((((unsigned char *)U16h)[err_pos]< 0xD8) || (((unsigned char *)U16h)[err_pos] >= 0xE0))
    271             err_pos--;
    272            fprintf(stderr, "Illegal UTF-16 sequence at position %i in source.\n", block_pos + err_pos*2);
     290          if((((unsigned char *)U16h)[err_pos]< 0xDC) || (((unsigned char *)U16h)[err_pos] >= 0xE0))
     291                err_pos--;
     292                *err = U_ILLEGAL_CHAR_FOUND;
     293                (*srcbuf) += err_pos;
     294                if(err_pos<0){
     295                        target_bckup_steps = 2;
     296                        (*targetbuf) -= target_bckup_steps;
     297                        return;
     298                }
    273299        }
    274300        endmask = sisd_sll(simd_const_1(1),sisd_from_int(err_pos));
     
    293319
    294320    short u8_bytes_per_reg[8];
    295     char * U8_as_char = (char *) U8;
    296321
    297322    for (int i=0; i< 4; i++){
     
    300325       for(int k=0; k<8; k++) u8_bytes_per_reg[k] = 0;
    301326       del_count(delmask[i],u8_bytes_per_reg);
    302 //        print_bit_block("delmask",delmask[i]);
    303327 
    304328       for(int j=0; j<8; j++){
    305         sisd_store_unaligned(U8[i*8+j],(SIMD_type *) &U8_as_char[pos]);
    306         pos += u8_bytes_per_reg[j];
     329        sisd_store_unaligned(U8[i*8+j],(SIMD_type *) (*targetbuf));
     330        *targetbuf += u8_bytes_per_reg[j];
    307331       }
    308332    }
    309     fwrite(U8_as_char , 1 , pos , outfile );
    310 
    311     block_pos += BLOCK_SIZE*2;
    312 //     fwrite(U8_as_char , 1 , 512 , outfile );
    313   } while ((bytes_read == BLOCK_SIZE * 2) && !error_found);
    314     fclose(infile);
    315     fclose(outfile);
    316     if (error_found) {
    317         exit(-1);
    318     }
    319 
    320   fclose(infile);
    321   fclose(outfile);
     333
     334    if(!error_found) (*srcbuf) += units_read;
     335
     336    } while ((units_read == BLOCK_SIZE) && !error_found);
     337
    322338}
    323339
     
    327343int
    328344main(int argc, char * argv[]) {
    329   if (argc < 2) {
    330     printf("Usage: %s <filename> [<outputfile>]\n", argv[0]);
    331           exit(-1);
    332   }
    333   char * filename = argv[1];
    334 #ifdef BUFFER_PROFILING
    335   transcode_timer = init_BOM_timer(BUFFER_SIZE);
    336 #endif
    337   FILE *infile, *outfile;
    338   infile = fopen(filename, "rb");
    339   if (!infile) {
    340       fprintf(stderr, "Error: cannot open %s for input.\n", filename);
    341       exit(-1);
    342   }
    343 
    344   if (argc < 3) outfile = stdout;
    345   else {
    346     outfile = fopen(argv[2], "wb");
    347     if (!outfile) {
    348       fprintf(stderr, "Error: cannot open %s for writing.\n", argv[2]);
    349       exit(-1);
    350     }
    351   }
    352 
    353   do_process(infile, outfile);
    354 
    355 #ifdef BUFFER_PROFILING
    356   printf("Buffer conversion timing.\n");
    357   dump_BOM_table(transcode_timer);
    358 #endif
    359   return(0);
    360 }
     345        char * infilename, * outfilename;       
     346        FILE *infile, *outfile;
     347        struct stat fileinfo;
     348        const UChar * srcbuf;
     349        const UChar * srclimit;
     350        const char * targetlimit;
     351        char * targetbuf;
     352        char * target_head;
     353        UErrorCode status;
     354        int chars_read;
     355
     356        if (argc < 2) {
     357                printf("Usage: %s <filename> [<outputfile>]\n", argv[0]);
     358                exit(-1);
     359        }
     360
     361        infilename = argv[1];
     362        stat(infilename, &fileinfo);
     363        infile = fopen(infilename, "rb");
     364        if (!infile) {
     365                fprintf(stderr, "Error: cannot open %s for input.\n", infilename);
     366                exit(-1);
     367        }
     368       
     369        if (argc < 3) outfile = stdout;
     370        else {
     371                outfilename = argv[2];
     372                outfile = fopen(outfilename, "wb");
     373                if (!outfile) {
     374                        fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
     375                        exit(-1);
     376                }
     377        }
     378
     379
     380        srcbuf = (UChar *) malloc(fileinfo.st_size+1);
     381        if (!srcbuf) {
     382              fprintf(stderr, "Error: buffer for %s of size %i cannot be created.\n", infilename, fileinfo.st_size+1);
     383              exit(-1);
     384        }
     385
     386        chars_read = fread((void *)srcbuf, 1, fileinfo.st_size, infile);
     387
     388        srclimit = srcbuf + chars_read/2;
     389//      (*srcbuf)[fileinfo.st_size] = '\0';
     390        fclose(infile);
     391
     392        targetbuf = (char *) malloc(chars_read*2);
     393        targetlimit = targetbuf + chars_read*2;
     394        target_head = targetbuf;
     395
     396        u16u8(&targetbuf, targetlimit, &srcbuf, srclimit, &status);
     397        fwrite(target_head , 1 ,  targetbuf - target_head, outfile );
     398
     399        if (status == U_ILLEGAL_CHAR_FOUND) {
     400                fprintf(stderr, "Illegal UTF-16 sequence at position %i in source.\n", chars_read-(srclimit-srcbuf)*2);
     401        }
     402        else if (status == U_TRUNCATED_CHAR_FOUND) {
     403                fprintf(stderr, "EOF with incomplete UTF-16 sequence at position %i in source.\n",chars_read-(srclimit-srcbuf)*2);
     404        }
     405
     406//      fclose(infile);
     407//      fclose(outfile);
     408
     409
     410
     411        return(0);
     412}
Note: See TracChangeset for help on using the changeset viewer.