source: u8u16/trunk/src/u8u16.c @ 5877

Last change on this file since 5877 was 5877, checked in by cameron, 14 months ago

Adding old u8u16 for Teradata

File size: 6.2 KB
Line 
1#include <stdio.h>
2#include <stdlib.h>
3#include <errno.h>
4#include <stdint.h>
5#ifndef UNICODE_CONVERT
6#ifndef _MSC_VER
7#include <iconv.h>
8#endif
9#endif
10
11#ifdef _MSC_VER
12//#undef BUFFER_PROFILING
13#undef STD_ICONV
14#endif
15// Profiling
16
17#ifdef BUFFER_PROFILING
18#include "../Profiling/BOM_Profiler.h"
19BOM_Table * transcode_timer;
20#endif
21
22/* Counting the number of blocks classified by the
23   maxiumum UTF-8 sequence length within the block. */
24#ifdef BLOCK_COUNTING
25int block_counts[4] = {0,0,0,0};
26#endif
27
28#define BUFFER_SIZE 2048
29#define U16BUFFER_UNITS (BUFFER_SIZE + 16)
30
31#ifdef UNICODE_ORG_CONVERT
32#undef STD_ICONV
33#undef FAST_U8U16
34#define error_in_result(rslt) rslt != conversionOK
35#define illegal_sequence_error(rslt) rslt == sourceIllegal
36#define incomplete_sequence_error(rslt) rslt == sourceExhausted
37#endif
38
39#ifndef UNICODE_ORG_CONVERT
40#define error_in_result(rslt) rslt == (size_t) -1
41#define illegal_sequence_error(rslt) u8u16_errno == EILSEQ
42#define incomplete_sequence_error(rslt) u8u16_errno == EINVAL
43
44
45#ifdef STD_ICONV
46#undef FAST_U8U16
47#endif
48#ifndef STD_ICONV
49#define FAST_U8U16 1
50#endif
51#endif
52
53// Include the conversion routines.
54#ifdef UNICODE_ORG_CONVERT
55//#include "Unicode.org/ConvertUTF.h"
56#include "Unicode.org/ConvertUTF.c"
57#endif
58#ifdef FAST_U8U16
59#include "libu8u16.c"
60#endif
61
62
63void do_UTF8toUTF16(FILE *infile, FILE *outfile) {
64
65#ifdef __GNUC__
66  unsigned char UTF8_buffer[BUFFER_SIZE] __attribute__((aligned(16)));
67#endif
68#ifdef _MSC_VER
69  __declspec(align(16)) unsigned char UTF8_buffer[BUFFER_SIZE];
70#endif
71  unsigned short UTF16_buffer[U16BUFFER_UNITS];
72  unsigned char * srcbuf_ptr = &UTF8_buffer[0];
73  unsigned short * trgtbuf_ptr = &UTF16_buffer[0];
74  size_t inbytes_left, outbytes_left;
75  int file_position = 0;
76
77  intptr_t chars_read, i, UTF16_chars;
78  int u8u16_errno;
79   
80#ifdef UNICODE_ORG_CONVERT
81   ConversionResult rslt;
82#endif
83#ifdef STD_ICONV
84   iconv_t cd = iconv_open("UTF-16BE", "UTF-8");
85   size_t rslt;
86#endif
87#ifdef FAST_U8U16
88   size_t rslt;
89#endif
90
91  chars_read = fread(&UTF8_buffer, 1, BUFFER_SIZE, infile);
92
93  while (chars_read > 0) {
94#ifdef BUFFER_PROFILING
95    start_BOM_interval(transcode_timer);
96#endif
97    srcbuf_ptr = &UTF8_buffer[0];
98    trgtbuf_ptr = &UTF16_buffer[0];
99    inbytes_left = chars_read;
100    outbytes_left = U16BUFFER_UNITS*2;
101#ifdef STD_ICONV
102    rslt = iconv(cd,
103                 (char **) &srcbuf_ptr, 
104                 &inbytes_left,
105                 (char **) &trgtbuf_ptr, 
106                 &outbytes_left);
107    u8u16_errno = errno;
108#endif
109#ifdef UNICODE_ORG_CONVERT
110    rslt = ConvertUTF8toUTF16((const UTF8**) &srcbuf_ptr, 
111                              srcbuf_ptr + chars_read,
112                              &trgtbuf_ptr, 
113                              &UTF16_buffer[BUFFER_SIZE],
114                              strictConversion);
115    inbytes_left = chars_read - (srcbuf_ptr - &UTF8_buffer[0]);
116#endif
117#ifdef FAST_U8U16
118#ifndef BUFFERED_U8U16
119    rslt = u8u16((char **) &srcbuf_ptr, 
120                 &inbytes_left,
121                 (char **) &trgtbuf_ptr, 
122                 &outbytes_left);
123#endif
124#ifdef BUFFERED_U8U16
125    rslt = buffered_u8u16((char **) &srcbuf_ptr, 
126                 &inbytes_left,
127                 (char **) &trgtbuf_ptr, 
128                 &outbytes_left);
129#endif
130    u8u16_errno = errno;
131#endif
132
133#ifdef BUFFER_PROFILING
134     end_BOM_interval(transcode_timer, chars_read - inbytes_left);
135#endif
136    file_position += chars_read - inbytes_left;
137    UTF16_chars = ((intptr_t) trgtbuf_ptr - (intptr_t) &UTF16_buffer[0])/2;
138    fwrite(&UTF16_buffer, 2, UTF16_chars, outfile);
139    if (error_in_result(rslt)) {
140      if (illegal_sequence_error(rslt)) {
141        int pos = 0;
142        fprintf(stderr, "Illegal UTF-8 sequence at position %i in source.\n", file_position);
143#ifdef DEBUG_ERROR
144        for (pos =0; (pos < 5) && (pos < inbytes_left); pos++) {
145          fprintf(stderr, " %02X", (unsigned char) srcbuf_ptr[pos]);
146        }
147        fprintf(stderr, "\n");
148#endif
149        fclose(infile);
150        fclose(outfile);
151        exit(-1);
152      }
153      if (!incomplete_sequence_error(rslt)) {
154        fprintf(stderr, "Unknown error %i at position %i in source.\n", errno, file_position);
155        fclose(infile);
156        fclose(outfile);
157        exit(-1);
158      }
159      // errno == EINVAL  or rslt == sourceExhausted
160      // Incomplete sequence at end of input buffer.
161      if (chars_read < BUFFER_SIZE) {
162        int pos = 0;
163        fprintf(stderr, "EOF with incomplete UTF-8 sequence at position %i in source.\n", file_position);
164#ifdef DEBUG_ERROR
165        for (pos =0; (pos < 5) && (pos < inbytes_left); pos++) {
166          fprintf(stderr, " %02X", (unsigned char) srcbuf_ptr[pos]);
167        }
168        fprintf(stderr, "\n");
169#endif
170        fclose(infile);
171        fclose(outfile);
172        exit(-1);
173      }
174      // Move unprocessed characters to beginning.
175      for (i = 0; i < inbytes_left; i++) {
176        UTF8_buffer[i] = UTF8_buffer[chars_read - inbytes_left + i];
177      }
178
179    }
180    chars_read = fread(&UTF8_buffer[inbytes_left], 1, BUFFER_SIZE-inbytes_left, infile);
181    chars_read += inbytes_left;
182  }
183#ifdef STD_ICONV
184  iconv_close(cd);
185#endif
186  fclose(infile);
187  fclose(outfile);
188}
189
190
191
192
193int
194main(int argc, char * argv[]) {
195  if (argc < 2) {
196    printf("Usage: %s <filename.u8> [<filename.u16>].\n", argv[0]);
197          exit(-1);
198  }
199  char * filename = argv[1];
200#ifdef BUFFER_PROFILING
201  transcode_timer = init_BOM_timer();
202#endif
203  FILE *infile, *outfile;
204  infile = fopen(filename, "rb");
205  if (!infile) {
206      fprintf(stderr, "Error: cannot open %s for input.\n", filename);
207      exit(-1);
208  }
209
210  if (argc < 3) outfile = stdout;
211  else {
212    outfile = fopen(argv[2], "wb");
213    if (!outfile) {
214      fprintf(stderr, "Error: cannot open %s for writing.\n", argv[2]);
215      exit(-1);
216    }
217  }
218
219  do_UTF8toUTF16(infile, outfile);
220
221#ifdef BUFFER_PROFILING
222  printf("Buffer conversion timing.\n");
223  dump_BOM_table(transcode_timer);
224#endif
225#ifdef BLOCK_COUNTING
226printf("%i bytes in pure ASCII blocks.\n", block_counts[0]);
227printf("%i bytes in blocks confined to two-byte subplane.\n", block_counts[1]);
228printf("%i bytes in blocks confined to basic multilingual plane.\n", block_counts[2]);
229printf("%i bytes in blocks containing 4-byte UTF-8 sequences.\n", block_counts[3]);
230#endif
231  return(0);
232}
Note: See TracBrowser for help on using the repository browser.