test_unicode.c
Go to the documentation of this file.
1 /* ========================================================================== */
2 /*! \file
3  * \brief Test of \c enc_convert_to_utf8_nfc() implementation
4  *
5  * Copyright (c) 2012-2022 by the developers. See the LICENSE file for details.
6  */
7 
8 
9 /* ========================================================================== */
10 /* Include headers */
11 
12 #include "posix.h" /* Include this first because of feature test macros */
13 
14 #include <stdio.h>
15 #include <string.h>
16 
17 #include "config.h"
18 
19 #include "encoding.h"
20 #include "test.h"
21 #include "test_unicode.h"
22 
23 
24 /* ========================================================================== */
25 /* Data types */
26 
27 enum sm_state
28 {
29  SM_SRC,
30  SM_NFC,
31  SM_NFD,
32  SM_INVALID
33 };
34 
35 
36 /* ========================================================================== */
37 /* Constants */
38 
39 /* Buffer size in codepoint units */
40 #define BUFSIZE (size_t) 16
41 
42 /*
43  * Every codepoint can require up to 4 byte in UTF-8
44  * +1 for leading space
45  * +1 for NUL termination
46  */
47 #define BUFSIZE_UTF8 (BUFSIZE * (size_t) 4 + (size_t) 2)
48 
49 #include "uc_test_nfc.c"
50 
51 
52 /* ========================================================================== */
53 /*! \addtogroup TEST */
54 /*! @{ */
55 
56 
57 /* ========================================================================== */
58 /* Test Unicode conformance with official test data
59  *
60  * \param[in] record Record number
61  * \param[in] src_utf8 Source data
62  * \param[in] nfc_utf8 Data in normal form C (NFC)
63  * \param[in] nfd_utf8 Data in normal form D (NFD)
64  *
65  * According to the Unicode standard, NFC normalization must behave like this:
66  *
67  * nfc_utf8 == toNFC(src_utf8) == toNFC(nfc_utf8) == toNFC(nfd_utf8)
68  *
69  * \return
70  * - \c EXIT_SUCCESS on success
71  * - \c EXIT_FAILURE on error
72  */
73 
74 static int test_unicode_conformance(size_t record, const char* src_utf8,
75  const char* nfc_utf8,
76  const char* nfd_utf8)
77 {
78  int res = POSIX_EXIT_SUCCESS;
79  const char* buf1 = NULL;
80  const char* buf2 = NULL;
81  const char* buf3 = NULL;
82  const char* loc;
83  const char* input = NULL;
84  const char* err = NULL;
85 
86 #if 0
87  /* For debugging */
88  printf("================\nrecord: %u\n", (unsigned int) record);
89  printf("src_utf8: %s\n", src_utf8);
90  printf("nfc_utf8: %s\n", nfc_utf8);
91  printf("nfd_utf8: %s\n", nfd_utf8);
92  printf("================\n"),
93 #endif
94 
95  /* nfc_utf8 == toNFC(src_utf8) */
96  input = src_utf8;
98  if(NULL == buf1)
99  {
100  print_error("Conversion SRC => NFC failed");
101  res = POSIX_EXIT_FAILURE;
102  }
103  else if(strcmp(nfc_utf8, buf1))
104  {
105  print_error("Result mismatch for SRC => NFC");
106  err = buf1;
107  res = POSIX_EXIT_FAILURE;
108  }
109 
110  /* nfc_utf8 == toNFC(nfc_utf8) */
111  if(POSIX_EXIT_SUCCESS == res)
112  {
113  input = nfc_utf8;
114  buf2 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
115  if(NULL == buf2)
116  {
117  print_error("Conversion NFC => NFC failed");
118  res = POSIX_EXIT_FAILURE;
119  }
120  else if(strcmp(nfc_utf8, buf2))
121  {
122  print_error("Result mismatch for NFC => NFC");
123  err = buf2;
124  res = POSIX_EXIT_FAILURE;
125  }
126  }
127 
128  /* nfc_utf8 == toNFC(nfd_utf8) */
129  if(POSIX_EXIT_SUCCESS == res)
130  {
131  input = nfd_utf8;
132  buf3 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
133  if(NULL == buf3)
134  {
135  print_error("Conversion NFD => NFC failed");
136  res = POSIX_EXIT_FAILURE;
137  }
138  else if(strcmp(nfc_utf8, buf3))
139  {
140  print_error("Result mismatch for NFD => NFC");
141  err = buf3;
142  res = POSIX_EXIT_FAILURE;
143  }
144  }
145 
146  /* For debugging */
147  if(POSIX_EXIT_SUCCESS != res && NULL != err)
148  {
149 #if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
150  loc = posix_setlocale(POSIX_LC_CTYPE, "");
151 #else /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
152  loc = NULL;
153 #endif /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
154  if(NULL == loc)
155  {
156  print_error("Setting locale for debug messages failed");
157  }
158  else
159  {
160  /* Print Unicode data only if terminal use Unicode locale */
161  if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
162  {
163  print_error(
164  "Debug messages can't be printed with current locale");
165  }
166  else
167  {
168  fprintf(stderr, TEST_TAB "Record number in test data file: %lu\n",
169  (unsigned long int) record);
170  fprintf(stderr, TEST_TAB "Input data: \"%s\"\n", input);
171  fprintf(stderr, TEST_TAB "Result is : \"%s\"\n", err);
172  fprintf(stderr, TEST_TAB "Should be : \"%s\"\n", nfc_utf8);
173  }
174  }
175  }
176 
177  /* Release memory */
178  if(buf1 != src_utf8) { enc_free((void*) buf1); }
179  if(buf2 != nfc_utf8) { enc_free((void*) buf2); }
180  if(buf3 != nfd_utf8) { enc_free((void*) buf3); }
181 
182  return(res);
183 }
184 
185 
186 /* ========================================================================== */
187 /* Extract official Unicode test data records
188  *
189  * \return
190  * - \c EXIT_SUCCESS on success
191  * - \c EXIT_FAILURE on error
192  */
193 
194 static int test_unicode_part2(void)
195 {
196  int res = POSIX_EXIT_SUCCESS;
197  size_t i = 0;
198  long int ucp = -1L;
199  enum sm_state state = SM_SRC;
200  long int src[BUFSIZE];
201  long int nfc[BUFSIZE];
202  long int nfd[BUFSIZE];
203  size_t src_i = 0;
204  size_t nfc_i = 0;
205  size_t nfd_i = 0;
206  char src_utf8[BUFSIZE_UTF8];
207  char nfc_utf8[BUFSIZE_UTF8];
208  char nfd_utf8[BUFSIZE_UTF8];
209  size_t utf8_i;
210  size_t rec = 0;
211 
212  /* Assignment in truth expression is intended */
213  while(POSIX_EXIT_SUCCESS == res && -1 != (ucp = uc_test_nfc_table[i++]))
214  {
215  /* Test sequence parser (-2: Field separator, -3: Record separator) */
216  switch(state)
217  {
218  case SM_SRC:
219  {
220  if(-2L == ucp) { state = SM_NFC; }
221  else if(0L > ucp)
222  {
223  print_error("Invalid data found");
224  res = POSIX_EXIT_FAILURE;
225  }
226  else
227  {
228  if(BUFSIZE <= src_i)
229  {
230  print_error("SRC data buffer too small");
231  res = POSIX_EXIT_FAILURE;
232  }
233  else { src[src_i++] = ucp; }
234  }
235  break;
236  }
237  case SM_NFC:
238  {
239  if(-2L == ucp) { state = SM_NFD; }
240  else if(0L > ucp)
241  {
242  print_error("Invalid data found");
243  res = POSIX_EXIT_FAILURE;
244  }
245  else
246  {
247  if(BUFSIZE <= nfc_i)
248  {
249  print_error("NFC data buffer too small");
250  res = POSIX_EXIT_FAILURE;
251  }
252  else { nfc[nfc_i++] = ucp; }
253  }
254  break;
255  }
256  case SM_NFD:
257  {
258  if(-3L == ucp)
259  {
260  /* Data extraction from record complete, convert data to UTF-8 */
261  src_utf8[0] = ' '; utf8_i = 1;
262  enc_uc_encode_utf8(src_utf8, &utf8_i, src, &src_i);
263  src_utf8[utf8_i] = 0;
264  nfc_utf8[0] = ' '; utf8_i = 1;
265  enc_uc_encode_utf8(nfc_utf8, &utf8_i, nfc, &nfc_i);
266  nfc_utf8[utf8_i] = 0;
267  nfd_utf8[0] = ' '; utf8_i = 1;
268  enc_uc_encode_utf8(nfd_utf8, &utf8_i, nfd, &nfd_i);
269  nfd_utf8[utf8_i] = 0;
270  if(src_i || nfc_i || nfd_i)
271  {
272  print_error("Encoding test data to UTF-8 failed");
273  res = POSIX_EXIT_FAILURE;
274  }
275  else
276  {
277  /* Execute Unicode conformance checks */
278  res = test_unicode_conformance(rec++,
279  src_utf8, nfc_utf8, nfd_utf8);
280  /* Extract next record */
281  state = SM_SRC;
282  }
283  }
284  else if(0L > ucp)
285  {
286  print_error("Invalid data found");
287  res = POSIX_EXIT_FAILURE;
288  }
289  else
290  {
291  if(BUFSIZE <= nfd_i)
292  {
293  print_error("NFD data buffer too small");
294  res = POSIX_EXIT_FAILURE;
295  }
296  else { nfd[nfd_i++] = ucp; }
297  }
298  break;
299  }
300  default:
301  {
302  print_error("Parser state machine error");
303  res = POSIX_EXIT_FAILURE;
304  break;
305  }
306  }
307  }
308 
309  return(res);
310 }
311 
312 
313 /* ========================================================================== */
314 /*! \brief Test \c enc_convert_to_utf8_nfc() implementation
315  *
316  * \note
317  * The UTF-7 transformation format use base64 encoded UTF-16BE as internal
318  * representation. Therefore all Unicode codepoints beyond the BMP must be
319  * encoded using surrogate codepoints (that are forbidden in UTF-8).
320  *
321  * Part 1: The following cases are tested:
322  * - ASCII only (trivial)
323  * - Unicode already in NFC normalization (NFC quick check)
324  * - Unicode precomposed but with composition exception (requires lookup table)
325  * - Unicode with NFD normalization (trivial canonical composition)
326  * - Unicode with noncanonical order A (canonical reordering and composition)
327  * - Unicode with noncanonical order B (canonical reordering)
328  * - Unicode singleton (decomposition to another single codepoint)
329  * - Unicode algorithmic composition (used for hangul syllables)
330  * - UTF-7 to UTF-8 conversion (and conversion from NFD to NFC normalization)
331  * - UTF-7 to UTF-8 conversion (with codepoint that require surrogate pair)
332  * - UTF-7 to UTF-8 conversion (shift sequence terminated by SP or end-of-data)
333  *
334  * Part2: The Unicode normalization conformance test data file is used.
335  *
336  * \note
337  * For part 2 all test strings are prepended with a space because our
338  * normalization implementation will intentionally strip "defective combining
339  * character sequences" at the start of strings (even if they are not
340  * "ill-formed" according to the standard).
341  *
342  * \return
343  * - \c EXIT_SUCCESS on success
344  * - \c EXIT_FAILURE on error
345  */
346 
347 int test_unicode(void)
348 {
349 #define TS_NUM (size_t) 11 /* Number of test strings */
350 #define TS_UTF7 (size_t) 8 /* First index of UTF-7 section */
351  static const char* ts[TS_NUM] =
352  {
353  /* UTF-8 section */
354  "This is an ASCII string",
355  "This is an Unicode string: \xC3\xA4word",
356  "This is an Unicode string: \xE0\xAD\x9Cword",
357  "This is an Unicode string: a\xCC\x88word",
358  "This is an Unicode string: start\xCE\xB1\xCC\x94\xCC\x81\xCD\x85word",
359  "This is an Unicode string: start\xCE\xB1\xCC\x81\xCC\x94\xCD\x85word",
360  "This is an Unicode string: \xE2\x84\xA6word",
361  "Composition of hangul jamo: \xE1\x84\x91\xE1\x85\xB1\xE1\x86\xB6",
362  /* UTF-7 section */
363  "This is an Unicode string: hundertf+AHUDCA-nfzig",
364  "This is an Unicode string: Violinschl+APw-ssel (+2DTdHg-)",
365  "Shift sequence terminated by SP or end-of-data: A+AMQ- O+ANY U+ANw"
366  };
367  static const char* rs[TS_NUM] = {
368  /* UTF-8 section */
369  "This is an ASCII string",
370  "This is an Unicode string: \xC3\xA4word",
371  "This is an Unicode string: \xE0\xAC\xA1\xE0\xAC\xBCword",
372  "This is an Unicode string: \xC3\xA4word",
373  "This is an Unicode string: start\xE1\xBE\x85word",
374  "This is an Unicode string: start\xE1\xBE\xB4\xCC\x94word",
375  "This is an Unicode string: \xCE\xA9word",
376  "Composition of hangul jamo: \xED\x93\x9B",
377  /* UTF-7 section */
378  "This is an Unicode string: hundertf\xC3\xBCnfzig",
379  "This is an Unicode string: Violinschl\xC3\xBCssel (\xF0\x9D\x84\x9E)",
380  "Shift sequence terminated by SP or end-of-data: A\xC3\x84 O\xC3\x96 U\xC3\x9C"
381  };
382  int res = POSIX_EXIT_SUCCESS;
383  size_t i;
384  const char* buf;
385  const char* loc;
386 
387  /* Part 1: Check with internal test data */
388  for(i = 0; i < TS_NUM; ++i)
389  {
390  if(TS_UTF7 <= i)
391  {
393  }
394  else
395  {
397  }
398  if(NULL == buf)
399  {
400  print_error("Conversion of data to UTF-8 failed");
401  res = POSIX_EXIT_FAILURE;
402  break;
403  }
404  if(strcmp(rs[i], buf))
405  {
406  print_error("Result is not correct");
407  /* For debugging */
408 #if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
409  loc = posix_setlocale(POSIX_LC_CTYPE, "");
410 #else /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
411  loc = NULL;
412 #endif /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
413  if(NULL == loc)
414  {
415  print_error("Setting locale for debug messages failed");
416  }
417  else
418  {
419  /* Print Unicode data only if terminal use Unicode locale */
420  if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
421  {
422  print_error(
423  "Debug messages can't be printed with current locale");
424  }
425  else
426  {
427  fprintf(stderr, TEST_TAB "Input data: \"%s\"\n", ts[i]);
428  fprintf(stderr, TEST_TAB "Result is : \"%s\"\n", buf);
429  fprintf(stderr, TEST_TAB "Should be : \"%s\"\n", rs[i]);
430  }
431  }
432  res = POSIX_EXIT_FAILURE;
433  break;
434  }
435  if(buf != ts[i]) { enc_free((void*) buf); }
436  }
437 
438  /* Part 2: Check with external test data (from Unicode data file) */
439  if(POSIX_EXIT_SUCCESS == res)
440  {
441  res = test_unicode_part2();
442  }
443 
444  return(res);
445 }
446 
447 
448 /*! @} */
449 
450 /* EOF */
TEST_TAB
#define TEST_TAB
Tabulator to indent messages from test programs.
Definition: test.h:13
enc_free
void enc_free(void *p)
Free an object allocated by encoding module.
Definition: encoding.c:8868
enc_convert_to_utf8_nfc
const char * enc_convert_to_utf8_nfc(enum enc_mime_cs charset, const char *s)
Convert string from supported character set to Unicode (UTF-8 NFC)
Definition: encoding.c:5788
test_unicode
int test_unicode(void)
Test enc_convert_to_utf8_nfc() implementation.
Definition: test_unicode.c:347
enc_uc_encode_utf8
void enc_uc_encode_utf8(char *buf, size_t *i, long int *dbuf, size_t *di)
Encode Unicode codepoints to UTF-8.
Definition: encoding.c:1008
ENC_CS_UTF_7
Definition: encoding.h:98
ENC_CS_UTF_8
Definition: encoding.h:99
print_error
void print_error(const char *)
Print error message.
Definition: main.cxx:276

Generated at 2024-04-27 using  doxygen