test_unicode.c
Go to the documentation of this file.
1 /* ========================================================================== */
2 /*! \file
3  * \brief Test of \c enc_convert_to_utf8_nfc() implementation
4  *
5  * Copyright (c) 2012-2024 by the developers. See the LICENSE file for details.
6  */
7 
8 
9 /* ========================================================================== */
10 /* Include headers */
11 
12 #include "posix.h" /* Include this first because of feature test macros */
13 
14 #include <stdio.h>
15 #include <string.h>
16 
17 #include "config.h"
18 
19 #include "encoding.h"
20 #include "test.h"
21 #include "test_unicode.h"
22 
23 
24 /* ========================================================================== */
25 /* Data types */
26 
27 enum sm_state
28 {
29  SM_SRC,
30  SM_NFC,
31  SM_NFD,
32  SM_INVALID
33 };
34 
35 
36 /* ========================================================================== */
37 /* Constants */
38 
39 /* Buffer size in codepoint units */
40 #define BUFSIZE (size_t) 16
41 
42 /*
43  * Every codepoint can require up to 4 byte in UTF-8
44  * +1 for leading space
45  * +1 for NUL termination
46  */
47 #define BUFSIZE_UTF8 (BUFSIZE * (size_t) 4 + (size_t) 2)
48 
49 #include "uc_test_nfc.c"
50 
51 
52 /* ========================================================================== */
53 /*! \addtogroup TEST */
54 /*! @{ */
55 
56 
57 /* ========================================================================== */
58 /* Create octet representation for debug output
59  *
60  * \param[out] ob Pointer to buffer for human readable octets
61  * \param[in] len_ob Size of buffer at address \e ob
62  * \param[in] uc_string Unicode string in UTF-8 format
63  *
64  * \e len_ob must be at least (3 * strlen( \e uc_string ))
65  *
66  * \note Only the first 16 octets are printed.
67  */
68 
69 void test_unicode_octets(char* ob, size_t len_ob, const char* uc_string)
70 {
71  size_t len = strlen(uc_string);
72  size_t i = 0;
73 
74  if (0U == len_ob || 3U * len > len_ob)
75  {
76  ob[0] = 0;
77  return;
78  }
79 
80  for (; len > i; ++i)
81  {
82  api_posix_snprintf(&ob[3U * i], len_ob - 3U * i,
83  "%02X ", (unsigned int)(unsigned char)uc_string[i]);
84  }
85 
86  ob[3U * len - 1] = 0;
87 }
88 
89 
90 /* ========================================================================== */
91 /* Test Unicode conformance with official test data
92  *
93  * \param[in] record Record number
94  * \param[in] src_utf8 Source data
95  * \param[in] nfc_utf8 Data in normal form C (NFC)
96  * \param[in] nfd_utf8 Data in normal form D (NFD)
97  *
98  * According to the Unicode standard, NFC normalization must behave like this:
99  *
100  * nfc_utf8 == toNFC(src_utf8) == toNFC(nfc_utf8) == toNFC(nfd_utf8)
101  *
102  * \return
103  * - \c EXIT_SUCCESS on success
104  * - \c EXIT_FAILURE on error
105  */
106 
107 static int test_unicode_conformance(size_t record, const char* src_utf8,
108  const char* nfc_utf8,
109  const char* nfd_utf8)
110 {
111  int res = API_POSIX_EXIT_SUCCESS;
112  const char* buf1 = NULL;
113  const char* buf2 = NULL;
114  const char* buf3 = NULL;
115  const char* loc;
116  const char* input = NULL;
117  const char* err = NULL;
118 
119 #if 0
120  /* For debugging */
121  printf("================\nrecord: %u\n", (unsigned int) record);
122  printf("src_utf8: \"%s\"\n", src_utf8);
123  printf("nfc_utf8: \"%s\"\n", nfc_utf8);
124  printf("nfd_utf8: \"%s\"\n", nfd_utf8);
125  printf("================\n"),
126 #endif
127 
128  /* nfc_utf8 == toNFC(src_utf8) */
129  input = src_utf8;
130  buf1 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
131  if(NULL == buf1)
132  {
133  print_error("Conversion SRC => NFC failed");
134  res = API_POSIX_EXIT_FAILURE;
135  }
136  else if(strcmp(nfc_utf8, buf1))
137  {
138  print_error("Result mismatch for SRC => NFC");
139  err = buf1;
140  res = API_POSIX_EXIT_FAILURE;
141  }
142 
143  /* nfc_utf8 == toNFC(nfc_utf8) */
144  if(API_POSIX_EXIT_SUCCESS == res)
145  {
146  input = nfc_utf8;
147  buf2 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
148  if(NULL == buf2)
149  {
150  print_error("Conversion NFC => NFC failed");
151  res = API_POSIX_EXIT_FAILURE;
152  }
153  else if(strcmp(nfc_utf8, buf2))
154  {
155  print_error("Result mismatch for NFC => NFC");
156  err = buf2;
157  res = API_POSIX_EXIT_FAILURE;
158  }
159  }
160 
161  /* nfc_utf8 == toNFC(nfd_utf8) */
162  if(API_POSIX_EXIT_SUCCESS == res)
163  {
164  input = nfd_utf8;
165  buf3 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
166  if(NULL == buf3)
167  {
168  print_error("Conversion NFD => NFC failed");
169  res = API_POSIX_EXIT_FAILURE;
170  }
171  else if(strcmp(nfc_utf8, buf3))
172  {
173  print_error("Result mismatch for NFD => NFC");
174  err = buf3;
175  res = API_POSIX_EXIT_FAILURE;
176  }
177  }
178 
179  /* For debugging */
180  if(API_POSIX_EXIT_SUCCESS != res && NULL != err)
181  {
182 #if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
183  loc = api_posix_setlocale(API_POSIX_LC_CTYPE, "");
184 #else /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
185  loc = NULL;
186 #endif /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
187  if(NULL == loc)
188  {
189  print_error("Setting locale for debug messages failed");
190  }
191  else
192  {
193  /* Print Unicode data only if terminal use Unicode locale */
194  if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
195  {
196  print_error(
197  "Debug messages can't be printed with current locale");
198  }
199  else
200  {
201 #define TEST_BUFSIZE 60U
202  char ob[TEST_BUFSIZE]; /* Buffer for human readable octets */
203 
204  fprintf(stderr, TEST_TAB "Record number in test data file: %lu\n",
205  (unsigned long int) record);
206  test_unicode_octets(ob, TEST_BUFSIZE, input);
207  fprintf(stderr, TEST_TAB "Input data: \"%s\" (%s)\n", input, ob);
208  test_unicode_octets(ob, TEST_BUFSIZE, err);
209  fprintf(stderr, TEST_TAB "Result is : \"%s\" (%s)\n", err, ob);
210  test_unicode_octets(ob, TEST_BUFSIZE, nfc_utf8);
211  fprintf(stderr, TEST_TAB "Should be : \"%s\" (%s)\n", nfc_utf8, ob);
212  }
213  }
214  }
215 
216  /* Release memory */
217  if(buf1 != src_utf8) { enc_free((void*) buf1); }
218  if(buf2 != nfc_utf8) { enc_free((void*) buf2); }
219  if(buf3 != nfd_utf8) { enc_free((void*) buf3); }
220 
221  return(res);
222 }
223 
224 
225 /* ========================================================================== */
226 /* Extract official Unicode test data records
227  *
228  * \return
229  * - \c EXIT_SUCCESS on success
230  * - \c EXIT_FAILURE on error
231  */
232 
233 static int test_unicode_part2(void)
234 {
235  int res = API_POSIX_EXIT_SUCCESS;
236  size_t i = 0;
237  long int ucp = -1L;
238  enum sm_state state = SM_SRC;
239  long int src[BUFSIZE];
240  long int nfc[BUFSIZE];
241  long int nfd[BUFSIZE];
242  size_t src_i = 0;
243  size_t nfc_i = 0;
244  size_t nfd_i = 0;
245  char src_utf8[BUFSIZE_UTF8];
246  char nfc_utf8[BUFSIZE_UTF8];
247  char nfd_utf8[BUFSIZE_UTF8];
248  size_t utf8_i;
249  size_t rec = 0;
250 
251  /* Assignment in truth expression is intended */
252  while(API_POSIX_EXIT_SUCCESS == res && -1 != (ucp = uc_test_nfc_table[i++]))
253  {
254  /* Test sequence parser (-2: Field separator, -3: Record separator) */
255  switch(state)
256  {
257  case SM_SRC:
258  {
259  if(-2L == ucp) { state = SM_NFC; }
260  else if(0L > ucp)
261  {
262  print_error("Invalid data found");
263  res = API_POSIX_EXIT_FAILURE;
264  }
265  else
266  {
267  if(BUFSIZE <= src_i)
268  {
269  print_error("SRC data buffer too small");
270  res = API_POSIX_EXIT_FAILURE;
271  }
272  else { src[src_i++] = ucp; }
273  }
274  break;
275  }
276  case SM_NFC:
277  {
278  if(-2L == ucp) { state = SM_NFD; }
279  else if(0L > ucp)
280  {
281  print_error("Invalid data found");
282  res = API_POSIX_EXIT_FAILURE;
283  }
284  else
285  {
286  if(BUFSIZE <= nfc_i)
287  {
288  print_error("NFC data buffer too small");
289  res = API_POSIX_EXIT_FAILURE;
290  }
291  else { nfc[nfc_i++] = ucp; }
292  }
293  break;
294  }
295  case SM_NFD:
296  {
297  if(-3L == ucp)
298  {
299  /* Data extraction from record complete, convert data to UTF-8 */
300  src_utf8[0] = ' '; utf8_i = 1;
301  enc_uc_encode_utf8(src_utf8, &utf8_i, src, &src_i);
302  src_utf8[utf8_i] = 0;
303  nfc_utf8[0] = ' '; utf8_i = 1;
304  enc_uc_encode_utf8(nfc_utf8, &utf8_i, nfc, &nfc_i);
305  nfc_utf8[utf8_i] = 0;
306  nfd_utf8[0] = ' '; utf8_i = 1;
307  enc_uc_encode_utf8(nfd_utf8, &utf8_i, nfd, &nfd_i);
308  nfd_utf8[utf8_i] = 0;
309  if(src_i || nfc_i || nfd_i)
310  {
311  print_error("Encoding test data to UTF-8 failed");
312  res = API_POSIX_EXIT_FAILURE;
313  }
314  else
315  {
316  /* Execute Unicode conformance checks */
317  res = test_unicode_conformance(rec++,
318  src_utf8, nfc_utf8, nfd_utf8);
319  /* Extract next record */
320  state = SM_SRC;
321  }
322  }
323  else if(0L > ucp)
324  {
325  print_error("Invalid data found");
326  res = API_POSIX_EXIT_FAILURE;
327  }
328  else
329  {
330  if(BUFSIZE <= nfd_i)
331  {
332  print_error("NFD data buffer too small");
333  res = API_POSIX_EXIT_FAILURE;
334  }
335  else { nfd[nfd_i++] = ucp; }
336  }
337  break;
338  }
339  default:
340  {
341  print_error("Parser state machine error");
342  res = API_POSIX_EXIT_FAILURE;
343  break;
344  }
345  }
346  }
347 
348  return(res);
349 }
350 
351 
352 /* ========================================================================== */
353 /*! \brief Test \c enc_convert_to_utf8_nfc() implementation
354  *
355  * \note
356  * The UTF-7 transformation format use base64 encoded UTF-16BE as internal
357  * representation. Therefore all Unicode codepoints beyond the BMP must be
358  * encoded using surrogate codepoints (that are forbidden in UTF-8).
359  *
360  * Part 1: The following cases are tested:
361  * - ASCII only (trivial)
362  * - Unicode already in NFC normalization (NFC quick check)
363  * - Unicode precomposed but with composition exception (requires lookup table)
364  * - Unicode with NFD normalization (trivial canonical composition)
365  * - Unicode with noncanonical order A (canonical reordering and composition)
366  * - Unicode with noncanonical order B (canonical reordering)
367  * - Unicode singleton (decomposition to another single codepoint)
368  * - Unicode algorithmic composition (used for hangul syllables)
369  * - UTF-7 to UTF-8 conversion (and conversion from NFD to NFC normalization)
370  * - UTF-7 to UTF-8 conversion (with codepoint that require surrogate pair)
371  * - UTF-7 to UTF-8 conversion (shift sequence terminated by SP or end-of-data)
372  *
373  * Part2: The Unicode normalization conformance test data file is used.
374  *
375  * \note
376  * For part 2 all test strings are prepended with a space because our
377  * normalization implementation will intentionally strip "defective combining
378  * character sequences" at the start of strings (even if they are not
379  * "ill-formed" according to the standard).
380  *
381  * \return
382  * - \c EXIT_SUCCESS on success
383  * - \c EXIT_FAILURE on error
384  */
385 
386 int test_unicode(void)
387 {
388 #define TS_NUM (size_t) 11 /* Number of test strings */
389 #define TS_UTF7 (size_t) 8 /* First index of UTF-7 section */
390  static const char* ts[TS_NUM] =
391  {
392  /* UTF-8 section */
393  "This is an ASCII string",
394  "This is an Unicode string: \xC3\xA4word",
395  "This is an Unicode string: \xE0\xAD\x9Cword",
396  "This is an Unicode string: a\xCC\x88word",
397  "This is an Unicode string: start\xCE\xB1\xCC\x94\xCC\x81\xCD\x85word",
398  "This is an Unicode string: start\xCE\xB1\xCC\x81\xCC\x94\xCD\x85word",
399  "This is an Unicode string: \xE2\x84\xA6word",
400  "Composition of hangul jamo: \xE1\x84\x91\xE1\x85\xB1\xE1\x86\xB6",
401  /* UTF-7 section */
402  "This is an Unicode string: hundertf+AHUDCA-nfzig",
403  "This is an Unicode string: Violinschl+APw-ssel (+2DTdHg-)",
404  "Shift sequence terminated by SP or end-of-data: A+AMQ- O+ANY U+ANw"
405  };
406  static const char* rs[TS_NUM] = {
407  /* UTF-8 section */
408  "This is an ASCII string",
409  "This is an Unicode string: \xC3\xA4word",
410  "This is an Unicode string: \xE0\xAC\xA1\xE0\xAC\xBCword",
411  "This is an Unicode string: \xC3\xA4word",
412  "This is an Unicode string: start\xE1\xBE\x85word",
413  "This is an Unicode string: start\xE1\xBE\xB4\xCC\x94word",
414  "This is an Unicode string: \xCE\xA9word",
415  "Composition of hangul jamo: \xED\x93\x9B",
416  /* UTF-7 section */
417  "This is an Unicode string: hundertf\xC3\xBCnfzig",
418  "This is an Unicode string: Violinschl\xC3\xBCssel (\xF0\x9D\x84\x9E)",
419  "Shift sequence terminated by SP or end-of-data: A\xC3\x84 O\xC3\x96 U\xC3\x9C"
420  };
421  int res = API_POSIX_EXIT_SUCCESS;
422  size_t i;
423  const char* buf;
424  const char* loc;
425 
426  /* Part 1: Check with internal test data */
427  for(i = 0; i < TS_NUM; ++i)
428  {
429  if(TS_UTF7 <= i)
430  {
432  }
433  else
434  {
436  }
437  if(NULL == buf)
438  {
439  print_error("Conversion of data to UTF-8 failed");
440  res = API_POSIX_EXIT_FAILURE;
441  break;
442  }
443  if(strcmp(rs[i], buf))
444  {
445  print_error("Result is not correct");
446  /* For debugging */
447 #if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
448  loc = api_posix_setlocale(API_POSIX_LC_CTYPE, "");
449 #else /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
450  loc = NULL;
451 #endif /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
452  if(NULL == loc)
453  {
454  print_error("Setting locale for debug messages failed");
455  }
456  else
457  {
458  /* Print Unicode data only if terminal use Unicode locale */
459  if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
460  {
461  print_error(
462  "Debug messages can't be printed with current locale");
463  }
464  else
465  {
466  fprintf(stderr, TEST_TAB "Input data: \"%s\"\n", ts[i]);
467  fprintf(stderr, TEST_TAB "Result is : \"%s\"\n", buf);
468  fprintf(stderr, TEST_TAB "Should be : \"%s\"\n", rs[i]);
469  }
470  }
471  res = API_POSIX_EXIT_FAILURE;
472  break;
473  }
474  if(buf != ts[i]) { enc_free((void*) buf); }
475  }
476 
477  /* Part 2: Check with external test data (from Unicode data file) */
478  if(API_POSIX_EXIT_SUCCESS == res)
479  {
480  res = test_unicode_part2();
481  }
482 
483  return(res);
484 }
485 
486 
487 /*! @} */
488 
489 /* EOF */
TEST_TAB
#define TEST_TAB
Tabulator to indent messages from test programs.
Definition: test.h:13
enc_free
void enc_free(void *p)
Free an object allocated by encoding module.
Definition: encoding.c:7856
enc_convert_to_utf8_nfc
const char * enc_convert_to_utf8_nfc(enum enc_mime_cs charset, const char *s)
Convert string from supported character set to Unicode (UTF-8 NFC)
Definition: encoding.c:4777
test_unicode
int test_unicode(void)
Test enc_convert_to_utf8_nfc() implementation.
Definition: test_unicode.c:386
enc_uc_encode_utf8
void enc_uc_encode_utf8(char *buf, size_t *i, long int *dbuf, size_t *di)
Encode Unicode codepoints to UTF-8.
Definition: encoding.c:572
ENC_CS_UTF_7
Definition: encoding.h:98
ENC_CS_UTF_8
Definition: encoding.h:99
print_error
void print_error(const char *)
Print error message.
Definition: main.cxx:276

Generated at 2026-01-27 using  doxygen