flnews V1.2.1

Go to the documentation of this file.
 /* ========================================================================== */
 /*! \file
  * \brief Test of \c enc_convert_to_utf8_nfc() implementation
  *
  * Copyright (c) 2012-2022 by the developers. See the LICENSE file for details.
  */
  
  
 /* ========================================================================== */
 /* Include headers */
  
 #include "posix.h"  /* Include this first because of feature test macros */
  
 #include <stdio.h>
 #include <string.h>
  
 #include "config.h"
  
 #include "encoding.h"
 #include "test.h"
 #include "test_unicode.h"
  
  
 /* ========================================================================== */
 /* Data types */
  
 enum sm_state
 {
    SM_SRC,
    SM_NFC,
    SM_NFD,
    SM_INVALID
 };
  
  
 /* ========================================================================== */
 /* Constants */
  
 /* Buffer size in codepoint units */
 #define BUFSIZE  (size_t) 16
  
 /*
  * Every codepoint can require up to 4 byte in UTF-8
  * +1 for leading space
  * +1 for NUL termination
  */
 #define BUFSIZE_UTF8  (BUFSIZE * (size_t) 4 + (size_t) 2)
  
 #include "uc_test_nfc.c"
  
  
 /* ========================================================================== */
 /*! \addtogroup TEST */
 /*! @{ */
  
  
 /* ========================================================================== */
 /* Test Unicode conformance with official test data
  *
  * \param[in] record    Record number
  * \param[in] src_utf8  Source data
  * \param[in] nfc_utf8  Data in normal form C (NFC)
  * \param[in] nfd_utf8  Data in normal form D (NFD)
  *
  * According to the Unicode standard, NFC normalization must behave like this:
  *
  *    nfc_utf8 == toNFC(src_utf8) == toNFC(nfc_utf8) == toNFC(nfd_utf8)
  *
  * \return
  * - \c EXIT_SUCCESS on success
  * - \c EXIT_FAILURE on error
  */
  
 static int  test_unicode_conformance(size_t  record, const char*  src_utf8,
                                      const char*  nfc_utf8,
                                      const char*  nfd_utf8)
 {
    int  res = POSIX_EXIT_SUCCESS;
    const char*  buf1 = NULL;
    const char*  buf2 = NULL;
    const char*  buf3 = NULL;
    const char*  loc;
    const char*  input = NULL;
    const char*  err = NULL;
  
 #if 0
    /* For debugging */
    printf("================\nrecord: %u\n", (unsigned int) record);
    printf("src_utf8: %s\n", src_utf8);
    printf("nfc_utf8: %s\n", nfc_utf8);
    printf("nfd_utf8: %s\n", nfd_utf8);
    printf("================\n"),
 #endif
  
    /* nfc_utf8 == toNFC(src_utf8) */
    input = src_utf8;
    buf1 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
    if(NULL == buf1)
    {
       print_error("Conversion SRC => NFC failed");
       res = POSIX_EXIT_FAILURE;
    }
    else if(strcmp(nfc_utf8, buf1))
    {
       print_error("Result mismatch for SRC => NFC");
       err = buf1;
       res = POSIX_EXIT_FAILURE;
    }
  
    /* nfc_utf8 == toNFC(nfc_utf8) */
    if(POSIX_EXIT_SUCCESS == res)
    {
       input = nfc_utf8;
       buf2 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
       if(NULL == buf2)
       {
          print_error("Conversion NFC => NFC failed");
          res = POSIX_EXIT_FAILURE;
       }
       else if(strcmp(nfc_utf8, buf2))
       {
          print_error("Result mismatch for NFC => NFC");
          err = buf2;
          res = POSIX_EXIT_FAILURE;
       }
    }
  
    /* nfc_utf8 == toNFC(nfd_utf8) */
    if(POSIX_EXIT_SUCCESS == res)
    {
       input = nfd_utf8;
       buf3 = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, input);
       if(NULL == buf3)
       {
          print_error("Conversion NFD => NFC failed");
          res = POSIX_EXIT_FAILURE;
       }
       else if(strcmp(nfc_utf8, buf3))
       {
          print_error("Result mismatch for NFD => NFC");
          err = buf3;
          res = POSIX_EXIT_FAILURE;
       }
    }
  
    /* For debugging */
    if(POSIX_EXIT_SUCCESS != res && NULL != err)
    {
 #if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
       loc = posix_setlocale(POSIX_LC_CTYPE, "");
 #else  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
       loc = NULL;
 #endif  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
       if(NULL == loc)
       {
          print_error("Setting locale for debug messages failed");
       }
       else
       {
          /* Print Unicode data only if terminal use Unicode locale */
          if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
          {
             print_error(
                "Debug messages can't be printed with current locale");
          }
          else
          {
             fprintf(stderr, TEST_TAB "Record number in test data file: %lu\n",
                     (unsigned long int) record);
             fprintf(stderr, TEST_TAB "Input data: \"%s\"\n", input);
             fprintf(stderr, TEST_TAB "Result is : \"%s\"\n", err);
             fprintf(stderr, TEST_TAB "Should be : \"%s\"\n", nfc_utf8);
          }
       }
    }
  
    /* Release memory */
    if(buf1 != src_utf8)  { enc_free((void*) buf1); }
    if(buf2 != nfc_utf8)  { enc_free((void*) buf2); }
    if(buf3 != nfd_utf8)  { enc_free((void*) buf3); }
  
    return(res);
 }
  
  
 /* ========================================================================== */
 /* Extract official Unicode test data records
  *
  * \return
  * - \c EXIT_SUCCESS on success
  * - \c EXIT_FAILURE on error
  */
  
 static int  test_unicode_part2(void)
 {
    int  res = POSIX_EXIT_SUCCESS;
    size_t  i = 0;
    long int  ucp = -1L;
    enum sm_state  state = SM_SRC;
    long int  src[BUFSIZE];
    long int  nfc[BUFSIZE];
    long int  nfd[BUFSIZE];
    size_t  src_i = 0;
    size_t  nfc_i = 0;
    size_t  nfd_i = 0;
    char  src_utf8[BUFSIZE_UTF8];
    char  nfc_utf8[BUFSIZE_UTF8];
    char  nfd_utf8[BUFSIZE_UTF8];
    size_t  utf8_i;
    size_t  rec = 0;
  
    /* Assignment in truth expression is intended */
    while(POSIX_EXIT_SUCCESS == res && -1 != (ucp = uc_test_nfc_table[i++]))
    {
       /* Test sequence parser (-2: Field separator, -3: Record separator) */
       switch(state)
       {
          case SM_SRC:
          {
             if(-2L == ucp)  { state = SM_NFC; }
             else if(0L > ucp)
             {
                print_error("Invalid data found");
                res = POSIX_EXIT_FAILURE;
             }
             else
             {
                if(BUFSIZE <= src_i)
                {
                   print_error("SRC data buffer too small");
                   res = POSIX_EXIT_FAILURE;
                }
                else  { src[src_i++] = ucp; }
             }
             break;
          }
          case SM_NFC:
          {
             if(-2L == ucp)  { state = SM_NFD; }
             else if(0L > ucp)
             {
                print_error("Invalid data found");
                res = POSIX_EXIT_FAILURE;
             }
             else
             {
                if(BUFSIZE <= nfc_i)
                {
                   print_error("NFC data buffer too small");
                   res = POSIX_EXIT_FAILURE;
                }
                else  { nfc[nfc_i++] = ucp; }
             }
             break;
          }
          case SM_NFD:
          {
             if(-3L == ucp)
             {
                /* Data extraction from record complete, convert data to UTF-8 */
                src_utf8[0] = ' '; utf8_i = 1;
                enc_uc_encode_utf8(src_utf8, &utf8_i, src, &src_i);
                src_utf8[utf8_i] = 0;
                nfc_utf8[0] = ' '; utf8_i = 1;
                enc_uc_encode_utf8(nfc_utf8, &utf8_i, nfc, &nfc_i);
                nfc_utf8[utf8_i] = 0;
                nfd_utf8[0] = ' '; utf8_i = 1;
                enc_uc_encode_utf8(nfd_utf8, &utf8_i, nfd, &nfd_i);
                nfd_utf8[utf8_i] = 0;
                if(src_i || nfc_i || nfd_i)
                {
                   print_error("Encoding test data to UTF-8 failed");
                   res = POSIX_EXIT_FAILURE;
                }
                else
                {
                   /* Execute Unicode conformance checks */
                   res = test_unicode_conformance(rec++,
                                                  src_utf8, nfc_utf8, nfd_utf8);
                   /* Extract next record */
                   state = SM_SRC;
                }
             }
             else if(0L > ucp)
             {
                print_error("Invalid data found");
                res = POSIX_EXIT_FAILURE;
             }
             else
             {
                if(BUFSIZE <= nfd_i)
                {
                   print_error("NFD data buffer too small");
                   res = POSIX_EXIT_FAILURE;
                }
                else  { nfd[nfd_i++] = ucp; }
             }
             break;
          }
          default:
          {
             print_error("Parser state machine error");
             res = POSIX_EXIT_FAILURE;
             break;
          }
       }
    }
  
    return(res);
 }
  
  
 /* ========================================================================== */
 /*! \brief Test \c enc_convert_to_utf8_nfc() implementation
  *
  * \note
  * The UTF-7 transformation format use base64 encoded UTF-16BE as internal
  * representation. Therefore all Unicode codepoints beyond the BMP must be
  * encoded using surrogate codepoints (that are forbidden in UTF-8).
  *
  * Part 1: The following cases are tested:
  * - ASCII only (trivial)
  * - Unicode already in NFC normalization (NFC quick check)
  * - Unicode precomposed but with composition exception (requires lookup table)
  * - Unicode with NFD normalization (trivial canonical composition)
  * - Unicode with noncanonical order A (canonical reordering and composition)
  * - Unicode with noncanonical order B (canonical reordering)
  * - Unicode singleton (decomposition to another single codepoint)
  * - Unicode algorithmic composition (used for hangul syllables)
  * - UTF-7 to UTF-8 conversion (and conversion from NFD to NFC normalization)
  * - UTF-7 to UTF-8 conversion (with codepoint that require surrogate pair)
  * - UTF-7 to UTF-8 conversion (shift sequence terminated by SP or end-of-data)
  *
  * Part2: The Unicode normalization conformance test data file is used.
  *
  * \note
  * For part 2 all test strings are prepended with a space because our
  * normalization implementation will intentionally strip "defective combining
  * character sequences" at the start of strings (even if they are not
  * "ill-formed" according to the standard).
  *
  * \return
  * - \c EXIT_SUCCESS on success
  * - \c EXIT_FAILURE on error
  */
  
 int  test_unicode(void)
 {
 #define TS_NUM  (size_t) 11  /* Number of test strings */
 #define TS_UTF7  (size_t) 8  /* First index of UTF-7 section */
    static const char*  ts[TS_NUM] =
    {
       /* UTF-8 section */
       "This is an ASCII string",
       "This is an Unicode string: \xC3\xA4word",
       "This is an Unicode string: \xE0\xAD\x9Cword",
       "This is an Unicode string: a\xCC\x88word",
       "This is an Unicode string: start\xCE\xB1\xCC\x94\xCC\x81\xCD\x85word",
       "This is an Unicode string: start\xCE\xB1\xCC\x81\xCC\x94\xCD\x85word",
       "This is an Unicode string: \xE2\x84\xA6word",
       "Composition of hangul jamo: \xE1\x84\x91\xE1\x85\xB1\xE1\x86\xB6",
       /* UTF-7 section */
       "This is an Unicode string: hundertf+AHUDCA-nfzig",
       "This is an Unicode string: Violinschl+APw-ssel (+2DTdHg-)",
       "Shift sequence terminated by SP or end-of-data: A+AMQ- O+ANY U+ANw"
    };
    static const char*  rs[TS_NUM] =  {
       /* UTF-8 section */
       "This is an ASCII string",
       "This is an Unicode string: \xC3\xA4word",
       "This is an Unicode string: \xE0\xAC\xA1\xE0\xAC\xBCword",
       "This is an Unicode string: \xC3\xA4word",
       "This is an Unicode string: start\xE1\xBE\x85word",
       "This is an Unicode string: start\xE1\xBE\xB4\xCC\x94word",
       "This is an Unicode string: \xCE\xA9word",
       "Composition of hangul jamo: \xED\x93\x9B",
       /* UTF-7 section */
       "This is an Unicode string: hundertf\xC3\xBCnfzig",
       "This is an Unicode string: Violinschl\xC3\xBCssel (\xF0\x9D\x84\x9E)",
       "Shift sequence terminated by SP or end-of-data: A\xC3\x84 O\xC3\x96 U\xC3\x9C"
    };
    int  res = POSIX_EXIT_SUCCESS;
    size_t  i;
    const char*  buf;
    const char*  loc;
  
    /* Part 1: Check with internal test data */
    for(i = 0; i < TS_NUM; ++i)
    {
       if(TS_UTF7 <= i)
       {
          buf = enc_convert_to_utf8_nfc(ENC_CS_UTF_7, ts[i]);
       }
       else
       {
          buf = enc_convert_to_utf8_nfc(ENC_CS_UTF_8, ts[i]);
       }
       if(NULL == buf)
       {
          print_error("Conversion of data to UTF-8 failed");
          res = POSIX_EXIT_FAILURE;
          break;
       }
       if(strcmp(rs[i], buf))
       {
          print_error("Result is not correct");
          /* For debugging */
 #if CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI
          loc = posix_setlocale(POSIX_LC_CTYPE, "");
 #else  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
          loc = NULL;
 #endif  /* CFG_USE_POSIX_API >= 200112 || CFG_USE_XSI */
          if(NULL == loc)
          {
             print_error("Setting locale for debug messages failed");
          }
          else
          {
             /* Print Unicode data only if terminal use Unicode locale */
             if(NULL == strstr(loc, "UTF") && NULL == strstr(loc, "utf"))
             {
                print_error(
                   "Debug messages can't be printed with current locale");
             }
             else
             {
                fprintf(stderr, TEST_TAB "Input data: \"%s\"\n", ts[i]);
                fprintf(stderr, TEST_TAB "Result is : \"%s\"\n", buf);
                fprintf(stderr, TEST_TAB "Should be : \"%s\"\n", rs[i]);
             }
          }
          res = POSIX_EXIT_FAILURE;
          break;
       }
       if(buf != ts[i])  { enc_free((void*) buf); }
    }
  
    /* Part 2: Check with external test data (from Unicode data file) */
    if(POSIX_EXIT_SUCCESS == res)
    {
       res = test_unicode_part2();
    }
  
    return(res);
 }
  
  
 /*! @} */
  
 /* EOF */