encoding.c
Go to the documentation of this file.
1 /* ========================================================================== */
2 /*! \file
3  * \brief Shared encoding related functions
4  *
5  * Copyright (c) 2012-2024 by the developers. See the LICENSE file for details.
6  *
7  * If nothing else is specified, functions return zero to indicate success
8  * and a negative value to indicate an error.
9  */
10 
11 
12 /* ========================================================================== */
13 /* Include headers */
14 
15 #include "posix.h" /* Include this first because of feature test macros */
16 
17 #include <ctype.h>
18 #include <stddef.h>
19 #include <string.h>
20 
21 #include <libbasexx-0/base64_encode.h>
22 #include <libbasexx-0/base64_decode.h>
23 #include <libjpiconv-0/iconv.h>
24 #include <libssiconv-0/iconv.h>
25 #include <libuciconv-0/iconv.h>
26 
27 #include "conf.h"
28 #include "encoding.h"
29 #include "fileutils.h"
30 #include "main.h"
31 
32 
33 /* ========================================================================== */
34 /*! \defgroup ENCODING ENC: Codeset and header field handling
35  *
36  * The functions in this group should be conformant to the following standards:
37  * ANSI X3.4,
38  * ISO 2022, ISO 8601, ISO 8859, ISO 10646,
39  * RFC 1468, RFC 2045, RFC 2046, RFC 2047, RFC 2049, RFC 2152, RFC 2183,
40  * RFC 2231, RFC 2646, RFC 3629, RFC 3676, RFC 5198, RFC 5536, RFC 6657,
41  * POSIX.1-1996,
42  * Unicode 14.0.0
43  *
44  * \todo
45  * We don't use \c iconv() because on old operating systems there may be no
46  * Unicode support. And even on such old machines we don't want an external
47  * dependency from GNU iconv.
48  * <br>
49  * There should be an option to use the systems \c iconv() on request.
50  */
51 /*! @{ */
52 
53 
54 /* ========================================================================== */
55 /* Constants */
56 
57 /*! \brief Message prefix for ENCODING module */
58 #define MAIN_ERR_PREFIX "ENC: "
59 
60 /* Define this to nonzero to enable Unicode NFC normalization debugging */
61 #define ENC_UC_NORM_DEBUG 0
62 
63 /*! \brief Maximum length of MIME parameter attribute tokens */
64 #define ENC_MIME_PARA_LENGTH_MAX (size_t) 127
65 
66 /*! \brief MIME word encoder folding behaviour
67  *
68  * If this is defined to nonzero, all lines of RFC 2047 conformant header fields
69  * that contain MIME encoded words are folded before 76 characters. Otherwise
70  * all lines that contain no encoded-words are not folded before 998 characters.
71  *
72  * RFC 2047 is ambigous regarding this rule:
73  * <br>
74  * https://tools.ietf.org/html/rfc2047#section-2
75  * <br>
76  * The default value 1 is safe in any case. Please read section 2, paragraph 5
77  * carefully before redefining this to 0!
78  */
79 #define ENC_MIME_HEADER_FOLD_ASCII_LINES 1
80 
81 
82 /* ========================================================================== */
83 /* Data types */
84 
85 /* ISO 2022 decoder states */
86 enum iso2022_state
87 {
88  ISO2022_ASCII,
89  ISO2022_ISO646,
90  ISO2022_JIS_X_0208
91 };
92 
93 /* Unicode hangul syllable type */
94 enum uc_hs_type
95 {
96  UC_HST_NONE,
97  UC_HST_L,
98  UC_HST_V,
99  UC_HST_T,
100  UC_HST_LV,
101  UC_HST_LVT
102 };
103 
104 /* Unicode canonical decomposition (and character combining class) */
105 struct uc_cdc
106 {
107  long int cp; /* Codepoint */
108  unsigned char ccc; /* Canonical combining class */
109  long int dc1; /* Decomposition mapping (recursive part 1) */
110  long int dc2; /* Decomposition mapping (non-recursive part 2) */
111 };
112 
113 /* Unicode hangul syllable type ranges */
114 struct uc_hst
115 {
116  long int first; /* First codepoint of range */
117  long int last; /* Last codepoint of range */
118  enum uc_hs_type hst; /* Hangul syllable type */
119 };
120 
121 /* Unicode NFC quick check codepoint ranges indicating normalization required */
122 struct uc_qc_nfc
123 {
124  long int first; /* First codepoint of range */
125  long int last; /* Last codepoint of range */
126 };
127 
128 /* Unicode full composition exclusion codepoint ranges */
129 struct uc_fce
130 {
131  long int first; /* First codepoint of range */
132  long int last; /* Last codepoint of range */
133 };
134 
135 /* Unicode default case folding mapping */
136 struct uc_cf
137 {
138  long int cp; /* Codepoint */
139  long int first; /* First codepoint of range */
140  long int second; /* Second codepoint of range */
141  long int third; /* Third codepoint of range */
142 };
143 
144 /* IOS2022-JP to Unicode codepoint mapping */
145 struct iso2022_jp
146 {
147  long int jis; /* JIS X 0208 codepoint */
148  long int uc; /* Unicode codepoint */
149 };
150 
151 /* MIME parameter (for RFC 2231 decoder) */
152 struct mime_parameter
153 {
154  int valid;
155  char attribute[ENC_MIME_PARA_LENGTH_MAX + 1];
156  size_t attribute_len;
157  unsigned int section;
158  char charset[ENC_MIME_PARA_LENGTH_MAX + 1];
159  const char* value_start;
160  const char* value_end;
161 };
162 
163 
164 /* ========================================================================== */
165 /* Constants */
166 
167 
168 /* Unicode codepoint inserted for rejected control characters */
169 #define ENC_RC 0xFFFDL /* U+FFFD */
170 
171 /*
172  * Size of Unicode decomposition buffer
173  * Minimum size: 8
174  */
175 #define ENC_UC_DECOMPOSITION_BUFSIZE (size_t) 16
176 
177 /* Maximum size of header line */
178 #define ENC_HDR_BUFSIZE (size_t) 998
179 
180 /* Buffer size for "Format" parameter of MIME "Content-Type" header field */
181 #define ENC_FMT_BUFLEN (size_t) 7
182 
183 /* Unicode canonical decomposition data */
184 #include "../uc_cdc.c"
185 
186 /* Unicode hangul syllable data */
187 #include "../uc_hst.c"
188 
189 /* Unicode NFC quick check data */
190 #include "../uc_qc_nfc.c"
191 
192 /* Unicode full composition exclusion data */
193 #include "../uc_fce.c"
194 
195 /* Unicode default case folding data */
196 #include "../uc_cf.c"
197 
198 
199 /* ========================================================================== */
200 /* Variables */
201 
202 /*! Ignored value that was assigned to silence compiler warning */
203 static volatile int ign;
204 
205 
206 /* ========================================================================== */
207 /* Decode hexadecimal nibble from ASCII to integer
208  *
209  * \param[in] nibble ASCII encoded haxadecimal nibble to decode
210  *
211  * \return
212  * - Integer value of \e nibble
213  * - Negative value on error
214  */
215 
216 static int enc_hex_decode_nibble(char nibble)
217 {
218  int res = -1;
219  int n = nibble;
220 
221  if(0x30 <= n && 0x39 >= n) { res = n - 0x30; }
222  else if(0x41 <= n && 0x46 >= n) { res = n - 0x41 + 10; }
223  else if(0x61 <= n && 0x66 >= n) { res = n - 0x61 + 10; }
224  else { PRINT_ERROR("Can't decode invalid hexadecimal nibble"); }
225  /* printf("Hex nibble %c => %d\n", nibble, res); */
226 
227  return(res);
228 }
229 
230 
231 /* ========================================================================== */
232 /* Convert from supported 8 bit character sets to UTF-8
233  *
234  * \param[in] charset 8 bit character set used for string \e s
235  * \param[in] s String to convert
236  *
237  * \return
238  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
239  * - NULL on error
240  */
241 
242 static const char* enc_8bit_convert_to_utf8(enum enc_mime_cs charset,
243  const char* s)
244 {
245  const char* res = NULL;
246  size_t inlen = 0;
247  unsigned char us_ascii = 1;
248 
249  /*
250  * Check whether data contains only of US-ASCII characters
251  * (all supported character sets are US-ASCII extensions)
252  */
253  while (s[inlen])
254  {
255  if ((const unsigned char) 0x80 & (const unsigned char) s[inlen])
256  {
257  us_ascii = 0;
258  }
259  ++inlen;
260  }
261  if (us_ascii)
262  {
263  res = s;
264  }
265  else
266  {
267  char* inbuf = api_posix_malloc(inlen + (size_t) 1); /* +1 for NUL */
268 
269  if (NULL != inbuf)
270  {
271  size_t outlen = inlen * (size_t) 4; /* 4 octets per CP in UTF-8 */
272  size_t len = outlen;
273  const char* cs_name = NULL;
274  char* outbuf = api_posix_malloc(outlen + (size_t) 1); /* +1 for NUL */
275 
276  if (NULL != outbuf)
277  {
278  switch(charset)
279  {
280  case ENC_CS_ASCII: { cs_name = "US-ASCII"; break; }
281  case ENC_CS_ISO8859_1: { cs_name = "ISO-8859-1"; break; }
282  case ENC_CS_ISO8859_2: { cs_name = "ISO-8859-2"; break; }
283  case ENC_CS_ISO8859_3: { cs_name = "ISO-8859-3"; break; }
284  case ENC_CS_ISO8859_4: { cs_name = "ISO-8859-4"; break; }
285  case ENC_CS_ISO8859_5: { cs_name = "ISO-8859-5"; break; }
286  case ENC_CS_ISO8859_6: { cs_name = "ISO-8859-6"; break; }
287  case ENC_CS_ISO8859_7: { cs_name = "ISO-8859-7"; break; }
288  case ENC_CS_ISO8859_8: { cs_name = "ISO-8859-8"; break; }
289  case ENC_CS_ISO8859_9: { cs_name = "ISO-8859-9"; break; }
290  case ENC_CS_ISO8859_10: { cs_name = "ISO-8859-10"; break; }
291  case ENC_CS_ISO8859_11: { cs_name = "ISO-8859-11"; break; }
292  case ENC_CS_ISO8859_13: { cs_name = "ISO-8859-13"; break; }
293  case ENC_CS_ISO8859_14: { cs_name = "ISO-8859-14"; break; }
294  case ENC_CS_ISO8859_15: { cs_name = "ISO-8859-15"; break; }
295  case ENC_CS_ISO8859_16: { cs_name = "ISO-8859-16"; break; }
296  case ENC_CS_MACINTOSH: { cs_name = "Macintosh"; break; }
297  case ENC_CS_KOI8R: { cs_name = "KOI8-R"; break; }
298  case ENC_CS_KOI8U: { cs_name = "KOI8-U"; break; }
299  case ENC_CS_WINDOWS_1250: { cs_name = "Windows-1250"; break; }
300  case ENC_CS_WINDOWS_1251: { cs_name = "Windows-1251"; break; }
301  case ENC_CS_WINDOWS_1252: { cs_name = "Windows-1252"; break; }
302  case ENC_CS_WINDOWS_1253: { cs_name = "Windows-1253"; break; }
303  case ENC_CS_WINDOWS_1254: { cs_name = "Windows-1254"; break; }
304  case ENC_CS_WINDOWS_1255: { cs_name = "Windows-1255"; break; }
305  case ENC_CS_WINDOWS_1256: { cs_name = "Windows-1256"; break; }
306  case ENC_CS_WINDOWS_1257: { cs_name = "Windows-1257"; break; }
307  case ENC_CS_WINDOWS_1258: { cs_name = "Windows-1258"; break; }
308  case ENC_CS_IBM437: { cs_name = "IBM437"; break; }
309  case ENC_CS_IBM775: { cs_name = "IBM775"; break; }
310  case ENC_CS_IBM850: { cs_name = "IBM850"; break; }
311  case ENC_CS_IBM852: { cs_name = "IBM852"; break; }
312  case ENC_CS_IBM858: { cs_name = "IBM00858"; break; }
313  default: { break; }
314  }
315  if (NULL != cs_name)
316  {
317  size_t rv = (size_t) -1;
318 
319  memcpy(inbuf, s, inlen + (size_t) 1);
320  rv = ssic0_iconvstr("UTF-8", cs_name,
321  inbuf, &inlen, outbuf, &outlen,
322  SSIC0_ICONV_REPLACE_INVALID);
323  if ((size_t) -1 == rv || (size_t) 0 != inlen)
324  {
325  /* Failed */
326  PRINT_ERROR("Conversion from 8-bit codepage to UTF-8 failed");
327  api_posix_free((void*) outbuf);
328  }
329  else
330  {
331  /* Success => Shrink output buffer to size of result */
332  len -= outlen;
333  outbuf[len] = 0;
334  res = api_posix_realloc((void*) outbuf, len + (size_t) 1);
335  if (NULL == res) { api_posix_free((void*) outbuf); }
336  }
337  }
338  }
339  api_posix_free((void*) inbuf);
340  }
341  }
342 
343  return res;
344 }
345 
346 
347 /* ========================================================================== */
348 /*! \brief Verify CESU-8 or UTF-8 encoding
349  *
350  * \param[in] s String to verify
351  * \param[in] utf Reject surrogate codepoints if nonzero.
352  *
353  * \note CESU-8 is defined in Unicode Technical Report #26.
354  *
355  * \attention
356  * Read chapter 10 of RFC 3629 for UTF-8 security considerations.
357  *
358  * According to RFC 3629 the following rules are applied:
359  * - Character code points beyond 0x10FFFF are invalid => We reject them.
360  * - Only the shortest possible code sequence is allowed => We verify this.
361  * - Surrogate character code points are invalid for UTF-8 => We reject them.
362  *
363  * \return
364  * - 0 on success
365  * - Negative value on error
366  */
367 
368 static int enc_uc_check_cesu8(const char* s, unsigned char utf)
369 {
370  int res = 0;
371  size_t i = 0;
372  int c;
373  int multibyte = 0;
374  size_t len = 0;
375  size_t remaining = 0;
376  unsigned long int mbc = 0;
377 
378  /* Assignment in truth expression is intended */
379  while((c = (int) s[i++]))
380  {
381  /* Verify singlebyte character */
382  if(!multibyte)
383  {
384  if(!(0 <= c && 127 >= c)) { multibyte = 1; }
385  }
386  /* Verify multibyte character */
387  if(multibyte)
388  {
389  if(!remaining)
390  {
391  if((c & 0xE0) == 0xC0) { len = 2; }
392  else if((c & 0xF0) == 0xE0) { len = 3; }
393  else if((c & 0xF8) == 0xF0) { len = 4; }
394  else
395  {
396  PRINT_ERROR("Invalid start of code sequence in UTF-8 data");
397  res = -1;
398  break;
399  }
400  switch(len)
401  {
402  case 2: mbc |= (unsigned long int) (c & 0x1F) << 6; break;
403  case 3: mbc |= (unsigned long int) (c & 0x0F) << 12; break;
404  case 4: mbc |= (unsigned long int) (c & 0x07) << 18; break;
405  }
406  remaining = len - (size_t) 1;
407  }
408  else
409  {
410  if((c & 0xC0) != 0x80)
411  {
412  PRINT_ERROR("Invalid continuation character in UTF-8 sequence");
413  res = -1;
414  break;
415  }
416  else
417  {
418  --remaining;
419  mbc |= (unsigned long int) (c & 0x3F) << remaining * (size_t) 6;
420  }
421  if(!remaining)
422  {
423  /* Verify character code */
424  switch(len)
425  {
426  case 2:
427  {
428  if(0x000080UL > mbc)
429  {
430  PRINT_ERROR("Invalid UTF-8 2-byte code sequence");
431  res = -1;
432  }
433  break;
434  }
435  case 3:
436  {
437  if(0x000800UL > mbc
438  || (utf && 0x00D800UL <= mbc && 0x00DFFFUL >= mbc))
439  {
440  PRINT_ERROR("Invalid UTF-8 3-byte code sequence");
441  res = -1;
442  }
443  break;
444  }
445  case 4:
446  {
447  if(0x010000UL > mbc || 0x10FFFFUL < mbc)
448  {
449  PRINT_ERROR("Invalid UTF-8 4-byte code sequence");
450  res = -1;
451  }
452  break;
453  }
454  default:
455  {
456  PRINT_ERROR("Bug in UTF-8 verify state machine");
457  res = -1;
458  break;
459  }
460  }
461  if(res) { break; }
462  /* Code sequence completely checked => Reset state machine */
463  multibyte = 0;
464  remaining = 0;
465  mbc = 0;
466  }
467  }
468  }
469  }
470  /* Check for incomplete multibyte code sequence at end of string */
471  if(multibyte) { res = -1; }
472 
473  return(res);
474 }
475 
476 
477 /* ========================================================================== */
478 /* Decode next Unicode codepoint from UTF-8 string
479  *
480  * \param[in] s UTF-8 string to decode
481  * \param[in,out] i Pointer to current index in string
482  *
483  * \attention
484  * The string \e s MUST be already checked for valid UTF-8 encoding before
485  * calling this function!
486  *
487  * On success, the index of the next codepoint is written to the location
488  * pointed to by \e i .
489  *
490  * \return
491  * - Next Unicode codepoint on success
492  * - -1 on error
493  */
494 
495 static long int enc_uc_decode_utf8(const char* s, size_t* i)
496 {
497  long int res = -1L;
498  int c;
499  int multibyte = 0;
500  size_t len = 0;
501  size_t remaining = 0;
502  unsigned long int mbc = 0;
503  int error = 0;
504 
505  /* Assignment in truth expression is intended */
506  while((c = (int) s[(*i)++]))
507  {
508  /* Check for singlebyte codepoint */
509  if(!multibyte)
510  {
511  if(0 <= c && 127 >= c) { res = (long int) c; break; }
512  else { multibyte = 1; }
513  }
514  /* Decode multibyte codepoint */
515  if(multibyte)
516  {
517  if(!remaining)
518  {
519  if((c & 0xE0) == 0xC0) { len = 2; }
520  else if((c & 0xF0) == 0xE0) { len = 3; }
521  else if((c & 0xF8) == 0xF0) { len = 4; }
522  switch(len)
523  {
524  case 2: mbc |= (unsigned long int) (c & 0x1F) << 6; break;
525  case 3: mbc |= (unsigned long int) (c & 0x0F) << 12; break;
526  case 4: mbc |= (unsigned long int) (c & 0x07) << 18; break;
527  default:
528  {
529  PRINT_ERROR("UTF-8 decoder called with invalid data");
530  error = 1;
531  break;
532  }
533  }
534  if(error) { res = -1L; break; }
535  remaining = len - (size_t) 1;
536  }
537  else
538  {
539  --remaining;
540  mbc |= (unsigned long int) (c & 0x3F) << remaining * (size_t) 6;
541  if(!remaining)
542  {
543  /* Codepoint decoding complete */
544  res = (long int) mbc;
545  break;
546  }
547  }
548  }
549  }
550 
551  return(res);
552 }
553 
554 
555 /* ========================================================================== */
556 /*! \brief Encode Unicode codepoints to UTF-8
557  *
558  * \param[out] buf Encoded UTF-8 string
559  * \param[in,out] i Current index in \e buf
560  * \param[in] dbuf Codepoint buffer
561  * \param[in,out] di Number of codepoints in \e dbuf
562  *
563  * \attention
564  * The target buffer \e buf must be large enough for the encoded data. This must
565  * be ensured by the caller using worst case calculations.
566  *
567  * On success, the start index of the next codepoint is written to the location
568  * pointed to by \e i and zero is written to the location pointed to by \e di .
569  */
570 
571 /* 'static' removed because test program must call this function */
572 void enc_uc_encode_utf8(char* buf, size_t* i, long int* dbuf, size_t* di)
573 {
574  size_t ii;
575  int inval = 0;
576  unsigned char prefix;
577  unsigned char data;
578 
579  for(ii = 0; ii < *di; ++ii)
580  {
581  if (0L > dbuf[ii]) { inval = 1; }
582  else if(0x00007FL >= dbuf[ii]) { buf[(*i)++] = (char) dbuf[ii]; }
583  else if(0x0007FFL >= dbuf[ii])
584  {
585  data = (unsigned char) ((dbuf[ii] >> 6) & 0x1FL);
586  prefix = 0xC0;
587  buf[(*i)++] = (char) (prefix | data);
588  data = (unsigned char) (dbuf[ii] & 0x3FL);
589  prefix = 0x80;
590  buf[(*i)++] = (char) (prefix | data);
591  }
592  else if(0x00FFFFL >= dbuf[ii])
593  {
594  data = (unsigned char) ((dbuf[ii] >> 12) & 0x0FL);
595  prefix = 0xE0;
596  buf[(*i)++] = (char) (prefix | data);
597  data = (unsigned char) ((dbuf[ii] >> 6) & 0x3FL);
598  prefix = 0x80;
599  buf[(*i)++] = (char) (prefix | data);
600  data = (unsigned char) (dbuf[ii] & 0x3FL);
601  prefix = 0x80;
602  buf[(*i)++] = (char) (prefix | data);
603  }
604  else if(0x10FFFFL >= dbuf[ii])
605  {
606  data = (unsigned char) ((dbuf[ii] >> 18) & 0x07L);
607  prefix = 0xF0;
608  buf[(*i)++] = (char) (prefix | data);
609  data = (unsigned char) ((dbuf[ii] >> 12) & 0x3FL);
610  prefix = 0x80;
611  buf[(*i)++] = (char) (prefix | data);
612  data = (unsigned char) ((dbuf[ii] >> 6) & 0x3FL);
613  prefix = 0x80;
614  buf[(*i)++] = (char) (prefix | data);
615  data = (unsigned char) (dbuf[ii] & 0x3FL);
616  prefix = 0x80;
617  buf[(*i)++] = (char) (prefix | data);
618  }
619  else { inval = 1; }
620  if(inval)
621  {
622  PRINT_ERROR("Unicode UTF-8 encoder: Invalid codepoint detected");
623  buf[(*i)++] = '?';
624  inval = 0;
625  }
626  }
627  *di = 0;
628 }
629 
630 
631 /* ========================================================================== */
632 /* Check whether Unicode codepoint is an unwanted control character
633  *
634  * \param[in] ucp Unicode codepoint to check
635  *
636  * RFC 5198 forbids C1 control characters => We reject them.
637  *
638  * Unicode variation selectors and control characters for bidirectional text
639  * should be accepted. The Cocoa backend of FLTK (for Apple macOS) is able to
640  * process them.
641  *
642  * \return
643  * - 0 if nothing was found
644  * - -1 if unwanted control characters are present
645  */
646 
647 static int enc_uc_check_control(long int ucp)
648 {
649  int res = 0;
650 
651  /* Check for ASCII C0 control characters plus DEL */
652  if(0x1FL >= ucp || 0x7FL == ucp)
653  {
654  /* Accept only HT, LF and CR (required for canonical format) */
655  if(0x09L != ucp && 0x0AL != ucp && 0x0DL != ucp) { res = -1; }
656  }
657  /* Reject ISO 8859 C1 control characters */
658  else if(0x80L <= ucp && 0x9FL >= ucp) { res = -1; }
659  /* Reject Unicode INTERLINEAR ANNOTATION special characters */
660  else if(0xFFF9L <= ucp && 0xFFFBL >= ucp) { res = -1; }
661  /* Reject Unicode LINE SEPARATOR and PARAGRAPH SEPARATOR */
662  else if(0x2028L <= ucp && 0x2029L >= ucp) { res = -1; }
663  /* Reject Unicode LANGUAGE TAG */
664  else if( 0xE0001L == ucp) { res = -1; }
665 #if 0
666  /*
667  * Reject LANGUAGE TAG associated range
668  *
669  * Note:
670  * This range was deprecated since Unicode 5.1, but was reintroduced for
671  * another purpose in Unicode 9.0 => No longer reject it.
672  */
673  else if(0xE0020L <= ucp && 0xE007FL >= ucp) { res = -1; }
674 #endif
675 
676  return(res);
677 }
678 
679 
680 /* ========================================================================== */
681 /* Lookup canonical decomposition and combining class of Unicode codepoint
682  *
683  * \param[in] ucp Unicode codepoint to lookup
684  * \param[out] res Result
685  */
686 
687 static void enc_uc_lookup_cdc(long int ucp, struct uc_cdc* res)
688 {
689  size_t i = 0;
690 
691  res->cp = ucp;
692  res->ccc = 0;
693  res->dc1 = -1L;
694  res->dc2 = -1L;
695  /* ASCII codepoints are always starters without canonical decomposition */
696  if(128L <= ucp)
697  {
698  /* Lookup codepoint in Unicode database */
699  while(-1L != uc_cdc_table[i].cp)
700  {
701  if(ucp == uc_cdc_table[i].cp)
702  {
703  /* Found nonstarter or canonical decomposable */
704  res->ccc = uc_cdc_table[i].ccc;
705  res->dc1 = uc_cdc_table[i].dc1;
706  res->dc2 = uc_cdc_table[i].dc2;
707  break;
708  }
709  ++i;
710  }
711  }
712  /* Use codepoint as decomposition if no canonical decomposition was found */
713  if(-1L == res->dc1) { res->dc1 = ucp; res->dc2 = -1L; }
714 }
715 
716 
717 /* ========================================================================== */
718 /* Lookup canonical composition of codepoint pair
719  *
720  * \param[in] starter Unicode codepoint of starter
721  * \param[in] cm Unicode codepoint of combination mark (or other starter)
722  *
723  * \note
724  * If \e starter is not a codepoint with canoncial combining class zero, the
725  * function returns error.
726  *
727  * \return
728  * - Codepoint of composition character on success
729  * - -1 on error
730  */
731 
732 static long int enc_uc_lookup_cc(long int starter, long int cm)
733 {
734  long int res = -1;
735  struct uc_cdc cdc; /* Canonical decomposition data */
736  size_t i;
737  size_t ii;
738  long int first;
739  long int last;
740 
741  /* Only used for hangul algorithmic canonical composition */
742  const long int SBase = 0xAC00L;
743  const long int LBase = 0x1100L;
744  const long int VBase = 0x1161L;
745  const long int TBase = 0x11A7L;
746  const long int NCount = 588L;
747  const long int LCount = 19L;
748  const long int VCount = 21L;
749  const long int TCount = 28L;
750  int jamo = 0; /* Number of jamo in syllable */
751  enum uc_hs_type hst;
752  enum uc_hs_type hst2;
753  long int LIndex;
754  long int VIndex;
755  long int TIndex;
756 
757  /* Check whether starter really is a starter */
758  enc_uc_lookup_cdc(starter, &cdc);
759  if(!cdc.ccc)
760  {
761  /* Yes => Lookup decomposition in Unicode database */
762  i = 0;
763  while(-1L != uc_cdc_table[i].cp)
764  {
765  if(uc_cdc_table[i].dc1 == starter && uc_cdc_table[i].dc2 == cm)
766  {
767  /* Found composition */
768  res = uc_cdc_table[i].cp;
769  /* Check for composition exception */
770  ii = 0;
771  while(-1L != uc_fce_table[ii].first)
772  {
773  first = uc_fce_table[ii].first;
774  last = uc_fce_table[ii].last;
775  if(first <= res && last >= res)
776  {
777  /* Composition exception found */
778 #if ENC_UC_NORM_DEBUG
779  printf(" Canonical composition exception\n");
780 #endif /* ENC_UC_NORM_DEBUG */
781  res = -1;
782  break;
783  }
784  ++ii;
785  }
786  break;
787  }
788  ++i;
789  }
790 
791  /*
792  * On error, check whether algorithmic composition is possible using
793  * Unicode hangul syllable type database
794  */
795  enc_uc_lookup_cdc(cm, &cdc);
796  if(!cdc.ccc)
797  {
798  i = 0;
799  while(-1L == res && -1L != uc_hst_table[i].first)
800  {
801  first = uc_hst_table[i].first;
802  last = uc_hst_table[i].last;
803  hst = uc_hst_table[i].hst;
804  if(first <= starter && last >= starter)
805  {
806  if(UC_HST_L == hst || UC_HST_LV == hst)
807  {
808  /* Starter is a hangul L-type consonant or LV-type syllable */
809  ii = 0;
810  while(-1L != uc_hst_table[ii].first)
811  {
812  first = uc_hst_table[ii].first;
813  last = uc_hst_table[ii].last;
814  hst2 = uc_hst_table[ii].hst;
815  if(first <= cm && last >= cm)
816  {
817  if(UC_HST_L == hst && UC_HST_V == hst2)
818  {
819  if(LBase <= starter && VBase <= cm)
820  {
821  LIndex = starter - LBase;
822  if(LIndex < LCount)
823  {
824  VIndex = cm - VBase;
825  if(VIndex < VCount)
826  {
827 #if ENC_UC_NORM_DEBUG
828  printf(" Canonical composition"
829  " for hangul LV-syllable found\n");
830  printf("Hangul LIndex: %ld\n", LIndex);
831  printf("Hangul VIndex: %ld\n", VIndex);
832 #endif /* ENC_UC_NORM_DEBUG */
833  jamo = 2;
834  res = SBase
835  + LIndex * NCount + VIndex * TCount;
836  }
837  }
838  }
839  }
840  else if(UC_HST_LV == hst && UC_HST_T == hst2)
841  {
842  if(TBase <= cm)
843  {
844  TIndex = cm - TBase;
845  if(TIndex < TCount)
846  {
847 #if ENC_UC_NORM_DEBUG
848  printf(" Canonical composition"
849  " for hangul LVT-syllable found\n");
850  printf("Hangul TIndex: %ld\n", TIndex);
851 #endif /* ENC_UC_NORM_DEBUG */
852  jamo = 3;
853  res = starter + TIndex;
854  }
855  }
856  }
857  if(jamo)
858  {
859  if(-1L == res)
860  {
861  PRINT_ERROR("Unicode algorithmic composition"
862  " for hangul syllable failed");
863  }
864 #if 1
865  /* Optional: Check hangul syllable type of result */
866  else
867  {
868  ii = 0;
869  while(-1L != uc_hst_table[ii].first)
870  {
871  first = uc_hst_table[ii].first;
872  last = uc_hst_table[ii].last;
873  hst = uc_hst_table[ii].hst;
874  if(first <= res && last >= res)
875  {
876  if(2 == jamo && UC_HST_LV != hst)
877  {
878  /* Result should be a LV-syllable! */
879  res = -1L;
880  }
881  if(3 == jamo && UC_HST_LVT != hst)
882  {
883  /* Result should be a LVT-syllable! */
884  res = -1L;
885  }
886  break;
887  }
888  ++ii;
889  }
890  if(-1L == res)
891  {
892  PRINT_ERROR("Invalid Unicode hangul syllable"
893  " detected (Bug)");
894  }
895  }
896 #endif
897  break;
898  }
899  }
900  ++ii;
901  }
902  }
903  }
904  ++i;
905  }
906  }
907  }
908 
909  return(res);
910 }
911 
912 
913 /* ========================================================================== */
914 /* Lookup mapping for default case folding
915  *
916  * \param[in] ucp Unicode codepoint
917  * \param[in] cfm Case folded mapping (up to 3 codepoints)
918  *
919  * If the case fold mapping is smaller than 3 codepoints, -1 is written to the
920  * unused fields.
921  *
922  * \note
923  * If \e ucp has no mapping for default case folding, \e ucp itself is
924  * returned in the first field.
925  */
926 
927 static void enc_uc_lookup_cf(long int ucp, long int mapping[3])
928 {
929  size_t i = 0;
930  int found = 0;
931 
932  /* Lookup codepoint in Unicode database */
933  while(-1L != uc_cf_table[i].cp)
934  {
935  if(ucp == uc_cf_table[i].cp)
936  {
937  /* Found mapping for Unicode default case folding */
938  mapping[0] = uc_cf_table[i].first;
939  mapping[1] = uc_cf_table[i].second;
940  mapping[2] = uc_cf_table[i].third;
941  found = 1;
942  break;
943  }
944  ++i;
945  }
946  if(!found)
947  {
948  mapping[0] = ucp;
949  mapping[1] = -1L;
950  mapping[2] = -1L;
951  }
952 }
953 
954 
955 /* ========================================================================== */
956 /* Get number of Unicode glyphs from UTF-8 string
957  *
958  * \param[in] s UTF-8 string to decode
959  * \param[in] end Check glyph count up to (but not including) this index
960  *
961  * To check all glyphs of a string, set \e end to zero.
962  *
963  * \attention
964  * The string \e s MUST be already checked for valid UTF-8 encoding before
965  * calling this function!
966  *
967  * \note
968  * Soft hyphens (SHY) are not counted, except directly before \e end .
969  * <br>
970  * For this function the glyph count is defined as "number of starters".
971  * For complex scripts this may not match the display width.
972  *
973  * \return
974  * - Glyph count
975  */
976 
977 static size_t enc_uc_get_glyph_count(const char* s, size_t end)
978 {
979  size_t res = 0;
980  size_t i = 0;
981  long int ucp;
982  struct uc_cdc cdc; /* Canonical decomposition data */
983 
984  while(1)
985  {
986  ucp = enc_uc_decode_utf8(s, &i);
987  if(-1L == ucp) { break; }
988  else
989  {
990  /* Do not count SHY characters, except at the end */
991  if (!((0x00ADL == ucp) && (end && i < end)))
992  {
993  /* Check whether codepoint is a starter */
994  enc_uc_lookup_cdc(ucp, &cdc);
995  if(!cdc.ccc) { ++res; }
996  }
997  }
998  if(end && i >= end) { break; }
999  }
1000 
1001  return(res);
1002 }
1003 
1004 
1005 /* ========================================================================== */
1006 /* Quick check for Unicode NFC normalization
1007  *
1008  * This function verify that:
1009  * - all codepoints are allowed in NFC
1010  * - the canonical ordering of combining marks is correct
1011  *
1012  * \param[in] s UTF-8 string to check
1013  *
1014  * \attention
1015  * The string \e s MUST be already checked for valid UTF-8 encoding before
1016  * calling this function!
1017  */
1018 
1019 static int enc_uc_check_nfc(const char* s)
1020 {
1021  int res = 0;
1022  size_t i = 0;
1023  long int ucp = 0;
1024  long int first;
1025  long int last;
1026  size_t ii;
1027  struct uc_cdc cdc; /* Canonical decomposition data */
1028  unsigned char ccc_last = 0;
1029 
1030  while(1)
1031  {
1032  ucp = enc_uc_decode_utf8(s, &i);
1033  if(-1L == ucp) { break; }
1034  /* Quick check for ASCII */
1035  if(128L <= ucp)
1036  {
1037  /* Lookup codepoint in Unicode database */
1038  ii = 0;
1039  while(-1L != uc_qc_nfc_table[ii].first)
1040  {
1041  first = uc_qc_nfc_table[ii].first;
1042  last = uc_qc_nfc_table[ii].last;
1043  if(first <= ucp && last >= ucp)
1044  {
1045  /* Codepoint is (maybe) not allowed in NFC */
1046  res = -1;
1047  break;
1048  }
1049  ++ii;
1050  }
1051  if(res) { break; }
1052  /* Check ordering of combining marks */
1053  enc_uc_lookup_cdc(ucp, &cdc);
1054  if(cdc.ccc && (cdc.ccc < ccc_last)) { res = -1; break; }
1055  ccc_last = cdc.ccc;
1056  }
1057  else { ccc_last = 0; }
1058  }
1059 
1060 #if ENC_UC_NORM_DEBUG
1061  if(res)
1062  {
1063  printf("Maybe not NFC: %s (len: %u)\n ", s, (unsigned int) strlen(s));
1064  i = 0; while(s[i])
1065  {
1066  printf(" 0x%02X", (unsigned int) (unsigned char) s[i++]);
1067  }
1068  printf("\n");
1069  }
1070 #endif /* ENC_UC_NORM_DEBUG */
1071 
1072  return(res);
1073 }
1074 
1075 
1076 /* ========================================================================== */
1077 /* Unicode canonical decomposition engine
1078  *
1079  * This function is reentrant and calls itself for recursive decomposition.
1080  *
1081  * \attention
1082  * The decomposition buffer \e dbuf must have the fixed size
1083  * \ref ENC_UC_DECOMPOSITION_BUFSIZE bytes.
1084  *
1085  * \param[in] ucp Unicode codepoint
1086  * \param[in,out] dbuf Decomposition buffer
1087  * \param[in] di Pointer to index in decomposition buffer
1088  */
1089 
1090 static int enc_uc_engine_decompose(long int ucp, long int* dbuf, size_t* di)
1091 {
1092  int res = 0;
1093  struct uc_cdc cdc; /* Canonical decomposition data */
1094 
1095  /* Ensure that there is space for 2 codepoints in decomposition buffer */
1096  if(ENC_UC_DECOMPOSITION_BUFSIZE - (size_t) 2 <= *di)
1097  {
1098  /* Decomposition buffer not large enough */
1099  PRINT_ERROR("Unicode canonical decomposition engine failed");
1100  dbuf[0] = (long int) (unsigned char) '[';
1101  dbuf[1] = (long int) (unsigned char) 'E';
1102  dbuf[2] = (long int) (unsigned char) 'r';
1103  dbuf[3] = (long int) (unsigned char) 'r';
1104  dbuf[4] = (long int) (unsigned char) 'o';
1105  dbuf[5] = (long int) (unsigned char) 'r';
1106  dbuf[6] = (long int) (unsigned char) ']';
1107  *di = 7;
1108  res = -1;
1109  }
1110  else
1111  {
1112  /* Recursively decompose */
1113  enc_uc_lookup_cdc(ucp, &cdc);
1114  if(cdc.dc1 != ucp) { res = enc_uc_engine_decompose(cdc.dc1, dbuf, di); }
1115  else { dbuf[(*di)++] = cdc.dc1; }
1116  if(-1L != cdc.dc2) { dbuf[(*di)++] = cdc.dc2; }
1117  }
1118 
1119  return(res);
1120 }
1121 
1122 
1123 /* ========================================================================== */
1124 /* Unicode canonical ordering engine
1125  *
1126  * All bursts of codepoints with nonzero canonical combining class are
1127  * stable sorted in ascending order.
1128  *
1129  * \param[in,out] dbuf Decomposition buffer
1130  * \param[in] di Number of codepoints in decomposition buffer
1131  */
1132 
1133 static void enc_uc_engine_reorder(long int* dbuf, size_t di)
1134 {
1135  size_t i, ii, iii, iiii;
1136  struct uc_cdc cdc; /* Canonical decomposition data */
1137  size_t len;
1138  long int tmp;
1139  unsigned char ccc1;
1140  unsigned char ccc2;
1141 
1142  for(i = 0; i < di; ++i)
1143  {
1144  enc_uc_lookup_cdc(dbuf[i], &cdc);
1145  /* Starters (Ccc = 0) always stay in place */
1146  if(cdc.ccc)
1147  {
1148 #if ENC_UC_NORM_DEBUG
1149  printf(" Nonstarter: U+%04lX (ccc=%u)\n",
1150  dbuf[i], (unsigned int) cdc.ccc);
1151 #endif /* ENC_UC_NORM_DEBUG */
1152  ii = i;
1153  while(++ii < di)
1154  {
1155  enc_uc_lookup_cdc(dbuf[ii], &cdc);
1156  if(!cdc.ccc) { break; }
1157  }
1158  len = ii - i;
1159  /* Sort burst of nonstarter codepoints (ccc != 0) to canonical order */
1160 #if ENC_UC_NORM_DEBUG
1161  printf(" Sort burst: len=%u\n", (unsigned int) len);
1162 #endif /* ENC_UC_NORM_DEBUG */
1163  for(iii = i; iii < i + len; ++iii)
1164  {
1165  /*
1166  * Bubble sort from end of buffer
1167  * This is very inefficient because the ccc lookup data is not
1168  * buffered. For european languages there are seldom more than
1169  * two or three codepoints combined => Keep it simple.
1170  */
1171  for(iiii = i + len - (size_t) 1; iiii > iii; --iiii)
1172  {
1173  enc_uc_lookup_cdc(dbuf[iiii - (size_t) 1], &cdc);
1174  ccc1 = cdc.ccc;
1175  enc_uc_lookup_cdc(dbuf[iiii], &cdc);
1176  ccc2 = cdc.ccc;
1177  if(ccc2 < ccc1)
1178  {
1179  tmp = dbuf[iiii - (size_t) 1];
1180  dbuf[iiii - (size_t) 1] = dbuf[iiii];
1181  dbuf[iiii] = tmp;
1182  }
1183  }
1184  }
1185  }
1186 #if ENC_UC_NORM_DEBUG
1187  else { printf(" Starter : U+%04lX\n", dbuf[i]); }
1188 #endif /* ENC_UC_NORM_DEBUG */
1189  }
1190 }
1191 
1192 
1193 /* ========================================================================== */
1194 /* Unicode canonical composition engine
1195  *
1196  * \param[in,out] dbuf Codepoint buffer
1197  * \param[in] di Pointer to number of codepoints in buffer
1198  */
1199 
1200 static void enc_uc_engine_compose(long int* dbuf, size_t* di)
1201 {
1202  size_t i = 0;
1203  size_t ii;
1204  long int ucp; /* Unicode codepoint */
1205  struct uc_cdc cdc; /* Canonical decomposition data */
1206  unsigned char ccc;
1207  int skip;
1208 
1209  while(++i < *di)
1210  {
1211  /* Check whether codepoint i can be canonically composed with starter */
1212 #if ENC_UC_NORM_DEBUG
1213  printf(" ---\n");
1214  printf(" Starter at beginning : U+%04lX\n", dbuf[0]);
1215  printf(" Codepoint in question: U+%04lX\n", dbuf[i]);
1216 #endif /* ENC_UC_NORM_DEBUG */
1217  ucp = enc_uc_lookup_cc(dbuf[0], dbuf[i]);
1218  if(-1L != ucp)
1219  {
1220  /* Yes => Get canonical combining class */
1221  enc_uc_lookup_cdc(dbuf[i], &cdc);
1222  ccc = cdc.ccc;
1223 #if ENC_UC_NORM_DEBUG
1224  printf(" Codepoint has ccc : %u\n", (unsigned int) ccc);
1225  printf(" Canonical composition: U+%04lX\n", ucp);
1226 #endif /* ENC_UC_NORM_DEBUG */
1227  /* Search for other codepoints with same canonical combining class */
1228  skip = 0;
1229  for(ii = 1; ii < i; ++ii)
1230  {
1231  enc_uc_lookup_cdc(dbuf[ii], &cdc);
1232  if(cdc.ccc >= ccc)
1233  {
1234  /* Found => Preserve canonical ordering => Don't compose */
1235 #if ENC_UC_NORM_DEBUG
1236  printf(" => Don't compose\n");
1237 #endif /* ENC_UC_NORM_DEBUG */
1238  skip = 1;
1239  break;
1240  }
1241  }
1242  if(skip) { continue; }
1243  /* Not found => Compose */
1244 #if ENC_UC_NORM_DEBUG
1245  printf(" => Compose\n");
1246 #endif /* ENC_UC_NORM_DEBUG */
1247  for(ii = i; ii < *di - (size_t) 1; ++ii)
1248  {
1249  dbuf[ii] = dbuf[ii + (size_t) 1];
1250  }
1251  dbuf[--*di] = -1L;
1252  dbuf[0] = ucp;
1253  /* Rewind index for now missing codepoint */
1254  --i;
1255  }
1256 #if ENC_UC_NORM_DEBUG
1257  printf(" ---\n");
1258 #endif /* ENC_UC_NORM_DEBUG */
1259  }
1260 }
1261 
1262 
1263 /* ========================================================================== */
1264 /* Unicode normalization engine (shared part)
1265  *
1266  * \param[in] s Valid Unicode string with arbitrary or no normalization
1267  * \param[out] l Pointer to length of result
1268  * \param[in] nfc Normalization form (NFC if nonzero, otherwise NFD)
1269  *
1270  * \attention
1271  * The string \e s MUST be already checked for valid UTF-8 encoding before
1272  * calling this function!
1273  *
1274  * \return
1275  * - Pointer to processed Unicode data
1276  * A new memory block was allocated
1277  * - NULL on error (Original memory block for \e s is still allocated)
1278  */
1279 
1280 static const char* enc_uc_engine_n(const char* s, size_t* l, int nfc)
1281 {
1282  char* res = NULL;
1283  size_t rlen = 0;
1284  size_t ri = 0;
1285  char* p;
1286  size_t i = 0;
1287  size_t last;
1288  long int ucp; /* Unicode codepoint */
1289  struct uc_cdc cdc; /* Canonical decomposition data */
1290  long int dbuf[ENC_UC_DECOMPOSITION_BUFSIZE];
1291  size_t di = 0;
1292  int error = 0; /* Error flag => Skip to next starter */
1293 
1294  while(1)
1295  {
1296  /* Allocate memory in exponentially increasing chunks */
1297  if(rlen - ri <= (size_t) 4 * ENC_UC_DECOMPOSITION_BUFSIZE)
1298  {
1299  /*
1300  * Ensure there is space in the result buffer for at least 4 times the
1301  * decompositon buffer size. Reason: Every Unicode codepoint can
1302  * consume up to 4 bytes after encoded to UTF-8 in worst case.
1303  */
1304  if(!rlen) { rlen = (size_t) 4 * ENC_UC_DECOMPOSITION_BUFSIZE; }
1305  rlen *= (size_t) 2;
1306  p = api_posix_realloc((void*) res, rlen);
1307  if(NULL == p) { api_posix_free((void*) res); res = NULL; break; }
1308  else { res = p; }
1309  }
1310  /* Check whether next codepoint is a starter (ccc = 0) */
1311  last = i;
1312  ucp = enc_uc_decode_utf8(s, &i);
1313  if(-1L == ucp) { break; }
1314  enc_uc_lookup_cdc(ucp, &cdc);
1315  if(!cdc.ccc)
1316  {
1317  /* Yes => Check for buffered sequence */
1318  if(di)
1319  {
1320  /* Present => Push last codepoint back and flush buffer first */
1321  i = last;
1322  enc_uc_engine_reorder(dbuf, di);
1323  if(nfc) { enc_uc_engine_compose(dbuf, &di); }
1324  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1325  error = 0;
1326  continue;
1327  }
1328  }
1329  /* Recursive canonical decomposition */
1330  if(!error) { error = enc_uc_engine_decompose(ucp, dbuf, &di); }
1331  }
1332  /* Flush buffer */
1333  if(di)
1334  {
1335  enc_uc_engine_reorder(dbuf, di);
1336  if(nfc) { enc_uc_engine_compose(dbuf, &di); }
1337  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1338  }
1339 
1340  /* Terminate result string */
1341  if(NULL != res)
1342  {
1343  res[ri] = 0;
1344  *l = ri;
1345  }
1346 
1347  return(res);
1348 }
1349 
1350 
1351 /* ========================================================================== */
1352 /* Unicode NFD normalization engine
1353  *
1354  * \param[in] s Valid Unicode string with arbitrary or no normalization
1355  * \param[out] l Pointer to length of result
1356  *
1357  * \attention
1358  * The string \e s MUST be already checked for valid UTF-8 encoding before
1359  * calling this function!
1360  *
1361  * \return
1362  * - Pointer to decomposed Unicode data (UTF-8 encoded with NFD normalization)
1363  * A new memory block was allocated
1364  * - NULL on error (Original memory block for \e s is still allocated)
1365  */
1366 
1367 static const char* enc_uc_engine_nfd(const char* s, size_t* l)
1368 {
1369  return(enc_uc_engine_n(s, l, 0));
1370 }
1371 
1372 
1373 /* ========================================================================== */
1374 /* Unicode NFC normalization engine part 1
1375  *
1376  * \param[in] s Valid Unicode string with arbitrary or no normalization
1377  * \param[out] l Pointer to length of result
1378  *
1379  * \attention
1380  * The string \e s MUST be already checked for valid UTF-8 encoding before
1381  * calling this function!
1382  *
1383  * Part 1 does all the work but cannot compose starters, therefore the result
1384  * may contain starter pairs with canonical composition and must be
1385  * postprocessed by part 2 of the engine.
1386  *
1387  * \return
1388  * - Pointer to processed data (input for part 2 of the engine)
1389  * A new memory block was allocated
1390  * - NULL on error (Original memory block for \e s is still allocated)
1391  */
1392 
1393 static const char* enc_uc_engine_nfc_part1(const char* s, size_t* l)
1394 {
1395  return(enc_uc_engine_n(s, l, 1));
1396 }
1397 
1398 
1399 /* ========================================================================== */
1400 /* Unicode NFC normalization engine part 2
1401  *
1402  * \param[in] s Unicode string processed by part 1 of the engine
1403  * \param[in] l Length of \e s
1404  * \param[out] flag Flag indicating modified data
1405  *
1406  * Part 2 is for canonical composition of codepoint pairs that are both
1407  * starters. This includes algorithmic canonical composition for hangul
1408  * syllables.
1409  *
1410  * \return
1411  * - Pointer to precomposed Unicode data (UTF-8 encoded with NFC normalization)
1412  * A new memory block was allocated
1413  * - NULL on error (Undefined data written to \e flag)
1414  */
1415 
1416 static const char* enc_uc_engine_nfc_part2(const char* s, size_t l,
1417  int* flag)
1418 {
1419  char* res = NULL;
1420  size_t ri = 0;
1421  size_t i = 0;
1422  long int ucp; /* Unicode codepoint */
1423  struct uc_cdc cdc; /* Canonical decomposition data */
1424  long int dbuf[2];
1425  size_t di = 0;
1426 
1427  *flag = 0;
1428 #if ENC_UC_NORM_DEBUG
1429  printf(" *** Part 2 ***\n");
1430 #endif /* ENC_UC_NORM_DEBUG */
1431  res = api_posix_malloc(++l);
1432  if(NULL != res)
1433  {
1434  while(1)
1435  {
1436  /* Append next codepoint to buffer */
1437  ucp = enc_uc_decode_utf8(s, &i);
1438  if(-1L == ucp) { break; }
1439  dbuf[di++] = ucp;
1440  /* Check whether codepoint is a starter (ccc = 0) */
1441  enc_uc_lookup_cdc(ucp, &cdc);
1442  if(cdc.ccc)
1443  {
1444  /* No => Flush buffer */
1445  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1446  }
1447  else
1448  {
1449  /* Yes => Check for canonical composition of starter pair */
1450  if((size_t) 2 == di)
1451  {
1452  enc_uc_engine_compose(dbuf, &di);
1453  /* Flush first starter if there was no canonical composition */
1454  if((size_t) 2 == di)
1455  {
1456  di = 1;
1457  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1458  dbuf[0] = ucp;
1459  di = 1;
1460  }
1461  else
1462  {
1463  /* Canonical composition found for starter pair */
1464  *flag = 1;
1465  }
1466  }
1467  }
1468  }
1469  }
1470  /* Flush buffer */
1471  if(di) { enc_uc_encode_utf8(res, &ri, dbuf, &di); }
1472  /* Terminate result string */
1473  if(NULL != res) { res[ri] = 0; }
1474 
1475 #if ENC_UC_NORM_DEBUG
1476  if(NULL != res)
1477  {
1478  printf("Now NFC: %s (len: %u)\n ", res, (unsigned int) strlen(res));
1479  i = 0; while(res[i])
1480  {
1481  printf(" 0x%02X", (unsigned int) (unsigned char) res[i++]);
1482  }
1483  printf("\n\n");
1484  }
1485 #endif /* ENC_UC_NORM_DEBUG */
1486 
1487  return(res);
1488 }
1489 
1490 
1491 /* ========================================================================== */
1492 /* Strip defective combining character sequences (at the beginning of string)
1493  *
1494  * \param[in] s UTF-8 string to process
1495  *
1496  * \attention
1497  * The string \e s MUST be already checked for valid UTF-8 encoding before
1498  * calling this function!
1499  *
1500  * This function strips defective combining character sequences at the
1501  * beginning so that the result becomes semantically valid when standing
1502  * alone.
1503  *
1504  * \return
1505  * - Pointer to processed Unicode data
1506  * If the result is not equal to \e s , a new memory block was allocated
1507  * - NULL on error
1508  */
1509 
1510 static const char* enc_uc_strip_dccs(const char* s)
1511 {
1512  const char* res = NULL;
1513  int skip = 0; /* Garbage at the beginning of string must be skipped */
1514  long int ucp; /* Unicode codepoint */
1515  struct uc_cdc cdc; /* Canonical decomposition data */
1516  size_t i = 0;
1517  size_t last;
1518  size_t len;
1519 
1520  while(1)
1521  {
1522  last = i;
1523  ucp = enc_uc_decode_utf8(s, &i);
1524  if(-1L == ucp) { break; }
1525  enc_uc_lookup_cdc(ucp, &cdc);
1526  if(!cdc.ccc) { break; }
1527  else
1528  {
1529  /* The Unicode data tries to compose something with void */
1530  if(!skip) { PRINT_ERROR("Semantic error in Unicode string"); }
1531  skip = 1;
1532  }
1533  }
1534  i = last;
1535  if(skip)
1536  {
1537  len = strlen(&s[i]);
1538  res = (const char*) api_posix_malloc(++len);
1539  if(NULL != res) { memcpy((void*) res, &s[i], len); }
1540  }
1541  else { res = s; }
1542 
1543  return(res);
1544 }
1545 
1546 
1547 /* ========================================================================== */
1548 /* Normalize UTF-8 string to NFD
1549  *
1550  * Documentation about Unicode normalization:
1551  * <br>
1552  * http://www.unicode.org/reports/tr15/
1553  *
1554  * \param[in] s UTF-8 string to normalize
1555  *
1556  * \attention
1557  * The string \e s MUST be already checked for valid UTF-8 encoding before
1558  * calling this function!
1559  *
1560  * \note
1561  * An Unicode string with valid transformation encoding can still be
1562  * semantically nonsense. If it starts with a codepoint that is not a "starter"
1563  * in terms of the standard, the resulting string starts with a "defective
1564  * combining character sequence" - but may still make sense if concatenated
1565  * with other data in front of it.
1566  * This function always strips defective combining character sequences at the
1567  * beginning so that the result becomes semantically valid even when standing
1568  * alone.
1569  *
1570  * \return
1571  * - Pointer to decomposed Unicode data (UTF-8 encoded with NFD normalization)
1572  * If the result is not equal to \e s , a new memory block was allocated
1573  * - NULL on error
1574  */
1575 
1576 static const char* enc_uc_normalize_to_nfd(const char* s)
1577 {
1578  const char* res = NULL;
1579  const char* tgt;
1580  size_t l = 0;
1581 
1582  /* Strip all nonstarters at the beginning */
1583  res = enc_uc_strip_dccs(s);
1584  if(NULL != res)
1585  {
1586  /* Normalize string to NFD */
1587  tgt = enc_uc_engine_nfd(res, &l);
1588  if(res != s) { api_posix_free((void*) res); }
1589  res = tgt;
1590  }
1591 
1592  /* Check for error */
1593  if(NULL == res) { PRINT_ERROR("Unicode NFD normalization failed"); }
1594 
1595  return(res);
1596 }
1597 
1598 
1599 /* ========================================================================== */
1600 /* Normalize UTF-8 string to NFC
1601  *
1602  * Documentation about Unicode normalization:
1603  * <br>
1604  * http://www.unicode.org/reports/tr15/
1605  *
1606  * \param[in] s UTF-8 string to normalize
1607  *
1608  * RFC 5198 recommends NFC for use in general Internet text messages
1609  * => We do so.
1610  *
1611  * \attention
1612  * The string \e s MUST be already checked for valid UTF-8 encoding before
1613  * calling this function!
1614  *
1615  * \note
1616  * An Unicode string with valid transformation encoding can still be
1617  * semantically nonsense. If it starts with a codepoint that is not a "starter"
1618  * in terms of the standard, the resulting string starts with a "defective
1619  * combining character sequence" - but may still make sense if concatenated
1620  * with other data in front of it.
1621  * This function always strips defective combining character sequences at the
1622  * beginning so that the result becomes semantically valid even when standing
1623  * alone.
1624  *
1625  * \return
1626  * - Pointer to precomposed Unicode data (UTF-8 encoded with NFC normalization)
1627  * If the result is not equal to \e s , a new memory block was allocated
1628  * - NULL on error
1629  */
1630 
1631 static const char* enc_uc_normalize_to_nfc(const char* s)
1632 {
1633  const char* res = NULL;
1634  const char* tgt;
1635  size_t l = 0;
1636  int flag;
1637 
1638  /* Strip all nonstarters at the beginning */
1639  res = enc_uc_strip_dccs(s);
1640 
1641  /* Quick check whether the string is already in NFC */
1642  if(NULL != res && enc_uc_check_nfc(res))
1643  {
1644  /* No => Normalize string to NFD first (required since Unicode 16.0.0) */
1645  tgt = enc_uc_engine_nfd(res, &l);
1646  if(res != s) { api_posix_free((void*) res); }
1647  res = tgt;
1648  if(NULL != res)
1649  {
1650  /* Normalize string to NFC */
1651  tgt = enc_uc_engine_nfc_part1(res, &l);
1652  if(res != s) { api_posix_free((void*) res); }
1653  res = tgt;
1654  if(NULL != res)
1655  {
1656  /* Fixme: Should be single pass */
1657  tgt = enc_uc_engine_nfc_part2(res, l, &flag);
1658  api_posix_free((void*) res);
1659  res = tgt;
1660  if(NULL != res && flag)
1661  {
1662  /* Part 1 must be repeated if starters were composed */
1663  tgt = enc_uc_engine_nfc_part1(res, &l);
1664  api_posix_free((void*) res);
1665  res = tgt;
1666  }
1667  }
1668  }
1669  }
1670 
1671  /* Check for error */
1672  if(NULL == res) { PRINT_ERROR("Unicode NFC normalization failed"); }
1673 
1674  return(res);
1675 }
1676 
1677 
1678 /* ========================================================================== */
1679 /* Convert nonstandard Unicode Transformation Formats to UTF-8
1680  *
1681  * CESU-8 and UTF-7 are supported.
1682  *
1683  * \param[in] s String to convert
1684  * \param[in] e Source encoding
1685  *
1686  * \return
1687  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
1688  * - NULL on error
1689  */
1690 
1691 static const char* enc_uc_convert_nsutf_to_utf8(const char* s, const char* e)
1692 {
1693  const char* res = NULL;
1694  size_t inlen = 0;
1695  unsigned char us_ascii = 1;
1696 
1697  /* Check whether data contains no switch from US-ASCII character set */
1698  while (s[inlen])
1699  {
1700  if ( ((const unsigned char) 0x80 & (const unsigned char) s[inlen]) ||
1701  ((const unsigned char) 0x1B == (const unsigned char) s[inlen]) ||
1702  ((const unsigned char) 0x2B == (const unsigned char) s[inlen]) )
1703  {
1704  us_ascii = 0;
1705  }
1706  ++inlen;
1707  }
1708  if (us_ascii)
1709  {
1710  res = s;
1711  }
1712  else
1713  {
1714  char* inbuf = api_posix_malloc(inlen + (size_t) 1); /* +1 for NUL */
1715 
1716  if (NULL != inbuf)
1717  {
1718  size_t outlen = inlen * (size_t) 4; /* 4 octets per CP in UTF-8 */
1719  size_t len = outlen;
1720  char* outbuf = api_posix_malloc(outlen + (size_t) 1); /* +1 for NUL */
1721 
1722  if (NULL != outbuf)
1723  {
1724  size_t rv = (size_t) -1;
1725 
1726  memcpy(inbuf, s, inlen + (size_t) 1);
1727  rv = ucic0_iconvstr("UTF-8", e, inbuf, &inlen, outbuf, &outlen,
1728  UCIC0_ICONV_REPLACE_INVALID);
1729  if ((size_t) -1 == rv || (size_t) 0 != inlen)
1730  {
1731  /* Failed */
1732  PRINT_ERROR("Conversion from CESU-8 or UTF-7 to UTF-8 failed");
1733  api_posix_free((void*) outbuf);
1734  }
1735  else
1736  {
1737  /* Success => Shrink output buffer to size of result */
1738  len -= outlen;
1739  outbuf[len] = 0;
1740  res = api_posix_realloc((void*) outbuf, len + (size_t) 1);
1741  if (NULL == res) { api_posix_free((void*) outbuf); }
1742  }
1743  }
1744  api_posix_free((void*) inbuf);
1745  }
1746  }
1747 
1748  return res;
1749 }
1750 
1751 
1752 /* ========================================================================== */
1753 /* Convert from ISO-2022-JP encoding to UTF-8
1754  *
1755  * \param[in] s String to convert
1756  *
1757  * \return
1758  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
1759  * - NULL on error
1760  */
1761 
1762 static const char* enc_iso2022jp_convert_to_utf8(const char* s)
1763 {
1764  const char* res = NULL;
1765  size_t inlen = 0;
1766  unsigned char us_ascii = 1;
1767 
1768  /*
1769  * Check whether data contains no switch from US-ASCII character set
1770  * (ISO-2022-JP is an US-ASCII extension)
1771  */
1772  while (s[inlen])
1773  {
1774  if ( ((const unsigned char) 0x80 & (const unsigned char) s[inlen]) ||
1775  ((const unsigned char) 0x1B == (const unsigned char) s[inlen]) )
1776  {
1777  us_ascii = 0;
1778  }
1779  ++inlen;
1780  }
1781  if (us_ascii)
1782  {
1783  res = s;
1784  }
1785  else
1786  {
1787  char* inbuf = api_posix_malloc(inlen + (size_t) 1); /* +1 for NUL */
1788 
1789  if (NULL != inbuf)
1790  {
1791  size_t outlen = inlen * (size_t) 4; /* 4 octets per CP in UTF-8 */
1792  size_t len = outlen;
1793  char* outbuf = api_posix_malloc(outlen + (size_t) 1); /* +1 for NUL */
1794 
1795  if (NULL != outbuf)
1796  {
1797  size_t rv = (size_t) -1;
1798 
1799  memcpy(inbuf, s, inlen + (size_t) 1);
1800  rv = jpic0_iconvstr("UTF-8", "ISO-2022-JP",
1801  inbuf, &inlen, outbuf, &outlen,
1802  JPIC0_ICONV_REPLACE_INVALID);
1803  if ((size_t) -1 == rv || (size_t) 0 != inlen)
1804  {
1805  /* Failed */
1806  PRINT_ERROR("Conversion from ISO-2022-JP to UTF-8 failed");
1807  api_posix_free((void*) outbuf);
1808  }
1809  else
1810  {
1811  /* Success => Shrink output buffer to size of result */
1812  len -= outlen;
1813  outbuf[len] = 0;
1814  res = api_posix_realloc((void*) outbuf, len + (size_t) 1);
1815  if (NULL == res) { api_posix_free((void*) outbuf); }
1816  }
1817  }
1818  api_posix_free((void*) inbuf);
1819  }
1820  }
1821 
1822  return res;
1823 }
1824 
1825 
1826 /* ========================================================================== */
1827 /* Decode IANA character set description
1828  *
1829  * \param[in] s Description to decode
1830  * \param[in] len Length of string \e s
1831  *
1832  * This function checks whether the description \e s represents a supported IANA
1833  * character set name and return the corresponding ID for it.
1834  * According to RFC 2047 the character set is treated case-insensitive.
1835  *
1836  * \result
1837  * - MIME character set ID (from \ref enc_mime_cs )
1838  * - \c ENC_CS_UNKNOWN on error
1839  */
1840 
1841 static enum enc_mime_cs enc_mime_get_charset(const char* s, size_t len)
1842 {
1843  enum enc_mime_cs res = ENC_CS_UNKNOWN;
1844  char buf[ENC_CS_BUFLEN];
1845  size_t i;
1846  const char not_supported[] = "MIME: Unsupported character set: ";
1847  char* p;
1848  size_t l;
1849  int isoinv = 0;
1850  int macinv = 0;
1851  int ibminv = 0;
1852 
1853  if(ENC_CS_BUFLEN <= len)
1854  {
1855  /* If you get this error, the value of 'ENC_CS_BUFLEN' is too small */
1856  PRINT_ERROR("MIME: Name of character set too long");
1857  }
1858  else
1859  {
1860  /* Convert description to upper case */
1861  for(i = 0; i < len; ++i)
1862  {
1863  buf[i] = (char) toupper((int) s[i]);
1864  }
1865  buf[len] = 0;
1866  /* Check for all known character sets */
1867  if(!strcmp(buf, "US-ASCII")) { res = ENC_CS_ASCII; }
1868  else if(!strcmp(buf, "UTF-8")) { res = ENC_CS_UTF_8; }
1869  else if(!strcmp(buf, "CESU-8")) { res = ENC_CS_CESU_8; }
1870  else if(!strcmp(buf, "UTF-7")) { res = ENC_CS_UTF_7; }
1871  else if(!strcmp(buf, "ISO-8859-1")) { res = ENC_CS_ISO8859_1; }
1872  else if(!strcmp(buf, "ISO-8859-2")) { res = ENC_CS_ISO8859_2; }
1873  else if(!strcmp(buf, "ISO-8859-3")) { res = ENC_CS_ISO8859_3; }
1874  else if(!strcmp(buf, "ISO-8859-4")) { res = ENC_CS_ISO8859_4; }
1875  else if(!strcmp(buf, "ISO-8859-5")) { res = ENC_CS_ISO8859_5; }
1876  else if(!strcmp(buf, "ISO-8859-6")) { res = ENC_CS_ISO8859_6; }
1877  else if(!strcmp(buf, "ISO-8859-7")) { res = ENC_CS_ISO8859_7; }
1878  else if(!strcmp(buf, "ISO-8859-8")) { res = ENC_CS_ISO8859_8; }
1879  else if(!strcmp(buf, "ISO-8859-9")) { res = ENC_CS_ISO8859_9; }
1880  else if(!strcmp(buf, "ISO-8859-10")) { res = ENC_CS_ISO8859_10; }
1881  else if(!strcmp(buf, "TIS-620")) { res = ENC_CS_ISO8859_11; }
1882  /* Note: The proposed draft for ISO 8859-12 was released as ISO 8859-14 */
1883  else if(!strcmp(buf, "ISO-8859-13")) { res = ENC_CS_ISO8859_13; }
1884  else if(!strcmp(buf, "ISO-8859-14")) { res = ENC_CS_ISO8859_14; }
1885  else if(!strcmp(buf, "ISO-8859-15")) { res = ENC_CS_ISO8859_15; }
1886  else if(!strcmp(buf, "ISO-8859-16")) { res = ENC_CS_ISO8859_16; }
1887  else if(!strcmp(buf, "WINDOWS-1250")) { res = ENC_CS_WINDOWS_1250; }
1888  else if(!strcmp(buf, "WINDOWS-1251")) { res = ENC_CS_WINDOWS_1251; }
1889  else if(!strcmp(buf, "WINDOWS-1252")) { res = ENC_CS_WINDOWS_1252; }
1890  else if(!strcmp(buf, "WINDOWS-1253")) { res = ENC_CS_WINDOWS_1253; }
1891  else if(!strcmp(buf, "WINDOWS-1254")) { res = ENC_CS_WINDOWS_1254; }
1892  else if(!strcmp(buf, "WINDOWS-1255")) { res = ENC_CS_WINDOWS_1255; }
1893  else if(!strcmp(buf, "WINDOWS-1256")) { res = ENC_CS_WINDOWS_1256; }
1894  else if(!strcmp(buf, "WINDOWS-1257")) { res = ENC_CS_WINDOWS_1257; }
1895  else if(!strcmp(buf, "WINDOWS-1258")) { res = ENC_CS_WINDOWS_1258; }
1896  else if(!strcmp(buf, "KOI8-R")) { res = ENC_CS_KOI8R; }
1897  else if(!strcmp(buf, "KOI8-U")) { res = ENC_CS_KOI8U; }
1898  else if(!strcmp(buf, "MACINTOSH")) { res = ENC_CS_MACINTOSH; }
1899  else if(!strcmp(buf, "IBM437")) { res = ENC_CS_IBM437; }
1900  else if(!strcmp(buf, "IBM775")) { res = ENC_CS_IBM775; }
1901  else if(!strcmp(buf, "IBM850")) { res = ENC_CS_IBM850; }
1902  else if(!strcmp(buf, "IBM852")) { res = ENC_CS_IBM852; }
1903  else if(!strcmp(buf, "IBM00858")) { res = ENC_CS_IBM858; }
1904  else if(!strcmp(buf, "ISO-2022-JP")) { res = ENC_CS_ISO2022_JP; }
1905 
1906  /* Check for official IANA aliases */
1907  /* US-ASCII */
1908  else if(!strcmp(buf, "ANSI_X3.4-1968")) { res = ENC_CS_ASCII; }
1909  else if(!strcmp(buf, "ANSI_X3.4-1986")) { res = ENC_CS_ASCII; }
1910  else if(!strcmp(buf, "ISO-IR-6")) { res = ENC_CS_ASCII; }
1911  else if(!strcmp(buf, "ISO_646.IRV:1991")) { res = ENC_CS_ASCII; }
1912  else if(!strcmp(buf, "ISO646-US")) { res = ENC_CS_ASCII; }
1913  else if(!strcmp(buf, "IBM367")) { res = ENC_CS_ASCII; }
1914  else if(!strcmp(buf, "CP367")) { res = ENC_CS_ASCII; }
1915  else if(!strcmp(buf, "CSASCII")) { res = ENC_CS_ASCII; }
1916  else if(!strcmp(buf, "US")) { res = ENC_CS_ASCII; }
1917  /* UTF-8 */
1918  else if(!strcmp(buf, "CSUTF-8")) { res = ENC_CS_UTF_8; }
1919  /* CESU-8 */
1920  else if(!strcmp(buf, "CSCESU8")) { res = ENC_CS_CESU_8; }
1921  else if(!strcmp(buf, "CSCESU-8")) { res = ENC_CS_CESU_8; }
1922  /* UTF-7 */
1923  else if(!strcmp(buf, "CSUTF-7")) { res = ENC_CS_UTF_7; }
1924 
1925  /* Check for official IANA aliases of ISO 8859 parts */
1926  /* ISO 8859-1 */
1927  else if(!strcmp(buf, "ISO_8859-1:1987")) { res = ENC_CS_ISO8859_1; }
1928  else if(!strcmp(buf, "ISO_8859-1")) { res = ENC_CS_ISO8859_1; }
1929  else if(!strcmp(buf, "ISO-IR-100")) { res = ENC_CS_ISO8859_1; }
1930  else if(!strcmp(buf, "IBM819")) { res = ENC_CS_ISO8859_1; }
1931  else if(!strcmp(buf, "CP819")) { res = ENC_CS_ISO8859_1; }
1932  else if(!strcmp(buf, "CSISOLATIN1")) { res = ENC_CS_ISO8859_1; }
1933  else if(!strcmp(buf, "LATIN1")) { res = ENC_CS_ISO8859_1; }
1934  else if(!strcmp(buf, "L1")) { res = ENC_CS_ISO8859_1; }
1935  /* ISO 8859-2 */
1936  else if(!strcmp(buf, "ISO_8859-2:1987")) { res = ENC_CS_ISO8859_2; }
1937  else if(!strcmp(buf, "ISO_8859-2")) { res = ENC_CS_ISO8859_2; }
1938  else if(!strcmp(buf, "ISO-IR-101")) { res = ENC_CS_ISO8859_2; }
1939  else if(!strcmp(buf, "CSISOLATIN2")) { res = ENC_CS_ISO8859_2; }
1940  else if(!strcmp(buf, "LATIN2")) { res = ENC_CS_ISO8859_2; }
1941  else if(!strcmp(buf, "L2")) { res = ENC_CS_ISO8859_2; }
1942  /* ISO 8859-3 */
1943  else if(!strcmp(buf, "ISO_8859-3:1988")) { res = ENC_CS_ISO8859_3; }
1944  else if(!strcmp(buf, "ISO_8859-3")) { res = ENC_CS_ISO8859_3; }
1945  else if(!strcmp(buf, "ISO-IR-109")) { res = ENC_CS_ISO8859_3; }
1946  else if(!strcmp(buf, "CSISOLATIN3")) { res = ENC_CS_ISO8859_3; }
1947  else if(!strcmp(buf, "LATIN3")) { res = ENC_CS_ISO8859_3; }
1948  else if(!strcmp(buf, "L3")) { res = ENC_CS_ISO8859_3; }
1949  /* ISO 8859-4 */
1950  else if(!strcmp(buf, "ISO_8859-4:1988")) { res = ENC_CS_ISO8859_4; }
1951  else if(!strcmp(buf, "ISO_8859-4")) { res = ENC_CS_ISO8859_4; }
1952  else if(!strcmp(buf, "ISO-IR-110")) { res = ENC_CS_ISO8859_4; }
1953  else if(!strcmp(buf, "CSISOLATIN4")) { res = ENC_CS_ISO8859_4; }
1954  else if(!strcmp(buf, "LATIN4")) { res = ENC_CS_ISO8859_4; }
1955  else if(!strcmp(buf, "L4")) { res = ENC_CS_ISO8859_4; }
1956  /* ISO 8859-5 */
1957  else if(!strcmp(buf, "ISO_8859-5:1988")) { res = ENC_CS_ISO8859_5; }
1958  else if(!strcmp(buf, "ISO_8859-5")) { res = ENC_CS_ISO8859_5; }
1959  else if(!strcmp(buf, "ISO-IR-144")) { res = ENC_CS_ISO8859_5; }
1960  else if(!strcmp(buf, "CSISOLATINCYRILLIC")) { res = ENC_CS_ISO8859_5; }
1961  else if(!strcmp(buf, "CYRILLIC")) { res = ENC_CS_ISO8859_5; }
1962  /* ISO 8859-6 */
1963  else if(!strcmp(buf, "ISO_8859-6:1987")) { res = ENC_CS_ISO8859_6; }
1964  else if(!strcmp(buf, "ISO_8859-6")) { res = ENC_CS_ISO8859_6; }
1965  else if(!strcmp(buf, "ISO-IR-127")) { res = ENC_CS_ISO8859_6; }
1966  else if(!strcmp(buf, "ECMA-114")) { res = ENC_CS_ISO8859_6; }
1967  else if(!strcmp(buf, "ASMO-708")) { res = ENC_CS_ISO8859_6; }
1968  else if(!strcmp(buf, "CSISOLATINARABIC")) { res = ENC_CS_ISO8859_6; }
1969  else if(!strcmp(buf, "ARABIC")) { res = ENC_CS_ISO8859_6; }
1970  /* ISO 8859-7 */
1971  else if(!strcmp(buf, "ISO_8859-7:1987")) { res = ENC_CS_ISO8859_7; }
1972  else if(!strcmp(buf, "ISO_8859-7")) { res = ENC_CS_ISO8859_7; }
1973  else if(!strcmp(buf, "ISO-IR-126")) { res = ENC_CS_ISO8859_7; }
1974  else if(!strcmp(buf, "ECMA-118")) { res = ENC_CS_ISO8859_7; }
1975  else if(!strcmp(buf, "ELOT_928")) { res = ENC_CS_ISO8859_7; }
1976  else if(!strcmp(buf, "CSISOLATINGREEK")) { res = ENC_CS_ISO8859_7; }
1977  else if(!strcmp(buf, "GREEK8")) { res = ENC_CS_ISO8859_7; }
1978  else if(!strcmp(buf, "GREEK")) { res = ENC_CS_ISO8859_7; }
1979  /* ISO 8859-8 */
1980  else if(!strcmp(buf, "ISO_8859-8:1988")) { res = ENC_CS_ISO8859_8; }
1981  else if(!strcmp(buf, "ISO_8859-8")) { res = ENC_CS_ISO8859_8; }
1982  else if(!strcmp(buf, "CSISOLATIN8")) { res = ENC_CS_ISO8859_8; }
1983  else if(!strcmp(buf, "LATIN8")) { res = ENC_CS_ISO8859_8; }
1984  else if(!strcmp(buf, "L8")) { res = ENC_CS_ISO8859_8; }
1985  else if(!strcmp(buf, "ISO-IR-138")) { res = ENC_CS_ISO8859_8; }
1986  else if(!strcmp(buf, "HEBREW")) { res = ENC_CS_ISO8859_8; }
1987  else if(!strcmp(buf, "CSISOLATINHEBREW")) { res = ENC_CS_ISO8859_8; }
1988  /* ISO 8859-9 */
1989  else if(!strcmp(buf, "ISO_8859-9:1989")) { res = ENC_CS_ISO8859_9; }
1990  else if(!strcmp(buf, "ISO_8859-9")) { res = ENC_CS_ISO8859_9; }
1991  else if(!strcmp(buf, "ISO-IR-148")) { res = ENC_CS_ISO8859_9; }
1992  else if(!strcmp(buf, "CSISOLATIN5")) { res = ENC_CS_ISO8859_9; }
1993  else if(!strcmp(buf, "LATIN5")) { res = ENC_CS_ISO8859_9; }
1994  else if(!strcmp(buf, "L5")) { res = ENC_CS_ISO8859_9; }
1995  /* ISO 8859-10 */
1996  else if(!strcmp(buf, "ISO_8859-10:1992")) { res = ENC_CS_ISO8859_10; }
1997  else if(!strcmp(buf, "ISO-IR-157")) { res = ENC_CS_ISO8859_10; }
1998  else if(!strcmp(buf, "CSISOLATIN6")) { res = ENC_CS_ISO8859_10; }
1999  else if(!strcmp(buf, "LATIN6")) { res = ENC_CS_ISO8859_10; }
2000  else if(!strcmp(buf, "L6")) { res = ENC_CS_ISO8859_10; }
2001  /* ISO 8859-11 */
2002  else if(!strcmp(buf, "ISO_8859-11")) { res = ENC_CS_ISO8859_11; }
2003  else if(!strcmp(buf, "CSTIS620")) { res = ENC_CS_ISO8859_11; }
2004  /* ISO 8859-13 */
2005  else if(!strcmp(buf, "CSISO885913")) { res = ENC_CS_ISO8859_13; }
2006  /* ISO 8859-14 */
2007  else if(!strcmp(buf, "ISO_8859-14:1998")) { res = ENC_CS_ISO8859_14; }
2008  else if(!strcmp(buf, "ISO_8859-14")) { res = ENC_CS_ISO8859_14; }
2009  else if(!strcmp(buf, "ISO-IR-199")) { res = ENC_CS_ISO8859_14; }
2010  else if(!strcmp(buf, "CSISO885914")) { res = ENC_CS_ISO8859_14; }
2011  else if(!strcmp(buf, "ISO-CELTIC")) { res = ENC_CS_ISO8859_14; }
2012  else if(!strcmp(buf, "LATIN8")) { res = ENC_CS_ISO8859_14; }
2013  else if(!strcmp(buf, "L8")) { res = ENC_CS_ISO8859_14; }
2014  /* ISO 8859-15 */
2015  else if(!strcmp(buf, "ISO_8859-15")) { res = ENC_CS_ISO8859_15; }
2016  else if(!strcmp(buf, "CSISO885915")) { res = ENC_CS_ISO8859_15; }
2017  else if(!strcmp(buf, "LATIN9")) { res = ENC_CS_ISO8859_15; }
2018  /* ISO 8859-16 */
2019  else if(!strcmp(buf, "ISO_8859-16:2001")) { res = ENC_CS_ISO8859_16; }
2020  else if(!strcmp(buf, "ISO_8859-16")) { res = ENC_CS_ISO8859_16; }
2021  else if(!strcmp(buf, "ISO-IR-226")) { res = ENC_CS_ISO8859_16; }
2022  else if(!strcmp(buf, "CSISO885916")) { res = ENC_CS_ISO8859_16; }
2023  else if(!strcmp(buf, "LATIN10")) { res = ENC_CS_ISO8859_16; }
2024  else if(!strcmp(buf, "L10")) { res = ENC_CS_ISO8859_16; }
2025 
2026  /* Check for official IANA aliases of Windows codepages */
2027  /* Windows-1250 */
2028  else if(!strcmp(buf, "CSWINDOWS1250")) { res = ENC_CS_WINDOWS_1250; }
2029  /* Windows-1251 */
2030  else if(!strcmp(buf, "CSWINDOWS1251")) { res = ENC_CS_WINDOWS_1251; }
2031  /* Windows-1252 */
2032  else if(!strcmp(buf, "CSWINDOWS1252")) { res = ENC_CS_WINDOWS_1252; }
2033  /* Windows-1253 */
2034  else if(!strcmp(buf, "CSWINDOWS1253")) { res = ENC_CS_WINDOWS_1253; }
2035  /* Windows-1254 */
2036  else if(!strcmp(buf, "CSWINDOWS1254")) { res = ENC_CS_WINDOWS_1254; }
2037  /* Windows-1255 */
2038  else if(!strcmp(buf, "CSWINDOWS1255")) { res = ENC_CS_WINDOWS_1255; }
2039  /* Windows-1256 */
2040  else if(!strcmp(buf, "CSWINDOWS1256")) { res = ENC_CS_WINDOWS_1256; }
2041  /* Windows-1257 */
2042  else if(!strcmp(buf, "CSWINDOWS1257")) { res = ENC_CS_WINDOWS_1257; }
2043  /* Windows-1258 */
2044  else if(!strcmp(buf, "CSWINDOWS1258")) { res = ENC_CS_WINDOWS_1258; }
2045 
2046  /* Check for official IANA aliases of KOI8 codepages */
2047  else if(!strcmp(buf, "CSKOI8R")) { res = ENC_CS_KOI8R; }
2048  else if(!strcmp(buf, "CSKOI8U")) { res = ENC_CS_KOI8U; }
2049 
2050  /* Check for official IANA aliases of Macintosh */
2051  else if(!strcmp(buf, "CSMACINTOSH")) { res = ENC_CS_MACINTOSH; }
2052  else if(!strcmp(buf, "MAC")) { res = ENC_CS_MACINTOSH; }
2053 
2054  /* Check for official IANA aliases of IBM codepages */
2055  /* IBM437 */
2056  else if(!strcmp(buf, "CSPC8CODEPAGE437")) { res = ENC_CS_IBM437; }
2057  else if(!strcmp(buf, "CP437")) { res = ENC_CS_IBM437; }
2058  else if(!strcmp(buf, "437")) { res = ENC_CS_IBM437; }
2059  /* IBM775 */
2060  else if(!strcmp(buf, "CSPC775BALTIC")) { res = ENC_CS_IBM775; }
2061  else if(!strcmp(buf, "CP775")) { res = ENC_CS_IBM775; }
2062  /* IBM850 */
2063  else if(!strcmp(buf, "CSPC850MULTILINGUAL")) { res = ENC_CS_IBM850; }
2064  else if(!strcmp(buf, "CP850")) { res = ENC_CS_IBM850; }
2065  else if(!strcmp(buf, "850")) { res = ENC_CS_IBM850; }
2066  /* IBM852 */
2067  else if(!strcmp(buf, "CSPCP852")) { res = ENC_CS_IBM852; }
2068  else if(!strcmp(buf, "CP852")) { res = ENC_CS_IBM852; }
2069  else if(!strcmp(buf, "852")) { res = ENC_CS_IBM852; }
2070  /* IBM00858 */
2071  else if(!strcmp(buf, "PC-MULTILINGUAL-850+EURO"))
2072  { res = ENC_CS_IBM858; }
2073  else if(!strcmp(buf, "CSIBM00858")) { res = ENC_CS_IBM858; }
2074  else if(!strcmp(buf, "CCSID00858")) { res = ENC_CS_IBM858; }
2075  else if(!strcmp(buf, "CP00858")) { res = ENC_CS_IBM858; }
2076  /* ISO 2022-JP */
2077  else if(!strcmp(buf, "CSISO2022JP")) { res = ENC_CS_ISO2022_JP; }
2078 
2079  /* -------------------------------------------------------------------- */
2080  /* To be more tolerant: Check again for invalid ISO 8859 declarations */
2081  else if(!strcmp(buf, "ISO8859-1"))
2082  { isoinv = 1; res = ENC_CS_ISO8859_1; }
2083  else if(!strcmp(buf, "ISO8859-2"))
2084  { isoinv = 1; res = ENC_CS_ISO8859_2; }
2085  else if(!strcmp(buf, "ISO8859-3"))
2086  { isoinv = 1; res = ENC_CS_ISO8859_3; }
2087  else if(!strcmp(buf, "ISO8859-4"))
2088  { isoinv = 1; res = ENC_CS_ISO8859_4; }
2089  else if(!strcmp(buf, "ISO8859-5"))
2090  { isoinv = 1; res = ENC_CS_ISO8859_5; }
2091  else if(!strcmp(buf, "ISO8859-6"))
2092  { isoinv = 1; res = ENC_CS_ISO8859_6; }
2093  else if(!strcmp(buf, "ISO8859-7"))
2094  { isoinv = 1; res = ENC_CS_ISO8859_7; }
2095  else if(!strcmp(buf, "ISO8859-8"))
2096  { isoinv = 1; res = ENC_CS_ISO8859_8; }
2097  else if(!strcmp(buf, "ISO8859-9"))
2098  { isoinv = 1; res = ENC_CS_ISO8859_9; }
2099  else if(!strcmp(buf, "ISO8859-10"))
2100  { isoinv = 1; res = ENC_CS_ISO8859_10; }
2101  else if(!strcmp(buf, "ISO-8859-11"))
2102  { isoinv = 1; res = ENC_CS_ISO8859_11; }
2103  else if(!strcmp(buf, "ISO8859-11"))
2104  { isoinv = 1; res = ENC_CS_ISO8859_11; }
2105  else if(!strcmp(buf, "ISO8859-13"))
2106  { isoinv = 1; res = ENC_CS_ISO8859_13; }
2107  else if(!strcmp(buf, "ISO8859-14"))
2108  { isoinv = 1; res = ENC_CS_ISO8859_14; }
2109  else if(!strcmp(buf, "ISO8859-15"))
2110  { isoinv = 1; res = ENC_CS_ISO8859_15; }
2111  else if(!strcmp(buf, "ISO8859-16"))
2112  { isoinv = 1; res = ENC_CS_ISO8859_16; }
2113 
2114  /* To be more tolerant: Check again for invalid MACINTOSH declarations */
2115  else if(!strcmp(buf, "MAC")) { macinv = 1; res = ENC_CS_MACINTOSH; }
2116  else if(!strcmp(buf, "MACROMAN"))
2117  { macinv = 1; res = ENC_CS_MACINTOSH; }
2118  else if(!strcmp(buf, "X-MAC-ROMAN"))
2119  { macinv = 1; res = ENC_CS_MACINTOSH; }
2120 
2121  /* To be more tolerant: Check again for invalid IBM declarations */
2122  else if(!strcmp(buf, "CP-437")) { ibminv = 1; res = ENC_CS_IBM437; }
2123  else if(!strcmp(buf, "IBM858")) { ibminv = 1; res = ENC_CS_IBM858; }
2124  else if(!strcmp(buf, "CP858")) { ibminv = 1; res = ENC_CS_IBM858; }
2125  else if(!strcmp(buf, "CP1250"))
2126  { ibminv = 1; res = ENC_CS_WINDOWS_1250; }
2127  else if(!strcmp(buf, "CP1251"))
2128  { ibminv = 1; res = ENC_CS_WINDOWS_1251; }
2129  else if(!strcmp(buf, "CP1252"))
2130  { ibminv = 1; res = ENC_CS_WINDOWS_1252; }
2131  else if(!strcmp(buf, "CP1253"))
2132  { ibminv = 1; res = ENC_CS_WINDOWS_1253; }
2133  else if(!strcmp(buf, "CP1254"))
2134  { ibminv = 1; res = ENC_CS_WINDOWS_1254; }
2135  else if(!strcmp(buf, "CP1255"))
2136  { ibminv = 1; res = ENC_CS_WINDOWS_1255; }
2137  else if(!strcmp(buf, "CP1256"))
2138  { ibminv = 1; res = ENC_CS_WINDOWS_1256; }
2139  else if(!strcmp(buf, "CP1257"))
2140  { ibminv = 1; res = ENC_CS_WINDOWS_1257; }
2141  else if(!strcmp(buf, "CP1258"))
2142  { ibminv = 1; res = ENC_CS_WINDOWS_1258; }
2143 
2144  /* To be more tolerant: Check again for invalid UTF declarations */
2145  else if(!strcmp(buf, "UTF7"))
2146  {
2147  PRINT_ERROR("MIME: Invalid character set UTF7 accepted as UTF-7");
2148  res = ENC_CS_UTF_7;
2149  }
2150  else if(!strcmp(buf, "UTF8"))
2151  {
2152  PRINT_ERROR("MIME: Invalid character set UTF8 accepted as UTF-8");
2153  res = ENC_CS_UTF_8;
2154  }
2155  /* -------------------------------------------------------------------- */
2156 
2157  /* Check whether character set is supported */
2158  if(ENC_CS_UNKNOWN == res)
2159  {
2160  l = strlen(MAIN_ERR_PREFIX) + strlen(not_supported) + len;
2161  p = (char*) api_posix_malloc(++l);
2162  if(NULL != p)
2163  {
2164  /* No => Character set not supported */
2165  strcpy(p, MAIN_ERR_PREFIX);
2166  strcat(p, not_supported);
2167  strncat(p, buf, len);
2168  print_error(p);
2169  api_posix_free((void*) p);
2170  }
2171  /* Special check for ISO 8859-x character sets that aren't supported */
2172  buf[8] = 0;
2173  if(!strcmp(buf, "ISO-8859")) { res = ENC_CS_ISO8859_X; }
2174  /* To be more tolerant: Check again for invalid ISO 8859 declaration */
2175  if(!strcmp(buf, "ISO8859")) { isoinv = 1; res = ENC_CS_ISO8859_X; }
2176  }
2177  if(isoinv)
2178  {
2179  PRINT_ERROR("MIME: Invalid ISO 8859 character set accepted");
2180  }
2181  else if(macinv)
2182  {
2183  PRINT_ERROR("MIME: Invalid Macintosh character set accepted");
2184  }
2185  else if(ibminv)
2186  {
2187  PRINT_ERROR("MIME: Invalid IBM codepage accepted");
2188  }
2189  }
2190 
2191  return(res);
2192 }
2193 
2194 
2195 /* ========================================================================== */
2196 /* Decode MIME quoted printable data
2197  *
2198  * \param[in] start Pointer to start of data
2199  * \param[in] end Pointer to end of data
2200  * \param[in] ec Flag to switch between normal and encoded-word syntax
2201  * \param[out] dlen Pointer to length of decoded data
2202  *
2203  * \e end must never be smaller than \e start and must point to the location
2204  * after the last character.
2205  *
2206  * \attention
2207  * If \e ec is true, the syntax is switched to that for encoded-words according
2208  * to RFC 2047.
2209  *
2210  * According to RFC 2045 the following rules are applied:
2211  * - Whitespace at end of lines must be ignored => We do so.
2212  * - A robust decoder may exclude invalid input data and continue
2213  * => We do so by decoding all invalid characters as '?'.
2214  * - The hexadecimal representation of a character must use upper case letters
2215  * A decoder is allowed to accept lower case letters too => We do so.
2216  * - If an invalid sequence follows a '=' character, it is allowed to accept
2217  * this data as plain ASCII => We do so.
2218  * - Lines are not allowed to be longer than 76 characters, but it is allowed
2219  * that a decoder accept lines of arbitrary length => We do so.
2220  *
2221  * \note
2222  * A NUL termination is always appended to the decoded data (but not calculated
2223  * for \e dlen ). This means that if the decoded data is text, the result buffer
2224  * can be directly used as C string.
2225  *
2226  * \result
2227  * - Pointer to new memory block containing the decoded data
2228  * - NULL on error (Value at location \e dlen is not valid)
2229  */
2230 
2231 static char* enc_mime_decode_qp(const char* start, const char* end,
2232  int ec, size_t* dlen)
2233 {
2234  char* res = NULL;
2235  size_t len;
2236  size_t bi = 0;
2237  size_t i;
2238  char* src = NULL;
2239  size_t ws = API_POSIX_SIZE_MAX;
2240  char* tmp = NULL;
2241  char* p;
2242  char current;
2243  unsigned char c;
2244  int state = 0;
2245  char nibble_high = 0;
2246  int v;
2247  int invalid;
2248 
2249  /* Delete whitespace at end of lines */
2250  len = (size_t) (end - start);
2251  p = api_posix_malloc(len + (size_t) 1);
2252  if(NULL == p) { return(NULL); }
2253  else
2254  {
2255  src = p;
2256  for(i = 0; i < len; ++i)
2257  {
2258  /* Check for EOL */
2259  if((char) 0x0A == start[i] && i)
2260  {
2261  if((char) 0x0D == start[i - (size_t) 1] && API_POSIX_SIZE_MAX != ws)
2262  {
2263  /* Seek back to remove whitespace */
2264  bi = ws;
2265  src[bi++] = 0x0D;
2266  }
2267  }
2268  /* Check for whitespace */
2269  if((char) 0x09 == start[i] || (char) 0x20 == start[i])
2270  {
2271  if(API_POSIX_SIZE_MAX == ws) { ws = bi; }
2272  }
2273  else if((char) 0x0D != start[i]) { ws = API_POSIX_SIZE_MAX; }
2274  src[bi++] = start[i];
2275  }
2276  /* Terminate string in source buffer */
2277  src[bi] = 0;
2278  /* Reassign start and end pointers */
2279  start = src;
2280  end = &src[bi];
2281  }
2282 
2283  /* Decode data */
2284  len = 0;
2285  bi = 0;
2286  for(i = 0; i < (size_t) (end - start); ++i)
2287  {
2288  /* Allocate more memory in exponentially increasing chunks */
2289  /* Attention: An invalid QP sequence stays undecoded and 3 octets long! */
2290  if(bi + (size_t) 4 >= len) /* We need (3 + NUL) additional bytes */
2291  {
2292  if(!len) { len = 64; }
2293  p = api_posix_realloc((void*) tmp, len *= (size_t) 2);
2294  if(NULL == p)
2295  {
2296  api_posix_free((void*) tmp);
2297  tmp = NULL;
2298  break;
2299  }
2300  else { tmp = p; }
2301  }
2302  /* Parse current character */
2303  current = start[i];
2304  /* Only printable ASCII characters, SPACE, HT, LF and CR are allowed */
2305  invalid = 0;
2306  v = (int) current;
2307  if(!((9 <= v && 10 >= v) || 13 == v || (32 <= v && 126 >= v)))
2308  {
2309  invalid = 1;
2310  }
2311  /* SPACE and HT are not allowed in encoded-words */
2312  if(ec && !invalid && (9 == v || 32 == v)) { invalid = 1; }
2313  if(invalid)
2314  {
2315  /* Invalid character detected */
2316  PRINT_ERROR("MIME: Decoding invalid quoted printable data");
2317  current = '?';
2318  }
2319  /* Equal sign sequence decoder state machine */
2320  c = 0;
2321  if(!state && '=' == current) { ++state; }
2322  switch(state)
2323  {
2324  case 1:
2325  {
2326  /* Skip equal sign */
2327  ++state;
2328  break;
2329  }
2330  case 2:
2331  {
2332  /* Store CR of soft line break or high nibble of encoded octet */
2333  nibble_high = current;
2334  ++state;
2335  break;
2336  }
2337  case 3:
2338  {
2339  /* SPACE and HT at end of line must be ignored */
2340  if( !ec &&
2341  ((char) 0x09 == nibble_high || (char) 0x20 == nibble_high) )
2342  {
2343  if((char) 0x09 == current || (char) 0x20 == current) { break; }
2344  else if((char) 0x0D == current)
2345  {
2346  nibble_high = current;
2347  break;
2348  }
2349  }
2350  ++state;
2351  /* No break here is intended! */
2352  }
2353  /* FALLTHROUGH */
2354  case 4:
2355  {
2356  state = 0;
2357  /* Check for soft line break */
2358  if(!ec && (char) 0x0D == nibble_high && (char) 0x0A == current)
2359  {
2360  /* printf("Soft line break\n"); */
2361  break;
2362  }
2363  /* Decode octet */
2364  invalid = 0;
2365  v = enc_hex_decode_nibble(nibble_high);
2366  if(0 > v) { invalid = 1; }
2367  else
2368  {
2369  c = (unsigned char) (v * 16);
2370  v = enc_hex_decode_nibble(current);
2371  if(0 > v) { invalid = 1; }
2372  else { c += (unsigned char) v; }
2373  }
2374  if(invalid)
2375  {
2376  /* Invalid encoding => Accept data as ASCII */
2377  PRINT_ERROR("MIME: Invalid quoted printable encoded data");
2378  tmp[bi++] = '=';
2379  tmp[bi++] = nibble_high;
2380  c = (unsigned char) current;
2381  }
2382  break;
2383  }
2384  default:
2385  {
2386  /* Decode underscore to space */
2387  if(ec && '_' == current) { c = (unsigned char) 0x20; }
2388  else { c = (unsigned char) current; }
2389  break;
2390  }
2391  }
2392  if(c) { tmp[bi++] = (char) c; }
2393  }
2394 
2395  /* Terminate decoded data (for use as C string) */
2396  if(NULL == tmp) { res = NULL; }
2397  else
2398  {
2399  tmp[bi] = 0;
2400  /* Report length without the NUL termination */
2401  *dlen = bi;
2402  res = tmp;
2403  }
2404  api_posix_free((void*) src);
2405 
2406  return(res);
2407 }
2408 
2409 
2410 /* ========================================================================== */
2411 /* Convert MIME quoted printable encoded text to Unicode (UTF-8 NFC)
2412  *
2413  * \param[in] charset Character set of data
2414  * \param[in] start Pointer to start of data
2415  * \param[in] end Pointer to end of data
2416  * \param[in] ec Flag to switch between normal and encoded-word syntax
2417  *
2418  * \e end must never be smaller than \e start and must point to the location
2419  * after the last character.
2420  *
2421  * \attention
2422  * If \e ec is true, the syntax is switched to that for encoded-words according
2423  * to RFC 2047.
2424  *
2425  * \result
2426  * - Pointer to new memory block containing the decoded data
2427  * - NULL on error
2428  */
2429 
2430 static const char* enc_mime_decode_q(enum enc_mime_cs charset,
2431  const char* start, const char* end,
2432  int ec)
2433 {
2434  const char* res = NULL;
2435  size_t len;
2436  char* tmp = NULL;
2437 
2438  tmp = enc_mime_decode_qp(start, end, ec, &len);
2439  if(NULL != tmp)
2440  {
2441  /* Convert result to Unicode and normalize to NFC */
2442  res = enc_convert_to_utf8_nfc(charset, tmp);
2443  if(tmp != res) { api_posix_free((void*) tmp); }
2444  }
2445 
2446  return(res);
2447 }
2448 
2449 
2450 /* ========================================================================== */
2451 /* Decode MIME base64 data
2452  *
2453  * \param[in] start Pointer to start of data
2454  * \param[in] end Pointer to end of data
2455  * \param[out] dlen Pointer to length of decoded data
2456  *
2457  * \e end must never be smaller than \e start and must point to the location
2458  * after the last character.
2459  *
2460  * \note
2461  * A NUL termination is always appended to the decoded data (but not calculated
2462  * for \e dlen ). This means that if the decoded data is text, the result buffer
2463  * can be directly used as C string.
2464  *
2465  * \result
2466  * - Pointer to new memory block containing the decoded data
2467  * - NULL on error (Value at location \e dlen is not valid)
2468  */
2469 
2470 static char* enc_mime_decode_base64(const char* start, const char* end,
2471  size_t* dlen)
2472 {
2473  const unsigned char* in = (const unsigned char*) start;
2474  size_t len_in = end - start;
2475  unsigned char* out = NULL;
2476  size_t len_out = BXX0_BASE64_DECODE_LEN_OUT(len_in);
2477  unsigned char flags = BXX0_BASE64_DECODE_FLAG_IGNORE; /* Mandatory */
2478 
2479  {
2480  /* Allocate an additional byte for NUL termination */
2481  unsigned char* p = api_posix_malloc(len_out + (size_t) 1);
2482 
2483  if(NULL == p)
2484  {
2485  PRINT_ERROR("MIME: Base 64: Memory allocation for decoder failed");
2486  return NULL;
2487  }
2488  out = p;
2489  }
2490 
2491  /* Flags for error tolerance ("be liberal in what you accept from others") */
2492  flags |= BXX0_BASE64_DECODE_FLAG_NOPAD;
2493  flags |= BXX0_BASE64_DECODE_FLAG_CONCAT;
2494  flags |= BXX0_BASE64_DECODE_FLAG_INVTAIL;
2495 
2496  {
2497  size_t len_out_orig = len_out;
2498  signed char rv = bxx0_base64_decode(out, &len_out, in, &len_in, flags);
2499 
2500  if(0 > rv)
2501  {
2502  /* Error */
2503  if(BXX0_BASE64_DECODE_ERROR_SIZE == rv)
2504  PRINT_ERROR("MIME: Base 64: Error: Output buffer too small (bug)");
2505  else if(BXX0_BASE64_DECODE_ERROR_TAIL == rv)
2506  PRINT_ERROR("MIME: Base 64: Error: Invalid tail before padding");
2507  else if(BXX0_BASE64_DECODE_ERROR_PAD == rv)
2508  PRINT_ERROR("MIME: Base 64: Error: Invalid padding");
2509  else if(BXX0_BASE64_DECODE_ERROR_DAP == rv)
2510  PRINT_ERROR("MIME: Base 64: Error: Data after padding");
2511  else
2512  PRINT_ERROR("MIME: Base 64: Error: Unknown error");
2513 
2514  api_posix_free((void*) out);
2515  return NULL;
2516  }
2517 
2518  if(0 < rv)
2519  {
2520  /* Warning */
2521  if(BXX0_BASE64_DECODE_FLAG_INVTAIL & rv)
2522  PRINT_ERROR("MIME: Base 64: Warning: "
2523  "Unused bits with nonzero value in tail");
2524  else if(BXX0_BASE64_DECODE_FLAG_CONCAT & rv)
2525  PRINT_ERROR("MIME: Base 64: Warning: "
2526  "Accepted additional data after correctly padded tail");
2527  }
2528 
2529  if(0 != len_in)
2530  {
2531  /* Error */
2532  PRINT_ERROR("MIME: Base 64: Error: Decoding data aborted (bug)");
2533  api_posix_free((void*) out);
2534  return NULL;
2535  }
2536 
2537  *dlen = len_out_orig - len_out;
2538 
2539  /* NUL termination */
2540  out[*dlen] = 0;
2541  }
2542 
2543  return (char*) out;
2544 }
2545 
2546 
2547 /* ========================================================================== */
2548 /* Convert MIME base64 encoded text to Unicode (UTF-8)
2549  *
2550  * \param[in] charset Character set of data
2551  * \param[in] start Pointer to start of data
2552  * \param[in] end Pointer to end of data
2553  *
2554  * \e end must never be smaller than \e start and must point to the location
2555  * after the last character.
2556  *
2557  * \result
2558  * - Pointer to new memory block containing the decoded data
2559  * - NULL on error
2560  */
2561 
2562 static const char* enc_mime_decode_b(enum enc_mime_cs charset,
2563  const char* start, const char* end)
2564 {
2565  const char* res = NULL;
2566  size_t len = 0;
2567  char* tmp = NULL;
2568 
2569  tmp = enc_mime_decode_base64(start, end, &len);
2570  if(NULL != tmp)
2571  {
2572  /* Convert result to Unicode and normalize to NFC */
2573  res = enc_convert_to_utf8_nfc(charset, tmp);
2574  if(tmp != res) { api_posix_free((void*) tmp); }
2575  }
2576 
2577  return(res);
2578 }
2579 
2580 
2581 /* ========================================================================== */
2582 /* Check for leap year in terms of gregorian calendar
2583  *
2584  * \return
2585  * - 0 if \e year is not a leap year
2586  * - Nonzero if \e year is a leap year
2587  */
2588 
2589 static int enc_check_leap_year(unsigned int year)
2590 {
2591  if(!(year % 400U) || (!(year % 4U) && (year % 100U))) { return(1); }
2592  else { return(0); }
2593 }
2594 
2595 
2596 /* ========================================================================== */
2597 /* Encode date and time to POSIX timestamp (seconds since epoche)
2598  *
2599  * \param[out] pts Pointer to seconds since epoche (as defined by POSIX.1)
2600  * \param[in] year Years
2601  * \param[in] month Months
2602  * \param[in] day Days
2603  * \param[in] hour Hours
2604  * \param[in] minute Minutes
2605  * \param[in] seconds Seconds
2606  * \param[in] zone Timezone correction in minutes (Zero for UTC)
2607  *
2608  * \attention
2609  * This function accepts no timestamps before the epoche (the Usenet has not
2610  * existed yet at that time).
2611  *
2612  * On error, zero is written to \e pts .
2613  */
2614 
2615 static int enc_encode_posix_timestamp(core_time_t* pts, unsigned int year,
2616  unsigned int month, unsigned int day,
2617  unsigned int hour, unsigned int minute,
2618  unsigned int second, int zone)
2619 {
2620  static const unsigned int dom[12] = { 31U, 29U, 31U, 30U, 31U, 30U,
2621  31U, 31U, 30U, 31U, 30U, 31U };
2622  int res = -1;
2623  core_time_t ts = 0;
2624  core_time_t zone_seconds;
2625  unsigned int i;
2626 
2627  /* Clamp year down to 1970 */
2628  if(1970U <= year)
2629  {
2630  /* Check for 'core_time_t' overflow (leave at least one year) */
2631  if(2104U < year)
2632  {
2633  PRINT_ERROR("Warning: core_time_t overflow while decoding timestamp");
2634  year = 2104U;
2635  }
2636  for(i = 1970U; i < year; ++i)
2637  {
2638  ts += (core_time_t) 365 * (core_time_t) 86400;
2639  /* Add an additional day for leap years */
2640  if(enc_check_leap_year(i)) { ts += (core_time_t) 86400; }
2641  }
2642  for(i = 0; i < month - 1U; ++i)
2643  {
2644  ts += (core_time_t) dom[i] * (core_time_t) 86400;
2645  /* Subtract one day if current year is not a leap year */
2646  if(1U == i && !enc_check_leap_year(year))
2647  {
2648  ts -= (core_time_t) 86400;
2649  }
2650  }
2651  ts += (core_time_t) (day - 1U) * (core_time_t) 86400;
2652  ts += (core_time_t) hour * (core_time_t) 3600;
2653  ts += (core_time_t) minute * (core_time_t) 60;
2654  ts += (core_time_t) second;
2655  if(0 > zone)
2656  {
2657  zone_seconds = (core_time_t) -zone * (core_time_t) 60;
2658  ts += zone_seconds;
2659  res = 0;
2660  }
2661  else
2662  {
2663  zone_seconds = (core_time_t) zone * (core_time_t) 60;
2664  if(ts >= zone_seconds)
2665  {
2666  ts -= zone_seconds;
2667  res = 0;
2668  }
2669  }
2670  }
2671 
2672  /* Store result */
2673  if(res)
2674  {
2675  PRINT_ERROR("Encoding POSIX timestamp failed");
2676  *pts = 0;
2677  }
2678  else { *pts = ts; }
2679 
2680 #if 0
2681  if(!res)
2682  {
2683  /* For debugging (not thread safe) */
2684  printf("Seconds : %lu\n", (long int) ts);
2685  struct tm* t;
2686  t = gmtime((api_posix_time_t*) &ts);
2687  printf("Conv. UTC: %04d-%02d-%02d %02d:%02d:%02d\n",
2688  t->tm_year + 1900, t->tm_mon + 1, t->tm_mday,
2689  t->tm_hour, t->tm_min, t->tm_sec);
2690  }
2691 #endif
2692 
2693  return(res);
2694 }
2695 
2696 
2697 /* ========================================================================== */
2698 /* Check RFC 5322 atom
2699  *
2700  * \param[in] s Pointer to single character
2701  *
2702  * \return
2703  * - 0 if character pointed to by \e s is allowed
2704  * - Negative value on error
2705  */
2706 
2707 static int enc_check_atom(const char* s)
2708 {
2709  int res = -1;
2710  int c = (int) *s;
2711 
2712  /* Allow 'atext' */
2713  if(0x30 <= c && 0x39 >=c) { res = 0; }
2714  else if(0x41 <= c && 0x5A >=c) { res = 0; }
2715  else if(0x61 <= c && 0x7A >=c) { res = 0; }
2716  else if((int) '!' == c) { res = 0; }
2717  else if((int) '#' == c) { res = 0; }
2718  else if((int) '$' == c) { res = 0; }
2719  else if((int) '%' == c) { res = 0; }
2720  else if((int) '&' == c) { res = 0; }
2721  else if(0x27 == c) { res = 0; }
2722  else if((int) '*' == c) { res = 0; }
2723  else if((int) '+' == c) { res = 0; }
2724  else if((int) '-' == c) { res = 0; }
2725  else if((int) '/' == c) { res = 0; }
2726  else if((int) '=' == c) { res = 0; }
2727  else if((int) '?' == c) { res = 0; }
2728  else if((int) '^' == c) { res = 0; }
2729  else if((int) '_' == c) { res = 0; }
2730  else if((int) '`' == c) { res = 0; }
2731  else if((int) '{' == c) { res = 0; }
2732  else if((int) '|' == c) { res = 0; }
2733  else if((int) '}' == c) { res = 0; }
2734  else if((int) '~' == c) { res = 0; }
2735 
2736  return(res);
2737 }
2738 
2739 
2740 /* ========================================================================== */
2741 /* Check RFC 5322 dot-atom
2742  *
2743  * \param[in] s Pointer to single character
2744  *
2745  * \return
2746  * - 0 if character pointed to by \e s is allowed
2747  * - Negative value on error
2748  */
2749 
2750 static int enc_check_dotatom(const char* s)
2751 {
2752  int res;
2753 
2754  /* Allow dot and atext */
2755  if('.' == *s) { res = 0; }
2756  else { res = enc_check_atom(s); }
2757 
2758  return(res);
2759 }
2760 
2761 
2762 /* ========================================================================== */
2763 /* Encode words in display-name
2764  *
2765  * Words containing only 7 bit characters are encoded to atom or quoted-string.
2766  * Words containing 8 bit characters are preserved unchanged for MIME encoder.
2767  *
2768  * \param[in,out] s Pointer to data buffer
2769  */
2770 
2771 static void enc_encode_dispname(char* s)
2772 {
2773  size_t i = 0;
2774  char buf[ENC_HDR_BUFSIZE + (size_t) 1];
2775  size_t bi = 0;
2776  char word[ENC_HDR_BUFSIZE + (size_t) 1];
2777  char* w;
2778  size_t word_len;
2779  int last_word = 0;
2780  size_t ii;
2781  int atom;
2782  size_t start;
2783  int error;
2784  char cbuf[2];
2785 
2786  while(s[i])
2787  {
2788  /* Extract next word */
2789  ii = i; while(' ' == s[ii]) { ++ii; }
2790  w = strchr(&s[ii], (int) ' ');
2791  if(NULL == w)
2792  {
2793  word_len = strlen(&s[i]);
2794  last_word = 1;
2795  }
2796  else { word_len = (size_t) (w - &s[i]); }
2797  if(ENC_HDR_BUFSIZE < word_len) { word[0] = 0; }
2798  else
2799  {
2800  memcpy((void*) word, (void*) &s[i], word_len);
2801  word[word_len] = 0;
2802  }
2803  i += word_len;
2804  if(!last_word) { ++i; } /* Skip SP delimiter */
2805 
2806  /* Check word */
2807  atom = 1;
2808  ii = 0;
2809  while(word[ii])
2810  {
2811  if(0x80U <= (unsigned int) (unsigned char) word[ii])
2812  {
2813  atom = 1;
2814  break;
2815  }
2816  if(enc_check_atom(&word[ii])) { atom = 0; }
2817  ++ii;
2818  }
2819 
2820  /* SP delimiter between words */
2821  if(bi)
2822  {
2823  if(ENC_HDR_BUFSIZE <= bi) { break; } else { buf[bi++] = ' '; }
2824  }
2825 
2826  /* Copy data to buffer */
2827  if(atom)
2828  {
2829  if(ENC_HDR_BUFSIZE - bi < word_len) { break; }
2830  else
2831  {
2832  memcpy((void*) &buf[bi], (void*) word, word_len);
2833  bi += word_len;
2834  }
2835  }
2836  else
2837  {
2838  /* Create quoted-string */
2839  start = bi;
2840  error = 0;
2841  /* Leading DQUOTE delimiter */
2842  if(ENC_HDR_BUFSIZE <= bi) { error = 1; } else { buf[bi++] = '"'; }
2843  /* Process data */
2844  for(ii = 0; ii < word_len; ++ii)
2845  {
2846  /* Skip control characters */
2847  cbuf[0] = word[ii]; cbuf[1] = 0;
2848  if(enc_ascii_check_printable(cbuf)) { continue; }
2849  /* Check remaining buffer size */
2850  if(ENC_HDR_BUFSIZE - bi < (size_t) 2) { error = 1; break; }
2851  /* Check whether quoted pair is required */
2852  if('"' == word[ii] || 0x5C == (int) word[ii]) { buf[bi++] = 0x5C; }
2853  buf[bi++] = word[ii];
2854  }
2855  /* Trailing DQUOTE delimiter */
2856  if(ENC_HDR_BUFSIZE <= bi) { error = 1; } else { buf[bi++] = '"'; }
2857  if(error) { bi = start; }
2858  }
2859  if(last_word) { break; }
2860  }
2861  /* Terminate buffer */
2862  buf[bi] = 0;
2863  /* Copy data back to callers buffer */
2864  strncpy(s, buf, ++bi);
2865 
2866  return;
2867 }
2868 
2869 
2870 /* ========================================================================== */
2871 /* Decode MIME parameter percent encoding
2872  *
2873  * \param[in] buf Pointer to data buffer
2874  * \param[in] cs IANA name of character set
2875  *
2876  * This function decodes the percent encoding defined for MIME parameters by
2877  * RFC 2231. The IANA name of the character set for the resulting octet stream
2878  * must be specified by \e cs . The data is converted to Unicode in UTF-8
2879  * representation with NFC normalization.
2880  *
2881  * \return
2882  * - Pointer to decoded data (a new memory block was allocated)
2883  * - NULL on error
2884  */
2885 
2886 static char* enc_mime_decode_parameter(const char* buf, const char* cs)
2887 {
2888  char* res = NULL;
2889  char* tmp = NULL;
2890  const char* tmp2 = NULL;
2891  size_t len;
2892  int rv;
2893  enum enc_mime_cs charset;
2894 
2895  if(NULL != buf)
2896  {
2897  /* Percent decoder */
2898  len = strlen(buf);
2899  tmp = (char*) api_posix_malloc(++len);
2900  if(NULL != tmp)
2901  {
2902  memcpy((void*) tmp, (void*) buf, len);
2903  if(enc_ascii_check_printable(tmp))
2904  {
2905  PRINT_ERROR("MIME: Nonprintable characters in parameter");
2906  }
2907  else
2908  {
2909  rv = enc_percent_decode(tmp, 1);
2910  if(0 > rv)
2911  {
2912  PRINT_ERROR("MIME: Percent encoding failed for parameter");
2913  }
2914  else
2915  {
2916  charset = enc_mime_get_charset(cs, strlen(cs));
2917  tmp2 = enc_convert_to_utf8_nfc(charset, tmp);
2918  if(NULL == tmp2)
2919  {
2920  PRINT_ERROR("MIME: Parameter charset not supported");
2921  }
2922  else
2923  {
2924  len = strlen(tmp2);
2925  res = (char*) api_posix_malloc(++len);
2926  if(NULL != res)
2927  {
2928  memcpy((void*) res, (void*) tmp2, len);
2929  }
2930  if(tmp != tmp2) { api_posix_free((void*) tmp2); }
2931  }
2932  }
2933  }
2934  }
2935  }
2936  api_posix_free((void*) tmp);
2937 
2938  return(res);
2939 }
2940 
2941 
2942 /* ========================================================================== */
2943 /*! \brief Create a "name-addr" construct according to RFC 5322
2944  *
2945  * This function is intended to create the "From" and "Reply-To" header fields.
2946  *
2947  * \param[in] data Input data
2948  * \param[in] offset Folding offset, e.g. \c sizeof("From: ")
2949  *
2950  * The input data must have the following format: \c name \c <addr-spec> .
2951  *
2952  * \attention
2953  * The \c addr-spec construct is not allowed to contain comments or quoted
2954  * strings. Both parts, \c name and \c <addr-spec> must fit on a single header
2955  * line of 998 characters. Note that \e offset adds to the length of \c name .
2956  *
2957  * \c name must be an Unicode identifier corresponding to \c addr-spec . If it
2958  * contains non-ASCII characters, it is converted to a valid \c display-name
2959  * token. The result will be folded according to RFC 2047.
2960  *
2961  * On success the caller is responsible to free the memory allocated for the
2962  * result.
2963  *
2964  * \return
2965  * - Pointer to encoded data (a new memory block was allocated)
2966  * - NULL on error
2967  */
2968 
2969 const char* enc_create_name_addr(const char* data, size_t offset)
2970 {
2971  const char* res = NULL;
2972  size_t len = 4; /* The space after name, the angle brackets and NUL */
2973  size_t i;
2974  size_t counter = 0;
2975  int error = 0;
2976  char c;
2977  char* buf;
2978  int rv;
2979  char name[(size_t) 2 * ENC_HDR_BUFSIZE + (size_t) 1];
2980  char addr_spec[ENC_HDR_BUFSIZE + (size_t) 1];
2981 
2982  /* Extract name and addr-spec parts from input data */
2983  if((size_t) 2 * ENC_HDR_BUFSIZE < strlen(data)) { error = 1; }
2984  else
2985  {
2986  strcpy(name, data);
2987  addr_spec[0] = 0;
2988  if(!strlen(name)) { error = 1; }
2989  else
2990  {
2991  i = 0;
2992  while(name[i])
2993  {
2994  if('<' == name[i])
2995  {
2996  if(NULL == strchr(&name[i + (size_t) 1], (int) '<'))
2997  {
2998  if(!i) { name[0] = 0; }
2999  else { name[i - (size_t) 1] = 0; }
3000  if(ENC_HDR_BUFSIZE < strlen(&name[i])) { error = 1; }
3001  else
3002  {
3003  strcpy(addr_spec, &name[++i]);
3004  i = strlen(addr_spec) - (size_t) 1;
3005  if('>' != addr_spec[i]) { addr_spec[0] = 0; }
3006  else { addr_spec[i] = 0; }
3007  }
3008  break;
3009  }
3010  }
3011  ++i;
3012  }
3013  }
3014  }
3015 
3016  /* Prepare display-name */
3017  if(!error)
3018  {
3019  enc_encode_dispname(name);
3020  len += strlen(name);
3021  }
3022 
3023  /* Check addr-spec */
3024  if(!error)
3025  {
3026  len += strlen(addr_spec);
3027  error = enc_ascii_check(addr_spec);
3028  if(!error)
3029  {
3030  i = 0;
3031  do
3032  {
3033  c = addr_spec[i]; if(!c) { break; }
3034  if('@' != c && enc_check_dotatom(&c))
3035  {
3036  /* Invalid dot-atom found */
3037  error = 1;
3038  }
3039  /* Handle "@" separator */
3040  else if('@' == c)
3041  {
3042  ++counter;
3043  if(!i || !addr_spec[i + (size_t) 1])
3044  {
3045  /* Invalid separator found (at beginning or end) */
3046  error = 1;
3047  }
3048  /* Verify that dot-atoms don't have dots at beginning or end */
3049  if('.' == addr_spec[i - (size_t) 1]
3050  || '.' == addr_spec[i + (size_t) 1])
3051  {
3052  /* Invalid dot-atom found */
3053  error = 1;
3054  }
3055  }
3056  /* Verify that dot-atoms don't have dots at beginning or end */
3057  if(!error && '.' == c)
3058  {
3059  if(!i || !addr_spec[i + (size_t) 1])
3060  {
3061  /* Invalid dot-atom found */
3062  error = 1;
3063  }
3064  }
3065  ++i;
3066  }
3067  while(!error);
3068  }
3069  /* Final checks */
3070  if(! (error || (size_t) 1 != counter || (size_t) 5 > strlen(addr_spec)) )
3071  {
3072  /* Allocate buffer */
3073  buf = (char*) api_posix_malloc(len);
3074  if(NULL != buf)
3075  {
3076  /* Copy name and add trailing space (if not empty string) */
3077  if(name[0])
3078  {
3079  strcpy(buf, name);
3080  strcat(buf, " <");
3081  }
3082  else { strcpy(buf, "<"); }
3083  /* Copy addr-spec between angle brackets */
3084  strcat(buf, addr_spec);
3085  strcat(buf, ">");
3086  /* MIME encoding */
3087  rv = enc_mime_word_encode(&res, buf, offset);
3088  if(0 >= rv) { api_posix_free((void*) buf); }
3089  /* For positive return value, 'buf' was assigned to 'res'! */
3090  }
3091  }
3092  }
3093 
3094  /* Check for error */
3095  if(error) { PRINT_ERROR("Creating name-addr construct failed"); }
3096 
3097  /* For code review: Do not 'free()' memory pointed to by 'buf' here! */
3098 
3099  return(res);
3100 }
3101 
3102 /* ========================================================================== */
3103 /*! \brief Decode number of lines
3104  *
3105  * \param[in] lines Number of lines
3106  *
3107  * \e lines must be a RFC 5536 conformant body of the (now obsolete) "Lines"
3108  * header field.
3109  *
3110  * \return
3111  * - Number of lines
3112  * - 0 on error
3113  */
3114 
3115 unsigned long int enc_lines_decode(const char* lines)
3116 {
3117  unsigned long int res;
3118 
3119  if(1 != sscanf(lines, "%lu", &res)) { res = 0; }
3120 
3121  return(res);
3122 }
3123 
3124 
3125 /* ========================================================================== */
3126 /*! \brief Convert number of lines to string
3127  *
3128  * \param[out] l Pointer to result buffer (at least 11 characters large)
3129  * \param[in] l_raw Number of lines
3130  *
3131  * \attention
3132  * The value of \e l_raw must be representable as decimal number with not more
3133  * than 10 digits. Otherwise the string \c "Error" is returned.
3134  */
3135 
3136 void enc_convert_lines_to_string(char* l, unsigned long int l_raw)
3137 {
3138  int rv;
3139 
3140  rv = api_posix_snprintf(l, 11, "%lu", l_raw);
3141  if(0 > rv || 11 <= rv)
3142  {
3143  l[0] = 'E';
3144  l[1] = 'r';
3145  l[2] = 'r';
3146  l[3] = 'o';
3147  l[4] = 'r';
3148  l[5] = 0;
3149  }
3150 }
3151 
3152 
3153 /* ========================================================================== */
3154 /*! \brief Decode canonical timestamp to POSIX time (seconds since epoche)
3155  *
3156  * According to RFC 5322 all military timezones should be
3157  * treated as UTC because there was an error in RFC 822
3158  * => We do so and accept "Z" as valid because it means UTC
3159  *
3160  * \note
3161  * This function accepts no timestamps before the epoche (the Usenet has not
3162  * existed yet at that time).
3163  *
3164  * \param[in] timestamp RFC 5536 conformant timestamp string
3165  *
3166  * \return
3167  * - Seconds since epoche (as defined by POSIX.1)
3168  * - 0 on error
3169  */
3170 
3171 core_time_t enc_timestamp_decode(const char* timestamp)
3172 {
3173  static const char* months[12] = { "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
3174  "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
3175  static const unsigned int dom[12] = { 31U, 29U, 31U, 30U, 31U, 30U,
3176  31U, 31U, 30U, 31U, 30U, 31U };
3177  core_time_t res = 0;
3178  int error = 1;
3179  const char* p;
3180  const char* q;
3181  int rv;
3182  char m[4];
3183  char z[6];
3184  unsigned int zh = 0;
3185  unsigned int zm = 0;
3186  unsigned int i;
3187  unsigned int year;
3188  unsigned int month = 13U;
3189  unsigned int day;
3190  unsigned int hour = 0;
3191  unsigned int minute = 0;
3192  unsigned int second = 0;
3193  int zone = 1; /* Correction in minutes */
3194  int pos = 0;
3195 
3196 #if 0
3197  /* For debugging */
3198  printf("------------------------------------------------------------\n");
3199  printf("Timestamp: %s\n", timestamp);
3200 #endif
3201 
3202  /* Skip optional day-of-week */
3203  p = strchr(timestamp, (int) ',');
3204  if(NULL == p) { p = timestamp; } else { ++p; }
3205 
3206  /* Extract date */
3207  rv = sscanf(p, "%u %3c %u%n", &day, m, &year, &pos);
3208  if(3 != rv) { PRINT_ERROR("Invalid date in timestamp"); }
3209  else
3210  {
3211  /* Check for obsolete year format as defined by RFC 5322 */
3212  if(1000U > year)
3213  {
3214  if(50U > year) { year += 2000U; } else { year += 1900U; }
3215  }
3216  /* Decode month */
3217  m[0] = (char) toupper((int) m[0]);
3218  m[1] = (char) toupper((int) m[1]);
3219  m[2] = (char) toupper((int) m[2]);
3220  m[3] = 0;
3221  for(i = 0; i < 12; ++i)
3222  {
3223  if(!strcmp(months[i], m)) { month = i + 1U; break; }
3224  }
3225  if(13U <= month) { PRINT_ERROR("Invalid month in timestamp"); }
3226  else if(i < 12)
3227  {
3228  /* Check day */
3229  if(1U > day || dom[i] < day) { month = 13U; }
3230  if(13U > month)
3231  {
3232  if(2U == month && 29U == day)
3233  {
3234  /* Check for leap year in terms of gregorian calendar */
3235  if(!enc_check_leap_year(year)) { month = 13U; }
3236  }
3237  }
3238  if(13U <= month)
3239  {
3240  PRINT_ERROR("Invalid day of month in timestamp");
3241  }
3242  }
3243  }
3244 
3245  /* Extract time if date was found */
3246  if(13U > month)
3247  {
3248  p += pos;
3249  rv = sscanf(p, "%u : %u%n", &hour, &minute, &pos);
3250  if(2 != rv)
3251  {
3252  PRINT_ERROR("Invalid time in timestamp");
3253  }
3254  else
3255  {
3256  p += pos;
3257  q = strchr(p, (int) ':');
3258  if(NULL != q) { p = q; }
3259  rv = sscanf(p, ": %u%n", &second, &pos);
3260  if(1 == rv) { p += pos; }
3261  rv = sscanf(p, "%5s", z);
3262  z[5] = 0;
3263  if(1 != rv)
3264  {
3265  PRINT_ERROR("Missing timezone in timestamp");
3266  }
3267  else
3268  {
3269  /* Check time (accept leap second according to RFC 5322) */
3270  if(23U < hour || 59U < minute || 60U < second)
3271  {
3272  PRINT_ERROR("Invalid time in timestamp");
3273  }
3274  else
3275  {
3276  /* Decode timezone */
3277  if('+' == z[0] || '-' == z[0])
3278  {
3279  for(i = 1; i < 5; ++i)
3280  {
3281  if(enc_ascii_check_digit(&z[i])) { zone = 0; break; }
3282  }
3283  if(zone)
3284  {
3285  zh = ((unsigned int) z[1] - 0x30) * 10U;
3286  zh += ((unsigned int) z[2] - 0x30);
3287  zm = ((unsigned int) z[3] - 0x30) * 10U;
3288  zm += ((unsigned int) z[4] - 0x30);
3289  if(59U < zm) { zone = 0; }
3290  }
3291  if(!zone)
3292  {
3293  PRINT_ERROR("Invalid timezone in timestamp");
3294  }
3295  else
3296  {
3297  zone = (int) (zh * 60U + zm);
3298  if('-' == z[0]) { zone *= -1; }
3299  }
3300  }
3301  else
3302  {
3303  /* Check for obsolete timezone format */
3304  if(!strcmp("GMT", z)) { zone = 0; }
3305  else if(!strcmp("UT", z)) { zone = 0; }
3306  else if(!strcmp("EDT", z)) { zone = -4 * 60; }
3307  else if(!strcmp("EST", z)) { zone = -5 * 60; }
3308  else if(!strcmp("CDT", z)) { zone = -5 * 60; }
3309  else if(!strcmp("CST", z)) { zone = -6 * 60; }
3310  else if(!strcmp("MDT", z)) { zone = -6 * 60; }
3311  else if(!strcmp("MST", z)) { zone = -7 * 60; }
3312  else if(!strcmp("PDT", z)) { zone = -7 * 60; }
3313  else if(!strcmp("PST", z)) { zone = -8 * 60; }
3314  else if(!strcmp("Z", z)) { zone = 0; }
3315  else
3316  {
3317  zone = 0;
3318  PRINT_ERROR("Decode unknown timezone in timestamp as UTC");
3319  }
3320  }
3321 #if 0
3322  /* For debugging */
3323  printf("Decoded : %04u-%02u-%02u %02u:%02u:%02u %+d minutes\n",
3324  year, month, day, hour, minute, second, zone);
3325 #endif
3326  /* Decoding successful */
3327  error = 0;
3328  }
3329  }
3330  }
3331  }
3332 
3333  /* Calculate seconds since epoche */
3334  if(!error)
3335  {
3336  enc_encode_posix_timestamp(&res, year, month, day, hour, minute, second,
3337  zone);
3338  }
3339 
3340  return(res);
3341 }
3342 
3343 
3344 /* ========================================================================== */
3345 /*! \brief Convert POSIX timestamp to ISO 8601 conformant local date and time
3346  *
3347  * \param[out] isodate Buffer for date string (at least 20 characters)
3348  * \param[in] pts Seconds since epoche (as defined by POSIX.1)
3349  *
3350  * ISO 8601 allows to omit the 'T' character between the date and time fields
3351  * if there is no risk of confusing a date and time of day representation.
3352  * This is the case here => We omit the 'T' for better human readability
3353  *
3354  * \return
3355  * - 0 on success
3356  * - Negative value on error
3357  */
3358 
3360 {
3361  int res = -1;
3362  api_posix_time_t ts;
3363  api_posix_struct_tm t_data;
3364  api_posix_struct_tm* t;
3365 
3366  /*
3367  * Check for potential 'time_t' overflow
3368  * Many historical 32 bit Unix systems use 'signed int' for 'time_t'.
3369  * We clamp the 'pts' to this lowest common denominator.
3370  */
3371  if((core_time_t) INT_MAX < pts)
3372  {
3373  /* Clamp time up to 2038-01-19T03:14:07Z if system uses 32 bit 'int' */
3374  PRINT_ERROR("Warning: time_t overflow while converting timestamp");
3375  ts = (core_time_t) INT_MAX;
3376  }
3377  else { ts = (api_posix_time_t) pts; }
3378 
3379  /* Convert POSIX timestamp to ISO 8601 date */
3380  /*! \todo
3381  * Calling operating system for date conversion should be replaced until the
3382  * year 2038 (when 32 bit signed \c time_t implementations will overflow).
3383  */
3384  t = api_posix_localtime_r(&ts, &t_data);
3385  if(NULL != t)
3386  {
3387  /* Return value is intentionally ignred, use ign to silence compiler */
3388  ign = api_posix_snprintf(isodate, 20, "%04d-%02d-%02d %02d:%02d:%02d",
3389  t->tm_year + 1900, t->tm_mon + 1, t->tm_mday,
3390  t->tm_hour, t->tm_min, t->tm_sec);
3391  res = 0;
3392  }
3393 
3394  if(0 > res) { PRINT_ERROR("Timestamp conversion failed"); }
3395 
3396  return(res);
3397 }
3398 
3399 
3400 /* ========================================================================== */
3401 /*! \brief Get current UTC date in ISO 8601 conformant format
3402  *
3403  * \param[out] isodate Buffer for date string (at least 21 characters)
3404  *
3405  * The date is written to \e isodate in \c YYYY-MM-DDTHH-MM-SSZ format.
3406  *
3407  * \return
3408  * - 0 on success
3409  * - Negative value on error
3410  */
3411 
3412 int enc_get_iso8601_utc(char* isodate)
3413 {
3414  int res = -1;
3415  api_posix_time_t ts;
3416  api_posix_struct_tm t_data;
3417  api_posix_struct_tm* t;
3418 
3419  /*
3420  * Check for potential 'time_t' overflow
3421  * Many historical 32 bit Unix systems use 'signed int' for 'time_t'.
3422  * We clamp the 'pts' to this lowest common denominator.
3423  */
3424  api_posix_time(&ts);
3425  if((api_posix_time_t) 0 > ts) { res = -1; }
3426  else
3427  {
3428  /* Convert POSIX timestamp to ISO 8601 date */
3429  /*! \todo
3430  * Calling operating system for date conversion should be replaced until the
3431  * year 2038 (when 32 bit signed \c time_t implementations will overflow).
3432  */
3433  t = api_posix_gmtime_r(&ts, &t_data);
3434  if(NULL != t)
3435  {
3436  /* Return value is intentionally ignred, use ign to silence compiler */
3437  ign = api_posix_snprintf(isodate, 21, "%04d-%02d-%02dT%02d:%02d:%02dZ",
3438  t->tm_year + 1900, t->tm_mon + 1, t->tm_mday,
3439  t->tm_hour, t->tm_min, t->tm_sec);
3440  res = 0;
3441  }
3442  }
3443 
3444  if(0 > res) { PRINT_ERROR("ISO 8601 date request failed"); }
3445 
3446  return(res);
3447 }
3448 
3449 
3450 /* ========================================================================== */
3451 /*! \brief Convert ISO 8601 conformant UTC date and time to POSIX timestamp
3452  *
3453  * \param[out] pts Seconds since epoche (as defined by POSIX.1)
3454  * \param[in] isodate Buffer for date string (at least 20 characters)
3455  *
3456  * \attention
3457  * The parameter \e isodate must be in \c YYYY-MM-DDTHH-MM-SSZ format (UTC).
3458  *
3459  * \note
3460  * This function accepts no date input before the epoche.
3461  *
3462  * \return
3463  * - 0 on success
3464  * - Negative value on error
3465  */
3466 
3467 int enc_convert_iso8601_to_posix(core_time_t* pts, const char* isodate)
3468 {
3469  int res = -1;
3470  int rv;
3471  unsigned int year;
3472  unsigned int month;
3473  unsigned int mday;
3474  unsigned int hour;
3475  unsigned int minute;
3476  unsigned int second;
3477 
3478  /* Split ISO 8601 date */
3479  rv = sscanf(isodate, "%u-%u-%uT%u:%u:%uZ", &year, &month, &mday,
3480  &hour, &minute, &second);
3481  if(6 != rv) { PRINT_ERROR("ISO 8601 timestamp has invalid format"); }
3482  else
3483  {
3484  if(1970U <= year && 9999U >= year
3485  && 1U <= month && 12U >= month
3486  && 1U <= mday && 31U >= mday
3487  && 23U >= hour && 59U >= minute && 59U >= second) { res = 0; }
3488  }
3489 
3490  /* Calculate seconds since epoche */
3491  if(!res)
3492  {
3493  res = enc_encode_posix_timestamp(pts, year, month, mday,
3494  hour, minute, second, 0);
3495  }
3496 
3497  return(res);
3498 }
3499 
3500 
3501 /* ========================================================================== */
3502 /*! \brief Convert ISO 8601 conformant date to canonical timestamp
3503  *
3504  * \param[out] ts Pointer to canonical timestamp as defined by RFC 5322
3505  * \param[in] isodate ISO 8601 date string (exactly 10 characters)
3506  *
3507  * \attention
3508  * The parameter \e isodate must be in \c YYYY-MM-DD format (only date, time is
3509  * not supported).
3510  *
3511  * \note
3512  * On success, the caller is responsible to free the memory allocated for the
3513  * result string.
3514  *
3515  * \return
3516  * - 0 on success
3517  * - Negative value on error
3518  */
3519 
3520 int enc_convert_iso8601_to_timestamp(const char** ts, const char* isodate)
3521 {
3522  static const char* months[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
3523  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
3524  int res = -1;
3525  int rv;
3526  unsigned int year;
3527  unsigned int month;
3528  unsigned int mday;
3529  char* buf = NULL;
3530  size_t len = 50;
3531 
3532  /* Split ISO 8601 date */
3533  rv = sscanf(isodate, "%u-%u-%u", &year, &month, &mday);
3534  if(3 != rv) { PRINT_ERROR("ISO 8601 timestamp has invalid format"); }
3535  else
3536  {
3537  if(1900U <= year && 9999U >= year
3538  && 1U <= month && 12U >= month
3539  && 1U <= mday && 31U >= mday) { res = 0; }
3540  }
3541  if(!res)
3542  {
3543  /* Allocate buffer for result */
3544  buf = (char*) api_posix_malloc(len);
3545  if(NULL == buf) { res = -1; }
3546  else
3547  {
3548  api_posix_snprintf(buf, 50, "%u %s %04u %02d:%02d:%02d -0000",
3549  mday, months[--month], year, 0, 0, 0);
3550  *ts = (const char*) buf;
3551  }
3552  }
3553 
3554  return(res);
3555 }
3556 
3557 
3558 /* ========================================================================== */
3559 /*! \brief Convert article number from numerical format to ASCII
3560  *
3561  * \param[out] result Pointer to result string buffer (Size: 17 bytes)
3562  * \param[out] len Pointer to length of result string (Maximum value: 16)
3563  * \param[in] wm Article number (watermark) to convert
3564  *
3565  * RFC 3977 allows max. 16 digits.
3566  *
3567  * \note
3568  * The output is locale independent.
3569  *
3570  * \return
3571  * - 0 on success
3572  * - Negative value on error (\e result and \e len are not valid)
3573  */
3574 
3575 int enc_convert_anum_to_ascii(char result[17], size_t* len, core_anum_t wm)
3576 {
3577  int res = -1;
3578  int rv;
3579 
3580  /* C90 compilers are not required to support more than 32 bit data types */
3581  if (CORE_ANUM_T_MAX > ULONG_MAX)
3582  {
3583  PRINT_ERROR("Value of CORE_ANUM_T_MAX is too large");
3584  }
3585  else
3586  {
3587  rv = api_posix_snprintf(result, 17, "%lu", (unsigned long int) wm);
3588  if(rv > 0 && rv <= 16)
3589  {
3590  *len = (size_t) rv;
3591  res = 0;
3592  }
3593  }
3594 
3595  return(res);
3596 }
3597 
3598 
3599 /* ========================================================================== */
3600 /*! \brief Convert number from ASCII to numerical format
3601  *
3602  * \param[out] result Pointer to result
3603  * \param[in] wm Article number (watermark) string to convert
3604  * \param[in] len Length of string \e wm
3605  *
3606  * Max. 20 digits are supported, sufficient for 64-bit article numbers.
3607  * RFC 3977 allows max. 16 digits.
3608  *
3609  * This function correctly processes leading zeros and does not use standard
3610  * library functions with locale dependent behaviour.
3611  *
3612  * \note
3613  * \e wm needs no termination, the first \e len characters are used.
3614  *
3615  * \return
3616  * - 0 on success
3617  * - Negative value on error
3618  * - -2 means larger than \ref NNTP_ANUM_T_MAX
3619  */
3620 
3621 int enc_convert_ascii_to_anum(core_anum_t* result, const char* wm,
3622  int len)
3623 {
3624  int res = -1;
3625  unsigned char c;
3626  nntp_anum_t pot = 1; /* 10^0 (0th power of 10) */
3627  nntp_anum_t d;
3628  nntp_anum_t v = 0;
3629 
3630  /* Check length */
3631  if(0 < len && 20 >= len)
3632  {
3633  /* Process every digit as a power of ten */
3634  while(len)
3635  {
3636  /* Get character and check whether it is a digit */
3637  c = (unsigned char) wm[len-- - 1];
3638  if(enc_ascii_check_digit((char*) &c)) { len = -1; break; }
3639  /* ASCII decode it to numerical digit */
3640  d = (nntp_anum_t) (unsigned char) (c - 0x30U);
3641  /* Calculate value of digit */
3642  d *= pot;
3643  /* Avoid overflow */
3644  if(NNTP_ANUM_T_MAX - v < d) { res = -2; break; }
3645  /* Add value of digit to result */
3646  v += d;
3647  /* Calculate next power of ten */
3648  pot *= 10;
3649  }
3650  /* Check whether processing was successful */
3651  if(!len)
3652  {
3653  *result = v;
3654  res = 0;
3655  }
3656  }
3657 
3658  if(0 > res) { PRINT_ERROR("Article number conversion failed"); }
3659 
3660  return(res);
3661 }
3662 
3663 
3664 /* ========================================================================== */
3665 /*! \brief Convert octet to hexadecimal (ASCII) format
3666  *
3667  * \param[out] result Pointer to result
3668  * \param[in] octet Octet to convert
3669  *
3670  * Exactly 3 bytes are written to the buffer pointed to by \e result .
3671  * If \e octet is smaller than 16, a leading zero is created.
3672  * On error, the result "XX" is generated.
3673  * The \e result is always a zero terminated string.
3674  *
3675  * \return
3676  * - 0 on success
3677  * - Negative value on error
3678  */
3679 
3680 
3681 int enc_convert_octet_to_hex(char* result, unsigned int octet)
3682 {
3683  int res = -1;
3684 
3685  if(255U >= octet)
3686  {
3687  if(2 == api_posix_snprintf(result, 3, "%02X", octet)) { res = 0; }
3688  }
3689 
3690  /* Check for error */
3691  if(res) { strcpy(result, "XX"); }
3692 
3693  return(res);
3694 }
3695 
3696 
3697 /* ========================================================================== */
3698 /*! \brief Encode or decode data with ROT13 algorithm
3699  *
3700  * \param[in] data Pointer to buffer with Data to encode/decode
3701  *
3702  * Any character that is not a latin ASCII character in the ranges A..Z and
3703  * a..z will stay unchanged.
3704  *
3705  * No memory is allocated. The operation is executed in the buffer pointed to
3706  * by \e data .
3707  */
3708 
3709 void enc_rot13(char* data)
3710 {
3711  size_t i = 0;
3712  int c;
3713  int modified = 0;
3714 
3715  while(data[i])
3716  {
3717  c = (int) data[i];
3718  /* Check for capital letter */
3719  if(65 <= c && 90 >= c)
3720  {
3721  c += 13;
3722  if(90 < c) { c = 65 - 1 + (c - 90); }
3723  modified = 1;
3724  }
3725  /* Check for small letter */
3726  else if(97 <= c && 122 >= c)
3727  {
3728  c += 13;
3729  if(122 < c) { c = 97 - 1 + (c - 122); }
3730  modified = 1;
3731  }
3732  /* Change character */
3733  if(modified) { data[i] = (char) c; }
3734  /* Process next character */
3735  modified = 0;
3736  ++i;
3737  }
3738 }
3739 
3740 
3741 /* ========================================================================== */
3742 /*! \brief Encode binary data to base64
3743  *
3744  * \param[out] enc Pointer to result (zero terminated string)
3745  * \param[in] data Data to encode
3746  * \param[in] len Data length
3747  *
3748  * If \e len is zero, \e data is not dereferenced and the result will be an
3749  * empty string.
3750  *
3751  * On error, nothing is written to \e enc .
3752  *
3753  * On success a pointer to the result buffer will be written to \e enc .
3754  * The caller is responsible to free the memory allocated for this buffer.
3755  *
3756  * \return
3757  * - 0 on success
3758  * - Negative value on error
3759  */
3760 
3761 int enc_mime_encode_base64(const char** enc, const char* data, size_t len)
3762 {
3763  size_t len_out = BXX0_BASE64_ENCODE_LEN_OUT(len) + 1U; /* +1 for NUL */
3764  size_t len_out_orig = len_out;
3765  unsigned char* out = (unsigned char*) api_posix_malloc(len_out);
3766 
3767  if(NULL == out) { return -1; }
3768 
3769  if(len)
3770  {
3771  const unsigned char* in = (const unsigned char*) data;
3772  signed char rv = bxx0_base64_encode(out, &len_out, in, &len, 0);
3773 
3774  if(0 > rv || 0U != len)
3775  {
3776  api_posix_free((void*) out);
3777  return -1;
3778  }
3779  }
3780 
3781  out[len_out_orig - len_out] = 0;
3782  *enc = (const char*) out;
3783  return 0;
3784 }
3785 
3786 
3787 /* ========================================================================== */
3788 /*! \brief Extract addr-spec token from RFC 5322 mailbox
3789  *
3790  * \param[in] mailbox RFC 5322 mailbox
3791  *
3792  * \attention
3793  * The checks are more restrictive than the formal specification of RFC 5322.
3794  * White space is not allowed inside the \c addr-spec token!
3795  *
3796  * \note
3797  * It is tolerated that \e mailbox contains an invalid \c name-addr token
3798  * because it is ignored anyway.
3799  *
3800  * On success a pointer to the result buffer is returned.
3801  * The caller is responsible to free the memory allocated for this buffer.
3802  *
3803  * \return
3804  * - Pointer to new memory block containing the \c addr-spec token
3805  * - NULL on error
3806  */
3807 
3808 const char* enc_extract_addr_spec(const char* mailbox)
3809 {
3810  char* res = NULL;
3811  unsigned int state;
3812  const char* s;
3813  const char* e;
3814  size_t len;
3815  size_t i, ii;
3816 
3817  if(NULL != mailbox)
3818  {
3819  len = strlen(mailbox);
3820  /* A valid addr-spec is at least 3 characters long */
3821  if((size_t) 3 <= len)
3822  {
3823  /* Default to assumption that whole mailbox is an 'addr-spec' token */
3824  s = mailbox;
3825  e = &mailbox[len];
3826  /*
3827  * To tolerate arbitrary garbage for the 'name-addr' token search
3828  * backward from the end for an 'angle-addr' token.
3829  */
3830  state = 0;
3831  for(i = len; i; --i)
3832  {
3833  ii = i - (size_t) 1;
3834  if(!state && '>' == mailbox[ii])
3835  {
3836  e = &mailbox[ii];
3837  ++state;
3838  continue;
3839  }
3840  if(1U == state && '@' == mailbox[ii]) { ++state; continue; }
3841  if(2U == state && '<' == mailbox[ii])
3842  {
3843  /* 'angle-addr' token found at end of mailbox */
3844  ++state;
3845  s = &mailbox[ii];
3846  ++s;
3847  break;
3848  }
3849  }
3850  if((!state || 3U <= state) && e > s + 2)
3851  {
3852  /* Allocate new memory block and copy 'addr-spec' to it */
3853  len = (size_t) (e - s);
3854  res = (char*) api_posix_malloc(len + (size_t) 1);
3855  if(NULL != res)
3856  {
3857  memcpy((void*) res, (void*) s, len); res[len] = 0;
3858  /* Check whether result is a valid 'addr-spec' token */
3859  i = 0;
3860  state = 0;
3861  while(res[i] && '@' != res[i])
3862  {
3863  if(enc_check_dotatom(&res[i])) { state = 1; break; }
3864  ++i;
3865  }
3866  if(!state)
3867  {
3868  if(!i || '@' != res[i]) { state = 1; }
3869  /* Skip '@' and verify that there is more data */
3870  else { if(!res[++i]) { state = 1; } }
3871  }
3872  if(!state)
3873  {
3874  while(res[i])
3875  {
3876  /*
3877  * Relaxed check for 'domain-literal'
3878  * Check for printable ASCII only
3879  */
3880  if('[' == res[i])
3881  {
3882  if(enc_ascii_check_printable(&res[i])) { state = 1; }
3883  break;
3884  }
3885  /* Check 'dot-atom' */
3886  if(enc_check_dotatom(&res[i]))
3887  {
3888  state = 1;
3889  break;
3890  }
3891  ++i;
3892  }
3893  }
3894  if(state)
3895  {
3896  /* Invalid address format */
3897  api_posix_free((void*) res);
3898  res = NULL;
3899  }
3900  }
3901  }
3902  }
3903  }
3904 
3905  /* Check for error */
3906  if(NULL == res) { PRINT_ERROR("Invalid e-mail address"); }
3907 
3908  return(res);
3909 }
3910 
3911 
3912 /* ========================================================================== */
3913 /*! \brief Verify ASCII encoding
3914  *
3915  * \param[in] s String to verify
3916  *
3917  * \return
3918  * - 0 on success
3919  * - Negative value on error
3920  */
3921 
3922 int enc_ascii_check(const char* s)
3923 {
3924  int res = 0;
3925  size_t i = 0;
3926  int c;
3927 
3928  /* Assignment in the truth expression is intended */
3929  while((c = (int) s[i++]))
3930  {
3931  if(!(0 <= c && 127 >= c)) { res = -1; }
3932  }
3933 
3934  return(res);
3935 }
3936 
3937 
3938 /* ========================================================================== */
3939 /*! \brief Check for ASCII alphabetic characters
3940  *
3941  * \param[in] s Pointer to single character
3942  *
3943  * Locale independent check based on ASCII.
3944  *
3945  * \return
3946  * - 0 if \e s is an alphabetic character
3947  * - Negative value if \e s is not an alphabetic character
3948  */
3949 
3950 int enc_ascii_check_alpha(const char* s)
3951 {
3952  int res = 0;
3953  int c = (int) *s;
3954 
3955  if(!(65 <= c && 90 >= c) && !(97 <= c && 122 >= c)) { res = -1; }
3956 
3957  return(res);
3958 }
3959 
3960 
3961 /* ========================================================================== */
3962 /*! \brief Check for ASCII digit characters
3963  *
3964  * \param[in] s Pointer to single character
3965  *
3966  * Locale independent check based on ASCII.
3967  *
3968  * \return
3969  * - 0 if \e s is a digit character
3970  * - Negative value if \e s is not a digit character
3971  */
3972 
3973 int enc_ascii_check_digit(const char* s)
3974 {
3975  int res = 0;
3976  int c = (int) *s;
3977 
3978  if(!(48 <= c && 57 >= c)) { res = -1; }
3979 
3980  return(res);
3981 }
3982 
3983 
3984 /* ========================================================================== */
3985 /*! \brief Check for printable ASCII characters
3986  *
3987  * \param[in] s String to check
3988  *
3989  * HT (9) and SPACE (32, 0x20) inside \e s are treated as "printable" to make
3990  * this function suitable to check header field bodies according to RFC 5322.
3991  *
3992  * \note
3993  * The function \ref enc_ascii_convert_to_printable() can be used on error.
3994  *
3995  * \return
3996  * - 0 on success
3997  * - Negative value on error
3998  */
3999 
4000 int enc_ascii_check_printable(const char* s)
4001 {
4002  int res = 0;
4003  size_t i = 0;
4004  int c;
4005 
4006  /* Assignment in the truth expression is intended */
4007  while((c = (int) s[i++]))
4008  {
4009  if(!(9 == c || (32 <= c && 126 >= c))) { res = -1; }
4010  }
4011 
4012  return(res);
4013 }
4014 
4015 
4016 /* ========================================================================== */
4017 /*! \brief Convert to printable ASCII format
4018  *
4019  * \param[in] s String to convert
4020  *
4021  * This function should be used to repair a string in-place after the function
4022  * \ref enc_ascii_check_printable() have reported an error.
4023  *
4024  * Every invalid byte is replaced with '?'.
4025  */
4026 
4028 {
4029  size_t i = 0;
4030  int c;
4031 
4032  while(s[i])
4033  {
4034  c = (int) s[i];
4035  if(!(9 == c || (32 <= c && 126 >= c))) { s[i] = '?'; }
4036  ++i;
4037  }
4038 }
4039 
4040 
4041 /* ========================================================================== */
4042 /*! \brief Convert body of distribution header field
4043  *
4044  * \param[in] s String with unfolded body to convert
4045  *
4046  * This function process \e s in-place. The result will always be shorter or
4047  * same length as the original data.
4048  *
4049  * Every element of \c dist-list that contains invalid characters is removed.
4050  */
4051 
4053 {
4054  size_t i;
4055  size_t len;
4056  int c;
4057  size_t start = 0;
4058  int error = 0;
4059  char* p;
4060 
4061  /* Remove whitespace */
4062  i = 0;
4063  while(s[i])
4064  {
4065  c = (int) s[i];
4066  if(9 == c || 32 == c)
4067  {
4068  len = strlen(&s[i + (size_t) 1]);
4069  /* Move including NUL termination */
4070  memmove((void*) &s[i], (void*) &s[i + (size_t) 1], ++len);
4071  }
4072  else { ++i; }
4073  }
4074 
4075  /* Check content */
4076  i = 0;
4077  while(s[i])
4078  {
4079  /* Check for alphanumeric characters */
4080  if(enc_ascii_check_alpha(&s[i]) && enc_ascii_check_digit(&s[i]))
4081  {
4082  /* No => Check first character of 'dist-name' token */
4083  if(!start && !i) { error = 1; }
4084  else if(start && start + (size_t) 1 == i) { error = 1; }
4085  else
4086  {
4087  /* Not first => Check other characters of 'dist-name' token */
4088  if('+' != s[i] && '-' != s[i] && '_' != s[i] && ',' != s[i])
4089  {
4090  error = 1;
4091  }
4092  }
4093  /* Check for separator between entries */
4094  if(!error && ',' == s[i])
4095  {
4096  start = i;
4097  if(!s[i + (size_t) 1]) { error = 1; }
4098  }
4099  }
4100  /* Check for error */
4101  if(error)
4102  {
4103  PRINT_ERROR("Invalid entry in distribution list removed");
4104  p = strchr(&s[i + (size_t) 1], (int) ',');
4105  i = start;
4106  if(NULL == p) { s[i] = 0; }
4107  else
4108  {
4109  /* Remove invalid entry */
4110  if(!start) { p += 1; } /* No separator before first entry */
4111  len = strlen(p);
4112  /* Move including NUL termination */
4113  memmove((void*) &s[start], (void*) p, ++len);
4114  }
4115  error = 0;
4116  }
4117  else { ++i; }
4118  }
4119 }
4120 
4121 
4122 /* ========================================================================== */
4123 /*! \brief Verify UTF-8 encoding
4124  *
4125  * \param[in] s String to verify
4126  *
4127  * \attention
4128  * Read chapter 10 of RFC 3629 for UTF-8 security considerations.
4129  *
4130  * According to RFC 3629 the following rules are applied:
4131  * - Character code points beyond 0x10FFFF are invalid => We reject them.
4132  * - Only the shortest possible code sequence is allowed => We verify this.
4133  * - Surrogate character code points are invalid for UTF-8 => We reject them.
4134  *
4135  * \return
4136  * - 0 on success
4137  * - Negative value on error
4138  */
4139 
4140 int enc_uc_check_utf8(const char* s)
4141 {
4142  /* Enable additional check for surrogate-pairs */
4143  return enc_uc_check_cesu8(s, 1);
4144 }
4145 
4146 
4147 /* ========================================================================== */
4148 /*! \brief Repair UTF-8 encoding
4149  *
4150  * \param[in] s String to repair
4151  *
4152  * Invalid UTF-8 sequences and invalid codepoints are replaced with U+FFFD.
4153  *
4154  * \return
4155  * - Pointer to new memory block on success
4156  * - \c NULL on error
4157  */
4158 
4159 const char* enc_uc_repair_utf8(const char* s)
4160 {
4161  char* res = (char*) api_posix_malloc(strlen(s) * (size_t) 3);
4162  const char rc[3] = { (char) 0xEF, (char) 0xBF, (char) 0xBD };
4163  size_t i = 0;
4164  size_t ri = 0;
4165  int c;
4166  int multibyte = 0;
4167  size_t len = 0;
4168  size_t remaining = 0;
4169  unsigned long int mbc = 0;
4170  int error = 0;
4171 
4172  if(NULL != res)
4173  {
4174  /* Assignment in truth expression is intended */
4175  while((c = (int) s[i++]))
4176  {
4177  /* Resync after error */
4178  if(error)
4179  {
4180  if((c & 0xC0) == 0x80) { continue; }
4181  else { multibyte = 0; }
4182  }
4183  /* Verify singlebyte character */
4184  if(!multibyte)
4185  {
4186  if(!(0 <= c && 127 >= c)) { multibyte = 1; }
4187  else { res[ri++] = (char) c; }
4188  /* Reset state machine */
4189  remaining = 0;
4190  mbc = 0;
4191  error = 0;
4192  }
4193  /* Verify multibyte character */
4194  if(multibyte)
4195  {
4196  if(!remaining)
4197  {
4198  if((c & 0xE0) == 0xC0) { len = 2; }
4199  else if((c & 0xF0) == 0xE0) { len = 3; }
4200  else if((c & 0xF8) == 0xF0) { len = 4; }
4201  else
4202  {
4203  /* Invalid start of code sequence in UTF-8 data */
4204  res[ri++] = rc[0]; res[ri++] = rc[1]; res[ri++] = rc[2];
4205  error = 1;
4206  }
4207  switch(len)
4208  {
4209  case 2: mbc |= (unsigned long int) (c & 0x1F) << 6; break;
4210  case 3: mbc |= (unsigned long int) (c & 0x0F) << 12; break;
4211  case 4: mbc |= (unsigned long int) (c & 0x07) << 18; break;
4212  default: break;
4213  }
4214  remaining = len - (size_t) 1;
4215  }
4216  else
4217  {
4218  if((c & 0xC0) != 0x80)
4219  {
4220  /* Invalid continuation character in UTF-8 sequence */
4221  res[ri++] = rc[0]; res[ri++] = rc[1]; res[ri++] = rc[2];
4222  if(0 <= c && 127 >= c) { res[ri++] = (char) c; }
4223  error = 1;
4224  }
4225  else
4226  {
4227  --remaining;
4228  mbc |= (unsigned long int) (c & 0x3F) << remaining * (size_t) 6;
4229  }
4230  if(!remaining && !error)
4231  {
4232  /* Verify character code */
4233  switch(len)
4234  {
4235  case 2:
4236  {
4237  if(0x000080UL > mbc)
4238  {
4239  /* Invalid UTF-8 2-byte code sequence */
4240  res[ri++] = rc[0];
4241  res[ri++] = rc[1];
4242  res[ri++] = rc[2];
4243  error = 1;
4244  }
4245  else
4246  {
4247  res[ri++] = s[i - (size_t) 2];
4248  res[ri++] = s[i - (size_t) 1];
4249  }
4250  break;
4251  }
4252  case 3:
4253  {
4254  if(0x000800UL > mbc
4255  || (0x00D800UL <= mbc && 0x00DFFFUL >= mbc))
4256  {
4257  /* Invalid UTF-8 3-byte code sequence */
4258  res[ri++] = rc[0];
4259  res[ri++] = rc[1];
4260  res[ri++] = rc[2];
4261  error = 1;
4262  }
4263  else
4264  {
4265  res[ri++] = s[i - (size_t) 3];
4266  res[ri++] = s[i - (size_t) 2];
4267  res[ri++] = s[i - (size_t) 1];
4268  }
4269  break;
4270  }
4271  case 4:
4272  {
4273  if(0x010000UL > mbc || 0x10FFFFUL < mbc)
4274  {
4275  /* Invalid UTF-8 4-byte code sequence */
4276  res[ri++] = rc[0];
4277  res[ri++] = rc[1];
4278  res[ri++] = rc[2];
4279  error = 1;
4280  }
4281  else
4282  {
4283  res[ri++] = s[i - (size_t) 4];
4284  res[ri++] = s[i - (size_t) 3];
4285  res[ri++] = s[i - (size_t) 2];
4286  res[ri++] = s[i - (size_t) 1];
4287  }
4288  break;
4289  }
4290  default:
4291  {
4292  PRINT_ERROR("Bug in UTF-8 repair state machine");
4293  api_posix_free((void*) res);
4294  res = NULL;
4295  break;
4296  }
4297  }
4298  /* Code sequence complete */
4299  multibyte = 0;
4300  }
4301  }
4302  }
4303  }
4304  }
4305 
4306  /* Check for error */
4307  if(NULL != res)
4308  {
4309  /* Terminate new string */
4310  res[ri] = 0;
4311  /* Verify again */
4312  if(enc_uc_check_utf8(res))
4313  {
4314  PRINT_ERROR("UTF-8 data still invalid after repair (bug)");
4315  api_posix_free((void*) res);
4316  res = NULL;
4317  }
4318  else { PRINT_ERROR("UTF-8 data repaired"); }
4319  }
4320 
4321  return(res);
4322 }
4323 
4324 
4325 /* ========================================================================== */
4326 /*! \brief Create wildmat pattern array
4327  *
4328  * \param[out] obj Pointer to wildmat pattern array
4329  * \param[in] wm RFC 3977 conformant wildmat
4330  *
4331  * This function splits a RFC 3977 conformant \c wildmat into its elements of
4332  * type \c wildmat-pattern . Every \c wildmat-pattern is converted to a POSIX
4333  * extended regular expression and stored together with a negation flag (that
4334  * is set if the \c wildmat-pattern was preceded by an exclamation mark) in
4335  * the array \e obj .
4336  *
4337  * On success the caller is responsible to free the memoy allocated for the
4338  * resulting array with the function \e enc_destory_wildmat() .
4339  *
4340  * \attention
4341  * If the wildmat \e wm contains Unicode data, it must be normalized to NFC by
4342  * the caller.
4343  *
4344  * \return
4345  * - Number of patterns in the object on success
4346  * - Negative value on error (\c NULL was written to \e obj)
4347  */
4348 
4349 int enc_create_wildmat(struct enc_wm_pattern** obj, const char* wm)
4350 {
4351  int res = 0;
4352  size_t len;
4353  size_t i = 0;
4354  char* buf = NULL;
4355  size_t bi = 0;
4356  int error = 0;
4357  int negate = 0;
4358  int store = 0;
4359  int eod = 0;
4360  struct enc_wm_pattern* p;
4361  size_t obj_len = 0;
4362 
4363  *obj = NULL;
4364 
4365  /* Wildmat must have valid UTF-8 encoding */
4366  if(!enc_uc_check_utf8(wm))
4367  {
4368  /* Check for invalid characters (backslash and brackets) */
4369  if(NULL == strpbrk(wm, "\x5C[]"))
4370  {
4371  /* Extract wildmat-pattern elements */
4372  do
4373  {
4374  store = 0;
4375  negate = 0;
4376  /* Allocate ERE buffer for next pattern */
4377  len = strlen(&wm[i]);
4378  /* Required buffer size (see below): Triple + 2 + NUL */
4379  buf = (char*) api_posix_malloc(len * (size_t) 3 + (size_t) 3);
4380  if(NULL == buf) { break; }
4381  else
4382  {
4383  bi = 0;
4384  buf[bi++] = '^';
4385  while(!store)
4386  {
4387  /* Check for EOD */
4388  if(!wm[i])
4389  {
4390  if((size_t) 1 < bi) { store = 1; }
4391  eod = 1;
4392  break;
4393  }
4394  /* Check for (remaining) special character */
4395  if(NULL != strchr(".()*+?{|^$", (int) wm[i]))
4396  {
4397  switch((int) wm[i])
4398  {
4399  /* Match arbitrary single UTF-8 codepoint (not octet) */
4400  case (int) ')':
4401  {
4402  /* Replace with "[)]" (*3) */
4403  buf[bi++] = '[';
4404  buf[bi++] = ')';
4405  buf[bi++] = ']';
4406  break;
4407  }
4408  case (int) '*':
4409  {
4410  /* Replace with ".*" (*2) */
4411  buf[bi++] = '.';
4412  buf[bi++] = '*';
4413  break;
4414  }
4415  case (int) '?':
4416  {
4417  /* Replace with dot (*1) */
4418  buf[bi++] = '.';
4419  break;
4420  }
4421  default:
4422  {
4423  /* Escape special character with backslash (*2) */
4424  buf[bi++] = 0x5C;
4425  buf[bi++] = wm[i];
4426  break;
4427  }
4428  }
4429  }
4430  else
4431  {
4432  switch((int) wm[i])
4433  {
4434  case 0x09:
4435  case 0x20:
4436  {
4437  /* Ignore whitespace */
4438  break;
4439  }
4440  case (int) '!':
4441  {
4442  negate = 1;
4443  break;
4444  }
4445  case (int) ',':
4446  {
4447  store = 1;
4448  break;
4449  }
4450  default:
4451  {
4452  /* Ordinary character */
4453  buf[bi++] = wm[i];
4454  break;
4455  }
4456  }
4457  }
4458  ++i;
4459  }
4460  /* Store element into object */
4461  if(!store) { api_posix_free((void*) buf); }
4462  else
4463  {
4464  if(INT_MAX == res) { error = 1; }
4465  else
4466  {
4467  buf[bi++] = '$';
4468  buf[bi] = 0;
4469  /* printf("Pattern converted to ERE: %s\n", buf); */
4470  obj_len += sizeof(struct enc_wm_pattern);
4471  p = (struct enc_wm_pattern*) api_posix_realloc(*obj,
4472  obj_len);
4473  if(NULL == p) { error = 1; }
4474  else
4475  {
4476  *obj = p;
4477  (*obj)[res].negate = negate;
4478  (*obj)[res].ere = buf;
4479  ++res;
4480  }
4481  }
4482  if(error)
4483  {
4484  api_posix_free((void*) buf);
4485  break;
4486  }
4487  }
4488  }
4489  }
4490  while(!eod);
4491  }
4492  }
4493 
4494  /* Check for error */
4495  if(error || !eod || 0 >= res)
4496  {
4497  PRINT_ERROR("Failed to convert RFC 3977 wildmat");
4498  enc_destroy_wildmat(obj, res);
4499  res = -1;
4500  }
4501 
4502  return(res);
4503 }
4504 
4505 
4506 /* ========================================================================== */
4507 /*! \brief Destroy wildmat pattern array
4508  *
4509  * \param[in,out] obj Pointer to wildmat pattern array
4510  * \param[in] num Number of elements in array
4511  *
4512  * \c NULL is written to the location pointed to by \e obj after releasing the
4513  * memory allocated for the array.
4514  */
4515 
4516 void enc_destroy_wildmat(struct enc_wm_pattern** obj, int num)
4517 {
4518  int i;
4519 
4520  if(NULL != obj && NULL != *obj)
4521  {
4522  for(i = 0; i < num; ++i)
4523  {
4524  api_posix_free((void*) (*obj)[i].ere);
4525  }
4526  api_posix_free((void*) *obj);
4527  *obj = NULL;
4528  }
4529 }
4530 
4531 
4532 /* ========================================================================== */
4533 /*! \brief Convert from canonical (RFC 822) to local (POSIX) form
4534  *
4535  * \param[in] s String to convert
4536  * \param[in] rcr Replace invalid CR control characters if nonzero
4537  * \param[in] rlf Replace invalid LF control characters if nonzero
4538  *
4539  * According to RFC 822 and RFC 2049 this function accepts plain text article
4540  * content in canonical form and convert the CRLF line breaks to local (POSIX,
4541  * single LF) form.
4542  *
4543  * \attention
4544  * Single CR and LF control characters (not part of a CRLF sequence) are
4545  * forbidden in canonical format of text by RFC 2045 and RFC 2046.
4546  * Default behaviour is to preserve single CR and LF control characters.
4547  * The Unicode codepoint defined by \c ENC_RC can be inserted as replacement
4548  * for CR or/and LF by setting \e rcr or/and \e rlf respectively to a nonzero
4549  * value.
4550  *
4551  * On success the caller is responsible to free the allocated memory.
4552  *
4553  * \return
4554  * - Pointer to decoded data (a new memory block was allocated)
4555  * - NULL on error
4556  */
4557 
4558 const char* enc_convert_canonical_to_posix(const char* s, int rcr, int rlf)
4559 {
4560  const char* res = NULL;
4561  size_t i = 0;
4562  char* buf = NULL;
4563  size_t len = 0;
4564  size_t bi = 0;
4565  char* p;
4566  size_t escr = 0;
4567  size_t eslf = 0;
4568  long int rc_ucp = ENC_RC;
4569  size_t di;
4570 
4571  if(NULL != s)
4572  {
4573  /* Check for empty string and accept it */
4574  if(!s[0])
4575  {
4576  p = (char*) api_posix_malloc((size_t) 1);
4577  if(NULL != p) { p[0] = 0; res = p; }
4578  }
4579  else
4580  {
4581  while(s[i])
4582  {
4583  /*
4584  * Reserve space for one Unicode codepoint in UTF-8 transformation
4585  * format (4 octets in worst case).
4586  * At least 1 octet must stay available for NUL termination.
4587  */
4588  if(bi + (size_t) 4 + (size_t) 1 >= len)
4589  {
4590  /* Allocate more memory in exponentially increasing chunks */
4591  if(!len) { len = 64; }
4592  p = (char*) api_posix_realloc((void*) buf, len *= (size_t) 2);
4593  if(NULL == p)
4594  {
4595  api_posix_free((void*) buf);
4596  buf = NULL;
4597  break;
4598  }
4599  else { buf = p; }
4600  }
4601  /* Check for end of line (CRLF) */
4602  if(bi && i && 0x0A == (int) s[i] && 0x0D == (int) s[i - (size_t) 1])
4603  {
4604  /* Yes => Replace the CR with LF and don't increment position */
4605  buf[bi - (size_t) 1] = 0x0A;
4606  }
4607  else if(i && 0x0A != (int) s[i] && 0x0D == (int) s[i - (size_t) 1])
4608  {
4609  /* Single CR character (not part of a CRLF sequence) detected */
4610  ++escr;
4611  if(rcr)
4612  {
4613  --bi;
4614  di = 1;
4615  enc_uc_encode_utf8(buf, &bi, &rc_ucp, &di);
4616  }
4617  buf[bi++] = s[i];
4618  }
4619  else if(0x0A == (int) s[i])
4620  {
4621  /* Single LF character (not part of a CRLF sequence) detected */
4622  ++eslf;
4623  if(rlf)
4624  {
4625  di = 1;
4626  enc_uc_encode_utf8(buf, &bi, &rc_ucp, &di);
4627  }
4628  else
4629  {
4630  buf[bi++] = s[i];
4631  }
4632  }
4633  else { buf[bi++] = s[i]; }
4634  ++i;
4635  }
4636  if(NULL != buf)
4637  {
4638  buf[bi] = 0;
4639  res = buf;
4640  /* Print stored errors */
4641  if(escr)
4642  {
4643  /* Print error message only once */
4644  PRINT_ERROR("Invalid CR control character(s) detected"
4645  " while decoding canonical format");
4646  }
4647  if(eslf)
4648  {
4649  /* Print error message only once */
4650  PRINT_ERROR("Invalid LF control character(s) detected"
4651  " while decoding canonical format");
4652  }
4653  }
4654  }
4655  }
4656 
4657  return(res);
4658 }
4659 
4660 
4661 /* ========================================================================== */
4662 /*! \brief Convert from local (POSIX) to canonical (RFC 822) form
4663  *
4664  * \param[in] s String to convert
4665  *
4666  * According to RFC 822 and RFC 2049 this function accepts plain text article
4667  * content in local (POSIX) form and convert the single LF line breaks to
4668  * canonical (CRLF) form.
4669  *
4670  * According to RFC 2045 and RFC 2046 single CR characters are deleted.
4671  *
4672  * On success the caller is responsible to free the allocated memory.
4673  *
4674  * \return
4675  * - Pointer to decoded data (a new memory block was allocated)
4676  * - NULL on error
4677  */
4678 
4679 const char* enc_convert_posix_to_canonical(const char* s)
4680 {
4681  const char* res = NULL;
4682  size_t i = 0;
4683  char* buf = NULL;
4684  size_t len = 0;
4685  size_t bi = 0;
4686  char* p;
4687 
4688  if(NULL != s)
4689  {
4690  /* Check for empty string and accept it */
4691  if(!s[0])
4692  {
4693  p = (char*) api_posix_malloc((size_t) 1);
4694  if(NULL != p) { p[0] = 0; res = p; }
4695  }
4696  else
4697  {
4698  while(s[i])
4699  {
4700  /* At least 3 octets must stay available for CR + LF + NUL */
4701  if(bi + (size_t) 4 >= len)
4702  {
4703  /* Allocate more memory in exponentially increasing chunks */
4704  if(!len) { len = 64; }
4705  p = (char*) api_posix_realloc((void*) buf, len *= (size_t) 2);
4706  if(NULL == p)
4707  {
4708  api_posix_free((void*) buf);
4709  buf = NULL;
4710  break;
4711  }
4712  else { buf = p; }
4713  }
4714  /* Check for end of line (LF) */
4715  if(0x0A == (int) s[i])
4716  {
4717  /* Yes => Add a CR before the LF */
4718  buf[bi++] = 0x0D;
4719  buf[bi++] = 0x0A;
4720  }
4721  else if(0x0D == (int) s[i])
4722  {
4723  PRINT_ERROR("Invalid CR control character deleted"
4724  " while converting to canonical format");
4725  }
4726  else { buf[bi++] = s[i]; }
4727  ++i;
4728  }
4729  if(NULL != buf)
4730  {
4731  /* Ensure that last line of nonempty result ends with CRLF */
4732  if(bi)
4733  {
4734  if(0x0A != (int) buf[bi - (size_t) 1])
4735  {
4736  /* Append CR+LF */
4737  buf[bi++] = 0x0D;
4738  buf[bi++] = 0x0A;
4739  }
4740  }
4741  /* Add termination */
4742  buf[bi] = 0;
4743  res = buf;
4744  }
4745  }
4746  }
4747 
4748  return(res);
4749 }
4750 
4751 
4752 /* ========================================================================== */
4753 /*! \brief Convert string from supported character set to Unicode (UTF-8 NFC)
4754  *
4755  * \param[in] charset Character set of string \e s
4756  * \param[in] s String to convert
4757  *
4758  * According to RFC 2049 the following rules are applied:
4759  * - For all character sets from the ISO 8859 family that are not supported,
4760  * at least the ASCII characters must be decoded correctly
4761  * => We decode all non ASCII characters as "?" in this case.
4762  *
4763  * According to RFC 3629 the following rules are applied:
4764  * - If the input data is already UTF-8 is is not allowed to accept it
4765  * unchecked. It is mandatory to check the validity of the encoding
4766  * => We do so.
4767  *
4768  * \note
4769  * Some control characters that may cause problems are removed.
4770  *
4771  * \return
4772  * - Pointer to decoded Unicode data (UTF-8 encoded with NFC normalization)
4773  * If the result is not equal to \e s , a new memory block was allocated
4774  * - NULL on error (Original memory block for \e s is still allocated)
4775  */
4776 
4777 const char* enc_convert_to_utf8_nfc(enum enc_mime_cs charset, const char* s)
4778 {
4779  const char* res = NULL;
4780  char* p;
4781  const char* tmp;
4782  size_t len;
4783  size_t i;
4784  size_t ii;
4785  long int ucp;
4786  long int rc_ucp = ENC_RC;
4787  char rc_utf8[5] = { 0 };
4788  int cc_flag = 0; /* Flag indicating unwanted control characters */
4789  size_t di;
4790 
4791  switch(charset)
4792  {
4793  case ENC_CS_ISO8859_X:
4794  {
4795  PRINT_ERROR("Convert unsupported ISO 8859 character set as US-ASCII");
4796  /* No break here is intended */
4797  }
4798  /* FALLTHROUGH */
4799  case ENC_CS_ASCII:
4800  {
4801  len = strlen(s);
4802  p = (char*) api_posix_malloc(++len);
4803  if(NULL == p) { break; }
4804  for(i = 0; i < len; ++i)
4805  {
4806  p[i] = s[i];
4807  if((unsigned char) 127 < (unsigned char) p[i]) { p[i] = '?'; }
4808  }
4809  res = p;
4810  break;
4811  }
4812  case ENC_CS_ISO8859_1:
4813  case ENC_CS_ISO8859_2:
4814  case ENC_CS_ISO8859_3:
4815  case ENC_CS_ISO8859_4:
4816  case ENC_CS_ISO8859_5:
4817  case ENC_CS_ISO8859_6:
4818  case ENC_CS_ISO8859_7:
4819  case ENC_CS_ISO8859_8:
4820  case ENC_CS_ISO8859_9:
4821  case ENC_CS_ISO8859_10:
4822  case ENC_CS_ISO8859_11:
4823  case ENC_CS_ISO8859_13:
4824  case ENC_CS_ISO8859_14:
4825  case ENC_CS_ISO8859_15:
4826  case ENC_CS_ISO8859_16:
4827  case ENC_CS_MACINTOSH:
4828  case ENC_CS_KOI8R:
4829  case ENC_CS_KOI8U:
4830  case ENC_CS_WINDOWS_1250:
4831  case ENC_CS_WINDOWS_1251:
4832  case ENC_CS_WINDOWS_1252:
4833  case ENC_CS_WINDOWS_1253:
4834  case ENC_CS_WINDOWS_1254:
4835  case ENC_CS_WINDOWS_1255:
4836  case ENC_CS_WINDOWS_1256:
4837  case ENC_CS_WINDOWS_1257:
4838  case ENC_CS_WINDOWS_1258:
4839  case ENC_CS_IBM437:
4840  case ENC_CS_IBM775:
4841  case ENC_CS_IBM850:
4842  case ENC_CS_IBM852:
4843  case ENC_CS_IBM858:
4844  {
4845  res = enc_8bit_convert_to_utf8(charset, s);
4846  break;
4847  }
4848  case ENC_CS_ISO2022_JP:
4849  {
4850  res = enc_iso2022jp_convert_to_utf8(s);
4851  break;
4852  }
4853  case ENC_CS_UTF_7:
4854  {
4855  res = enc_uc_convert_nsutf_to_utf8(s, "UTF-7");
4856  break;
4857  }
4858  case ENC_CS_CESU_8:
4859  {
4860  res = enc_uc_convert_nsutf_to_utf8(s, "CESU-8");
4861  break;
4862  }
4863  case ENC_CS_UTF_8:
4864  {
4865  res = s;
4866  break;
4867  }
4868  default:
4869  {
4870  /* Not supported */
4871  res = NULL;
4872  break;
4873  }
4874  }
4875 
4876  /* Check encoding */
4877  if(NULL != res)
4878  {
4879  if(enc_uc_check_utf8(res))
4880  {
4881  /* Encoding is invalid */
4882  if(ENC_CS_UTF_8 != charset && ENC_CS_CESU_8 != charset
4883  && ENC_CS_UTF_7 != charset)
4884  {
4885  PRINT_ERROR("Invalid UTF-8 encoding detected");
4886  }
4887  /* Repair encoding */
4888  tmp = enc_uc_repair_utf8(res);
4889  if(res != tmp && res != s) { api_posix_free((void*) res); }
4890  res = tmp;
4891  }
4892  }
4893 
4894  /* Normalize to NFC */
4895  if(NULL != res)
4896  {
4897  tmp = enc_uc_normalize_to_nfc(res);
4898  if(res != tmp && res != s) { api_posix_free((void*) res); }
4899  res = tmp;
4900  }
4901 
4902  /* Remove unwanted control characters */
4903  if(NULL != res)
4904  {
4905  i = 0;
4906  while(1)
4907  {
4908  ucp = enc_uc_decode_utf8(res, &i);
4909  if(-1L == ucp) { break; }
4910  if(enc_uc_check_control(ucp)) { cc_flag = 1; break; }
4911  }
4912  if(cc_flag)
4913  {
4914  /* Unwanted control characters found */
4915  len = strlen(res);
4916  di = 1;
4917  i = 0;
4918  enc_uc_encode_utf8(rc_utf8, &i, &rc_ucp, &di);
4919  rc_utf8[i] = 0;
4920  len *= strlen(rc_utf8);
4921  p = (char*) api_posix_malloc(++len);
4922  if(NULL == p)
4923  {
4924  if(s != res) { api_posix_free((void*) res); }
4925  res = NULL;
4926  }
4927  else
4928  {
4929  i = 0; ii = 0;
4930  while(1)
4931  {
4932  ucp = enc_uc_decode_utf8(res, &i);
4933  if(-1L == ucp) { break; }
4934  if(enc_uc_check_control(ucp))
4935  {
4936  /* Replace them */
4937  di = 1;
4938  enc_uc_encode_utf8(p, &ii, &rc_ucp, &di);
4939  }
4940  else
4941  {
4942  di = 1;
4943  enc_uc_encode_utf8(p, &ii, &ucp, &di);
4944  }
4945  }
4946  p[ii] = 0;
4947  if(s != res) { api_posix_free((void*) res); }
4948  res = p;
4949  }
4950  PRINT_ERROR("Unwanted control characters detected and replaced");
4951  }
4952  }
4953 
4954  return(res);
4955 }
4956 
4957 
4958 /* ========================================================================== */
4959 /*! \brief Convert string from Unicode (UTF-8 NFC) to an 8bit character set
4960  *
4961  * \param[out] charset Pointer to character set of result (or \c NULL)
4962  * \param[in] s Unicode string to convert in UTF-8 NFC format
4963  * \param[out] cs_iana Pointer to IANA charset name of result (or \c NULL)
4964  *
4965  * \attention
4966  * Ensure that the string \e s is valid UTF-8 and normalized to NFC. Otherwise
4967  * this function will not work as expected.
4968  *
4969  * According to RFC 2046 the following rules are applied:
4970  * - In general, composition software should always use the "lowest common
4971  * denominator" character set possible
4972  * => We do so by preferring the widely supported ISO 8859-1 character set.
4973  *
4974  * \note
4975  * If this function supports more character sets in the future, ISO 8859-1 must
4976  * always stay the preferred one (because this is our fallback locale character
4977  * set to allow the use of POSIX regular expressions without Unicode support
4978  * from the system).
4979  *
4980  * If \c NULL is passed as parameter \e charset or \e cs_iana , this indicates
4981  * that the caller is not interested in this information. The corresponding
4982  * data is discarded in this case.
4983  *
4984  * \return
4985  * - Pointer to encoded data (the character set is written to \e charset)
4986  * If the result is not equal to \e s , a new memory block was allocated
4987  * - NULL on error (Original memory block for \e s is still allocated)
4988  * Nothing is written to \e charset and \e cs_iana in this case
4989  */
4990 
4991 const char* enc_convert_to_8bit(enum enc_mime_cs* charset, const char* s,
4992  const char** cs_iana)
4993 {
4994  const char* res = NULL;
4995  size_t i = 0;
4996  size_t ii = 0;
4997  long int ucp = 0;
4998  char* p = NULL;
4999  size_t len;
5000  int error = 0;
5001 
5002  /*
5003  * Allocate target buffer with same size as source buffer.
5004  * This is always sufficient for every 8bit character set.
5005  */
5006  len = strlen(s);
5007  p = (char*) api_posix_malloc(++len);
5008  if(NULL != p)
5009  {
5010  while(1)
5011  {
5012  ucp = enc_uc_decode_utf8(s, &i);
5013  if(-1L == ucp) { break; }
5014  /* ISO 8859-1 is mapped 1:1 into the Unicode codepoint space */
5015  if(256L <= ucp) { error = 1; break; }
5016  else { p[ii++] = (char) (unsigned char) ucp; }
5017  }
5018  /* Check for error */
5019  if(error) { api_posix_free((void*) p); }
5020  else
5021  {
5022  p[ii] = 0;
5023  res = p;
5024  if(NULL != charset) { *charset = ENC_CS_ISO8859_1; }
5025  if(NULL != cs_iana) { *cs_iana = "ISO-8859-1"; }
5026  }
5027  }
5028 
5029  return(res);
5030 }
5031 
5032 
5033 /* ========================================================================== */
5034 /*! \brief Encode header field body using MIME \c encoded-word tokens
5035  *
5036  * This function use quoted-printable encoding.
5037  *
5038  * \param[out] r Pointer to result string pointer
5039  * \param[in] b Header field body that contains potential Unicode data
5040  * \param[in] pl Length of header field prefix (Length limit: 25)
5041  *
5042  * The header field body \e b must be verified by the caller to be valid UTF-8
5043  * (this function will do the normalization to NFC).
5044  * The CRLF termination must be removed before calling this function.
5045  *
5046  * The length \e pl must include the header field name, the colon and any
5047  * potential white space not included in \e b .
5048  *
5049  * According to RFC 5536 the following rules are applied:
5050  * - A header field line is not allowed to be empty
5051  * => The header field is never folded immediately after the name separator.
5052  * - Lines are not allowed to contain more than 1000 characters
5053  * => We respect this by rejecting words that are longer than 998 characters.
5054  *
5055  * According to RFC 2047 the following rules are applied:
5056  * - White space between encoded-words is semantically ignored
5057  * => A single space between encoded-words is included in the trailing word,
5058  * additional LWSP characters are included into the leading word.
5059  * - A header line containing encoded-words must be no longer than 76 characters
5060  * => We fold before this limit.
5061  * - If folding is required, each encoded-word must contain an integral number
5062  * of characters and must be self-contained
5063  * => We only split between Unicode combining character sequences when using
5064  * UTF-8
5065  * (between grapheme clusters would be better, but is not supported yet)
5066  * - If there is more than one character set that can represent the 8-bit
5067  * content of an encoded-word, ISO 8859 should be preferred
5068  * => We do so if the required ISO 8859 encoder is available
5069  * (can be disabled with the \c force_unicode option in configfile).
5070  * - If encoded-word is not used because of 8-bit data, US-ASCII should be used
5071  * => We do so
5072  * (can be disabled with the \c force_unicode option in configfile).
5073  *
5074  * According to RFC 5198 the following rules are applied:
5075  * - It's recommended to use NFC normalization in general Internet text messages
5076  * => We do so.
5077  *
5078  * On success, the address of the result buffer is written to the location
5079  * pointed to by \e r (this may be the same as \e b if there is nothing to do).
5080  * The caller is responsible to free the potentially allocated memory.
5081  * On error \c NULL is written to the location pointed to by \e r .
5082  *
5083  * \return
5084  * - 0 on success if a new memory block was allocated
5085  * - 1 on success if there was nothing to encode and no memory was allocated
5086  * - -1 on error
5087  */
5088 
5089 int enc_mime_word_encode(const char** r, const char* b, size_t pl)
5090 {
5091  static const char error_msg[] = "[Error]";
5092  static const char folding[] = "\n "; /* Line break must be in POSIX form */
5093  int res = 0;
5094  char* rbuf = NULL;
5095  size_t rbuf_len = 0;
5096  size_t ri = 0;
5097  const char* body = NULL;
5098  const char* body_tmp = NULL;
5099  const char* cs_iana = "UTF-8";
5100  enum enc_mime_cs cs = ENC_CS_UTF_8;
5101  size_t start = 0;
5102  size_t end = 0;
5103  size_t i = 0;
5104  size_t ii;
5105  size_t iii;
5106  int enc_flag = 0;
5107  int enc_last = 0;
5108  int enc_split = 0;
5109  char enc_word[1001]; /* sizeof(folding) + 998 + NUL */
5110  size_t ei;
5111  size_t word_len;
5112  unsigned int dh, dl;
5113  char* p;
5114  size_t rem = 0;
5115  int init = 1; /* Flag indicating initial word */
5116  int first = 1; /* Flag indicating first line of header field */
5117  int uc_split; /* Flag indicating Unicode must be split here */
5118 #if !ENC_MIME_HEADER_FOLD_ASCII_LINES
5119  int no_ec = 1; /* Flag indicating line contains no encoded-words */
5120 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
5121  long int ucp; /* Unicode code point */
5122  struct uc_cdc cdc; /* Unicode canonical decomposition data */
5123  size_t gcpsl; /* Unicode combing character sequence length */
5124  int eod; /* End of data */
5125 
5126  /* Check parameters */
5127  if((size_t) 25 < pl)
5128  {
5129  PRINT_ERROR("MIME: Header field name too long");
5130  res = -1;
5131  }
5132  else
5133  {
5134  /* Calculate remaining bytes for folding */
5135  rem = (size_t) 76 - pl;
5136  /*
5137  * Check whether header field body contains only printable ASCII
5138  * and no "=?" or "?=" (to be more friendly) sequences
5139  */
5141  && NULL == strstr(b, "=?") && NULL == strstr(b, "?="))
5142  {
5143  /* Nothing to do => Data can be used "as is" */
5144  res = 1;
5145  }
5146  else
5147  {
5148  /* Check Unicode */
5149  if(enc_uc_check_utf8(b))
5150  {
5151  /* Invalid Unicode */
5152  PRINT_ERROR("MIME: Encoding of header field failed");
5153  p = (char*) api_posix_malloc(strlen(error_msg) + (size_t) 1);
5154  if(NULL != p) { strcpy(p, error_msg); }
5155  body_tmp = p;
5156  }
5157  else
5158  {
5159  /* Normalize Unicode */
5160  body_tmp = enc_uc_normalize_to_nfc(b);
5161  }
5162  if(NULL == body_tmp) { res = -1; }
5163  }
5164  }
5165 
5166  /* Check for error */
5167  if(!res)
5168  {
5169  /* Check whether user has forced Unicode */
5170  if (config[CONF_FORCE_UNICODE].val.i) { body = body_tmp; }
5171  else
5172  {
5173  /* Convert body to target character set */
5174  body = enc_convert_to_8bit(&cs, body_tmp, &cs_iana);
5175  if(NULL == body) { body = body_tmp; }
5176  else
5177  {
5178  /* Check for 7bit data */
5179  if (0 == enc_ascii_check(body))
5180  {
5181  cs = ENC_CS_ASCII;
5182  cs_iana = "US-ASCII";
5183  }
5184  }
5185  }
5186  /* Split body into words using SP delimiter */
5187  do
5188  {
5189  end = i++;
5190  if(!body[i] || ' ' == body[i])
5191  {
5192  /* Check for 2*LWSP */
5193  if(body[i])
5194  {
5195  if(' ' == body[i + (size_t) 1]
5196  || (char) 0x09 == body[i + (size_t) 1])
5197  {
5198  continue;
5199  }
5200  }
5201  /* Check whether word needs encoding */
5202  enc_last = enc_flag; enc_flag = 0;
5203  ei = 0;
5204  for(ii = start; ii <= end; ++ii)
5205  {
5206  enc_word[ei++] = body[ii];
5207  if(128U <= (unsigned int) body[ii]) { enc_flag = 1; break; }
5208  if('=' == (unsigned int) body[ii])
5209  {
5210  if((ii < end && '?' == body[ii + (size_t) 1])
5211  || (ii > start && '?' == body[ii - (size_t) 1]))
5212  {
5213  enc_flag = 1;
5214  break;
5215  }
5216  }
5217  }
5218  if(enc_split) { enc_flag = 1; }
5219  if(enc_flag)
5220  {
5221  /* Create MIME encoded word using quoted printable encoding */
5222 #if !ENC_MIME_HEADER_FOLD_ASCII_LINES
5223  no_ec = 0;
5224 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
5225  strcpy(enc_word, "=?");
5226  strcat(enc_word, cs_iana);
5227  strcat(enc_word, "?Q?");
5228  uc_split = 0;
5229  if(enc_last && !enc_split)
5230  {
5231  /* The space between encoded words is not semantical */
5232  strcat(enc_word, "_");
5233  }
5234  ei = strlen(enc_word);
5235  for(ii = start; ii <= end; ++ii)
5236  {
5237  /* Check for start of UTF-8 sequence */
5238  if(ENC_CS_UTF_8 == cs && 0x80 != ((int) body[ii] & 0xC0))
5239  {
5240  /* Search for next starter */
5241  eod = 0;
5242  iii = 0;
5243  while(!uc_split)
5244  {
5245  /* Count bytes as "=XX", even if encoded "as is" */
5246  gcpsl = iii * (size_t) 3;
5247  /* Check for end of data */
5248  if(!body[ii + iii]) { eod = 1; }
5249  else
5250  {
5251  /* Decode UTF-8 sequence for codepoint */
5252  if(!body[ii + iii]) { break; }
5253  ucp = enc_uc_decode_utf8(&body[ii], &iii);
5254  if(0L > ucp)
5255  {
5256  PRINT_ERROR("MIME: Decoding UCP failed");
5257  break;
5258  }
5259  enc_uc_lookup_cdc(ucp, &cdc);
5260  /* Check for starter */
5261  if(!gcpsl)
5262  {
5263  /* If Starter => Skip */
5264  if(!cdc.ccc) { continue; }
5265  /* Else abort */
5266  else { break; }
5267  }
5268  }
5269  /*
5270  * Check for next Unicode combining character sequence
5271  * boundary
5272  */
5273  if(eod || !cdc.ccc) /* Check eod first */
5274  {
5275  /* Combining character sequence boundary found */
5276  /*
5277  * Reserve space for encoded word prefix and suffix:
5278  * "=?UTF-8?Q??="
5279  * => 12 characters with the folding space
5280  */
5281  if((size_t) (75 - 12) < gcpsl)
5282  {
5283  /* Combining character sequence too long */
5284  PRINT_ERROR("MIME: "
5285  "Combining character sequence too long");
5286  /*
5287  * Replace with '?' (U+FFFD is too large!)
5288  * Maximum allowed length is one "=XX" triplet.
5289  */
5290  enc_word[ei++] = '=';
5291  enc_word[ei++] = '3';
5292  enc_word[ei++] = 'F';
5293  ii += iii - (size_t) 1;
5294  uc_split = 1;
5295  }
5296  /*
5297  * Check for length limit
5298  * Reserve 2 characters for closing "?="
5299  * Special handling for first line with less space
5300  */
5301  else if(first && ((size_t) (rem - 2) - gcpsl < ei))
5302  {
5303  uc_split = 1;
5304  }
5305  else if((size_t) (75 - 2) - gcpsl < ei)
5306  {
5307  uc_split = 1;
5308  }
5309  break;
5310  }
5311  }
5312  }
5313  if(uc_split) { /* Rewind current byte */ --ii; }
5314  else
5315  {
5316  /* Check whether character can be encoded "as is" */
5317  if( ('0' <= body[ii] && '9' >= body[ii])
5318  || ('A' <= body[ii] && 'Z' >= body[ii])
5319  || ('a' <= body[ii] && 'z' >= body[ii])
5320  || '!' == body[ii] || '*' == body[ii] || '+' == body[ii]
5321  || '-' == body[ii] || '/' == body[ii] )
5322  {
5323  /* Yes */
5324  enc_word[ei++] = body[ii];
5325  }
5326  else
5327  {
5328  /* No => Encode with hexadecimal syntax */
5329  enc_word[ei++] = '=';
5330  dh = (unsigned int) (unsigned char) body[ii] / 16U;
5331  if(10U > dh) { enc_word[ei++] = (char) (48U + dh); }
5332  else { enc_word[ei++] = (char) (65U + dh - 10U); }
5333  dl = (unsigned int) (unsigned char) body[ii] % 16U;
5334  if(10U > dl) { enc_word[ei++] = (char) (48U + dl); }
5335  else { enc_word[ei++] = (char) (65U + dl - 10U); }
5336  }
5337  }
5338  /*
5339  * Check for length limit
5340  * Reserve 3 characters for next hexdecimal value
5341  * Reserve 2 characters for closing "?="
5342  */
5343  if(uc_split || (size_t) (75 - 3 - 2) < ei)
5344  {
5345  /* Terminate normally if there are no more characters */
5346  if(ii < end)
5347  {
5348  enc_split = 1;
5349  /* Rewind index to process skipped data in next run */
5350  i -= (end - ii);
5351  --i;
5352  break;
5353  }
5354  }
5355  else { enc_split = 0; }
5356  }
5357  /* End mark of encoded-word */
5358  enc_word[ei++] = '?';
5359  enc_word[ei++] = '=';
5360  }
5361  /* Terminate word */
5362  enc_word[ei] = 0;
5363  /* printf("Word: |%s|\n", enc_word); */
5364  /* One additional character for potential delimiting space */
5365  word_len = strlen(enc_word) + (size_t) 1;
5366  if((size_t) 998 < word_len)
5367  {
5368  PRINT_ERROR("MIME: Encoded-word too long");
5369  res = -1;
5370  break;
5371  }
5372  /* Fold header field if lines get too long otherwise */
5373  if(word_len && (word_len > rem)
5375  && !(no_ec && !enc_flag && (word_len < rem + (size_t) 922))
5376 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
5377  )
5378  {
5379  /* Fold => This automatically creates SP delimiter */
5380  if(first)
5381  {
5382  PRINT_ERROR("MIME: Encoded-word too long for first line");
5383  res = -1;
5384  break;
5385  }
5386  else if(word_len > rem)
5387  {
5388  memmove((void*) &enc_word[strlen(folding)], (void*) enc_word,
5389  word_len--);
5390  /* Decrement because SP delimitier is part of folding mark */
5391  memcpy((void*) enc_word, (void*) folding, strlen(folding));
5392  word_len += strlen(folding);
5393  rem = (size_t) 75;
5394  }
5395 #if !ENC_MIME_HEADER_FOLD_ASCII_LINES
5396  /* Check whether last word was an encoded word */
5397  if(!enc_flag) { no_ec = 1; }
5398 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
5399  }
5400  else
5401  {
5402  /*
5403  * Prepend SP delimiter
5404  * Note that this delimiter is always syntactical, but not sematical
5405  * between two encoded words!
5406  */
5407  if(init) { init = 0; --word_len; }
5408  else
5409  {
5410  memmove((void*) &enc_word[1], (void*) enc_word, word_len);
5411  enc_word[0] = ' ';
5412  }
5413  }
5414  /* Allocate more memory in exponentially increasing chunks */
5415  /* Attention: Be prepared for large data (ASCII only lines) */
5416  while(ri + word_len >= rbuf_len) /* One additional byte for NUL */
5417  {
5418  if(!rbuf_len) { rbuf_len = 128; }
5419  p = api_posix_realloc((void*) rbuf, rbuf_len *= (size_t) 2);
5420  if(NULL == p) { res = -1; break; }
5421  else { rbuf = p; }
5422  }
5423  if(-1 == res) { break; }
5424  /* Copy word to result buffer */
5425  memcpy((void*) &rbuf[ri], (void*) enc_word, word_len);
5426  ri += word_len;
5427  if(rem < word_len) { rem = 0; }
5428  else { rem -= word_len; }
5429  first = 0;
5430  /* Store new start index */
5431  start = i + (size_t) 1;
5432  }
5433  }
5434  while(body[i]);
5435  }
5436  if(body != body_tmp) { api_posix_free((void*) body); }
5437  if(body_tmp != b) { api_posix_free((void*) body_tmp); }
5438  /* Terminate result string */
5439  if(NULL != rbuf) { rbuf[ri] = 0; }
5440 
5441  /* Check result */
5442  switch(res)
5443  {
5444  case 0:
5445  {
5446  *r = (const char*) rbuf;
5447  break;
5448  }
5449  case 1:
5450  {
5451  *r = b;
5452  break;
5453  }
5454  default:
5455  {
5456  api_posix_free((void*) rbuf);
5457  *r = NULL;
5458  break;
5459  }
5460  }
5461  /* if(0 <= res) { printf("Result: %s\n", *r); } */
5462 
5463  return(res);
5464 }
5465 
5466 
5467 /* ========================================================================== */
5468 /*! \brief Decode header field containing potential MIME \c encoded-word tokens
5469  *
5470  * \param[out] r Pointer to result string pointer
5471  * \param[in] b Header field body that contains potential encoded-words
5472  *
5473  * The header field body \e b must be unfolded before calling this function.
5474  *
5475  * According to RFC 2047 the following rules are applied:
5476  * - An encoded-word is not allowed to be longer than 75 characters
5477  * => We decode encoded-word of arbitrary length.
5478  * - An encoded-word not at the beginning can start after a 'linear-white-space'
5479  * token => We resync the parser after every white space.
5480  * - Any amount of linear-space-white between 'encoded-word's must be ignored
5481  * => We do so.
5482  * - The character set and encoding fields must be treated case-insensitive
5483  * => We do so.
5484  * - All character sets from the ISO 8859 family that are not supported must be
5485  * handled in a way that contained ASCII characters are decoded correctly
5486  * => We do so.
5487  *
5488  * According to RFC 3629 the following rules are applied:
5489  * - If the content of an encoded word is UTF-8 encoded, it is is not allowed
5490  * to accept it unchecked. It is mandatory to check the validity of the
5491  * encoding => We do so.
5492  *
5493  * On success, the address of the result buffer is written to the location
5494  * pointed to by \e r (this may be the same as \e b if there is nothing to do).
5495  * The caller is responsible to free the potentially allocated memory.
5496  * On error \c NULL is written to the location pointed to by \e r .
5497  *
5498  * \return
5499  * - 0 on success if something was decoded and a new memory block was allocated
5500  * - 1 on success if there was nothing to decode and no memory was allocated
5501  * - -1 on error
5502  */
5503 
5504 int enc_mime_word_decode(const char** r, const char* b)
5505 {
5506  int res = 0;
5507  char* rbuf = NULL;
5508  size_t rbuf_len = 0;
5509  size_t ri = 0;
5510  size_t i = 0;
5511  const char* target;
5512  char* p;
5513  char* p2;
5514  enum enc_mime_cs charset;
5515  char encoding;
5516  const char* wbuf;
5517  const char* nbuf;
5518  size_t nbuf_len;
5519  int word_flag = 0;
5520  size_t word_trailing_space = 0;
5521  size_t ii;
5522  int ctrl = 0; /* Indicates unwanted control characters to remove */
5523  size_t len = 0;
5524 
5525  /* Fast special check for "no equal sign" */
5526  target = strchr(&b[i], (int) '=');
5527  if(NULL == target)
5528  {
5529  /* ... and no unwanted LF and CR control characters */
5530  target = strchr(&b[i], 0x0A);
5531  if(NULL == target)
5532  {
5533  target = strchr(&b[i], 0x0D);
5534  if(NULL == target) { res = 1; }
5535  }
5536  }
5537 
5538  while(!res && b[i])
5539  {
5540  wbuf = NULL;
5541  /* Skip white space */
5542  nbuf_len = 0;
5543  while(b[i] &&
5544  (' ' == b[i + nbuf_len] || (const char) 0x09 == b[i + nbuf_len]))
5545  {
5546  ++nbuf_len;
5547  }
5548  if(!nbuf_len)
5549  {
5550  /* Check for encoded word */
5551  p = NULL;
5552  target = &b[i];
5553  if('=' == target[0])
5554  {
5555  if('?' == target[1])
5556  {
5557  /* Start delimiter detected */
5558  p = strchr(&target[2], (int) '?');
5559  if(NULL != p)
5560  {
5561  /* Extract character set (ignore RF2231 language tokens) */
5562  p2 = strchr(&target[2], (int) '*');
5563  if(NULL == p2) { p2 = p; }
5564  else if(p < p2) { p2 = p; }
5565  charset = enc_mime_get_charset(&target[2],
5566  (size_t) (p2 - &target[2]));
5567  /* Extract encoding */
5568  if(p[1])
5569  {
5570  encoding = (char) toupper((int) p[1]);
5571  if('?' != p[2])
5572  {
5573  PRINT_ERROR("MIME: Syntax error in encoded-word");
5574  }
5575  else
5576  {
5577  /* Extract payload */
5578  target = &p[3];
5579  p = strchr(target, (int) '?');
5580  if(NULL != p)
5581  {
5582  if('=' != p[1])
5583  {
5584  PRINT_ERROR("MIME: "
5585  "Too many fields in encoded-word");
5586  }
5587  else
5588  {
5589  /* End delimiter detected */
5590  switch(encoding)
5591  {
5592  case 'Q':
5593  {
5594  /* Use quoted printable decoder */
5595  wbuf = enc_mime_decode_q(charset,
5596  target, p, 1);
5597  break;
5598  }
5599  case 'B':
5600  {
5601  /* Use base64 decoder */
5602  wbuf = enc_mime_decode_b(charset,
5603  target, p);
5604  break;
5605  }
5606  default:
5607  {
5608  PRINT_ERROR("MIME: Encoding not supported");
5609  break;
5610  }
5611  }
5612  }
5613  }
5614  }
5615  }
5616  }
5617  }
5618  }
5619  if(NULL != wbuf)
5620  {
5621  /* Rewind white space between encoded words */
5622  if(word_flag)
5623  {
5624  while( ri && (' ' == rbuf[ri - (size_t) 1] ||
5625  0x09 == (int) rbuf[ri - (size_t) 1]) )
5626  {
5627  --ri;
5628  }
5629  ri += word_trailing_space;
5630  }
5631  /* Copy encoded word */
5632  word_flag = 1;
5633  nbuf = wbuf;
5634  nbuf_len = strlen(nbuf);
5635  i += (size_t) (&p[2] - &b[i]);
5636  /* Store number of trailing spaces */
5637  word_trailing_space = 0;
5638  if(nbuf_len)
5639  {
5640  ii = nbuf_len;
5641  while(ii--)
5642  {
5643  if(' ' != nbuf[ii]) { break; }
5644  else { ++word_trailing_space; }
5645  }
5646  }
5647  }
5648  else
5649  {
5650  /* Copy as ASCII up to next white space */
5651  word_flag = 0;
5652  nbuf = &b[i];
5653  p = strchr(nbuf, (int) ' ');
5654  p2 = strchr(nbuf, 0x09);
5655  if(NULL != p2 && p2 < p) { p = p2; }
5656  if(NULL == p) { nbuf_len = strlen(nbuf); }
5657  else { nbuf_len = (size_t) (p - nbuf); }
5658  i += nbuf_len;
5659  }
5660  }
5661  else
5662  {
5663  /* Copy white space */
5664  nbuf = &b[i];
5665  i += nbuf_len;
5666  }
5667 
5668  /* Allocate more memory in exponentially increasing chunks */
5669  while(ri + nbuf_len >= rbuf_len) /* 1 additional byte for termination */
5670  {
5671  if(!rbuf_len) { rbuf_len = 128; }
5672  if(API_POSIX_SIZE_MAX / (size_t) 2 < rbuf_len) { res = -1; break; }
5673  p = (char*) api_posix_realloc((void*) rbuf, rbuf_len *= (size_t) 2);
5674  if(NULL == p) { res = -1; break; }
5675  else { rbuf = p; }
5676  }
5677 
5678  /* Copy decoded word to result buffer */
5679  memcpy((void*) &rbuf[ri], (void*) nbuf, nbuf_len);
5680  ri += nbuf_len;
5681  if(NULL != wbuf) { api_posix_free((void*) wbuf); }
5682  }
5683  /* Terminate result string */
5684  if(NULL != rbuf) { len = ri; rbuf[len] = 0; }
5685 
5686  /* Replace unwanted LF and CR control characters with U+FFFE */
5687  if(NULL != rbuf)
5688  {
5689  ri = 0;
5690  while(rbuf[ri])
5691  {
5692  if(0x0A == (int) rbuf[ri] || 0x0D == (int) rbuf[ri])
5693  {
5694  ctrl = 1;
5695  break;
5696  }
5697  ++ri;
5698  }
5699  if(ctrl)
5700  {
5701  if(API_POSIX_SIZE_MAX / (size_t) 3 <= len) { res = -1; }
5702  else
5703  {
5704  /* Multiply string length by 3 and add 1 for NUL termination */
5705  rbuf_len = len * (size_t) 3 + (size_t) 1;
5706  p = (char*) api_posix_realloc((void*) rbuf, rbuf_len);
5707  if(NULL == p) { res = -1; }
5708  else
5709  {
5710  rbuf = p;
5711  /* Use U+FFFE (3 octets) as replacement character */
5712  ri = 0;
5713  do
5714  {
5715  if(0x0A == (int) rbuf[ri] || 0x0D == (int) rbuf[ri])
5716  {
5717  memmove((void*) &rbuf[ri + (size_t) 2], (void*) &rbuf[ri],
5718  rbuf_len - (ri + (size_t) 2));
5719  rbuf[ri++] = (unsigned char) 0xEF;
5720  rbuf[ri++] = (unsigned char) 0xBF;
5721  rbuf[ri] = (unsigned char) 0xBD;
5722  }
5723  }
5724  while(rbuf[++ri]);
5725  }
5726  PRINT_ERROR("MIME: "
5727  "Unwanted CR and/or LF detected in header field");
5728  }
5729  }
5730  }
5731 
5732  /* Check result */
5733  switch(res)
5734  {
5735  case 0:
5736  {
5737  *r = (const char*) rbuf;
5738  break;
5739  }
5740  case 1:
5741  {
5742  *r = b;
5743  break;
5744  }
5745  default:
5746  {
5747  api_posix_free((void*) rbuf);
5748  *r = NULL;
5749  break;
5750  }
5751  }
5752 
5753  return(res);
5754 }
5755 
5756 
5757 /* ========================================================================== */
5758 /*! \brief Decode header field containing potential MIME parameters
5759  *
5760  * \param[out] r Pointer to result string pointer
5761  * \param[in] b Prepared header field body that contains potential parameters
5762  * \param[in] m Operating mode (see description below)
5763  *
5764  * The parameter \e m enable special processing if set to a nonzero value.
5765  * \e m should be set to 1 for the \c Content-Type header field.
5766  *
5767  * \attention
5768  * This function must be called after unfolding the field body, with comments
5769  * stripped and after decoding of \c quoted-string tokens. Whitespace must
5770  * already be merged into the semantically equivalent single SP (and removed
5771  * completely before semicolons and around equal signs) by the caller.
5772  *
5773  * According to RFC 2231 the following rules are applied:
5774  * - Parameters can be split into multiple sections which can be listed in
5775  * arbitrary order inside the header field body
5776  * => We accept parameter sections in any order and merge them in ascending
5777  * order.
5778  * - Parameter sections are allowed to contain literal content as well as
5779  * \c quoted-string tokens. Mixing sections of both types is allowed
5780  * => \c quoted-string tokens must already be decoded in \e b by the caller.
5781  * - Parameters can contain character set information
5782  * => We accept content in any supported character set and decode it to
5783  * Unicode NFC (non-US_ASCII octets of unsupported character sets are
5784  * decoded to the underscore character).
5785  * - Parameter can contain language information => We accept and ignore it.
5786  *
5787  * According to RFC 3629 the following rules are applied:
5788  * - If the content of a parameter is UTF-8 encoded, it is is not allowed to
5789  * accept it unchecked. It is mandatory to check the validity of the encoding
5790  * => We do so.
5791  *
5792  * On success, the address of the result buffer is written to the location
5793  * pointed to by \e r (this may be the same as \e b if there is nothing to do).
5794  * The caller is responsible to free the potentially allocated memory.
5795  * On error \c NULL is written to the location pointed to by \e r .
5796  *
5797  * \return
5798  * - 0 on success if something was decoded and a new memory block was allocated
5799  * - 1 on success if there was nothing to decode and no memory was allocated
5800  * - -1 on error
5801  */
5802 
5803 int enc_mime_para_decode(const char** r, const char* b, int m)
5804 {
5805  int res = -1;
5806  api_posix_locale_t loc_ctype_posix = 0;
5807  int finished = 0;
5808  struct mime_parameter** parray = NULL;
5809  size_t ppsize = sizeof(struct mime_parameter*);
5810  struct mime_parameter* pdata;
5811  size_t psize = sizeof(struct mime_parameter);
5812  const char* first_end; /* Semicolon after first element */
5813  const char* p; /* Start of parameter */
5814  const char* p_cs; /* Start of charset name */
5815  const char* p_start; /* Start of parameter value */
5816  const char* p_end; /* End of parameter */
5817  const char* p_eq_sign; /* Position of equal sign */
5818  const char* p_asterisk; /* Position of asterisk (or 'p_eq_sign') */
5819  size_t alen; /* Length of attribute token */
5820  size_t clen; /* Length of charset token */
5821  size_t i = 0;
5822  size_t ii = 0;
5823  int rv;
5824  struct mime_parameter** tmp;
5825  char* tmp2;
5826  char* tmp3;
5827  unsigned int sec_num;
5828  char ext_mark; /* Flag indicating extended-parameter (with charset) */
5829  const char* q;
5830  char* rbuf = NULL;
5831  size_t rbuf_len;
5832  size_t ri = 0;
5833  size_t len;
5834  size_t len2;
5835  int rewind;
5836  int error = 0;
5837  char* para_charset; /* Pointer to charset declaration of first section */
5838 
5839  /* Create a locale object with LC_CTYPE == POSIX */
5840  loc_ctype_posix = api_posix_newlocale(API_POSIX_LC_CTYPE_MASK, "POSIX",
5841  (api_posix_locale_t) 0);
5842  if((api_posix_locale_t) 0 == loc_ctype_posix)
5843  {
5844  PRINT_ERROR("MIME: Cannot create locale object");
5845  return(res);
5846  }
5847 
5848  /* Nothing to do if there are no asterisks */
5849  if(NULL == strchr(b, (int) '*')) { *r = b; res = 1; }
5850  else
5851  {
5852 #if 0
5853  /* For debugging */
5854  printf("---------------\n");
5855  printf("Header field body : %s\n", b);
5856 #endif
5857  /* Skip to end of content */
5858  first_end = strchr(b, (int) ';');
5859  if(NULL == first_end) { *r = b; res = 1; }
5860  else
5861  {
5862  /* Initialize parameter section array */
5863  parray = (struct mime_parameter**) api_posix_malloc(ppsize);
5864  if(NULL != parray)
5865  {
5866  parray[0] = NULL;
5867  /* Parse parameters */
5868  p_end = first_end;
5869  do
5870  {
5871  p = p_end + 1;
5872  sec_num = 0;
5873  ext_mark = ' ';
5874  clen = 0;
5875  /* Skip potential space after semicolon */
5876  if(' ' == *p) { ++p; }
5877  /* Seach for end of parameter section content */
5878  p_end = strchr(p, (int) ';');
5879  if(NULL == p_end)
5880  {
5881  p_end = p + strlen(p);
5882  /* Strip potential trailing space */
5883  if(' ' == *(p_end - 1)) { --p_end; }
5884  finished = 1;
5885  }
5886  /* Search for end of parameter name */
5887  p_eq_sign = strchr(p, (int) '=');
5888  if(NULL == p_eq_sign) { break; }
5889  if(p_end < p_eq_sign) { break; }
5890  /* Search for end of attribute token (asterisk) */
5891  p_asterisk = strchr(p, (int) '*');
5892  if(NULL != p_asterisk && p_eq_sign > p_asterisk)
5893  {
5894  /* Extract section number */
5895  rv = sscanf(p_asterisk, " * %u", &sec_num);
5896  if(1 != rv)
5897  {
5898  /* No section number specified */
5899  sec_num = 0;
5900  /* Check for extended-parameter */
5901  sscanf(p_asterisk, " %c", &ext_mark);
5902  }
5903  else
5904  {
5905  /* Check for extended-parameter */
5906  sscanf(p_asterisk, " * %*u %c", &ext_mark);
5907  }
5908  }
5909  else { p_asterisk = p_eq_sign; }
5910  alen = (size_t) (p_asterisk - p);
5911  if(alen && ' ' == p[alen - (size_t) 1])
5912  {
5913  /* Strip potential trailing space */
5914  --alen;
5915  }
5916  /* Check for parameter attribute length limit */
5917  if(ENC_MIME_PARA_LENGTH_MAX < alen)
5918  {
5919  PRINT_ERROR("MIME: Parameter attribute too long");
5920  continue;
5921  }
5922  /* Extract charset */
5923  p_start = p_eq_sign + 1;
5924  p_cs = p_start;
5925  if(!sec_num && '*' == ext_mark)
5926  {
5927  q = strchr(p_start, 0x27);
5928  if(NULL == q)
5929  {
5930  PRINT_ERROR("MIME: Parameter charset field missing");
5931  }
5932  else
5933  {
5934  clen = (size_t) (q - p_start);
5935  if(ENC_MIME_PARA_LENGTH_MAX < clen)
5936  {
5937  PRINT_ERROR("MIME: Parameter charset too long");
5938  clen = 0;
5939  }
5940  p_start = q + 1;
5941  q = strchr(p_start, 0x27);
5942  if(NULL == q)
5943  {
5944  PRINT_ERROR("MIME: Parameter language field missing");
5945  }
5946  else { p_start = q + 1; }
5947  }
5948  }
5949  /* Remove unknown parameters for "Content-Type" mode */
5950  if(1 == m)
5951  {
5952  if(api_posix_strncasecmp_l(p, "Charset", alen, loc_ctype_posix)
5953  && api_posix_strncasecmp_l(p, "Format", alen, loc_ctype_posix)
5954  && api_posix_strncasecmp_l(p, "DelSp", alen, loc_ctype_posix)
5955  && api_posix_strncasecmp_l(p, "InsLine", alen, loc_ctype_posix)
5956  && api_posix_strncasecmp_l(p, "Boundary", alen, loc_ctype_posix)
5957  )
5958  {
5959  /* Ignore all other parameters */
5960  continue;
5961  }
5962  }
5963  /* Increase size of array */
5964  tmp = (struct mime_parameter**)
5965  api_posix_realloc(parray,
5966  ppsize += sizeof(struct mime_parameter*));
5967  if(NULL == tmp)
5968  {
5969  PRINT_ERROR("MIME: Parameter memory allocation failed");
5970  break;
5971  }
5972  parray = tmp;
5973  /* Construct parameter structure ... */
5974  pdata = (struct mime_parameter*) api_posix_malloc(psize);
5975  if(NULL == pdata)
5976  {
5977  PRINT_ERROR("MIME: Parameter memory allocation failed");
5978  break;
5979  }
5980  strncpy(pdata->attribute, p, alen);
5981  pdata->attribute[alen] = 0;
5982  pdata->attribute_len = alen;
5983  pdata->section = sec_num;
5984  strncpy(pdata->charset, p_cs, clen);
5985  pdata->charset[clen] = 0;
5986  pdata->value_start = p_start;
5987  pdata->value_end = p_end;
5988  pdata->valid = 1;
5989 #if 0
5990  /* For debugging */
5991  printf("Index : %u / Section: %u (%s): ", (unsigned int) i,
5992  sec_num, pdata->attribute);
5993  if(strlen(pdata->charset))
5994  {
5995  printf("[Charset: %s] ", pdata->charset);
5996  }
5997  for(size_t iii = 0; (size_t) (p_end - p_start) > iii; ++iii)
5998  {
5999  printf("%c", pdata->value_start[iii]);
6000  }
6001  printf("\n");
6002 #endif
6003  /* ... and append it to array */
6004  parray[i] = pdata;
6005  parray[++i] = NULL;
6006  }
6007  while(!finished);
6008  /* -------------------------------------------------------------- */
6009  /* Allocate new memory buffer for result */
6010  rbuf_len = (size_t) (first_end - b);
6011  /* 3 additional bytes for "; " separator and NUL termination */
6012  rbuf = (char*) api_posix_malloc(rbuf_len + (size_t) 3);
6013  if(NULL != rbuf)
6014  {
6015  /* Copy first element (including the semicolon) */
6016  strncpy(rbuf, b, rbuf_len);
6017  rbuf[rbuf_len] = 0;
6018  /* Strip SPs from first element for "Content-Type" mode */
6019  if(1 == m)
6020  {
6021  /* Assignment in truth expression is intended */
6022  while(NULL != (q = strchr(rbuf, (int) ' ')))
6023  {
6024  memmove((void*) q, (void*) (q + 1),
6025  strlen(q + 1) + (size_t) 1);
6026  if(rbuf_len) { --rbuf_len; }
6027  }
6028  }
6029  ri += rbuf_len;
6030  rbuf_len += (size_t) 3;
6031  /* Merge parameter sections */
6032  res = 0;
6033  i = 0;
6034  if(NULL == parray[i])
6035  {
6036  PRINT_ERROR("MIME: Missing parameters");
6037  }
6038  else do
6039  {
6040  /* Array contain at least 1 element */
6041  if(!parray[i]->valid) { continue; }
6042  if(parray[i]->section) { continue; }
6043  /* Found initial section => Insert separator */
6044  rbuf[ri++] = ';'; rbuf[ri++] = ' ';
6045  /* Select initial section to force first match */
6046  sec_num = 0;
6047  rewind = 0;
6048  ii = 0;
6049  para_charset = NULL;
6050  do
6051  {
6052  /* Search for next segment */
6053  if(rewind) { rewind = 0; ii = 0; }
6054  if(!parray[ii]->valid) { continue; }
6055  if(sec_num != parray[ii]->section) { continue; }
6056  else if(!strcmp(parray[i]->attribute,
6057  parray[ii]->attribute))
6058  {
6059  /* Calculate length */
6060  if(!sec_num)
6061  {
6062  /* One additional byte for NUL (and later '=') */
6063  alen = parray[ii]->attribute_len + (size_t) 1;
6064  }
6065  else { alen = 0; }
6066  len = alen;
6067  len += (size_t) (parray[ii]->value_end
6068  - parray[ii]->value_start);
6069  /* Allocate memory in exponentially increasing chunks */
6070  len += 3; /* For "; " separator and NUL termination */
6071  while(ri + len >= rbuf_len)
6072  {
6073  tmp2 = api_posix_realloc((void*) rbuf,
6074  rbuf_len *= (size_t) 2);
6075  if(NULL == tmp2)
6076  {
6077  PRINT_ERROR("MIME: Memory allocation"
6078  " for result buffer failed");
6079  error = 1;
6080  continue;
6081  }
6082  else { rbuf = tmp2; }
6083  }
6084  len -= (size_t) 3;
6085  /* Append attribute to result buffer for section 0 */
6086  if(!sec_num)
6087  {
6088  strncpy(&rbuf[ri], parray[ii]->attribute, alen);
6089  rbuf[ri + alen - (size_t) 1] = '=';
6090  ri += alen;
6091  len -= alen;
6092  }
6093  /* Only first parameter section has a charset field */
6094  if(!sec_num) { para_charset = parray[ii]->charset; }
6095  /* Append decoded value section to result buffer */
6096  tmp3 = NULL;
6097  if(NULL != para_charset)
6098  {
6099  /* Interpret zero length charset as "US-ASCII" */
6100  if((size_t) 0 == strlen(para_charset))
6101  {
6102  para_charset="US-ASCII";
6103  }
6104  /* Decode charset of value section */
6105  tmp2 = api_posix_malloc(len + (size_t) 1);
6106  if(NULL != tmp2)
6107  {
6108  strncpy(tmp2, parray[ii]->value_start, len);
6109  tmp2[len] = 0;
6110  tmp3 = enc_mime_decode_parameter(tmp2,
6111  para_charset);
6112  if(NULL != tmp3)
6113  {
6114  len2 = strlen(tmp3);
6115  if(len < len2)
6116  {
6117  PRINT_ERROR("MIME: Decoding error");
6118  api_posix_free((void*) tmp3);
6119  tmp3 = NULL;
6120  }
6121  else
6122  {
6123  strcpy(&rbuf[ri], tmp3);
6124  ri += len2;
6125  }
6126  }
6127  api_posix_free((void*) tmp2);
6128  }
6129  }
6130  if(NULL == tmp3)
6131  {
6132  strncpy(&rbuf[ri], parray[ii]->value_start, len);
6133  rbuf[ri + len] = 0;
6134  ri += len;
6135  }
6136  api_posix_free((void*) tmp3);
6137  parray[ii]->valid = 0;
6138  /* Rewind index for next section */
6139  rewind = 1;
6140  ++sec_num;
6141  }
6142  }
6143  while(!error && (NULL != parray[++ii] || rewind));
6144  parray[i]->valid = 0;
6145  }
6146  while(!error && NULL != parray[++i]);
6147  }
6148  /* Destroy parameter section array */
6149  i = 0;
6150  while(NULL != parray[i]) { api_posix_free((void*) parray[i++]); }
6151  api_posix_free((void*) parray);
6152  }
6153  }
6154  }
6155  if(error) { res = -1; }
6156 
6157  /* Destroy locale object */
6158  if((api_posix_locale_t) 0 != loc_ctype_posix)
6159  {
6160  api_posix_freelocale(loc_ctype_posix);
6161  }
6162 
6163  /* Check for error */
6164  if(0 > res) { *r = NULL; }
6165  if(!res)
6166  {
6167 #if 0
6168  /* For debugging (Attention: Terminal must use UTF-8 encoding!) */
6169  printf("Result: %s\n", rbuf);
6170  printf("---------------\n");
6171 #endif
6172  *r = rbuf;
6173  } else { api_posix_free((void*) rbuf); }
6174 
6175  return(res);
6176 }
6177 
6178 
6179 /* ========================================================================== */
6180 /*! \brief Decode MIME "Content-Type" header field
6181  *
6182  * \param[out] ct Pointer to result structure
6183  * \param[in] hf_body Header field body that contains the MIME content type
6184  * \param[out] bo Pointer to buffer for multipart boundary delimiter
6185  *
6186  * The header field body \e hf_body is decoded and content IDs are written to
6187  * the structure pointed to by \e ct .
6188  *
6189  * The buffer for the boundary string used in messages with content type
6190  * "multipart" must be allocated by the caller with a size of at least
6191  * \ref ENC_BO_BUFLEN and a pointer to the start of this buffer must be passed
6192  * as \e bo parameter. It is allowed to pass \c NULL for \e bo if the caller
6193  * is not interested in the boundary string.
6194  *
6195  * According to RFC 2045 the following rules are applied:
6196  * - If the content type is not present, "text/plain" and "US-ASCII" must be
6197  * used as default => We do so.
6198  *
6199  * According to RFC 2046 the following rules are applied:
6200  * - The content type and subtype must be treated case insensitive => We do so.
6201  * - The parameter names must be treated case insensitive => We do so.
6202  * - The default character set must be assumed as "US-ASCII" if the "charset"
6203  * parameter is missing for "text/plain" content type => We do so.
6204  *
6205  * According to RFC 3676 the following rules are applied:
6206  * - The values of parameters "Format" and "DelSp" must be treated case
6207  * insensitive => We do so.
6208  * - The parameter "DelSp" should be ignored if content type is not "text/plain"
6209  * with "format=flowed" => We do so.
6210  *
6211  * The experimental parameter "InsLine" set to "yes" adds an empty line
6212  * separator after every paragraph that end with an empty line.
6213  * This allows to declare single lines as paragraphs, e.g. for Smartphones,
6214  * without losing the separation to the following text (or creating double empty
6215  * line separation in compatibility view).
6216  *
6217  * \note
6218  * This function never fails, instead \c ENC_xxx_UNKNOWN IDs are returned.
6219  */
6220 
6221 void enc_mime_get_ct(struct enc_mime_ct* ct, const char* hf_body, char* bo)
6222 {
6223  char* body = NULL;
6224  size_t len;
6225  size_t i;
6226  size_t ii;
6227  char fmt[ENC_FMT_BUFLEN];
6228  char cs[ENC_CS_BUFLEN];
6229  size_t bo_len, bo_len_valid;
6230  int trailing_sp = 0;
6231 
6232  /* Initialize result */
6233  ct->type = ENC_CT_UNKNOWN;
6234  ct->subtype = ENC_CTS_UNKNOWN;
6235  ct->charset = ENC_CS_UNKNOWN;
6236  ct->flags = 0;
6237 
6238  /* Accept NULL pointer (treat as "field is not present") */
6239  if(NULL == hf_body)
6240  {
6241 #if 0
6242  /* For debugging */
6243  printf("Content-Type: Not specified\n");
6244 #endif
6245  ct->type = ENC_CT_TEXT;
6246  ct->subtype = ENC_CTS_PLAIN;
6247  ct->charset = ENC_CS_ASCII;
6248  return;
6249  }
6250 
6251  /* Allocate memory for case conversion */
6252  len = strlen(hf_body);
6253  body = (char*) api_posix_malloc(len + (size_t) 1);
6254  if(NULL != body)
6255  {
6256  /* Convert header field body to upper case */
6257  for(i = 0; i < len; ++i) { body[i] = (char) toupper((int) hf_body[i]); }
6258  body[len] = 0;
6259 #if 0
6260  /* For debugging */
6261  printf("Content-Type: %s\n", body);
6262 #endif
6263  /* Check for content type "text" */
6264  if(!strncmp("TEXT", body, 4))
6265  {
6266  ct->type = ENC_CT_TEXT;
6267  ct->charset = ENC_CS_ASCII;
6268  if(!strncmp("TEXT/PLAIN", body, 10))
6269  {
6270  ct->subtype = ENC_CTS_PLAIN;
6271  /* Search for RFC 3676 "Format" parameter (case insensitive) */
6272  for(i = 0; i < len; ++i)
6273  {
6274  if(!strncmp("FORMAT", &body[i], 6))
6275  {
6276  /* Extract parameter value */
6277  ii = i + (size_t) 6;
6278  while(body[ii])
6279  {
6280  if('=' != body[ii] && ' ' != body[ii]) { break; }
6281  else { ++ii; }
6282  }
6283  for(i = 0; i < ENC_FMT_BUFLEN; ++i)
6284  {
6285  if(!body[ii + i]
6286  || ';' == body[ii + i] || ' ' == body[ii + i])
6287  {
6288  fmt[i] = 0; break;
6289  }
6290  else { fmt[i] = body[ii + i]; }
6291  }
6292  fmt[ENC_FMT_BUFLEN - (size_t) 1] = 0;
6293  if(!strncmp("FLOWED", fmt, 6))
6294  {
6295  ct->flags |= ENC_CT_FLAG_FLOWED;
6296  }
6297  break;
6298  }
6299  }
6300  if(ct->flags & ENC_CT_FLAG_FLOWED)
6301  {
6302  /* Search for RFC 3676 "DelSp" parameter (case insensitive) */
6303  for(i = 0; i < len; ++i)
6304  {
6305  if(!strncmp("DELSP", &body[i], 5))
6306  {
6307  /* Extract parameter value */
6308  ii = i + (size_t) 5;
6309  while(body[ii])
6310  {
6311  if('=' != body[ii] && ' ' != body[ii]) { break; }
6312  else { ++ii; }
6313  }
6314  for(i = 0; i < ENC_FMT_BUFLEN; ++i)
6315  {
6316  if(!body[ii + i]
6317  || ';' == body[ii + i] || ' ' == body[ii + i])
6318  {
6319  fmt[i] = 0; break;
6320  }
6321  else { fmt[i] = body[ii + i]; }
6322  }
6323  fmt[ENC_FMT_BUFLEN - (size_t) 1] = 0;
6324  if(!strncmp("YES", fmt, 3))
6325  {
6326  ct->flags |= ENC_CT_FLAG_DELSP;
6327  }
6328  break;
6329  }
6330  }
6331  /* Search for "InsLine" parameter (case insensitive) */
6332  for(i = 0; i < len; ++i)
6333  {
6334  if(!strncmp("INSLINE", &body[i], 7))
6335  {
6336  /* Extract parameter value */
6337  ii = i + (size_t) 7;
6338  while(body[ii])
6339  {
6340  if('=' != body[ii] && ' ' != body[ii]) { break; }
6341  else { ++ii; }
6342  }
6343  for(i = 0; i < ENC_FMT_BUFLEN; ++i)
6344  {
6345  if(!body[ii + i]
6346  || ';' == body[ii + i] || ' ' == body[ii + i])
6347  {
6348  fmt[i] = 0; break;
6349  }
6350  else { fmt[i] = body[ii + i]; }
6351  }
6352  fmt[ENC_FMT_BUFLEN - (size_t) 1] = 0;
6353  if(!strncmp("YES", fmt, 3))
6354  {
6355  ct->flags |= ENC_CT_FLAG_INSLINE;
6356  }
6357  break;
6358  }
6359  }
6360  }
6361  }
6362  /* Search for "charset" parameter */
6363  for(i = 0; i < len; ++i)
6364  {
6365  if(!strncmp("CHARSET", &body[i], 7))
6366  {
6367  /* Extract parameter value */
6368  ii = i + (size_t) 7;
6369  while(body[ii])
6370  {
6371  if('=' != body[ii] && ' ' != body[ii]) { break; }
6372  else { ++ii; }
6373  }
6374  for(i = 0; i < ENC_CS_BUFLEN; ++i)
6375  {
6376  if(!body[ii + i]
6377  || ';' == body[ii + i] || ' ' == body[ii + i])
6378  {
6379  cs[i] = 0; break;
6380  }
6381  else { cs[i] = body[ii + i]; }
6382  }
6383  cs[ENC_CS_BUFLEN - (size_t) 1] = 0;
6384  ct->charset = enc_mime_get_charset(cs, strlen(cs));
6385  break;
6386  }
6387  }
6388  }
6389  /* Check for content type "image" */
6390  else if(!strncmp("IMAGE", body, 5))
6391  {
6392  ct->type = ENC_CT_IMAGE;
6393  }
6394  /* Check for content type "audio" */
6395  else if(!strncmp("AUDIO", body, 5))
6396  {
6397  ct->type = ENC_CT_AUDIO;
6398  }
6399  /* Check for content type "video" */
6400  else if(!strncmp("VIDEO", body, 5))
6401  {
6402  ct->type = ENC_CT_VIDEO;
6403  }
6404  /* Check for content type "message" (only subtype "rfc822" supported) */
6405  else if(!strncmp("MESSAGE/RFC822", body, 14))
6406  {
6407  ct->type = ENC_CT_MESSAGE;
6408  ct->subtype = ENC_CTS_RFC822;
6409  }
6410  /* Check for content type "multipart", map unknown subtypes to "mixed" */
6411  else if(!strncmp("MULTIPART", body, 9))
6412  {
6413  ct->type = ENC_CT_MULTIPART;
6414  ct->subtype = ENC_CTS_MIXED;
6415  if(!strncmp("MULTIPART/ALTERNATIVE", body, 21))
6416  {
6418  }
6419  else if(!strncmp("MULTIPART/DIGEST", body, 16))
6420  {
6421  ct->subtype = ENC_CTS_DIGEST;
6422  }
6423  bo[0] = 0;
6424  /* Search for "boundary" parameter */
6425  for(i = 0; i < len; ++i)
6426  {
6427  if(!strncmp("BOUNDARY", &body[i], 8))
6428  {
6429  /* Extract case sensitive parameter value */
6430  ii = i + (size_t) 8;
6431  if('=' != hf_body[ii++])
6432  {
6433  PRINT_ERROR("MIME: "
6434  "Missing multipart boundary parameter value");
6435  }
6436  else
6437  {
6438  /* Start of boundary parameter value found */
6439  for(i = 0; i < ENC_BO_BUFLEN; ++i)
6440  {
6441  if(!hf_body[ii + i] || ';' == hf_body[ii + i])
6442  {
6443  bo[i] = 0; break;
6444  }
6445  else { bo[i] = hf_body[ii + i]; }
6446  }
6447  bo[ENC_BO_BUFLEN - (size_t) 1] = 0;
6448  /* Check boundary */
6449  bo_len = strlen(bo);
6450  bo_len_valid = strspn(bo,
6451  "0123456789"
6452  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
6453  "'()+_,-./:=?"
6454  " ");
6455  if (bo_len_valid != bo_len)
6456  {
6457  PRINT_ERROR("MIME: Invalid multipart boundary parameter");
6458  if(bo_len_valid && ' ' != bo[bo_len_valid - (size_t) 1])
6459  {
6460  /* Strip invalid tail */
6461  bo[bo_len_valid] = 0;
6462  bo_len = bo_len_valid;
6463  }
6464  else
6465  {
6466  /* Boundary not usable, use "?" as dummy replacement */
6467  bo[0] = '?';
6468  bo_len = 1;
6469  bo[bo_len] = 0;
6470  }
6471  }
6472  while(bo_len)
6473  {
6474  if(' ' != bo[bo_len - (size_t) 1]) { break; }
6475  /* Strip trailing SP */
6476  trailing_sp = 1;
6477  bo[--bo_len] = 0;
6478  }
6479  if(trailing_sp)
6480  {
6481  PRINT_ERROR("MIME: Stripped trailing whitespace "
6482  "from multipart boundary parameter");
6483  }
6484  }
6485  break;
6486  }
6487  }
6488  }
6489  }
6490  api_posix_free((void*) body);
6491 }
6492 
6493 
6494 /* ========================================================================== */
6495 /*! \brief Decode content transfer encoding description
6496  *
6497  * \param[in] hf_body MIME content transfer encoding description string
6498  *
6499  * This function checks whether the string \e hf_body represents a supported
6500  * content transfer encoding and return the corresponding ID for it.
6501  * According to RFC 2047 the content transfer encoding is treated
6502  * case-insensitive.
6503  *
6504  * \note
6505  * It is allowed to call this function with \e hf_body set to \c NULL. This is
6506  * treated as an error and the return value will indicate an unknown transfer
6507  * encoding.
6508  *
6509  * \note
6510  * RFC 2049 requires that every non-7bit MIME content must be labeled with a
6511  * content transfer encoding header field of "8bit" or "binary".
6512  *
6513  * \result
6514  * - MIME content transfer encoding ID (from \ref enc_mime_cte )
6515  * - \c ENC_CTE_UNKNOWN on error
6516  */
6517 /* If this header field is missing, we assume "binary" instead of "7bit".
6518  * Don't change this because it is required for handling unknown transfer
6519  * encodings!
6520  */
6521 
6522 enum enc_mime_cte enc_mime_get_cte(const char* hf_body)
6523 {
6524  enum enc_mime_cte res = ENC_CTE_BIN;
6525  char buf[ENC_CTE_BUFLEN];
6526  size_t len;
6527  size_t i;
6528  const char not_supported[]
6529  = "ENC: MIME: Unsupported content transfer encoding: ";
6530  char* p;
6531  size_t l;
6532 
6533  /* Accept NULL pointer */
6534  if(NULL != hf_body)
6535  {
6536  res = ENC_CTE_UNKNOWN;
6537  len = strlen(hf_body);
6538  if(ENC_CTE_BUFLEN <= len)
6539  {
6540  /* If you get this error, the value of 'ENC_CTE_BUFLEN' is too small */
6541  PRINT_ERROR("MIME: Name of content transfer encoding too long");
6542  }
6543  else
6544  {
6545  /* Convert description to upper case */
6546  for(i = 0; i < len; ++i)
6547  {
6548  buf[i] = (char) toupper((int) hf_body[i]);
6549  }
6550  buf[len] = 0;
6551  /* Check for all known content transfer encodings */
6552  if(!strcmp(buf, "7BIT")) { res = ENC_CTE_7BIT; }
6553  if(!strcmp(buf, "8BIT")) { res = ENC_CTE_8BIT; }
6554  if(!strcmp(buf, "BINARY")) { res = ENC_CTE_BIN; }
6555  if(!strcmp(buf, "QUOTED-PRINTABLE")) { res = ENC_CTE_Q; }
6556  if(!strcmp(buf, "BASE64")) { res = ENC_CTE_B; }
6557  /* To be more tolerant: Check again for invalid identity declaration */
6558  if(!strcmp(buf, "7-BIT"))
6559  {
6560  PRINT_ERROR("MIME: "
6561  "Invalid content transfer encoding 7-bit accepted as 7bit");
6562  res = ENC_CTE_7BIT;
6563  }
6564  if(!strcmp(buf, "8-BIT"))
6565  {
6566  PRINT_ERROR("MIME: "
6567  "Invalid content transfer encoding 8-bit accepted as 8bit");
6568  res = ENC_CTE_8BIT;
6569  }
6570  /* Check whether content transfer encoding is supported */
6571  if(ENC_CTE_UNKNOWN == res)
6572  {
6573  l = strlen(not_supported) + len;
6574  p = (char*) api_posix_malloc(++l);
6575  if(NULL != p)
6576  {
6577  strcpy(p, not_supported);
6578  strncat(p, buf, len);
6579  print_error(p);
6580  api_posix_free((void*) p);
6581  }
6582  }
6583  }
6584  }
6585 
6586  return(res);
6587 }
6588 
6589 
6590 /* ========================================================================== */
6591 /*! \brief Decode content disposition
6592  *
6593  * \param[in] hf_body Body of Content-Disposition header field
6594  * \param[out] type Pointer to content disposition type ID
6595  * \param[out] filename Pointer to filename
6596  *
6597  * The field body \e hf_body must be unfolded and preprocessed (parameters must
6598  * ne already decoded according to RFC 2231).
6599  * The value for the filename parameter must be already converted to UTF-8.
6600  *
6601  * If a filename parameter is present, a new memory block is allocated for
6602  * \e filename . Otherwise \c NULL is returned.
6603  */
6604 
6605 void enc_mime_get_cd(const char* hf_body,
6606  enum enc_mime_cd* type, const char** filename)
6607 {
6608  api_posix_locale_t loc_ctype_posix;
6609  char* body = NULL;
6610  const char* fn_para = "FILENAME=";
6611  const char* p;
6612  const char* q;
6613  size_t len;
6614  char* buf;
6615  size_t i;
6616 
6617  /* Prepare values to return if an error occurs */
6618  *type = ENC_CD_UNKNOWN;
6619  *filename = NULL;
6620 
6621  /* Extract disposition type (case-insensitive) */
6622  loc_ctype_posix = api_posix_newlocale(API_POSIX_LC_CTYPE_MASK, "POSIX",
6623  (api_posix_locale_t) 0);
6624  if((api_posix_locale_t) 0 == loc_ctype_posix)
6625  {
6626  PRINT_ERROR("MIME: Cannot create locale object");
6627  return;
6628  }
6629  else
6630  {
6631  if(!api_posix_strncasecmp_l(hf_body, "inline", strlen("inline"),
6632  loc_ctype_posix))
6633  {
6634  *type = ENC_CD_INLINE;
6635  }
6636  else if(!api_posix_strncasecmp_l(hf_body, "attachment",
6637  strlen("attachment"),
6638  loc_ctype_posix))
6639  {
6640  *type = ENC_CD_ATTACHMENT;
6641  }
6642  api_posix_freelocale(loc_ctype_posix);
6643  }
6644 
6645  /* Extract filename */
6646  len = strlen(hf_body);
6647  body = (char*) api_posix_malloc(len + (size_t) 1);
6648  if(NULL != body)
6649  {
6650  /* Convert header field body to upper case */
6651  for(i = 0; i < len; ++i) { body[i] = (char) toupper((int) hf_body[i]); }
6652  body[len] = 0;
6653  /* Check for parameter "filename" */
6654  p = strstr(body, fn_para);
6655  if(NULL != p)
6656  {
6657  p += strlen(fn_para);
6658  q = strchr(p, ';');
6659  if(NULL != q) { len = (size_t) (q - p); }
6660  else { len = strlen(p); }
6661  /* Copy filename case-sensitive */
6662  buf = (char*) malloc(len + (size_t) 1);
6663  if(NULL != buf)
6664  {
6665  i = (size_t) (p - body);
6666  strncpy(buf, &hf_body[i], len);
6667  buf[len] = 0;
6668  /* Strip path, if present */
6669  p = strrchr(buf, '/');
6670  if(NULL != p) { ++p; memmove(buf, p, strlen(p) + (size_t) 1); }
6671  /*
6672  * Reject filename if it contains '~', '|' or '\' characters.
6673  * See RFC 2183 Section 5 "Security Considerations" for details
6674  */
6675  p = strpbrk(buf, "~|\x5C");
6676  if(NULL != p)
6677  {
6678  PRINT_ERROR("MIME: "
6679  "Filename in Content-Disposition rejected");
6680  }
6681  else { *filename = buf; }
6682  }
6683  }
6684  }
6685  api_posix_free((void*) body);
6686 }
6687 
6688 
6689 /* ========================================================================== */
6690 /*! \brief Decode MIME content transfer encoding and save to file
6691  *
6692  * \param[in] pn Pathname of file
6693  * \param[in] cte MIME content transfer encoding
6694  * \param[in] entity MIME entity body
6695  *
6696  * According to RFC 2049 all transfer encodings not defined in MIME 1.0 are
6697  * rejected.
6698  *
6699  * \return
6700  * - 0 on success
6701  * - -1 on error
6702  */
6703 
6704 int enc_mime_save_to_file(const char* pn, enum enc_mime_cte cte,
6705  const char* entity)
6706 {
6707  int res = -1;
6708  size_t len = strlen(entity);
6709  const char* p = entity;
6710  const char* buf = NULL;
6711  int fd;
6712  int rv;
6713  api_posix_mode_t perm = API_POSIX_S_IRUSR | API_POSIX_S_IWUSR |
6714  API_POSIX_S_IRGRP | API_POSIX_S_IWGRP |
6715  API_POSIX_S_IROTH | API_POSIX_S_IWOTH;
6716 
6717  /* Decode transfer encoding */
6718  switch(cte)
6719  {
6720  case ENC_CTE_Q:
6721  {
6722  buf = enc_mime_decode_qp(entity, &entity[len], 0, &len);
6723  p = buf;
6724  break;
6725  }
6726  case ENC_CTE_B:
6727  {
6728  buf = enc_mime_decode_base64(entity, &entity[len], &len);
6729  p = buf;
6730  break;
6731  }
6732  case ENC_CTE_7BIT:
6733  case ENC_CTE_8BIT:
6734  case ENC_CTE_BIN:
6735  {
6736  break;
6737  }
6738  default:
6739  {
6740  PRINT_ERROR("MIME: Content transfer encoding not supported");
6741  break;
6742  }
6743  }
6744 
6745  /* Save to file */
6746  if(NULL != p)
6747  {
6748  rv = fu_open_file(pn, &fd,
6749  API_POSIX_O_WRONLY | API_POSIX_O_CREAT
6750  | API_POSIX_O_TRUNC, perm);
6751  if(rv)
6752  {
6753  PRINT_ERROR("MIME: Opening file failed");
6754  }
6755  else
6756  {
6757  rv = fu_write_to_filedesc(fd, p, len);
6758  if(rv)
6759  {
6760  PRINT_ERROR("MIME: Writing to file failed");
6761  }
6762  else { res = 0; }
6763  fu_close_file(&fd, NULL);
6764  }
6765  }
6766 
6767  enc_free((void*) buf);
6768 
6769  return(res);
6770 }
6771 
6772 
6773 /* ========================================================================== */
6774 /*! \brief Decode MIME text content to UTF-8 NFC
6775  *
6776  * \param[in] cte MIME content transfer encoding
6777  * \param[in] charset MIME character set
6778  * \param[in] s MIME encoded data
6779  *
6780  * According to RFC 2049 all transfer encodings not defined in MIME 1.0 are
6781  * rejected.
6782  *
6783  * \return
6784  * - Pointer to decoded data.
6785  * If the result is not equal to \e s , a new memory block was allocated
6786  * - NULL on error (Original memory block for \e s is still allocated)
6787  */
6788 
6789 const char* enc_mime_decode(enum enc_mime_cte cte, enum enc_mime_cs charset,
6790  const char* s)
6791 {
6792  const char* res = NULL;
6793  size_t len = strlen(s);
6794 
6795  /* Decode transfer encoding and convert charset to Unicode */
6796  switch(cte)
6797  {
6798  case ENC_CTE_Q:
6799  {
6800  res = enc_mime_decode_q(charset, s, &s[len], 0);
6801  break;
6802  }
6803  case ENC_CTE_B:
6804  {
6805  res = enc_mime_decode_b(charset, s, &s[len]);
6806  break;
6807  }
6808  case ENC_CTE_7BIT:
6809  case ENC_CTE_8BIT:
6810  case ENC_CTE_BIN:
6811  {
6812  res = enc_convert_to_utf8_nfc(charset, s);
6813  break;
6814  }
6815  default:
6816  {
6817  PRINT_ERROR("MIME: Content transfer encoding not supported");
6818  break;
6819  }
6820  }
6821 
6822  return(res);
6823 }
6824 
6825 
6826 /* ========================================================================== */
6827 /*! \brief Decode MIME "text/plain" content with "format=flowed" parameter
6828  *
6829  * \param[in] s MIME encoded data in canonical form
6830  * \param[in] delsp Delete spaces at EOL if nonzero
6831  * \param[in] insline Add empty line separator after paragraphs if nonzero
6832  *
6833  * \attention
6834  * The encoding of the data referenced by \e s must be valid Unicode in UTF-8
6835  * representation. This must be checked by the caller before this function is
6836  * used.
6837  *
6838  * \return
6839  * - Pointer to decoded data
6840  * (if the result is not equal to \e s , a new memory block was allocated)
6841  * - NULL on error (Original memory block for \e s is still allocated)
6842  */
6843 
6844 const char* enc_mime_flowed_decode(const char* s, unsigned int delsp,
6845  unsigned int insline)
6846 {
6847  const char* quote_mark;
6848  int error = 0;
6849  int abort;
6850  int check;
6851  char* p;
6852  size_t ii;
6853  /* Index in input buffer */
6854  size_t i = 0;
6855  /* Target buffer */
6856  char* buf = NULL;
6857  size_t len = 0;
6858  size_t bi = 0;
6859  int insert_crlf = 0;
6860  /* Paragraph buffer */
6861  char* para = NULL;
6862  size_t plen = 0;
6863  size_t pi = 0;
6864  int pflowed;
6865  int pell; /* Empty last line */
6866  /* Line buffer */
6867  size_t start;
6868  size_t end;
6869  size_t llen = 0;
6870  size_t llimit;
6871  int flowed;
6872  /* Quote depth */
6873  int qdepth;
6874  size_t qd;
6875  /* Index after last space (or SHY) suitable for line break */
6876  size_t last_space;
6877  size_t ustring_len;
6878 
6879  /* Set quote mark style according to config file */
6880  switch(config[CONF_QUOTESTYLE].val.i)
6881  {
6882  case 0: { quote_mark = ">"; break; }
6883  case 1: { quote_mark = "> "; break; }
6884  default:
6885  {
6886  PRINT_ERROR("Quoting style configuration not supported");
6887  /* Use default from old versions that can't be configured */
6888  quote_mark = "> ";
6889  break;
6890  }
6891  }
6892  /* Process data */
6893  while(s[i])
6894  {
6895  /* Process next paragraph */
6896  pi = 0;
6897  pflowed = 0;
6898  pell = 0;
6899  qdepth = -1;
6900  do
6901  {
6902  /* Process next line */
6903  flowed = 0;
6904  /* Calculate quoting depth */
6905  qd = 0;
6906  while('>' == s[i])
6907  {
6908  if(API_POSIX_INT_MAX <= qd) { break; }
6909  ++qd;
6910  ++i;
6911  }
6912  if(-1 == qdepth) { qdepth = (int) qd; }
6913  else
6914  {
6915  if((int) qd != qdepth)
6916  {
6917  PRINT_ERROR("MIME: Invalid paragraph format"
6918  " (format=flowed)");
6919  i -= qd;
6920  break;
6921  }
6922  }
6923  /* Remove space stuffing */
6924  if(' ' == s[i]) { ++i; }
6925  start = end = i;
6926  /* Search for EOL */
6927  while(s[i])
6928  {
6929  if(i && 0x0A == (int) s[i])
6930  {
6931  if(0x0D != (int) s[i - (size_t) 1])
6932  {
6933  /* Canonical line termination must be CR+LF */
6934  PRINT_ERROR("MIME: Invalid line termination"
6935  " (format=flowed)");
6936  end = i;
6937  }
6938  else { end = i - (size_t) 1; }
6939  ++i;
6940  break;
6941  }
6942  /* Special handling for last line without CR+LF */
6943  if(!s[++i]) { end = i; }
6944  }
6945  llen = end - start;
6946  /* Check for flowed line */
6947  if(llen && ' ' == s[end - (size_t) 1])
6948  {
6949  /* Check for signature separator */
6950  if(!((size_t) 3 == llen
6951  && '-' == s[start] && '-' == s[start + (size_t) 1]))
6952  {
6953  flowed = 1;
6954  pflowed = 1;
6955  if(delsp) { --llen; --end; }
6956  }
6957  }
6958  /* Allocate memory in exponentially increasing chunks */
6959  while(pi + llen + (size_t) 1 >= plen) /* At least 1 additional byte */
6960  {
6961  if(!plen) { plen = 128; }
6962  p = (char*) api_posix_realloc((void*) para, plen *= (size_t) 2);
6963  if(NULL == p)
6964  {
6965  PRINT_ERROR("Memory allocation failed");
6966  error = 1;
6967  break;
6968  }
6969  else { para = p; }
6970  }
6971  if(error) { break; }
6972  /* Copy line to paragraph buffer */
6973  strncpy(&para[pi], &s[start], llen);
6974  pi += llen;
6975  }
6976  while(flowed);
6977  if(error) { break; }
6978  para[pi] = 0;
6979  /* Set flag if paragraph ends with empty line */
6980  if(pflowed && !llen) { pell = 1; };
6981  /* Copy fixed line or flowed paragraph to target buffer */
6982  pi = 0;
6983  do
6984  {
6985  llen = (size_t) qdepth * strlen(quote_mark);
6986  if(!pflowed)
6987  {
6988  start = 0;
6989  end = strlen(para);
6990  llen += end;
6991  }
6992  else
6993  {
6994  /* Rewrap flowed lines before 72 characters if possible */
6995  start = pi;
6996  last_space = 0;
6997  abort = 0;
6998  while(!abort)
6999  {
7000  check = 0;
7001  if(!para[pi]) { abort = 1; }
7002  else
7003  {
7004  /* Check for SP */
7005  if(' ' == para[pi]) { check = 1; }
7006  /* Check for SHY (in UTF-8 encoding) */
7007  else if(pi
7008  && 0xADU == (unsigned int) (unsigned char) para[pi]
7009  && 0xC2U == (unsigned int)
7010  (unsigned char) para[pi - (size_t) 1])
7011  {
7012  check = 1;
7013  }
7014  /* Check for ZWSP (in UTF-8 encoding) */
7015  else if(1 < pi
7016  && 0x8BU == (unsigned int) (unsigned char) para[pi]
7017  && 0x80U == (unsigned int)
7018  (unsigned char) para[pi - (size_t) 1]
7019  && 0xE2U == (unsigned int)
7020  (unsigned char) para[pi - (size_t) 2])
7021  {
7022  check = 1;
7023  }
7024  ++pi;
7025  }
7026  if(abort || check)
7027  {
7028  /* Allow max. 78 characters for quoted content */
7029  llimit = (size_t) 72;
7030  if(1 == qdepth) { llimit = (size_t) 74; }
7031  else if(2 == qdepth) { llimit = (size_t) 76; }
7032  else if(3 <= qdepth) { llimit = (size_t) 78; }
7033  /* Use 20 characters as minimum content width */
7034  if(llimit - (size_t) 20
7035  <= (size_t) qdepth * strlen(quote_mark))
7036  {
7037  llimit = (size_t) 20;
7038  }
7039  else
7040  {
7041  llimit -= (size_t) qdepth * strlen(quote_mark);
7042  }
7043  /* Check for line length limit */
7044  ustring_len = pi - start;
7045  if(ustring_len)
7046  {
7047  /* Do not count trailing SP */
7048  if(pi && ' ' == para[pi - (size_t) 1]) { --ustring_len; }
7049  }
7050  if(llimit < enc_uc_get_glyph_count(&para[start], ustring_len))
7051  {
7052  /* Check for second last line */
7053  if(last_space) { pi = last_space; }
7054  /* Check for last line */
7055  else if(abort)
7056  {
7057  pflowed = 0;
7058  if(pell) { insert_crlf = 1; }
7059  }
7060  break;
7061  }
7062  /* Check for end of paragraph */
7063  if(abort)
7064  {
7065  pflowed = 0;
7066  if(pell) { insert_crlf = 1; }
7067  }
7068  else { last_space = pi; }
7069  }
7070  }
7071  /* Skip trailing SP */
7072  if(start < pi && ' ' == para[pi - (size_t) 1])
7073  {
7074  end = pi - (size_t) 1;
7075  }
7076  else { end = pi; }
7077  llen += end - start;
7078  }
7079  /* Two additional characters for CR+LF line termination */
7080  llen += (size_t) 2;
7081  /* InsLine parameter has precedence over configfile entry */
7082  if(!insline)
7083  {
7084  /* Reset request for empty line separator if not configured */
7085  if(!config[CONF_FLOWED_CRLF].val.i) { insert_crlf = 0; }
7086  }
7087  /* Two additional characters for optional empty line after paragraph */
7088  if(insert_crlf) { llen += (size_t) 2; }
7089  /* Allocate memory in exponentially increasing chunks */
7090  while(bi + llen + (size_t) 1 >= len) /* At least 1 additional byte */
7091  {
7092  if(!len) { len = 256; }
7093  p = (char*) api_posix_realloc((void*) buf, len *= (size_t) 2);
7094  if(NULL == p)
7095  {
7096  PRINT_ERROR("Memory allocation failed");
7097  error = 1;
7098  break;
7099  }
7100  else { buf = p; }
7101  }
7102  if(error) { break; }
7103  /* Copy quote marks */
7104  for(ii = 0; ii < (size_t) qdepth; ++ii)
7105  {
7106  strncpy(&buf[bi], quote_mark, strlen(quote_mark));
7107  bi += strlen(quote_mark);
7108  }
7109  /* Copy line */
7110  strncpy(&buf[bi], &para[start], end - start);
7111  bi += end - start;
7112  /* Copy line termination */
7113  buf[bi++] = (char) 0x0D; buf[bi++] = (char) 0x0A;
7114  }
7115  while(pflowed);
7116  /* Insert optional empty line separator after paragraph */
7117  if(insert_crlf)
7118  {
7119  buf[bi++] = (char) 0x0D; buf[bi++] = (char) 0x0A;
7120  insert_crlf = 0;
7121  }
7122  if(error) { break; }
7123  }
7124  api_posix_free((void*) para);
7125  if(error)
7126  {
7127  PRINT_ERROR("MIME: Decoding of format=flowed content failed");
7128  api_posix_free((void*) buf);
7129  buf = NULL;
7130  }
7131  else if(NULL != buf)
7132  {
7133  /* Terminate string in target buffer */
7134  buf[bi] = 0;
7135  }
7136 
7137  return(buf);
7138 }
7139 
7140 
7141 /* ========================================================================== */
7142 /*! \brief Extract MIME encapsulated message
7143  *
7144  * \param[in] s MIME encapsulated message
7145  * \param[in] len Length of encapsulated message
7146  * \param[out] mpe MIME multipart entity locations
7147  *
7148  * On success a pointer to the result array is written to \e mpe . The caller
7149  * is responsible to free the memory allocated for this array.
7150  *
7151  * \return
7152  * - 1 on success
7153  * - 0 on error
7154  */
7155 
7156 size_t enc_mime_message(const char* s, size_t len,
7157  struct enc_mime_mpe** mpe)
7158 {
7159  size_t res = 0;
7160  struct enc_mime_mpe* array;
7161 
7162  /* Allocate memory for array element */
7163  array = (struct enc_mime_mpe*) api_posix_malloc(sizeof(struct enc_mime_mpe));
7164  if(NULL == array)
7165  {
7166  PRINT_ERROR("Parsing encapsulated message aborted");
7167  }
7168  else
7169  {
7170  /* Store start index and length of entity */
7171  array[res].start = s;
7172  array[res++].len = len;
7173  }
7174 
7175  /* Check for success */
7176  *mpe = NULL;
7177  if(res) { *mpe = array; }
7178 
7179  return(res);
7180 }
7181 
7182 
7183 /* ========================================================================== */
7184 /*! \brief Parse MIME multipart content
7185  *
7186  * \param[in] s MIME encoded multipart data
7187  * \param[in] b MIME boundary delimiter
7188  * \param[out] mpe MIME multipart entity locations
7189  *
7190  * On success a pointer to the result array is written to \e mpe . The caller
7191  * is responsible to free the memory allocated for this array.
7192  *
7193  * \return
7194  * - Nonzero number of entities in multipart data on success
7195  * - 0 on error
7196  */
7197 
7198 size_t enc_mime_multipart(const char* s, const char* b,
7199  struct enc_mime_mpe** mpe)
7200 {
7201  size_t res = 0;
7202  size_t b_len;
7203  char boundary[ENC_BO_BUFLEN] = "--";
7204  size_t i = 0;
7205  int preamble = 1;
7206  size_t match;
7207  size_t start = 0;
7208  size_t end = 0;
7209  size_t e_len;
7210  struct enc_mime_mpe* array = NULL;
7211  struct enc_mime_mpe* tmp;
7212 
7213  b_len = strlen(b);
7214  /* RFC 2046 limits the boundary delimiter length to 70 characters */
7215  if(!b_len || (size_t) 70 < b_len)
7216  {
7217  PRINT_ERROR("Invalid MIME multipart boundary delimiter");
7218  }
7219  else if ((size_t) 75 > ENC_BO_BUFLEN)
7220  {
7221  PRINT_ERROR("Value of ENC_BO_BUFLEN must be at least 75");
7222  }
7223  else
7224  {
7225  /* Add "--" prefix to boundary */
7226  strncpy(&boundary[2], b, 71);
7227  b_len += (size_t) 2;
7228  /* Parse content */
7229  while(s[i])
7230  {
7231  /*
7232  * Store potential end of entity
7233  * RFC 2046 specifies that last CRLF of an entity is part of the
7234  * following boundary delimiter.
7235  */
7236  if((size_t) 2 <= i) { end = i - (size_t) 2; }
7237  /* Compare boundary with beginning of line */
7238  match = 0;
7239  if(!strncmp(&s[i], boundary, b_len)) { match = 1; }
7240  /* Skip to beginning of next line (this also consumes potential LWS) */
7241  while(1)
7242  {
7243  if(!s[i]) { break; }
7244  else if((char) 0x0D == s[i++])
7245  {
7246  if((char) 0x0A == s[i++]) { break; }
7247  }
7248  }
7249  /* Check for start of entity */
7250  if(match)
7251  {
7252  /* Ignore preamble */
7253  if(!preamble && end > start)
7254  {
7255  /* Allocate memory for new array element */
7256  e_len = end - start;
7257  tmp = (struct enc_mime_mpe*)
7258  api_posix_realloc(array, (res + (size_t) 1)
7259  * sizeof(struct enc_mime_mpe));
7260  if(NULL == tmp)
7261  {
7262  PRINT_ERROR("Parsing multipart message aborted");
7263  break;
7264  }
7265  else
7266  {
7267  array = tmp;
7268  /* Store start index and length of entity */
7269  array[res].start = &s[start];
7270  array[res++].len = e_len;
7271  }
7272  }
7273  /* Prepare for next entity */
7274  start = i;
7275  preamble = 0;
7276  }
7277  }
7278  }
7279 
7280  /* Check for success */
7281  *mpe = NULL;
7282  if(res) { *mpe = array; }
7283 
7284  return(res);
7285 }
7286 
7287 
7288 /* ========================================================================== */
7289 /*! \brief Percent decoder
7290  *
7291  * \param[in] s String to decode (URI or MIME parameter value)
7292  * \param[in] clean Replace NUL and ';' with '_' if nonzero
7293  *
7294  * \note
7295  * The data is decoded in place because it can't be larger after the decoding
7296  * operation.
7297  *
7298  * If \e s is \c NULL no operation is performed and success is returned.
7299  *
7300  * \return
7301  * - Positive value on success (if data in \e s was decoded)
7302  * - 0 on success (if there was nothing to do)
7303  * - Negative value if percent encoding in \e s is invalid
7304  */
7305 
7306 int enc_percent_decode(char* s, int clean)
7307 {
7308  int res = 0;
7309  char* p = s;
7310  char* q;
7311  int invalid;
7312  int v;
7313  unsigned char c = 0;
7314  size_t len;
7315 
7316  while(NULL != p)
7317  {
7318  q = p;
7319  p = strchr(q, (int) '%');
7320  if(NULL != p)
7321  {
7322  /* Percent sign found => Decode character */
7323  res = 1;
7324  if((size_t) 3 > strlen(p)) { res = -1; break; }
7325  invalid = 0;
7326  v = enc_hex_decode_nibble(p[1]);
7327  if(0 > v) { invalid = 1; }
7328  else
7329  {
7330  c = (unsigned char) (v * 16);
7331  v = enc_hex_decode_nibble(p[2]);
7332  if(0 > v) { invalid = 1; }
7333  else { c += (unsigned char) v; }
7334  }
7335  /* Check for invalid data */
7336  if(invalid) { res = -1; break; }
7337  else
7338  {
7339  p[0] = (char) c;
7340  if(clean)
7341  {
7342  /* Replace NUL and ';' with '_' */
7343  if(!p[0] || ';' == p[0]) { p[0] = '_'; }
7344  }
7345  len = strlen(&p[3]);
7346  memmove((void*) &p[1], (void*) &p[3], ++len);
7347  ++p;
7348  }
7349  }
7350  }
7351  if(-1 == res) { PRINT_ERROR("Percent decoding of URI failed"); }
7352 
7353  return(res);
7354 }
7355 
7356 
7357 /* ========================================================================== */
7358 /*! \brief Percent encoding for URI content
7359  *
7360  * \param[in] s URI body to encode
7361  * \param[in] sch URI scheme
7362  *
7363  * Passing \c NULL for parameter \e s is allowed and treated as error.
7364  *
7365  * Generic URI syntax is defined in RFC 3986.
7366  * <br>
7367  * The scheme "ftp" is defined in RFC 1738.
7368  * <br>
7369  * The scheme "http" is defined in RFC 7230.
7370  * <br>
7371  * The scheme "mailto" is defined in RFC 6068.
7372  * <br>
7373  * The scheme "news" is defined in RFC 5538.
7374  *
7375  * The following characters are percent encoded:
7376  * - Space (not allowed for "mailto" and "news" schemes)
7377  * - The literal percent sign
7378  * - The list "gen-delims" defined in RFC 3986
7379  * - Anything not in the list "unreserved" for "http" and "ftp" schemes
7380  * - For the "mailto" scheme exactly one "commercial at" sign is required and
7381  * treated literally
7382  * - For the "news" scheme a single "commercial at" sign is accepted literally
7383  *
7384  * \return
7385  * - Pointer to result on success.
7386  * If the result is not equal to \e s , a new memory block was allocated
7387  * - NULL on error
7388  */
7389 
7390 const char* enc_uri_percent_encode(const char* s, enum enc_uri_scheme sch)
7391 {
7392  const char* res = NULL;
7393  const char* gen_delims =
7394  ":/?#[]@"; /* gen-delims */
7395  const char* sub_delims =
7396  "!$&'()*+,;="; /* sub-delims */
7397  const char* unreserved =
7398  "abcdefghijklmnopqrstuvwxyz" /* Small letters */
7399  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* Capital letters */
7400  "0123456789" /* Digits */
7401  "-._~"; /* Hyphen, Period, Underscore and Tilde */
7402  int process = 0;
7403  size_t i = 0;
7404  char* buf = NULL;
7405  size_t bi = 0;
7406  int error = 0;
7407  int encode;
7408  size_t commercial_at = 0;
7409  unsigned int nibble;
7410 
7411  if(NULL != s)
7412  {
7413  /* Check whether only unreserved characters are present */
7414  while(s[i])
7415  {
7416  if(NULL == strchr(unreserved, (int) s[i])) { process = 1; break; }
7417  ++i;
7418  }
7419  if(!process) { res = s; }
7420  else
7421  {
7422  /* Allocate new buffer (Triple size is always sufficient) */
7423  buf = (char*) api_posix_malloc(strlen(s) * (size_t) 3 + (size_t) 1);
7424  if(NULL != buf)
7425  {
7426  i = 0;
7427  while(s[i])
7428  {
7429  encode = 0;
7430  switch(sch)
7431  {
7432  case ENC_URI_SCHEME_HTTP:
7433  case ENC_URI_SCHEME_FTP:
7434  {
7435  /*
7436  * Because we don't parse the URI syntax, it is not
7437  * possible to decide whether a slash is allowed or not
7438  * here => Always accept it.
7439  */
7440  if('/' == s[i]) { encode = 0; }
7441  else if(NULL == strchr(unreserved, (int) s[i]))
7442  {
7443  encode = 1;
7444  }
7445  break;
7446  }
7447  case ENC_URI_SCHEME_MAILTO:
7448  case ENC_URI_SCHEME_NEWS:
7449  {
7450  if(ENC_URI_SCHEME_NEWS == sch && '>' == s[i])
7451  {
7452  /* As defined by RFC 5536 Section 3.1.3 */
7453  error = 1;
7454  }
7455  if(' ' == s[i]) { error = 1; }
7456  else if('%' == s[i]) { encode = 1; }
7457  else if('@' == s[i])
7458  {
7459  if(!commercial_at)
7460  {
7461  /* Accept zero or one "commercial at" signs */
7462  ++commercial_at;
7463  }
7464  else { error = 1; }
7465  }
7466  else if(NULL != strchr(gen_delims, (int) s[i]))
7467  {
7468  encode = 1;
7469  }
7470  else if(ENC_URI_SCHEME_MAILTO == sch
7471  && NULL != strchr(sub_delims, (int) s[i]))
7472  {
7473  /* Some listed in RFC 6068 Section 2 => Encode all */
7474  encode = 1;
7475  }
7476  break;
7477  }
7478  default:
7479  {
7480  PRINT_ERROR("Invalid URI scheme for percent encoding");
7481  error = 1;
7482  break;
7483  }
7484  }
7485  if(error) { break; }
7486  if(!encode) { buf[bi++] = s[i]; }
7487  else
7488  {
7489  /* Percent encoder */
7490  buf[bi++] = '%';
7491  /* High nibble */
7492  nibble = ((unsigned int) s[i] & 0xF0U) >> 4;
7493  if(10U > nibble) { buf[bi] = 0x30; }
7494  else { buf[bi] = 0x41; nibble -= 10U; }
7495  buf[bi++] += (char) nibble;
7496  /* Low nibble */
7497  nibble = (unsigned int) s[i] & 0x0FU;
7498  if(10 > nibble) { buf[bi] = 0x30; }
7499  else { buf[bi] = 0x41; nibble -= 10U; }
7500  buf[bi++] += (char) nibble;
7501  }
7502  ++i;
7503  }
7504  /* Terminate result string */
7505  buf[bi] = 0;
7506  /* Check for error */
7507  if(!error) { res = buf; }
7508  }
7509  }
7510  }
7511  if(NULL != res)
7512  {
7513  /* Ensure that one "@" is present in URI with scheme "mailto" */
7514  if(ENC_URI_SCHEME_MAILTO == sch && !commercial_at)
7515  {
7516  PRINT_ERROR("Missing \"@\" in URI with scheme \"mailto\"");
7517  error = 1;
7518  }
7519  }
7520  if(error)
7521  {
7522  PRINT_ERROR("Percent encoding of URI failed");
7523  api_posix_free((void*) buf);
7524  res = NULL;
7525  }
7526 
7527  return(res);
7528 }
7529 
7530 
7531 /* ========================================================================== */
7532 /* Unicode search (case insensitive)
7533  *
7534  * This function uses the Unicode Default Case Folding algorithm.
7535  * This means it is based on the full case folding operations without the
7536  * context-dependent mappings sensitive to the casing context.
7537  *
7538  * https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf
7539  *
7540  * According to Unicode 13.0.0 Section 3.13 the following algorithm is required
7541  * for caseless matching of two strings:
7542  *
7543  * NFD(toCasefold(NFD(X))) == NFD(toCasefold(NFD(Y)))
7544  *
7545  * \param[in] s String to search in
7546  * \param[in] start_pos Start index in \e s
7547  * \param[in] search_s Search string
7548  * \param[out] found_pos Position in \e s where \e search_s was found
7549  * \param[out] found_len Length of match in \e s
7550  *
7551  * \attention
7552  * The strings \e s and \e search_s must be normalized to NFC by the caller!
7553  *
7554  * \note
7555  * It is treated as error if \e start_pos points inside a combing character
7556  * sequence (to a codepoint with nonzero canonical combining class).
7557  *
7558  * \return
7559  * - 0 on success (\e found_pos and \e found_len are valid)
7560  * - -1 on error (nothing was written to \e found_pos and \e found_len )
7561  */
7562 
7563 int enc_uc_search(const char* s, size_t start_pos, const char* search_s,
7564  size_t* found_pos, size_t* found_len)
7565 {
7566  int res = -1;
7567  int ok = 0, ok2 = 0, ok3 = 0, ok4 = 0, ok5 = 0;
7568  size_t search_s_len; /* Length of search string (in NFD) */
7569  size_t s_len; /* Length of target from start_pos (in NFD) */
7570  size_t i, j;
7571  size_t bi; /* Index in case folding target buffer */
7572  long int ucp;
7573  struct uc_cdc ucp_attr;
7574  long int mapping[3];
7575  size_t di;
7576  size_t match_pos = 0, match_len = 0; /* With case folding */
7577  size_t tmp_pos = 0, end_pos = 0; /* Without case folding */
7578  const char* s_nfd = NULL;
7579  const char* search_s_nfd = NULL;
7580  const char* s_cf = NULL;
7581  const char* search_s_cf = NULL;
7582  char* p;
7583  const char* q;
7584  char utf[4];
7585  size_t inc;
7586 
7587  /*
7588  * Only non-ASCII codepoints (at least 2 bytes in UTF-8) will increase the
7589  * length. The maximum length of an UTF-8 sequence is 4 bytes.
7590  * => Worst case increase factor is 2 (per codepoint for UTF-8).
7591  * Maximum increase factor on codepoint basis is 3 according to
7592  * Unicode 13.0.0 standard.
7593  * => Worst case increase factor is 6 (2 * 3 for case folding with UTF-8).
7594  */
7595  const size_t mem_factor = 6;
7596 
7597  /* Check input data and normalize to NFD from start position */
7598  if(NULL == s || enc_uc_check_utf8(s)) { goto error; }
7599  i = 0; ucp = enc_uc_decode_utf8(s, &i);
7600  if(-1L == ucp) { goto error; }
7601  enc_uc_lookup_cdc(ucp, &ucp_attr);
7602  if(ucp_attr.ccc) { goto error; }
7603  s_nfd = enc_uc_normalize_to_nfd(&s[start_pos]);
7604  if(NULL == s_nfd) { goto error; }
7605 
7606  /* Check search string and normalize it to NFD */
7607  if(NULL == search_s || enc_uc_check_utf8(search_s)) { goto error; }
7608  search_s_nfd = enc_uc_normalize_to_nfd(search_s);
7609  if(NULL == search_s_nfd) { goto error; }
7610 
7611  /* Unicode case folding for search_s */
7612  search_s_len = strlen(search_s_nfd);
7613  if(!search_s_len) { goto error; }
7614  if(search_s_len * mem_factor + (size_t) 1 < search_s_len)
7615  {
7616  /* Wraparound in memory size calculation */
7617  PRINT_ERROR("Memory allocation failed");
7618  }
7619  else
7620  {
7621  p = (char*) api_posix_malloc(search_s_len * mem_factor + (size_t) 1);
7622  if(NULL == p) { PRINT_ERROR("Memory allocation failed"); }
7623  else
7624  {
7625  i = 0;
7626  bi = 0;
7627  while(1)
7628  {
7629  ucp = enc_uc_decode_utf8(search_s_nfd, &i);
7630  if(-1L == ucp) { break; }
7631  else
7632  {
7633  enc_uc_lookup_cf(ucp, mapping);
7634  for(j = 0; (size_t) 3 > j; ++j)
7635  {
7636  if(-1L == mapping[j]) { break; }
7637  else
7638  {
7639  di = 1;
7640  enc_uc_encode_utf8(p, &bi, &mapping[j], &di);
7641  }
7642  }
7643  }
7644  }
7645  p[bi] = 0;
7646  /* Normalize target string to NFD again after case folding */
7647  q = enc_uc_normalize_to_nfd(p);
7648  if(NULL == q) { enc_free((void*) p); }
7649  else
7650  {
7651  if(p == q) { search_s_cf = p; }
7652  else
7653  {
7654  enc_free((void*) p);
7655  search_s_cf = q;
7656  }
7657  match_len = strlen(search_s_cf);
7658  ok = 1;
7659  }
7660  }
7661  }
7662 
7663  /* Unicode case folding for s */
7664  if(ok)
7665  {
7666  s_len = strlen(s_nfd);
7667  if(s_len * mem_factor + (size_t) 1 < s_len)
7668  {
7669  /* Wraparound in memory size calculation */
7670  PRINT_ERROR("Memory allocation failed");
7671  }
7672  else
7673  {
7674  p = (char*) api_posix_malloc(s_len * mem_factor + (size_t) 1);
7675  if(NULL == p) { PRINT_ERROR("Memory allocation failed"); }
7676  else
7677  {
7678  i = 0;
7679  bi = 0;
7680  while(1)
7681  {
7682  ucp = enc_uc_decode_utf8(s_nfd, &i);
7683  if(-1L == ucp) { break; }
7684  else
7685  {
7686  enc_uc_lookup_cf(ucp, mapping);
7687  for(j = 0; (size_t) 3 > j; ++j)
7688  {
7689  if(-1L == mapping[j]) { break; }
7690  else
7691  {
7692  di = 1;
7693  enc_uc_encode_utf8(p, &bi, &mapping[j], &di);
7694  }
7695  }
7696  }
7697  }
7698  p[bi] = 0;
7699  /* Normalize target string to NFD again after case folding */
7700  q = enc_uc_normalize_to_nfd(p);
7701  if(NULL == q) { enc_free((void*) p); }
7702  else
7703  {
7704  if(strlen(p) != strlen(q))
7705  {
7706  /*
7707  * The result must have the same length (only reordering is
7708  * allowed), otherwise the position calculation below will
7709  * fail!
7710  *
7711  * For this implementation:
7712  * It is assumed, that the NFD normalization after case
7713  * folding of NFD data will not change the length in UTF-8.
7714  * Fail gracefully if this assumption is wrong or become
7715  * wrong with the database from a future Unicode version
7716  * => Report no match and print a bug warning in this case.
7717  */
7718  PRINT_ERROR("Case folding failed, length changed (bug)");
7719  }
7720  else
7721  {
7722  if(p == q) { s_cf = p; }
7723  else
7724  {
7725  enc_free((void*) p);
7726  s_cf = q;
7727  }
7728  ok2 = 1;
7729  }
7730  }
7731  }
7732  }
7733  }
7734 
7735  /* Search with binary compare in case folded data */
7736  if(ok && ok2)
7737  {
7738  p = strstr(s_cf, search_s_cf);
7739  if(NULL != p)
7740  {
7741  match_pos = (size_t) (p - s_cf);
7742  ok3 = 1;
7743  }
7744  }
7745 
7746  /*
7747  * Unicode normalization and full case folding may have changed the length
7748  * of the data (in codepoints and in bytes for UTF-8).
7749  *
7750  * Therefore both positions of a match, start and end, may be different
7751  * compared to the original data. The corresponding positions for the
7752  * original data must be calculated.
7753  */
7754  if(ok3)
7755  {
7756  /* Calculate start and end offsets of match in (unfolded) NFD data */
7757  i = 0;
7758  bi = 0;
7759  while(1)
7760  {
7761  if(bi == match_pos)
7762  {
7763  tmp_pos = i;
7764  ok4 = 1;
7765  }
7766  ucp = enc_uc_decode_utf8(s_nfd, &i);
7767  if(-1L == ucp) { break; }
7768  else
7769  {
7770  enc_uc_lookup_cf(ucp, mapping);
7771  for(j = 0; (size_t) 3 > j; ++j)
7772  {
7773  if(-1L == mapping[j]) { break; }
7774  else
7775  {
7776  di = 1;
7777  inc = 0;
7778  enc_uc_encode_utf8(utf, &inc, &mapping[j], &di);
7779  /* Result data is thrown away, only its length is used */
7780  bi += inc;
7781  }
7782  }
7783  }
7784  if(ok4 && (bi == match_pos + match_len))
7785  {
7786  end_pos = i;
7787  ok5 = 1;
7788  break;
7789  }
7790  }
7791 
7792  /* Calculate start and end offsets of match in NFC data */
7793  if(ok5 && tmp_pos < end_pos)
7794  {
7795  p = (char*) api_posix_malloc(end_pos + (size_t) 1);
7796  if(NULL == p) { PRINT_ERROR("Memory allocation failed"); }
7797  else
7798  {
7799  strncpy(p, s_nfd, end_pos); p[end_pos] = 0;
7800  q = enc_uc_normalize_to_nfc(p);
7801  if(NULL != q)
7802  {
7803  j = strlen(q);
7804  if(p != q) { api_posix_free((void*) q); }
7805  if(end_pos >= j)
7806  {
7807  end_pos -= (end_pos - j);
7808  /* Calculate start offset of match in original NFC data */
7809  p[tmp_pos] = 0;
7810  q = enc_uc_normalize_to_nfc(p);
7811  if(NULL != q)
7812  {
7813  j = strlen(q);
7814  if(p != q) { api_posix_free((void*) q); }
7815  if(tmp_pos >= j)
7816  {
7817  tmp_pos -= (tmp_pos - j);
7818  *found_pos = start_pos + tmp_pos;
7819  *found_len = end_pos - tmp_pos;
7820  res = 0;
7821  }
7822  }
7823  }
7824  }
7825  api_posix_free((void*) p);
7826  }
7827  }
7828  }
7829 
7830  api_posix_free((void*) search_s_cf);
7831  api_posix_free((void*) s_cf);
7832 
7833 error:
7834  if(search_s != search_s_nfd) { api_posix_free((void*) search_s_nfd); }
7835  if(&s[start_pos] != s_nfd) { api_posix_free((void*) s_nfd); }
7836 
7837  return(res);
7838 }
7839 
7840 
7841 /* ========================================================================== */
7842 /*! \brief Free an object allocated by encoding module
7843  *
7844  * Use this function to release dynamic memory that was allocated by the
7845  * encoding module.
7846  *
7847  * \param[in] p Pointer to object
7848  *
7849  * Release the memory for the object pointed to by \e p.
7850  *
7851  * \note
7852  * The pointer \e p is allowed to be \c NULL and no operation is performed in
7853  * this case.
7854  */
7855 
7856 void enc_free(void* p)
7857 {
7858  /*
7859  * Attention:
7860  * Parts of the CORE module (for historical reasons) are still using
7861  * \c api_posix_free() to release memory allocated by this module. Until the
7862  * separation is complete, the memory manager of this module cannot be
7863  * changed.
7864  */
7865  api_posix_free(p);
7866 }
7867 
7868 
7869 /*! @} */
7870 
7871 /* EOF */
ENC_CS_ISO8859_14
Definition: encoding.h:75
fu_write_to_filedesc
int fu_write_to_filedesc(int filedesc, const char *buffer, size_t len)
Write data block to filedescriptor.
Definition: fileutils.c:552
enc_get_iso8601_utc
int enc_get_iso8601_utc(char *isodate)
Get current UTC date in ISO 8601 conformant format.
Definition: encoding.c:3412
ENC_CS_ISO8859_5
Definition: encoding.h:67
enc_ascii_convert_distribution
void enc_ascii_convert_distribution(char *s)
Convert body of distribution header field.
Definition: encoding.c:4052
ENC_CS_KOI8U
Definition: encoding.h:81
enc_mime_ct::subtype
enum enc_mime_ct_subtype subtype
Definition: encoding.h:115
ENC_CS_IBM850
Definition: encoding.h:93
CONF_FORCE_UNICODE
Definition: conf.h:77
ENC_URI_SCHEME_HTTP
Definition: encoding.h:132
ENC_CT_VIDEO
Definition: encoding.h:29
ENC_CS_ASCII
Definition: encoding.h:62
ENC_CTE_BIN
Definition: encoding.h:53
enc_mime_mpe
Locations of MIME multipart entities.
Definition: encoding.h:122
enc_free
void enc_free(void *p)
Free an object allocated by encoding module.
Definition: encoding.c:7856
ENC_CS_IBM775
Definition: encoding.h:92
enc_convert_to_utf8_nfc
const char * enc_convert_to_utf8_nfc(enum enc_mime_cs charset, const char *s)
Convert string from supported character set to Unicode (UTF-8 NFC)
Definition: encoding.c:4777
MAIN_ERR_PREFIX
#define MAIN_ERR_PREFIX
Message prefix for ENCODING module.
Definition: encoding.c:58
core_anum_t
#define core_anum_t
Article number data type (value zero is always reserved)
Definition: core.h:24
enc_mime_cs
enc_mime_cs
IDs for supported MIME character sets.
Definition: encoding.h:59
enc_uri_percent_encode
const char * enc_uri_percent_encode(const char *s, enum enc_uri_scheme sch)
Percent encoding for URI content.
Definition: encoding.c:7390
ENC_URI_SCHEME_NEWS
Definition: encoding.h:134
ENC_CT_MULTIPART
Definition: encoding.h:30
enc_mime_flowed_decode
const char * enc_mime_flowed_decode(const char *s, unsigned int delsp, unsigned int insline)
Decode MIME "text/plain" content with "format=flowed" parameter.
Definition: encoding.c:6844
ENC_CS_BUFLEN
#define ENC_CS_BUFLEN
Buffer size for character set name strings.
Definition: encoding.h:153
core_time_t
unsigned long int core_time_t
Time in seconds since the epoche (in terms of POSIX.1)
Definition: core.h:54
enc_mime_save_to_file
int enc_mime_save_to_file(const char *pn, enum enc_mime_cte cte, const char *entity)
Decode MIME content transfer encoding and save to file.
Definition: encoding.c:6704
enc_rot13
void enc_rot13(char *data)
Encode or decode data with ROT13 algorithm.
Definition: encoding.c:3709
ENC_CS_ISO8859_4
Definition: encoding.h:66
enc_mime_word_decode
int enc_mime_word_decode(const char **r, const char *b)
Decode header field containing potential MIME encoded-word tokens.
Definition: encoding.c:5504
ENC_CS_ISO8859_8
Definition: encoding.h:70
ENC_CS_ISO8859_10
Definition: encoding.h:72
enc_mime_get_cte
enum enc_mime_cte enc_mime_get_cte(const char *hf_body)
Decode content transfer encoding description.
Definition: encoding.c:6522
ENC_CS_WINDOWS_1256
Definition: encoding.h:88
enc_ascii_check
int enc_ascii_check(const char *s)
Verify ASCII encoding.
Definition: encoding.c:3922
config
struct conf config[CONF_NUM]
Global configuration.
Definition: conf.c:63
ENC_CTE_8BIT
Definition: encoding.h:52
enc_convert_ascii_to_anum
int enc_convert_ascii_to_anum(core_anum_t *result, const char *wm, int len)
Convert number from ASCII to numerical format.
Definition: encoding.c:3621
ENC_CS_KOI8R
Definition: encoding.h:80
enc_percent_decode
int enc_percent_decode(char *s, int clean)
Percent decoder.
Definition: encoding.c:7306
ENC_BO_BUFLEN
#define ENC_BO_BUFLEN
Buffer size for multipart boundary strings.
Definition: encoding.h:176
ENC_CS_ISO8859_X
Definition: encoding.h:78
ENC_CS_ISO8859_16
Definition: encoding.h:77
enc_mime_ct::type
enum enc_mime_ct_type type
Definition: encoding.h:114
enc_mime_mpe::start
const char * start
Definition: encoding.h:124
ENC_CTE_Q
Definition: encoding.h:54
enc_convert_lines_to_string
void enc_convert_lines_to_string(char *l, unsigned long int l_raw)
Convert number of lines to string.
Definition: encoding.c:3136
ENC_CS_WINDOWS_1255
Definition: encoding.h:87
enc_mime_mpe::len
size_t len
Definition: encoding.h:125
enc_wm_pattern
Wildmat array element (for RFC 3977 wildmat-pattern)
Definition: encoding.h:139
ENC_CS_ISO8859_6
Definition: encoding.h:68
ENC_CS_ISO8859_7
Definition: encoding.h:69
enc_convert_posix_to_iso8601
int enc_convert_posix_to_iso8601(char *isodate, core_time_t pts)
Convert POSIX timestamp to ISO 8601 conformant local date and time.
Definition: encoding.c:3359
enc_extract_addr_spec
const char * enc_extract_addr_spec(const char *mailbox)
Extract addr-spec token from RFC 5322 mailbox.
Definition: encoding.c:3808
enc_ascii_check_alpha
int enc_ascii_check_alpha(const char *s)
Check for ASCII alphabetic characters.
Definition: encoding.c:3950
CONF_QUOTESTYLE
Definition: conf.h:59
enc_uc_encode_utf8
void enc_uc_encode_utf8(char *buf, size_t *i, long int *dbuf, size_t *di)
Encode Unicode codepoints to UTF-8.
Definition: encoding.c:572
enc_timestamp_decode
core_time_t enc_timestamp_decode(const char *timestamp)
Decode canonical timestamp to POSIX time (seconds since epoche)
Definition: encoding.c:3171
ENC_CTE_BUFLEN
#define ENC_CTE_BUFLEN
Buffer size for content transfer encoding name strings.
Definition: encoding.h:150
enc_ascii_check_printable
int enc_ascii_check_printable(const char *s)
Check for printable ASCII characters.
Definition: encoding.c:4000
ENC_CS_ISO2022_JP
Definition: encoding.h:96
ENC_CS_WINDOWS_1257
Definition: encoding.h:89
ENC_CT_AUDIO
Definition: encoding.h:28
enc_ascii_check_digit
int enc_ascii_check_digit(const char *s)
Check for ASCII digit characters.
Definition: encoding.c:3973
enc_mime_message
size_t enc_mime_message(const char *s, size_t len, struct enc_mime_mpe **mpe)
Extract MIME encapsulated message.
Definition: encoding.c:7156
ENC_CS_ISO8859_13
Definition: encoding.h:74
enc_mime_para_decode
int enc_mime_para_decode(const char **r, const char *b, int m)
Decode header field containing potential MIME parameters.
Definition: encoding.c:5803
ENC_CS_WINDOWS_1253
Definition: encoding.h:85
enc_uc_check_utf8
int enc_uc_check_utf8(const char *s)
Verify UTF-8 encoding.
Definition: encoding.c:4140
enc_mime_decode
const char * enc_mime_decode(enum enc_mime_cte cte, enum enc_mime_cs charset, const char *s)
Decode MIME text content to UTF-8 NFC.
Definition: encoding.c:6789
enc_convert_anum_to_ascii
int enc_convert_anum_to_ascii(char result[17], size_t *len, core_anum_t wm)
Convert article number from numerical format to ASCII.
Definition: encoding.c:3575
ENC_CTS_MIXED
Definition: encoding.h:40
enc_mime_ct::charset
enum enc_mime_cs charset
Definition: encoding.h:116
PRINT_ERROR
#define PRINT_ERROR(s)
Prepend module prefix and print error message.
Definition: main.h:19
data
struct core_data data
Global data object (shared by all threads)
Definition: core.c:242
ENC_URI_SCHEME_MAILTO
Definition: encoding.h:135
ENC_CTS_PLAIN
Definition: encoding.h:39
ENC_MIME_PARA_LENGTH_MAX
#define ENC_MIME_PARA_LENGTH_MAX
Maximum length of MIME parameter attribute tokens.
Definition: encoding.c:64
ENC_CTS_DIGEST
Definition: encoding.h:42
enc_destroy_wildmat
void enc_destroy_wildmat(struct enc_wm_pattern **obj, int num)
Destroy wildmat pattern array.
Definition: encoding.c:4516
enc_convert_octet_to_hex
int enc_convert_octet_to_hex(char *result, unsigned int octet)
Convert octet to hexadecimal (ASCII) format.
Definition: encoding.c:3681
ENC_CS_WINDOWS_1258
Definition: encoding.h:90
NNTP_ANUM_T_MAX
#define NNTP_ANUM_T_MAX
Maximum value this implementation supports for nntp_anum_t.
Definition: nntp.h:52
ENC_CTE_7BIT
Definition: encoding.h:51
ENC_CTS_RFC822
Definition: encoding.h:43
ENC_URI_SCHEME_FTP
Definition: encoding.h:133
ENC_CS_UTF_7
Definition: encoding.h:98
ENC_CS_MACINTOSH
Definition: encoding.h:79
enc_mime_ct
MIME content type information.
Definition: encoding.h:112
enc_ascii_convert_to_printable
void enc_ascii_convert_to_printable(char *s)
Convert to printable ASCII format.
Definition: encoding.c:4027
enc_convert_posix_to_canonical
const char * enc_convert_posix_to_canonical(const char *s)
Convert from local (POSIX) to canonical (RFC 822) form.
Definition: encoding.c:4679
enc_uri_scheme
enc_uri_scheme
URI schemes.
Definition: encoding.h:129
ENC_CS_ISO8859_9
Definition: encoding.h:71
ENC_CS_IBM852
Definition: encoding.h:94
ENC_CS_WINDOWS_1251
Definition: encoding.h:83
ENC_CS_WINDOWS_1252
Definition: encoding.h:84
ENC_CS_ISO8859_11
Definition: encoding.h:73
enc_create_wildmat
int enc_create_wildmat(struct enc_wm_pattern **obj, const char *wm)
Create wildmat pattern array.
Definition: encoding.c:4349
enc_convert_iso8601_to_timestamp
int enc_convert_iso8601_to_timestamp(const char **ts, const char *isodate)
Convert ISO 8601 conformant date to canonical timestamp.
Definition: encoding.c:3520
fu_close_file
void fu_close_file(int *filedesc, FILE **stream)
Close file (and potentially associated I/O stream)
Definition: fileutils.c:297
ENC_CTS_ALTERNATIVE
Definition: encoding.h:41
enc_uc_repair_utf8
const char * enc_uc_repair_utf8(const char *s)
Repair UTF-8 encoding.
Definition: encoding.c:4159
ENC_CS_IBM858
Definition: encoding.h:95
ENC_CS_ISO8859_15
Definition: encoding.h:76
ENC_CS_WINDOWS_1254
Definition: encoding.h:86
ENC_CT_MESSAGE
Definition: encoding.h:31
ENC_CT_IMAGE
Definition: encoding.h:27
ENC_CS_ISO8859_3
Definition: encoding.h:65
ENC_MIME_HEADER_FOLD_ASCII_LINES
#define ENC_MIME_HEADER_FOLD_ASCII_LINES
MIME word encoder folding behaviour.
Definition: encoding.c:79
ENC_CS_CESU_8
Definition: encoding.h:97
enc_mime_encode_base64
int enc_mime_encode_base64(const char **enc, const char *data, size_t len)
Encode binary data to base64.
Definition: encoding.c:3761
enc_convert_canonical_to_posix
const char * enc_convert_canonical_to_posix(const char *s, int rcr, int rlf)
Convert from canonical (RFC 822) to local (POSIX) form.
Definition: encoding.c:4558
enc_create_name_addr
const char * enc_create_name_addr(const char *data, size_t offset)
Create a "name-addr" construct according to RFC 5322.
Definition: encoding.c:2969
enc_lines_decode
unsigned long int enc_lines_decode(const char *lines)
Decode number of lines.
Definition: encoding.c:3115
ENC_CTE_B
Definition: encoding.h:55
enc_mime_get_cd
void enc_mime_get_cd(const char *hf_body, enum enc_mime_cd *type, const char **filename)
Decode content disposition.
Definition: encoding.c:6605
ENC_CS_ISO8859_2
Definition: encoding.h:64
enc_mime_ct::flags
unsigned int flags
Definition: encoding.h:118
ENC_CS_WINDOWS_1250
Definition: encoding.h:82
ENC_CS_UTF_8
Definition: encoding.h:99
nntp_anum_t
unsigned long int nntp_anum_t
Article number.
Definition: nntp.h:28
enc_mime_cd
enc_mime_cd
IDs for supported MIME content disposition.
Definition: encoding.h:104
CONF_FLOWED_CRLF
Definition: conf.h:76
ENC_CS_ISO8859_1
Definition: encoding.h:63
enc_mime_multipart
size_t enc_mime_multipart(const char *s, const char *b, struct enc_mime_mpe **mpe)
Parse MIME multipart content.
Definition: encoding.c:7198
CORE_ANUM_T_MAX
#define CORE_ANUM_T_MAX
Article number limit.
Definition: core.h:180
enc_mime_word_encode
int enc_mime_word_encode(const char **r, const char *b, size_t pl)
Encode header field body using MIME encoded-word tokens.
Definition: encoding.c:5089
enc_mime_get_ct
void enc_mime_get_ct(struct enc_mime_ct *ct, const char *hf_body, char *bo)
Decode MIME "Content-Type" header field.
Definition: encoding.c:6221
ENC_CS_IBM437
Definition: encoding.h:91
enc_mime_cte
enc_mime_cte
IDs for supported MIME content transfer encodings.
Definition: encoding.h:48
enc_convert_to_8bit
const char * enc_convert_to_8bit(enum enc_mime_cs *charset, const char *s, const char **cs_iana)
Convert string from Unicode (UTF-8 NFC) to an 8bit character set.
Definition: encoding.c:4991
enc_convert_iso8601_to_posix
int enc_convert_iso8601_to_posix(core_time_t *pts, const char *isodate)
Convert ISO 8601 conformant UTC date and time to POSIX timestamp.
Definition: encoding.c:3467
fu_open_file
int fu_open_file(const char *pathname, int *filedesc, int mode, api_posix_mode_t perm)
Open file.
Definition: fileutils.c:246
ENC_CT_TEXT
Definition: encoding.h:26
print_error
void print_error(const char *)
Print error message.
Definition: main.cxx:276

Generated at 2026-01-27 using  doxygen