encoding.c
Go to the documentation of this file.
1 /* ========================================================================== */
2 /*! \file
3  * \brief Shared encoding related functions
4  *
5  * Copyright (c) 2012-2023 by the developers. See the LICENSE file for details.
6  *
7  * If nothing else is specified, functions return zero to indicate success
8  * and a negative value to indicate an error.
9  */
10 
11 
12 /* ========================================================================== */
13 /* Include headers */
14 
15 #include "posix.h" /* Include this first because of feature test macros */
16 
17 #include <ctype.h>
18 #include <stddef.h>
19 #include <string.h>
20 
21 #include "conf.h"
22 #include "encoding.h"
23 #include "fileutils.h"
24 #include "main.h"
25 
26 
27 /* ========================================================================== */
28 /*! \defgroup ENCODING ENC: Codeset and header field handling
29  *
30  * The functions in this group should be conformant to the following standards:
31  * ANSI X3.4,
32  * ISO 2022, ISO 8601, ISO 8859, ISO 10646,
33  * RFC 1468, RFC 2045, RFC 2046, RFC 2047, RFC 2049, RFC 2152, RFC 2183,
34  * RFC 2231, RFC 2646, RFC 3629, RFC 3676, RFC 5198, RFC 5536, RFC 6657,
35  * POSIX.1-1996,
36  * Unicode 14.0.0
37  *
38  * \todo
39  * We don't use \c iconv() because on old operating systems there may be no
40  * Unicode support. And even on such old machines we don't want an external
41  * dependency from GNU iconv.
42  * <br>
43  * There should be an option to use the systems \c iconv() on request.
44  */
45 /*! @{ */
46 
47 
48 /* ========================================================================== */
49 /* Constants */
50 
51 /*! \brief Message prefix for ENCODING module */
52 #define MAIN_ERR_PREFIX "ENC: "
53 
54 /* Define this to nonzero to enable Unicode NFC normalization debugging */
55 #define ENC_UC_NORM_DEBUG 0
56 
57 /*! \brief Maximum length of MIME parameter attribute tokens */
58 #define ENC_MIME_PARA_LENGTH_MAX (size_t) 127
59 
60 /*! \brief MIME word encoder folding behaviour
61  *
62  * If this is defined to nonzero, all lines of RFC 2047 conformant header fields
63  * that contain MIME encoded words are folded before 76 characters. Otherwise
64  * all lines that contain no encoded-words are not folded before 998 characters.
65  *
66  * RFC 2047 is ambigous regarding this rule:
67  * <br>
68  * https://tools.ietf.org/html/rfc2047#section-2
69  * <br>
70  * The default value 1 is safe in any case. Please read section 2, paragraph 5
71  * carefully before redefining this to 0!
72  */
73 #define ENC_MIME_HEADER_FOLD_ASCII_LINES 1
74 
75 
76 /* ========================================================================== */
77 /* Data types */
78 
79 /* ISO 2022 decoder states */
80 enum iso2022_state
81 {
82  ISO2022_ASCII,
83  ISO2022_ISO646,
84  ISO2022_JIS_X_0208
85 };
86 
87 /* Unicode hangul syllable type */
88 enum uc_hs_type
89 {
90  UC_HST_NONE,
91  UC_HST_L,
92  UC_HST_V,
93  UC_HST_T,
94  UC_HST_LV,
95  UC_HST_LVT
96 };
97 
98 /* Unicode canonical decomposition (and character combining class) */
99 struct uc_cdc
100 {
101  long int cp; /* Codepoint */
102  unsigned char ccc; /* Canonical combining class */
103  long int dc1; /* Decomposition mapping (recursive part 1) */
104  long int dc2; /* Decomposition mapping (non-recursive part 2) */
105 };
106 
107 /* Unicode hangul syllable type ranges */
108 struct uc_hst
109 {
110  long int first; /* First codepoint of range */
111  long int last; /* Last codepoint of range */
112  enum uc_hs_type hst; /* Hangul syllable type */
113 };
114 
115 /* Unicode NFC quick check codepoint ranges indicating normalization required */
116 struct uc_qc_nfc
117 {
118  long int first; /* First codepoint of range */
119  long int last; /* Last codepoint of range */
120 };
121 
122 /* Unicode full composition exclusion codepoint ranges */
123 struct uc_fce
124 {
125  long int first; /* First codepoint of range */
126  long int last; /* Last codepoint of range */
127 };
128 
129 /* Unicode default case folding mapping */
130 struct uc_cf
131 {
132  long int cp; /* Codepoint */
133  long int first; /* First codepoint of range */
134  long int second; /* Second codepoint of range */
135  long int third; /* Third codepoint of range */
136 };
137 
138 /* IOS2022-JP to Unicode codepoint mapping */
139 struct iso2022_jp
140 {
141  long int jis; /* JIS X 0208 codepoint */
142  long int uc; /* Unicode codepoint */
143 };
144 
145 /* MIME parameter (for RFC 2231 decoder) */
146 struct mime_parameter
147 {
148  int valid;
149  char attribute[ENC_MIME_PARA_LENGTH_MAX + 1];
150  size_t attribute_len;
151  unsigned int section;
152  char charset[ENC_MIME_PARA_LENGTH_MAX + 1];
153  const char* value_start;
154  const char* value_end;
155 };
156 
157 
158 /* ========================================================================== */
159 /* Constants */
160 
161 /*
162  * UTF-8 sequence to insert for unassigned codepoints in 8 bit character sets
163  *
164  * \attention
165  * The sequence must represent a single Unicode codepoint!
166  */
167 #define ENC_UA "\xEF\xBF\xBD" /* U+FFFD */
168 
169 /* Unicode codepoint inserted for rejected control characters */
170 #define ENC_RC 0xFFFDL /* U+FFFD */
171 
172 /*
173  * Size of Unicode decomposition buffer
174  * Minimum size: 8
175  */
176 #define ENC_UC_DECOMPOSITION_BUFSIZE (size_t) 16
177 
178 /* Maximum size of header line */
179 #define ENC_HDR_BUFSIZE (size_t) 998
180 
181 /* Buffer size for "Format" parameter of MIME "Content-Type" header field */
182 #define ENC_FMT_BUFLEN (size_t) 7
183 
184 /* Unicode canonical decomposition data */
185 #include "../uc_cdc.c"
186 
187 /* Unicode hangul syllable data */
188 #include "../uc_hst.c"
189 
190 /* Unicode NFC quick check data */
191 #include "../uc_qc_nfc.c"
192 
193 /* Unicode full composition exclusion data */
194 #include "../uc_fce.c"
195 
196 /* Unicode default case folding data */
197 #include "../uc_cf.c"
198 
199 /* ISO 2022-JP to Unicode mapping data */
200 #include "../iso2022-jp.c"
201 
202 /* MIME base64 encoding table */
203 static const char enc_base64[64] =
204 {
205  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
206  'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
207  'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
208  'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
209  'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
210  'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
211  'w', 'x', 'y', 'z', '0', '1', '2', '3',
212  '4', '5', '6', '7', '8', '9', '+', '/'
213 };
214 
215 
216 /* ========================================================================== */
217 /* Variables */
218 
219 /*! Ignored value that was assigned to silence compiler warning */
220 static volatile int ign;
221 
222 
223 /* ========================================================================== */
224 /* Character sets */
225 /*
226  * Lines with overlength are accepted here to keep 16 (one hexadecimal digit)
227  * character codes on all lines. This is the most common representation in code
228  * tables.
229  */
230 
231 /* UTF-8 NFC sequences for upper half of ISO 8859-1 character set */
232 static const char* enc_iso8859_1[128] =
233 {
234  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
235  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
236  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
237  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",
238  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
239  "\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F",
240  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
241  "\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF"
242 };
243 
244 /* UTF-8 NFC sequences for upper half of ISO 8859-2 character set */
245 static const char* enc_iso8859_2[128] =
246 {
247  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
248  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
249  "\xC2\xA0", "\xC4\x84", "\xCB\x98", "\xC5\x81", "\xC2\xA4", "\xC4\xBD", "\xC5\x9A", "\xC2\xA7", "\xC2\xA8", "\xC5\xA0", "\xC5\x9E", "\xC5\xA4", "\xC5\xB9", "\xC2\xAD", "\xC5\xBD", "\xC5\xBB",
250  "\xC2\xB0", "\xC4\x85", "\xCB\x9B", "\xC5\x82", "\xC2\xB4", "\xC4\xBE", "\xC5\x9B", "\xCB\x87", "\xC2\xB8", "\xC5\xA1", "\xC5\x9F", "\xC5\xA5", "\xC5\xBA", "\xCB\x9D", "\xC5\xBE", "\xC5\xBC",
251  "\xC5\x94", "\xC3\x81", "\xC3\x82", "\xC4\x82", "\xC3\x84", "\xC4\xB9", "\xC4\x86", "\xC3\x87", "\xC4\x8C", "\xC3\x89", "\xC4\x98", "\xC3\x8B", "\xC4\x9A", "\xC3\x8D", "\xC3\x8E", "\xC4\x8E",
252  "\xC4\x90", "\xC5\x83", "\xC5\x87", "\xC3\x93", "\xC3\x94", "\xC5\x90", "\xC3\x96", "\xC3\x97", "\xC5\x98", "\xC5\xAE", "\xC3\x9A", "\xC5\xB0", "\xC3\x9C", "\xC3\x9D", "\xC5\xA2", "\xC3\x9F",
253  "\xC5\x95", "\xC3\xA1", "\xC3\xA2", "\xC4\x83", "\xC3\xA4", "\xC4\xBA", "\xC4\x87", "\xC3\xA7", "\xC4\x8D", "\xC3\xA9", "\xC4\x99", "\xC3\xAB", "\xC4\x9B", "\xC3\xAD", "\xC3\xAE", "\xC4\x8F",
254  "\xC4\x91", "\xC5\x84", "\xC5\x88", "\xC3\xB3", "\xC3\xB4", "\xC5\x91", "\xC3\xB6", "\xC3\xB7", "\xC5\x99", "\xC5\xAF", "\xC3\xBA", "\xC5\xB1", "\xC3\xBC", "\xC3\xBD", "\xC5\xA3", "\xCB\x99"
255 };
256 
257 /* UTF-8 NFC sequences for upper half of ISO 8859-3 character set */
258 static const char* enc_iso8859_3[128] =
259 {
260  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
261  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
262  "\xC2\xA0", "\xC4\xA6", "\xCB\x98", "\xC2\xA3", "\xC2\xA4", ENC_UA, "\xC4\xA4", "\xC2\xA7", "\xC2\xA8", "\xC4\xB0", "\xC5\x9E", "\xC4\x9E", "\xC4\xB4", "\xC2\xAD", ENC_UA, "\xC5\xBB",
263  "\xC2\xB0", "\xC4\xA7", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC4\xA5", "\xC2\xB7", "\xC2\xB8", "\xC4\xB1", "\xC5\x9F", "\xC4\x9F", "\xC4\xB5", "\xC2\xBD", ENC_UA, "\xC5\xBC",
264  "\xC3\x80", "\xC3\x81", "\xC3\x82", ENC_UA, "\xC3\x84", "\xC4\x8A", "\xC4\x88", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
265  ENC_UA, "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC4\xA0", "\xC3\x96", "\xC3\x97", "\xC4\x9C", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC5\xAC", "\xC5\x9C", "\xC3\x9F",
266  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", ENC_UA, "\xC3\xA4", "\xC4\x8B", "\xC4\x89", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
267  ENC_UA, "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC4\xA1", "\xC3\xB6", "\xC3\xB7", "\xC4\x9D", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC5\xAD", "\xC5\x9D", "\xCB\x99"
268 };
269 
270 /* UTF-8 NFC sequences for upper half of ISO 8859-4 character set */
271 static const char* enc_iso8859_4[128] =
272 {
273  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
274  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
275  "\xC2\xA0", "\xC4\x84", "\xC4\xB8", "\xC5\x96", "\xC2\xA4", "\xC4\xA8", "\xC4\xBB", "\xC2\xA7", "\xC2\xA8", "\xC5\xA0", "\xC4\x92", "\xC4\xA2", "\xC5\xA6", "\xC2\xAD", "\xC5\xBD", "\xC2\xAF",
276  "\xC2\xB0", "\xC4\x85", "\xCB\x9B", "\xC5\x97", "\xC2\xB4", "\xC4\xA9", "\xC4\xBC", "\xCB\x87", "\xC2\xB8", "\xC5\xA1", "\xC4\x93", "\xC4\xA3", "\xC5\xA7", "\xC5\x8A", "\xC5\xBE", "\xC5\x8B",
277  "\xC4\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC4\xAE", "\xC4\x8C", "\xC3\x89", "\xC4\x98", "\xC3\x8B", "\xC4\x96", "\xC3\x8D", "\xC3\x8E", "\xC4\xAA",
278  "\xC4\x90", "\xC5\x85", "\xC5\x8C", "\xC4\xB6", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC5\xB2", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC5\xA8", "\xC5\xAA", "\xC3\x9F",
279  "\xC4\x81", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC4\xAF", "\xC4\x8D", "\xC3\xA9", "\xC4\x99", "\xC3\xAB", "\xC4\x97", "\xC3\xAD", "\xC3\xAE", "\xC4\xAB",
280  "\xC4\x91", "\xC5\x86", "\xC5\x8D", "\xC4\xB7", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC5\xB3", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC5\xA9", "\xC5\xAB", "\xCB\x99"
281 };
282 
283 /* UTF-8 NFC sequences for upper half of ISO 8859-5 character set */
284 static const char* enc_iso8859_5[128] =
285 {
286  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
287  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
288  "\xC2\xA0", "\xD0\x81", "\xD0\x82", "\xD0\x83", "\xD0\x84", "\xD0\x85", "\xD0\x86", "\xD0\x87", "\xD0\x88", "\xD0\x89", "\xD0\x8A", "\xD0\x8B", "\xD0\x8C", "\xC2\xAD", "\xD0\x8E", "\xD0\x8F",
289  "\xD0\x90", "\xD0\x91", "\xD0\x92", "\xD0\x93", "\xD0\x94", "\xD0\x95", "\xD0\x96", "\xD0\x97", "\xD0\x98", "\xD0\x99", "\xD0\x9A", "\xD0\x9B", "\xD0\x9C", "\xD0\x9D", "\xD0\x9E", "\xD0\x9F",
290  "\xD0\xA0", "\xD0\xA1", "\xD0\xA2", "\xD0\xA3", "\xD0\xA4", "\xD0\xA5", "\xD0\xA6", "\xD0\xA7", "\xD0\xA8", "\xD0\xA9", "\xD0\xAA", "\xD0\xAB", "\xD0\xAC", "\xD0\xAD", "\xD0\xAE", "\xD0\xAF",
291  "\xD0\xB0", "\xD0\xB1", "\xD0\xB2", "\xD0\xB3", "\xD0\xB4", "\xD0\xB5", "\xD0\xB6", "\xD0\xB7", "\xD0\xB8", "\xD0\xB9", "\xD0\xBA", "\xD0\xBB", "\xD0\xBC", "\xD0\xBD", "\xD0\xBE", "\xD0\xBF",
292  "\xD1\x80", "\xD1\x81", "\xD1\x82", "\xD1\x83", "\xD1\x84", "\xD1\x85", "\xD1\x86", "\xD1\x87", "\xD1\x88", "\xD1\x89", "\xD1\x8A", "\xD1\x8B", "\xD1\x8C", "\xD1\x8D", "\xD1\x8E", "\xD1\x8F",
293  "\xE2\x84\x96", "\xD1\x91", "\xD1\x92", "\xD1\x93", "\xD1\x94", "\xD1\x95", "\xD1\x96", "\xD1\x97", "\xD1\x98", "\xD1\x99", "\xD1\x9A", "\xD1\x9B", "\xD1\x9C", "\xC2\xA7", "\xD1\x9E", "\xD1\x9F"
294 };
295 
296 /* UTF-8 NFC sequences for upper half of ISO 8859-6 character set */
297 static const char* enc_iso8859_6[128] =
298 {
299  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
300  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
301  "\xC2\xA0", ENC_UA, ENC_UA, ENC_UA, "\xC2\xA4", ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, "\xD8\x8C", "\xC2\xAD", ENC_UA, ENC_UA,
302  ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, "\xD8\x9B", ENC_UA, ENC_UA, ENC_UA, "\xD8\x9F",
303  ENC_UA, "\xD8\xA1", "\xD8\xA2", "\xD8\xA3", "\xD8\xA4", "\xD8\xA5", "\xD8\xA6", "\xD8\xA7", "\xD8\xA8", "\xD8\xA9", "\xD8\xAA", "\xD8\xAB", "\xD8\xAC", "\xD8\xAD", "\xD8\xAE", "\xD8\xAF",
304  "\xD8\xB0", "\xD8\xB1", "\xD8\xB2", "\xD8\xB3", "\xD8\xB4", "\xD8\xB5", "\xD8\xB6", "\xD8\xB7", "\xD8\xB8", "\xD8\xB9", "\xD8\xBA", ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA,
305  "\xD9\x80", "\xD9\x81", "\xD9\x82", "\xD9\x83", "\xD9\x84", "\xD9\x85", "\xD9\x86", "\xD9\x87", "\xD9\x88", "\xD9\x89", "\xD9\x8A", "\xD9\x8B", "\xD9\x8C", "\xD9\x8D", "\xD9\x8E", "\xD9\x8F",
306  "\xD9\x90", "\xD9\x91", "\xD9\x92", ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA
307 };
308 
309 /* UTF-8 NFC sequences for upper half of ISO 8859-7 character set */
310 static const char* enc_iso8859_7[128] =
311 {
312  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
313  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
314  "\xC2\xA0", "\xE2\x80\x98", "\xE2\x80\x99", "\xC2\xA3", "\xE2\x82\xAC", "\xE2\x82\xAF", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xCD\xBA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", ENC_UA, "\xE2\x80\x95",
315  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xCE\x84", "\xCE\x85", "\xCE\x86", "\xC2\xB7", "\xCE\x88", "\xCE\x89", "\xCE\x8A", "\xC2\xBB", "\xCE\x8C", "\xC2\xBD", "\xCE\x8E", "\xCE\x8F",
316  "\xCE\x90", "\xCE\x91", "\xCE\x92", "\xCE\x93", "\xCE\x94", "\xCE\x95", "\xCE\x96", "\xCE\x97", "\xCE\x98", "\xCE\x99", "\xCE\x9A", "\xCE\x9B", "\xCE\x9C", "\xCE\x9D", "\xCE\x9E", "\xCE\x9F",
317  "\xCE\xA0", "\xCE\xA1", ENC_UA, "\xCE\xA3", "\xCE\xA4", "\xCE\xA5", "\xCE\xA6", "\xCE\xA7", "\xCE\xA8", "\xCE\xA9", "\xCE\xAA", "\xCE\xAB", "\xCE\xAC", "\xCE\xAD", "\xCE\xAE", "\xCE\xAF",
318  "\xCE\xB0", "\xCE\xB1", "\xCE\xB2", "\xCE\xB3", "\xCE\xB4", "\xCE\xB5", "\xCE\xB6", "\xCE\xB7", "\xCE\xB8", "\xCE\xB9", "\xCE\xBA", "\xCE\xBB", "\xCE\xBC", "\xCE\xBD", "\xCE\xBE", "\xCE\xBF",
319  "\xCF\x80", "\xCF\x81", "\xCF\x82", "\xCF\x83", "\xCF\x84", "\xCF\x85", "\xCF\x86", "\xCF\x87", "\xCF\x88", "\xCF\x89", "\xCF\x8A", "\xCF\x8B", "\xCF\x8C", "\xCF\x8D", "\xCF\x8E", ENC_UA
320 };
321 
322 /* UTF-8 NFC sequences for upper half of ISO 8859-8 character set */
323 static const char* enc_iso8859_8[128] =
324 {
325  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
326  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
327  "\xC2\xA0", ENC_UA, "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC3\x97", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
328  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC3\xB7", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", ENC_UA,
329  ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA,
330  ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, "\xE2\x80\x97",
331  "\xD7\x90", "\xD7\x91", "\xD7\x92", "\xD7\x93", "\xD7\x94", "\xD7\x95", "\xD7\x96", "\xD7\x97", "\xD7\x98", "\xD7\x99", "\xD7\x9A", "\xD7\x9B", "\xD7\x9C", "\xD7\x9D", "\xD7\x9E", "\xD7\x9F",
332  "\xD7\xA0", "\xD7\xA1", "\xD7\xA2", "\xD7\xA3", "\xD7\xA4", "\xD7\xA5", "\xD7\xA6", "\xD7\xA7", "\xD7\xA8", "\xD7\xA9", "\xD7\xAA", ENC_UA, ENC_UA, "\xE2\x80\x8E", "\xE2\x80\x8F", ENC_UA
333 };
334 
335 /* UTF-8 NFC sequences for upper half of ISO 8859-9 character set */
336 static const char* enc_iso8859_9[128] =
337 {
338  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
339  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
340  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
341  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",
342  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
343  "\xC4\x9E", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC4\xB0", "\xC5\x9E", "\xC3\x9F",
344  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
345  "\xC4\x9F", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC4\xB1", "\xC5\x9F", "\xC3\xBF"
346 };
347 
348 /* UTF-8 NFC sequences for upper half of ISO 8859-10 character set */
349 static const char* enc_iso8859_10[128] =
350 {
351  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
352  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
353  "\xC2\xA0", "\xC4\x84", "\xC4\x92", "\xC4\xA2", "\xC4\xAA", "\xC4\xA8", "\xC4\xB6", "\xC2\xA7", "\xC4\xBB", "\xC4\x90", "\xC5\xA0", "\xC5\xA6", "\xC5\xBD", "\xC2\xAD", "\xC5\xAA", "\xC5\x8A",
354  "\xC2\xB0", "\xC4\x85", "\xC4\x93", "\xC4\xA3", "\xC4\xAB", "\xC4\xA9", "\xC4\xB7", "\xC2\xB7", "\xC4\xBC", "\xC4\x91", "\xC5\xA1", "\xC5\xA7", "\xC5\xBE", "\xE2\x80\x95", "\xC5\xAB", "\xC5\x8B",
355  "\xC4\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC4\xAE", "\xC4\x8C", "\xC3\x89", "\xC4\x98", "\xC3\x8B", "\xC4\x96", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
356  "\xC3\x90", "\xC5\x85", "\xC5\x8C", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC5\xA8", "\xC3\x98", "\xC5\xB2", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F",
357  "\xC4\x81", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC4\xAF", "\xC4\x8D", "\xC3\xA9", "\xC4\x99", "\xC3\xAB", "\xC4\x97", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
358  "\xC3\xB0", "\xC5\x86", "\xC5\x8D", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC5\xA9", "\xC3\xB8", "\xC5\xB3", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC4\xB8"
359 };
360 
361 /* UTF-8 NFC sequences for upper half of ISO 8859-11 character set */
362 static const char* enc_iso8859_11[128] =
363 {
364  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
365  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
366  "\xC2\xA0", "\xE0\xB8\x81", "\xE0\xB8\x82", "\xE0\xB8\x83", "\xE0\xB8\x84", "\xE0\xB8\x85", "\xE0\xB8\x86", "\xE0\xB8\x87", "\xE0\xB8\x88", "\xE0\xB8\x89", "\xE0\xB8\x8A", "\xE0\xB8\x8B", "\xE0\xB8\x8C", "\xE0\xB8\x8D", "\xE0\xB8\x8E", "\xE0\xB8\x8F",
367  "\xE0\xB8\x90", "\xE0\xB8\x91", "\xE0\xB8\x92", "\xE0\xB8\x93", "\xE0\xB8\x94", "\xE0\xB8\x95", "\xE0\xB8\x96", "\xE0\xB8\x97", "\xE0\xB8\x98", "\xE0\xB8\x99", "\xE0\xB8\x9A", "\xE0\xB8\x9B", "\xE0\xB8\x9C", "\xE0\xB8\x9D", "\xE0\xB8\x9E", "\xE0\xB8\x9F",
368  "\xE0\xB8\xA0", "\xE0\xB8\xA1", "\xE0\xB8\xA2", "\xE0\xB8\xA3", "\xE0\xB8\xA4", "\xE0\xB8\xA5", "\xE0\xB8\xA6", "\xE0\xB8\xA7", "\xE0\xB8\xA8", "\xE0\xB8\xA9", "\xE0\xB8\xAA", "\xE0\xB8\xAB", "\xE0\xB8\xAC", "\xE0\xB8\xAD", "\xE0\xB8\xAE", "\xE0\xB8\xAF",
369  "\xE0\xB8\xB0", "\xE0\xB8\xB1", "\xE0\xB8\xB2", "\xE0\xB8\xB3", "\xE0\xB8\xB4", "\xE0\xB8\xB5", "\xE0\xB8\xB6", "\xE0\xB8\xB7", "\xE0\xB8\xB8", "\xE0\xB8\xB9", "\xE0\xB8\xBA", ENC_UA, ENC_UA, ENC_UA, ENC_UA, "\xE0\xB8\xBF",
370  "\xE0\xB9\x80", "\xE0\xB9\x81", "\xE0\xB9\x82", "\xE0\xB9\x83", "\xE0\xB9\x84", "\xE0\xB9\x85", "\xE0\xB9\x86", "\xE0\xB9\x87", "\xE0\xB9\x88", "\xE0\xB9\x89", "\xE0\xB9\x8A", "\xE0\xB9\x8B", "\xE0\xB9\x8C", "\xE0\xB9\x8D", "\xE0\xB9\x8E", "\xE0\xB9\x8F",
371  "\xE0\xB9\x90", "\xE0\xB9\x91", "\xE0\xB9\x92", "\xE0\xB9\x93", "\xE0\xB9\x94", "\xE0\xB9\x95", "\xE0\xB9\x96", "\xE0\xB9\x97", "\xE0\xB9\x98", "\xE0\xB9\x99", "\xE0\xB9\x9A", "\xE0\xB9\x9B", ENC_UA, ENC_UA, ENC_UA, ENC_UA
372 };
373 
374 /* UTF-8 NFC sequences for upper half of ISO 8859-13 character set */
375 static const char* enc_iso8859_13[128] =
376 {
377  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
378  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
379  "\xC2\xA0", "\xE2\x80\x9D", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xE2\x80\x9E", "\xC2\xA6", "\xC2\xA7", "\xC3\x98", "\xC2\xA9", "\xC5\x96", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC3\x86",
380  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xE2\x80\x9C", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC3\xB8", "\xC2\xB9", "\xC5\x97", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC3\xA6",
381  "\xC4\x84", "\xC4\xAE", "\xC4\x80", "\xC4\x86", "\xC3\x84", "\xC3\x85", "\xC4\x98", "\xC4\x92", "\xC4\x8C", "\xC3\x89", "\xC5\xB9", "\xC4\x96", "\xC4\xA2", "\xC4\xB6", "\xC4\xAA", "\xC4\xBB",
382  "\xC5\xA0", "\xC5\x83", "\xC5\x85", "\xC3\x93", "\xC5\x8C", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC5\xB2", "\xC5\x81", "\xC5\x9A", "\xC5\xAA", "\xC3\x9C", "\xC5\xBB", "\xC5\xBD", "\xC3\x9F",
383  "\xC4\x85", "\xC4\xAF", "\xC4\x81", "\xC4\x87", "\xC3\xA4", "\xC3\xA5", "\xC4\x99", "\xC4\x93", "\xC4\x8D", "\xC3\xA9", "\xC5\xBA", "\xC4\x97", "\xC4\xA3", "\xC4\xB7", "\xC4\xAB", "\xC4\xBC",
384  "\xC5\xA1", "\xC5\x84", "\xC5\x86", "\xC3\xB3", "\xC5\x8D", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC5\xB3", "\xC5\x82", "\xC5\x9B", "\xC5\xAB", "\xC3\xBC", "\xC5\xBC", "\xC5\xBE", "\xE2\x80\x99"
385 };
386 
387 /* UTF-8 NFC sequences for upper half of ISO 8859-14 character set */
388 static const char* enc_iso8859_14[128] =
389 {
390  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
391  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
392  "\xC2\xA0", "\xE1\xB8\x82", "\xE1\xB8\x83", "\xC2\xA3", "\xC4\x8A", "\xC4\x8B","\xE1\xB8\x8A", "\xC2\xA7", "\xE1\xBA\x80", "\xC2\xA9", "\xE1\xBA\x82", "\xE1\xB8\x8B", "\xE1\xBB\xB2", "\xC2\xAD", "\xC2\xAE", "\xC5\xB8",
393  "\xE1\xB8\x9E", "\xE1\xB8\x9F", "\xC4\xA0", "\xC4\xA1", "\xE1\xB9\x80", "\xE1\xB9\x81", "\xC2\xB6", "\xE1\xB9\x96", "\xE1\xBA\x81", "\xE1\xB9\x97", "\xE1\xBA\x83", "\xE1\xB9\xA0", "\xE1\xBB\xB3", "\xE1\xBA\x84", "\xE1\xBA\x85", "\xE1\xB9\xA1",
394  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
395  "\xC5\xB4", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xE1\xB9\xAA", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC5\xB6", "\xC3\x9F",
396  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
397  "\xC5\xB5", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xE1\xB9\xAB", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC5\xB7", "\xC3\xBF"
398 };
399 
400 /* UTF-8 NFC sequences for upper half of ISO 8859-15 character set */
401 static const char* enc_iso8859_15[128] =
402 {
403  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
404  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
405  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xE2\x82\xAC", "\xC2\xA5", "\xC5\xA0", "\xC2\xA7", "\xC5\xA1", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
406  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC5\xBD", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC5\xBE", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC5\x92", "\xC5\x93", "\xC5\xB8", "\xC2\xBF",
407  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
408  "\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F",
409  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
410  "\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF"
411 };
412 
413 /* UTF-8 NFC sequences for upper half of ISO 8859-16 character set */
414 static const char* enc_iso8859_16[128] =
415 {
416  "\xC2\x80", "\xC2\x81", "\xC2\x82", "\xC2\x83", "\xC2\x84", "\xC2\x85", "\xC2\x86", "\xC2\x87", "\xC2\x88", "\xC2\x89", "\xC2\x8A", "\xC2\x8B", "\xC2\x8C", "\xC2\x8D", "\xC2\x8E", "\xC2\x8F",
417  "\xC2\x90", "\xC2\x91", "\xC2\x92", "\xC2\x93", "\xC2\x94", "\xC2\x95", "\xC2\x96", "\xC2\x97", "\xC2\x98", "\xC2\x99", "\xC2\x9A", "\xC2\x9B", "\xC2\x9C", "\xC2\x9D", "\xC2\x9E", "\xC2\x9F",
418  "\xC2\xA0", "\xC4\x84", "\xC4\x85", "\xC5\x81", "\xE2\x82\xAC", "\xE2\x80\x9E", "\xC5\xA0", "\xC2\xA7", "\xC5\xA1", "\xC2\xA9", "\xC8\x98", "\xC2\xAB", "\xC5\xB9", "\xC2\xAD", "\xC5\xBA", "\xC5\xBB",
419  "\xC2\xB0", "\xC2\xB1", "\xC4\x8C", "\xC5\x82", "\xC5\xBD", "\xE2\x80\x9D", "\xC2\xB6", "\xC2\xB7", "\xC5\xBE", "\xC4\x8D", "\xC8\x99", "\xC2\xBB", "\xC5\x92", "\xC5\x93", "\xC5\xB8", "\xC5\xBC",
420  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC4\x82", "\xC3\x84", "\xC4\x86", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
421  "\xC4\x90", "\xC5\x83", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC5\x90", "\xC3\x96", "\xC5\x9A", "\xC5\xB0", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC4\x98", "\xC8\x9A", "\xC3\x9F",
422  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC4\x83", "\xC3\xA4", "\xC4\x87", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
423  "\xC4\x91", "\xC5\x84", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC5\x91", "\xC3\xB6", "\xC5\x9B", "\xC5\xB1", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC4\x99", "\xC8\x9B", "\xC3\xBF"
424 };
425 
426 /* UTF-8 NFC sequences for upper half of MacRoman character set */
427 static const char* enc_mac_roman[128] =
428 {
429  "\xC3\x84", "\xC3\x85", "\xC3\x87", "\xC3\x89", "\xC3\x91", "\xC3\x96", "\xC3\x9C", "\xC3\xA1", "\xC3\xA0", "\xC3\xA2", "\xC3\xA4", "\xC3\xA3", "\xC3\xA5", "\xC3\xA7", "\xC3\xA9", "\xC3\xA8",
430  "\xC3\xAA", "\xC3\xAB", "\xC3\xAD", "\xC3\xAC", "\xC3\xAE", "\xC3\xAF", "\xC3\xB1", "\xC3\xB3", "\xC3\xB2", "\xC3\xB4", "\xC3\xB6", "\xC3\xB5", "\xC3\xBA", "\xC3\xB9", "\xC3\xBB", "\xC3\xBC",
431  "\xE2\x80\xA0", "\xC2\xB0", "\xC2\xA2", "\xC2\xA3", "\xC2\xA7", "\xE2\x80\xA2", "\xC2\xB6", "\xC3\x9F", "\xC2\xAE", "\xC2\xA9", "\xE2\x84\xA2", "\xC2\xB4", "\xC2\xA8", "\xE2\x89\xA0", "\xC3\x86", "\xC3\x98",
432  "\xE2\x88\x9E", "\xC2\xB1", "\xE2\x89\xA4", "\xE2\x89\xA5", "\xC2\xA5", "\xC2\xB5", "\xE2\x88\x82", "\xE2\x88\x91", "\xE2\x88\x8F", "\xCF\x80", "\xE2\x88\xAB", "\xC2\xAA", "\xC2\xBA", "\xCE\xA9", "\xC3\xA6", "\xC3\xB8",
433  "\xC2\xBF", "\xC2\xA1", "\xC2\xAC", "\xE2\x88\x9A", "\xC6\x92", "\xE2\x89\x88", "\xE2\x88\x86", "\xC2\xAB", "\xC2\xBB", "\xE2\x80\xA6", "\xC2\xA0", "\xC3\x80", "\xC3\x83", "\xC3\x95", "\xC5\x92", "\xC5\x93",
434  "\xE2\x80\x93", "\xE2\x80\x94", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\x98", "\xE2\x80\x99", "\xC3\xB7", "\xE2\x97\x8A", "\xC3\xBF", "\xC5\xB8", "\xE2\x81\x84", "\xE2\x82\xAC", "\xE2\x80\xB9", "\xE2\x80\xBA", "\xEF\xAC\x81", "\xEF\xAC\x82",
435  "\xE2\x80\xA1", "\xC2\xB7", "\xE2\x80\x9A", "\xE2\x80\x9E", "\xE2\x80\xB0", "\xC3\x82", "\xC3\x8A", "\xC3\x81", "\xC3\x8B", "\xC3\x88", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F", "\xC3\x8C", "\xC3\x93", "\xC3\x94",
436  ENC_UA, "\xC3\x92", "\xC3\x9A", "\xC3\x9B", "\xC3\x99", "\xC4\xB1", "\xCB\x86", "\xCB\x9C", "\xC2\xAF", "\xCB\x98", "\xCB\x99", "\xCB\x9A", "\xC2\xB8", "\xCB\x9D", "\xCB\x9B", "\xCB\x87"
437 };
438 
439 
440 /* UTF-8 NFC sequences for upper half of Windows-1250 character set */
441 static const char* enc_windows_1250[128] =
442 {
443  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", ENC_UA, "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", ENC_UA, "\xE2\x80\xB0", "\xC5\xA0", "\xE2\x80\xB9", "\xC5\x9A", "\xC5\xA4", "\xC5\xBD", "\xC5\xB9",
444  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", ENC_UA, "\xE2\x84\xA2", "\xC5\xA1", "\xE2\x80\xBA", "\xC5\x9B", "\xC5\xA5", "\xC5\xBE", "\xC5\xBA",
445  "\xC2\xA0", "\xCB\x87", "\xCB\x98", "\xC5\x81", "\xC2\xA4", "\xC4\x84", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC5\x9E", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC5\xBB",
446  "\xC2\xB0", "\xC2\xB1", "\xCB\x9B", "\xC5\x82", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC4\x85", "\xC5\x9F", "\xC2\xBB", "\xC4\xBD", "\xCB\x9D", "\xC4\xBE", "\xC5\xBC",
447  "\xC5\x94", "\xC3\x81", "\xC3\x82", "\xC4\x82", "\xC3\x84", "\xC4\xB9", "\xC4\x86", "\xC3\x87", "\xC4\x8C", "\xC3\x89", "\xC4\x98", "\xC3\x8B", "\xC4\x9A", "\xC3\x8D", "\xC3\x8E", "\xC4\x8E",
448  "\xC4\x90", "\xC5\x83", "\xC5\x87", "\xC3\x93", "\xC3\x94", "\xC5\x90", "\xC3\x96", "\xC3\x97", "\xC5\x98", "\xC5\xAE", "\xC3\x9A", "\xC5\xB0", "\xC3\x9C", "\xC3\x9D", "\xC5\xA2", "\xC3\x9F",
449  "\xC5\x95", "\xC3\xA1", "\xC3\xA2", "\xC4\x83", "\xC3\xA4", "\xC4\xBA", "\xC4\x87", "\xC3\xA7", "\xC4\x8D", "\xC3\xA9", "\xC4\x99", "\xC3\xAB", "\xC4\x9B", "\xC3\xAD", "\xC3\xAE", "\xC4\x8F",
450  "\xC4\x91", "\xC5\x84", "\xC5\x88", "\xC3\xB3", "\xC3\xB4", "\xC5\x91", "\xC3\xB6", "\xC3\xB7", "\xC5\x99", "\xC5\xAF", "\xC3\xBA", "\xC5\xB1", "\xC3\xBC", "\xC3\xBD", "\xC5\xA3", "\xCB\x99"
451 };
452 
453 /* UTF-8 NFC sequences for upper half of Windows-1251 character set */
454 static const char* enc_windows_1251[128] =
455 {
456  "\xD0\x82", "\xD0\x83", "\xE2\x80\x9A", "\xD1\x93", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", "\xE2\x82\xAC", "\xE2\x80\xB0", "\xD0\x89", "\xE2\x80\xB9", "\xD0\x8A", "\xD0\x8C", "\xD0\x8B", "\xD0\x8F",
457  "\xD1\x92", "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", ENC_UA, "\xE2\x84\xA2", "\xD1\x99", "\xE2\x80\xBA", "\xD1\x9A", "\xD1\x9C", "\xD1\x9B", "\xD1\x9F",
458  "\xC2\xA0", "\xD0\x8E", "\xD1\x9E", "\xD0\x88", "\xC2\xA4", "\xD2\x90", "\xC2\xA6", "\xC2\xA7", "\xD0\x81", "\xC2\xA9", "\xD0\x84", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xD0\x87",
459  "\xC2\xB0", "\xC2\xB1", "\xD0\x86", "\xD1\x96", "\xD2\x91", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xD1\x91", "\xE2\x84\x96", "\xD1\x94", "\xC2\xBB", "\xD1\x98", "\xD0\x85", "\xD1\x95", "\xD1\x97",
460  "\xD0\x90", "\xD0\x91", "\xD0\x92", "\xD0\x93", "\xD0\x94", "\xD0\x95", "\xD0\x96", "\xD0\x97", "\xD0\x98", "\xD0\x99", "\xD0\x9A", "\xD0\x9B", "\xD0\x9C", "\xD0\x9D", "\xD0\x9E", "\xD0\x9F",
461  "\xD0\xA0", "\xD0\xA1", "\xD0\xA2", "\xD0\xA3", "\xD0\xA4", "\xD0\xA5", "\xD0\xA6", "\xD0\xA7", "\xD0\xA8", "\xD0\xA9", "\xD0\xAA", "\xD0\xAB", "\xD0\xAC", "\xD0\xAD", "\xD0\xAE", "\xD0\xAF",
462  "\xD0\xB0", "\xD0\xB1", "\xD0\xB2", "\xD0\xB3", "\xD0\xB4", "\xD0\xB5", "\xD0\xB6", "\xD0\xB7", "\xD0\xB8", "\xD0\xB9", "\xD0\xBA", "\xD0\xBB", "\xD0\xBC", "\xD0\xBD", "\xD0\xBE", "\xD0\xBF",
463  "\xD1\x80", "\xD1\x81", "\xD1\x82", "\xD1\x83", "\xD1\x84", "\xD1\x85", "\xD1\x86", "\xD1\x87", "\xD1\x88", "\xD1\x89", "\xD1\x8A", "\xD1\x8B", "\xD1\x8C", "\xD1\x8D", "\xD1\x8E", "\xD1\x8F"
464 };
465 
466 /* UTF-8 NFC sequences for upper half of Windows-1252 character set */
467 static const char* enc_windows_1252[128] =
468 {
469  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", "\xC6\x92", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", "\xCB\x86", "\xE2\x80\xB0", "\xC5\xA0", "\xE2\x80\xB9", "\xC5\x92", ENC_UA, "\xC5\xBD", ENC_UA,
470  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", "\xCB\x9C", "\xE2\x84\xA2", "\xC5\xA1", "\xE2\x80\xBA", "\xC5\x93", ENC_UA, "\xC5\xBE", "\xC5\xB8",
471  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
472  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",
473  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
474  "\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F",
475  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
476  "\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF"
477 };
478 
479 /* UTF-8 NFC sequences for upper half of Windows-1253 character set */
480 static const char* enc_windows_1253[128] =
481 {
482  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", "\xC6\x92", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", ENC_UA, "\xE2\x80\xB0", ENC_UA, "\xE2\x80\xB9", ENC_UA, ENC_UA, ENC_UA, ENC_UA,
483  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", ENC_UA, "\xE2\x84\xA2", ENC_UA, "\xE2\x80\xBA", ENC_UA, ENC_UA, ENC_UA, ENC_UA,
484  "\xC2\xA0", "\xCE\x85", "\xCE\x86", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", ENC_UA, "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xE2\x80\x95",
485  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xCE\x84", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xCE\x88", "\xCE\x89", "\xCE\x8A", "\xC2\xBB", "\xCE\x8C", "\xC2\xBD", "\xCE\x8E", "\xCE\x8F",
486  "\xCE\x90", "\xCE\x91", "\xCE\x92", "\xCE\x93", "\xCE\x94", "\xCE\x95", "\xCE\x96", "\xCE\x97", "\xCE\x98", "\xCE\x99", "\xCE\x9A", "\xCE\x9B", "\xCE\x9C", "\xCE\x9D", "\xCE\x9E", "\xCE\x9F",
487  "\xCE\xA0", "\xCE\xA1", ENC_UA, "\xCE\xA3", "\xCE\xA4", "\xCE\xA5", "\xCE\xA6", "\xCE\xA7", "\xCE\xA8", "\xCE\xA9", "\xCE\xAA", "\xCE\xAB", "\xCE\xAC", "\xCE\xAD", "\xCE\xAE", "\xCE\xAF",
488  "\xCE\xB0", "\xCE\xB1", "\xCE\xB2", "\xCE\xB3", "\xCE\xB4", "\xCE\xB5", "\xCE\xB6", "\xCE\xB7", "\xCE\xB8", "\xCE\xB9", "\xCE\xBA", "\xCE\xBB", "\xCE\xBC", "\xCE\xBD", "\xCE\xBE", "\xCE\xBF",
489  "\xCF\x80", "\xCF\x81", "\xCF\x82", "\xCF\x83", "\xCF\x84", "\xCF\x85", "\xCF\x86", "\xCF\x87", "\xCF\x88", "\xCF\x89", "\xCF\x8A", "\xCF\x8B", "\xCF\x8C", "\xCF\x8D", "\xCF\x8E", ENC_UA
490 };
491 
492 /* UTF-8 NFC sequences for upper half of Windows-1254 character set */
493 static const char* enc_windows_1254[128] =
494 {
495  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", "\xC6\x92", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", "\xCB\x86", "\xE2\x80\xB0", "\xC5\xA0", "\xE2\x80\xB9", "\xC5\x92", ENC_UA, ENC_UA, ENC_UA,
496  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", "\xCB\x9C", "\xE2\x84\xA2", "\xC5\xA1", "\xE2\x80\xBA", "\xC5\x93", ENC_UA, ENC_UA, "\xC5\xB8",
497  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
498  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",
499  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
500  "\xC4\x9E", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC4\xB0", "\xC5\x9E", "\xC3\x9F",
501  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
502  "\xC4\x9F", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC4\xB1", "\xC5\x9F", "\xC3\xBF"
503 };
504 
505 /* UTF-8 NFC sequences for upper half of Windows-1255 character set */
506 static const char* enc_windows_1255[128] =
507 {
508  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", "\xC6\x92", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", "\xCB\x86", "\xE2\x80\xB0", ENC_UA, "\xE2\x80\xB9", ENC_UA, ENC_UA, ENC_UA, ENC_UA,
509  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", "\xCB\x9C", "\xE2\x84\xA2", ENC_UA, "\xE2\x80\xBA", ENC_UA, ENC_UA, ENC_UA, ENC_UA,
510  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xE2\x82\xAA", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC3\x97", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
511  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC3\xB7", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",
512  "\xD6\xB0", "\xD6\xB1", "\xD6\xB2", "\xD6\xB3", "\xD6\xB4", "\xD6\xB5", "\xD6\xB6", "\xD6\xB7", "\xD6\xB8", "\xD6\xB9", "\xD6\xBA", "\xD6\xBB", "\xD6\xBC", "\xD6\xBD", "\xD6\xBE", "\xD6\xBF",
513  "\xD7\x80", "\xD7\x81", "\xD7\x82", "\xD7\x83", "\xD7\xB0", "\xD7\xB1", "\xD7\xB2", "\xD7\xB3", "\xD7\xB4", ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA, ENC_UA,
514  "\xD7\x90", "\xD7\x91", "\xD7\x92", "\xD7\x93", "\xD7\x94", "\xD7\x95", "\xD7\x96", "\xD7\x97", "\xD7\x98", "\xD7\x99", "\xD7\x9A", "\xD7\x9B", "\xD7\x9C", "\xD7\x9D", "\xD7\x9E", "\xD7\x9F",
515  "\xD7\xA0", "\xD7\xA1", "\xD7\xA2", "\xD7\xA3", "\xD7\xA4", "\xD7\xA5", "\xD7\xA6", "\xD7\xA7", "\xD7\xA8", "\xD7\xA9", "\xD7\xAA", ENC_UA, ENC_UA, "\xE2\x80\x8E", "\xE2\x80\x8F", ENC_UA
516 };
517 
518 /* UTF-8 NFC sequences for upper half of Windows-1256 character set */
519 static const char* enc_windows_1256[128] =
520 {
521  "\xE2\x82\xAC", "\xD9\xBE", "\xE2\x80\x9A", "\xC6\x92", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", "\xCB\x86", "\xE2\x80\xB0", "\xD9\xB9", "\xE2\x80\xB9", "\xC5\x92", "\xDA\x86", "\xDA\x98", "\xDA\x88",
522  "\xDA\xAF", "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", "\xDA\xA9", "\xE2\x84\xA2", "\xDA\x91", "\xE2\x80\xBA", "\xC5\x93", "\xE2\x80\x8C", "\xE2\x80\x8D", "\xDA\xBA",
523  "\xC2\xA0", "\xD8\x8C", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xDA\xBE", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
524  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xD8\x9B", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xD8\x9F",
525  "\xDB\x81", "\xD8\xA1", "\xD8\xA2", "\xD8\xA3", "\xD8\xA4", "\xD8\xA5", "\xD8\xA6", "\xD8\xA7", "\xD8\xA8", "\xD8\xA9", "\xD8\xAA", "\xD8\xAB", "\xD8\xAC", "\xD8\xAD", "\xD8\xAE", "\xD8\xAF",
526  "\xD8\xB0", "\xD8\xB1", "\xD8\xB2", "\xD8\xB3", "\xD8\xB4", "\xD8\xB5", "\xD8\xB6", "\xC3\x97", "\xD8\xB7", "\xD8\xB8", "\xD8\xB9", "\xD8\xBA", "\xD9\x80", "\xD9\x81", "\xD9\x82", "\xD9\x83",
527  "\xC3\xA0", "\xD9\x84", "\xC3\xA2", "\xD9\x85", "\xD9\x86", "\xD9\x87", "\xD9\x88", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xD9\x89", "\xD9\x8A", "\xC3\xAE", "\xC3\xAF",
528  "\xD9\x8B", "\xD9\x8C", "\xD9\x8D", "\xD9\x8E", "\xC3\xB4", "\xD9\x8F", "\xD9\x90", "\xC3\xB7", "\xD9\x91", "\xC3\xB9", "\xD9\x92", "\xC3\xBB", "\xC3\xBC", "\xE2\x80\x8E", "\xE2\x80\x8F", "\xDB\x92"
529 };
530 
531 /* UTF-8 NFC sequences for upper half of Windows-1257 character set */
532 static const char* enc_windows_1257[128] =
533 {
534  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", ENC_UA, "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", ENC_UA, "\xE2\x80\xB0", ENC_UA, "\xE2\x80\xB9", ENC_UA, "\xC2\xA8", "\xCB\x87", "\xC2\xB8",
535  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", ENC_UA, "\xE2\x84\xA2", ENC_UA, "\xE2\x80\xBA", ENC_UA, "\xC2\xAF", "\xCB\x9B", ENC_UA,
536  "\xC2\xA0", ENC_UA, "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", ENC_UA, "\xC2\xA6", "\xC2\xA7", "\xC3\x98", "\xC2\xA9", "\xC5\x96", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC3\x86",
537  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC3\xB8", "\xC2\xB9", "\xC5\x97", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC3\xA6",
538  "\xC4\x84", "\xC4\xAE", "\xC4\x80", "\xC4\x86", "\xC3\x84", "\xC3\x85", "\xC4\x98", "\xC4\x92", "\xC4\x8C", "\xC3\x89", "\xC5\xB9", "\xC4\x96", "\xC4\xA2", "\xC4\xB6", "\xC4\xAA", "\xC4\xBB",
539  "\xC5\xA0", "\xC5\x83", "\xC5\x85", "\xC3\x93", "\xC5\x8C", "\xC3\x95", "\xC3\x96", "\xC3\x97", "\xC5\xB2", "\xC5\x81", "\xC5\x9A", "\xC5\xAA", "\xC3\x9C", "\xC5\xBB", "\xC5\xBD", "\xC3\x9F",
540  "\xC4\x85", "\xC4\xAF", "\xC4\x81", "\xC4\x87", "\xC3\xA4", "\xC3\xA5", "\xC4\x99", "\xC4\x93", "\xC4\x8D", "\xC3\xA9", "\xC5\xBA", "\xC4\x97", "\xC4\xA3", "\xC4\xB7", "\xC4\xAB", "\xC4\xBC",
541  "\xC5\xA1", "\xC5\x84", "\xC5\x86", "\xC3\xB3", "\xC5\x8D", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", "\xC5\xB3", "\xC5\x82", "\xC5\x9B", "\xC5\xAB", "\xC3\xBC", "\xC5\xBC", "\xC5\xBE", "\xCB\x99"
542 };
543 
544 /* UTF-8 NFC sequences for upper half of Windows-1258 character set */
545 static const char* enc_windows_1258[128] =
546 {
547  "\xE2\x82\xAC", ENC_UA, "\xE2\x80\x9A", "\xC6\x92", "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", "\xCB\x86", "\xE2\x80\xB0", ENC_UA, "\xE2\x80\xB9", "\xC5\x92", ENC_UA, ENC_UA, ENC_UA,
548  ENC_UA, "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", "\xCB\x9C", "\xE2\x84\xA2", ENC_UA, "\xE2\x80\xBA", "\xC5\x93", ENC_UA, ENC_UA, "\xC5\xB8",
549  "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",
550  "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",
551  "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC4\x82", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xCC\x80", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",
552  "\xC4\x90", "\xC3\x91", "\xCC\x89", "\xC3\x93", "\xC3\x94", "\xC6\xA0", "\xC3\x96", "\xC3\x97", "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC6\xAF", "\xCC\x83", "\xC3\x9F",
553  "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC4\x83", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xCC\x81", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",
554  "\xC4\x91", "\xC3\xB1", "\xCC\xA3", "\xC3\xB3", "\xC3\xB4", "\xC6\xA1", "\xC3\xB6", "\xC3\xB7", "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC6\xB0", "\xE2\x82\xAB", "\xC3\xBF"
555 };
556 
557 /* UTF-8 NFC sequences for upper half of KOI8-R character set */
558 static const char* enc_koi8r[128] =
559 {
560  "\xE2\x94\x80", "\xE2\x94\x82", "\xE2\x94\x8C", "\xE2\x94\x90", "\xE2\x94\x94", "\xE2\x94\x98", "\xE2\x94\x9C", "\xE2\x94\xA4", "\xE2\x94\xAC", "\xE2\x94\xB4", "\xE2\x94\xBC", "\xE2\x96\x80", "\xE2\x96\x84", "\xE2\x96\x88", "\xE2\x96\x8C", "\xE2\x96\x90",
561  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x8C\xA0", "\xE2\x96\xA0", "\xE2\x88\x99", "\xE2\x88\x9A", "\xE2\x89\x88", "\xE2\x89\xA4", "\xE2\x89\xA5", "\xC2\xA0", "\xE2\x8C\xA1", "\xC2\xB0", "\xC2\xB2", "\xC2\xB7", "\xC3\xB7",
562  "\xE2\x95\x90", "\xE2\x95\x91", "\xE2\x95\x92", "\xD1\x91", "\xE2\x95\x93", "\xE2\x95\x94", "\xE2\x95\x95", "\xE2\x95\x96", "\xE2\x95\x97", "\xE2\x95\x98", "\xE2\x95\x99", "\xE2\x95\x9A", "\xE2\x95\x9B", "\xE2\x95\x9C", "\xE2\x95\x9D", "\xE2\x95\x9E",
563  "\xE2\x95\x9F", "\xE2\x95\xA0", "\xE2\x95\xA1", "\xD0\x81", "\xE2\x95\xA2", "\xE2\x95\xA3", "\xE2\x95\xA4", "\xE2\x95\xA5", "\xE2\x95\xA6", "\xE2\x95\xA7", "\xE2\x95\xA8", "\xE2\x95\xA9", "\xE2\x95\xAA", "\xE2\x95\xAB", "\xE2\x95\xAC", "\xC2\xA9",
564  "\xD1\x8E", "\xD0\xB0", "\xD0\xB1", "\xD1\x86", "\xD0\xB4", "\xD0\xB5", "\xD1\x84", "\xD0\xB3", "\xD1\x85", "\xD0\xB8", "\xD0\xB9", "\xD0\xBA", "\xD0\xBB", "\xD0\xBC", "\xD0\xBD", "\xD0\xBE",
565  "\xD0\xBF", "\xD1\x8F", "\xD1\x80", "\xD1\x81", "\xD1\x82", "\xD1\x83", "\xD0\xB6", "\xD0\xB2", "\xD1\x8C", "\xD1\x8B", "\xD0\xB7", "\xD1\x88", "\xD1\x8D", "\xD1\x89", "\xD1\x87", "\xD1\x8A",
566  "\xD0\xAE", "\xD0\x90", "\xD0\x91", "\xD0\xA6", "\xD0\x94", "\xD0\x95", "\xD0\xA4", "\xD0\x93", "\xD0\xA5", "\xD0\x98", "\xD0\x99", "\xD0\x9A", "\xD0\x9B", "\xD0\x9C", "\xD0\x9D", "\xD0\x9E",
567  "\xD0\x9F", "\xD0\xAF", "\xD0\xA0", "\xD0\xA1", "\xD0\xA2", "\xD0\xA3", "\xD0\x96", "\xD0\x92", "\xD0\xAC", "\xD0\xAB", "\xD0\x97", "\xD0\xA8", "\xD0\xAD", "\xD0\xA9", "\xD0\xA7", "\xD0\xAA"
568 };
569 
570 /* UTF-8 NFC sequences for upper half of KOI8-U character set */
571 static const char* enc_koi8u[128] =
572 {
573  "\xE2\x94\x80", "\xE2\x94\x82", "\xE2\x94\x8C", "\xE2\x94\x90", "\xE2\x94\x94", "\xE2\x94\x98", "\xE2\x94\x9C", "\xE2\x94\xA4", "\xE2\x94\xAC", "\xE2\x94\xB4", "\xE2\x94\xBC", "\xE2\x96\x80", "\xE2\x96\x84", "\xE2\x96\x88", "\xE2\x96\x8C", "\xE2\x96\x90",
574  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x8C\xA0", "\xE2\x96\xA0", "\xE2\x88\x99", "\xE2\x88\x9A", "\xE2\x89\x88", "\xE2\x89\xA4", "\xE2\x89\xA5", "\xC2\xA0", "\xE2\x8C\xA1", "\xC2\xB0", "\xC2\xB2", "\xC2\xB7", "\xC3\xB7",
575  "\xE2\x95\x90", "\xE2\x95\x91", "\xE2\x95\x92", "\xD1\x91", "\xD1\x94", "\xE2\x95\x94", "\xD1\x96", "\xD1\x97", "\xE2\x95\x97", "\xE2\x95\x98", "\xE2\x95\x99", "\xE2\x95\x9A", "\xE2\x95\x9B", "\xD2\x91", "\xE2\x95\x9D", "\xE2\x95\x9E",
576  "\xE2\x95\x9F", "\xE2\x95\xA0", "\xE2\x95\xA1", "\xD0\x81", "\xD0\x84", "\xE2\x95\xA3", "\xD0\x86", "\xD0\x87", "\xE2\x95\xA6", "\xE2\x95\xA7", "\xE2\x95\xA8", "\xE2\x95\xA9", "\xE2\x95\xAA", "\xD2\x90", "\xE2\x95\xAC", "\xC2\xA9",
577  "\xD1\x8E", "\xD0\xB0", "\xD0\xB1", "\xD1\x86", "\xD0\xB4", "\xD0\xB5", "\xD1\x84", "\xD0\xB3", "\xD1\x85", "\xD0\xB8", "\xD0\xB9", "\xD0\xBA", "\xD0\xBB", "\xD0\xBC", "\xD0\xBD", "\xD0\xBE",
578  "\xD0\xBF", "\xD1\x8F", "\xD1\x80", "\xD1\x81", "\xD1\x82", "\xD1\x83", "\xD0\xB6", "\xD0\xB2", "\xD1\x8C", "\xD1\x8B", "\xD0\xB7", "\xD1\x88", "\xD1\x8D", "\xD1\x89", "\xD1\x87", "\xD1\x8A",
579  "\xD0\xAE", "\xD0\x90", "\xD0\x91", "\xD0\xA6", "\xD0\x94", "\xD0\x95", "\xD0\xA4", "\xD0\x93", "\xD0\xA5", "\xD0\x98", "\xD0\x99", "\xD0\x9A", "\xD0\x9B", "\xD0\x9C", "\xD0\x9D", "\xD0\x9E",
580  "\xD0\x9F", "\xD0\xAF", "\xD0\xA0", "\xD0\xA1", "\xD0\xA2", "\xD0\xA3", "\xD0\x96", "\xD0\x92", "\xD0\xAC", "\xD0\xAB", "\xD0\x97", "\xD0\xA8", "\xD0\xAD", "\xD0\xA9", "\xD0\xA7", "\xD0\xAA"
581 };
582 
583 /* UTF-8 NFC sequences for upper half of IBM437 character set */
584 static const char* enc_ibm437[128] =
585 {
586  "\xC3\x87", "\xC3\xBC", "\xC3\xA9", "\xC3\xA2", "\xC3\xA4", "\xC3\xA0", "\xC3\xA5", "\xC3\xA7", "\xC3\xAA", "\xC3\xAB", "\xC3\xA8", "\xC3\xAF", "\xC3\xAE", "\xC3\xAC", "\xC3\x84", "\xC3\x85",
587  "\xC3\x89", "\xC3\xA6", "\xC3\x86", "\xC3\xB4", "\xC3\xB6", "\xC3\xB2", "\xC3\xBB", "\xC3\xB9", "\xC3\xBF", "\xC3\x96", "\xC3\x9C", "\xC2\xA2", "\xC2\xA3", "\xC2\xA5", "\xE2\x82\xA7", "\xC6\x92",
588  "\xC3\xA1", "\xC3\xAD", "\xC3\xB3", "\xC3\xBA", "\xC3\xB1", "\xC3\x91", "\xC2\xAA", "\xC2\xBA", "\xC2\xBF", "\xE2\x8C\x90", "\xC2\xAC", "\xC2\xBD", "\xC2\xBC", "\xC2\xA1", "\xC2\xAB", "\xC2\xBB",
589  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x94\x82", "\xE2\x94\xA4", "\xE2\x95\xA1", "\xE2\x95\xA2", "\xE2\x95\x96", "\xE2\x95\x95", "\xE2\x95\xA3", "\xE2\x95\x91", "\xE2\x95\x97", "\xE2\x95\x9D", "\xE2\x95\x9C", "\xE2\x95\x9B", "\xE2\x94\x90",
590  "\xE2\x94\x94", "\xE2\x94\xB4", "\xE2\x94\xAC", "\xE2\x94\x9C", "\xE2\x94\x80", "\xE2\x94\xBC", "\xE2\x95\x9E", "\xE2\x95\x9F", "\xE2\x95\x9A", "\xE2\x95\x94", "\xE2\x95\xA9", "\xE2\x95\xA6", "\xE2\x95\xA0", "\xE2\x95\x90", "\xE2\x95\xAC", "\xE2\x95\xA7",
591  "\xE2\x95\xA8", "\xE2\x95\xA4", "\xE2\x95\xA5", "\xE2\x95\x99", "\xE2\x95\x98", "\xE2\x95\x92", "\xE2\x95\x93", "\xE2\x95\xAB", "\xE2\x95\xAA", "\xE2\x94\x98", "\xE2\x94\x8C", "\xE2\x96\x88", "\xE2\x96\x84", "\xE2\x96\x8C", "\xE2\x96\x90", "\xE2\x96\x80",
592  "\xCE\xB1", "\xC3\x9F", "\xCE\x93", "\xCF\x80", "\xCE\xA3", "\xCF\x83", "\xC2\xB5", "\xCF\x84", "\xCE\xA6", "\xCE\x98", "\xCE\xA9", "\xCE\xB4", "\xE2\x88\x9E", "\xCF\x86", "\xCE\xB5", "\xE2\x88\xA9",
593  "\xE2\x89\xA1", "\xC2\xB1", "\xE2\x89\xA5", "\xE2\x89\xA4", "\xE2\x8C\xA0", "\xE2\x8C\xA1", "\xC3\xB7", "\xE2\x89\x88", "\xC2\xB0", "\xE2\x88\x99", "\xC2\xB7", "\xE2\x88\x9A", "\xE2\x81\xBF", "\xC2\xB2", "\xE2\x96\xA0", "\xC2\xA0"
594 };
595 
596 /* UTF-8 NFC sequences for upper half of IBM775 character set */
597 static const char* enc_ibm775[128] =
598 {
599  "\xC4\x86", "\xC3\xBC", "\xC3\xA9", "\xC4\x81", "\xC3\xA4", "\xC4\xA3", "\xC3\xA5", "\xC4\x87", "\xC5\x82", "\xC4\x93", "\xC5\x96", "\xC5\x97", "\xC4\xAB", "\xC5\xB9", "\xC3\x84", "\xC3\x85",
600  "\xC3\x89", "\xC3\xA6", "\xC3\x86", "\xC5\x8D", "\xC3\xB6", "\xC4\xA2", "\xC2\xA2", "\xC5\x9A", "\xC5\x9B", "\xC3\x96", "\xC3\x9C", "\xC3\xB8", "\xC2\xA3", "\xC3\x98", "\xC3\x97", "\xC2\xA4",
601  "\xC4\x80", "\xC4\xAA", "\xC3\xB3", "\xC5\xBB", "\xC5\xBC", "\xC5\xBA", "\xE2\x80\x9D", "\xC2\xA6", "\xC2\xA9", "\xC2\xAE", "\xC2\xAC", "\xC2\xBD", "\xC2\xBC", "\xC5\x81", "\xC2\xAB", "\xC2\xBB",
602  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x94\x82", "\xE2\x94\xA4", "\xC4\x84", "\xC4\x8C", "\xC4\x98", "\xC4\x96", "\xE2\x95\xA3", "\xE2\x95\x91", "\xE2\x95\x97", "\xE2\x95\x9D", "\xC4\xAE", "\xC5\xA0", "\xE2\x94\x90",
603  "\xE2\x94\x94", "\xE2\x94\xB4", "\xE2\x94\xAC", "\xE2\x94\x9C", "\xE2\x94\x80", "\xE2\x94\xBC", "\xC5\xB2", "\xC5\xAA", "\xE2\x95\x9A", "\xE2\x95\x94", "\xE2\x95\xA9", "\xE2\x95\xA6", "\xE2\x95\xA0", "\xE2\x95\x90", "\xE2\x95\xAC", "\xC5\xBD",
604  "\xC4\x85", "\xC4\x8D", "\xC4\x99", "\xC4\x97", "\xC4\xAF", "\xC5\xA1", "\xC5\xB3", "\xC5\xAB", "\xC5\xBE", "\xE2\x94\x98", "\xE2\x94\x8C", "\xE2\x96\x88", "\xE2\x96\x84", "\xE2\x96\x8C", "\xE2\x96\x90", "\xE2\x96\x80",
605  "\xC3\x93", "\xC3\x9F", "\xC5\x8C", "\xC5\x83", "\xC3\xB5", "\xC3\x95", "\xC2\xB5", "\xC5\x84", "\xC4\xB6", "\xC4\xB7", "\xC4\xBB", "\xC4\xBC", "\xC5\x86", "\xC4\x92", "\xC5\x85", "\xE2\x80\x99",
606  "\xC2\xAD", "\xC2\xB1", "\xE2\x80\x9C", "\xC2\xBE", "\xC2\xB6", "\xC2\xA7", "\xC3\xB7", "\xE2\x80\x9E", "\xC2\xB0", "\xE2\x88\x99", "\xC2\xB7", "\xC2\xB9", "\xC2\xB3", "\xC2\xB2", "\xE2\x96\xA0", "\xC2\xA0"
607 };
608 
609 /* UTF-8 NFC sequences for upper half of IBM850 character set */
610 static const char* enc_ibm850[128] =
611 {
612  "\xC3\x87", "\xC3\xBC", "\xC3\xA9", "\xC3\xA2", "\xC3\xA4", "\xC3\xA0", "\xC3\xA5", "\xC3\xA7", "\xC3\xAA", "\xC3\xAB", "\xC3\xA8", "\xC3\xAF", "\xC3\xAE", "\xC3\xAC", "\xC3\x84", "\xC3\x85",
613  "\xC3\x89", "\xC3\xA6", "\xC3\x86", "\xC3\xB4", "\xC3\xB6", "\xC3\xB2", "\xC3\xBB", "\xC3\xB9", "\xC3\xBF", "\xC3\x96", "\xC3\x9C", "\xC3\xB8", "\xC2\xA3", "\xC3\x98", "\xC3\x97", "\xC6\x92",
614  "\xC3\xA1", "\xC3\xAD", "\xC3\xB3", "\xC3\xBA", "\xC3\xB1", "\xC3\x91", "\xC2\xAA", "\xC2\xBA", "\xC2\xBF", "\xC2\xAE", "\xC2\xAC", "\xC2\xBD", "\xC2\xBC", "\xC2\xA1", "\xC2\xAB", "\xC2\xBB",
615  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x94\x82", "\xE2\x94\xA4", "\xC3\x81", "\xC3\x82", "\xC3\x80", "\xC2\xA9", "\xE2\x95\xA3", "\xE2\x95\x91", "\xE2\x95\x97", "\xE2\x95\x9D", "\xC2\xA2", "\xC2\xA5", "\xE2\x94\x90",
616  "\xE2\x94\x94", "\xE2\x94\xB4", "\xE2\x94\xAC", "\xE2\x94\x9C", "\xE2\x94\x80", "\xE2\x94\xBC", "\xC3\xA3", "\xC3\x83", "\xE2\x95\x9A", "\xE2\x95\x94", "\xE2\x95\xA9", "\xE2\x95\xA6", "\xE2\x95\xA0", "\xE2\x95\x90", "\xE2\x95\xAC", "\xC2\xA4",
617  "\xC3\xB0", "\xC3\x90", "\xC3\x8A", "\xC3\x8B", "\xC3\x88", "\xC4\xB1", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F", "\xE2\x94\x98", "\xE2\x94\x8C", "\xE2\x96\x88", "\xE2\x96\x84", "\xC2\xA6", "\xC3\x8C", "\xE2\x96\x80",
618  "\xC3\x93", "\xC3\x9F", "\xC3\x94", "\xC3\x92", "\xC3\xB5", "\xC3\x95", "\xC2\xB5", "\xC3\xBE", "\xC3\x9E", "\xC3\x9A", "\xC3\x9B", "\xC3\x99", "\xC3\xBD", "\xC3\x9D", "\xC2\xAF", "\xC2\xB4",
619  "\xC2\xAD", "\xC2\xB1", "\xE2\x80\x97", "\xC2\xBE", "\xC2\xB6", "\xC2\xA7", "\xC3\xB7", "\xC2\xB8", "\xC2\xB0", "\xC2\xA8", "\xC2\xB7", "\xC2\xB9", "\xC2\xB3", "\xC2\xB2", "\xE2\x96\xA0", "\xC2\xA0"
620 };
621 
622 /* UTF-8 NFC sequences for upper half of IBM852 character set */
623 static const char* enc_ibm852[128] =
624 {
625  "\xC3\x87", "\xC3\xBC", "\xC3\xA9", "\xC3\xA2", "\xC3\xA4", "\xC5\xAF", "\xC4\x87", "\xC3\xA7", "\xC5\x82", "\xC3\xAB", "\xC5\x90", "\xC5\x91", "\xC3\xAE", "\xC5\xB9", "\xC3\x84", "\xC4\x86",
626  "\xC3\x89", "\xC4\xB9", "\xC4\xBA", "\xC3\xB4", "\xC3\xB6", "\xC4\xBD", "\xC4\xBE", "\xC5\x9A", "\xC5\x9B", "\xC3\x96", "\xC3\x9C", "\xC5\xA4", "\xC5\xA5", "\xC5\x81", "\xC3\x97", "\xC4\x8D",
627  "\xC3\xA1", "\xC3\xAD", "\xC3\xB3", "\xC3\xBA", "\xC4\x84", "\xC4\x85", "\xC5\xBD", "\xC5\xBE", "\xC4\x98", "\xC4\x99", "\xC2\xAC", "\xC5\xBA", "\xC4\x8C", "\xC5\x9F", "\xC2\xAB", "\xC2\xBB",
628  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x94\x82", "\xE2\x94\xA4", "\xC3\x81", "\xC3\x82", "\xC4\x9A", "\xC5\x9E", "\xE2\x95\xA3", "\xE2\x95\x91", "\xE2\x95\x97", "\xE2\x95\x9D", "\xC5\xBB", "\xC5\xBC", "\xE2\x94\x90",
629  "\xE2\x94\x94", "\xE2\x94\xB4", "\xE2\x94\xAC", "\xE2\x94\x9C", "\xE2\x94\x80", "\xE2\x94\xBC", "\xC4\x82", "\xC4\x83", "\xE2\x95\x9A", "\xE2\x95\x94", "\xE2\x95\xA9", "\xE2\x95\xA6", "\xE2\x95\xA0", "\xE2\x95\x90", "\xE2\x95\xAC", "\xC2\xA4",
630  "\xC4\x91", "\xC4\x90", "\xC4\x8E", "\xC3\x8B", "\xC4\x8F", "\xC5\x87", "\xC3\x8D", "\xC3\x8E", "\xC4\x9B", "\xE2\x94\x98", "\xE2\x94\x8C", "\xE2\x96\x88", "\xE2\x96\x84", "\xC5\xA2", "\xC5\xAE", "\xE2\x96\x80",
631  "\xC3\x93", "\xC3\x9F", "\xC3\x94", "\xC5\x83", "\xC5\x84", "\xC5\x88", "\xC5\xA0", "\xC5\xA1", "\xC5\x94", "\xC3\x9A", "\xC5\x95", "\xC5\xB0", "\xC3\xBD", "\xC3\x9D", "\xC5\xA3", "\xC2\xB4",
632  "\xC2\xAD", "\xCB\x9D", "\xCB\x9B", "\xCB\x87", "\xCB\x98", "\xC2\xA7", "\xC3\xB7", "\xC2\xB8", "\xC2\xB0", "\xC2\xA8", "\xCB\x99", "\xC5\xB1", "\xC5\x98", "\xC5\x99", "\xE2\x96\xA0", "\xC2\xA0"
633 };
634 
635 /* UTF-8 NFC sequences for upper half of IBM858 character set */
636 static const char* enc_ibm858[128] =
637 {
638  "\xC3\x87", "\xC3\xBC", "\xC3\xA9", "\xC3\xA2", "\xC3\xA4", "\xC3\xA0", "\xC3\xA5", "\xC3\xA7", "\xC3\xAA", "\xC3\xAB", "\xC3\xA8", "\xC3\xAF", "\xC3\xAE", "\xC3\xAC", "\xC3\x84", "\xC3\x85",
639  "\xC3\x89", "\xC3\xA6", "\xC3\x86", "\xC3\xB4", "\xC3\xB6", "\xC3\xB2", "\xC3\xBB", "\xC3\xB9", "\xC3\xBF", "\xC3\x96", "\xC3\x9C", "\xC3\xB8", "\xC2\xA3", "\xC3\x98", "\xC3\x97", "\xC6\x92",
640  "\xC3\xA1", "\xC3\xAD", "\xC3\xB3", "\xC3\xBA", "\xC3\xB1", "\xC3\x91", "\xC2\xAA", "\xC2\xBA", "\xC2\xBF", "\xC2\xAE", "\xC2\xAC", "\xC2\xBD", "\xC2\xBC", "\xC2\xA1", "\xC2\xAB", "\xC2\xBB",
641  "\xE2\x96\x91", "\xE2\x96\x92", "\xE2\x96\x93", "\xE2\x94\x82", "\xE2\x94\xA4", "\xC3\x81", "\xC3\x82", "\xC3\x80", "\xC2\xA9", "\xE2\x95\xA3", "\xE2\x95\x91", "\xE2\x95\x97", "\xE2\x95\x9D", "\xC2\xA2", "\xC2\xA5", "\xE2\x94\x90",
642  "\xE2\x94\x94", "\xE2\x94\xB4", "\xE2\x94\xAC", "\xE2\x94\x9C", "\xE2\x94\x80", "\xE2\x94\xBC", "\xC3\xA3", "\xC3\x83", "\xE2\x95\x9A", "\xE2\x95\x94", "\xE2\x95\xA9", "\xE2\x95\xA6", "\xE2\x95\xA0", "\xE2\x95\x90", "\xE2\x95\xAC", "\xC2\xA4",
643  "\xC3\xB0", "\xC3\x90", "\xC3\x8A", "\xC3\x8B", "\xC3\x88", "\xE2\x82\xAC", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F", "\xE2\x94\x98", "\xE2\x94\x8C", "\xE2\x96\x88", "\xE2\x96\x84", "\xC2\xA6", "\xC3\x8C", "\xE2\x96\x80",
644  "\xC3\x93", "\xC3\x9F", "\xC3\x94", "\xC3\x92", "\xC3\xB5", "\xC3\x95", "\xC2\xB5", "\xC3\xBE", "\xC3\x9E", "\xC3\x9A", "\xC3\x9B", "\xC3\x99", "\xC3\xBD", "\xC3\x9D", "\xC2\xAF", "\xC2\xB4",
645  "\xC2\xAD", "\xC2\xB1", "\xE2\x80\x97", "\xC2\xBE", "\xC2\xB6", "\xC2\xA7", "\xC3\xB7", "\xC2\xB8", "\xC2\xB0", "\xC2\xA8", "\xC2\xB7", "\xC2\xB9", "\xC2\xB3", "\xC2\xB2", "\xE2\x96\xA0", "\xC2\xA0"
646 };
647 
648 
649 /* ========================================================================== */
650 /* Decode hexadecimal nibble from ASCII to integer
651  *
652  * \param[in] nibble ASCII encoded haxadecimal nibble to decode
653  *
654  * \return
655  * - Integer value of \e nibble
656  * - Negative value on error
657  */
658 
659 static int enc_hex_decode_nibble(char nibble)
660 {
661  int res = -1;
662  int n = nibble;
663 
664  if(0x30 <= n && 0x39 >= n) { res = n - 0x30; }
665  else if(0x41 <= n && 0x46 >= n) { res = n - 0x41 + 10; }
666  else if(0x61 <= n && 0x66 >= n) { res = n - 0x61 + 10; }
667  else { PRINT_ERROR("Can't decode invalid hexadecimal nibble"); }
668  /* printf("Hex nibble %c => %d\n", nibble, res); */
669 
670  return(res);
671 }
672 
673 
674 /* ========================================================================== */
675 /* Convert from supported 8 bit character sets to UTF-8 NFC
676  *
677  * \param[in] charset 8 bit character set used for string \e s
678  * \param[in] s String to convert
679  *
680  * \return
681  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
682  * - NULL on error
683  */
684 
685 static const char* enc_8bit_convert_to_utf8_nfc(enum enc_mime_cs charset,
686  const char* s)
687 {
688  static const char qm[] = "?";
689  const char* res = s;
690  size_t i = 0;
691  char* buf = NULL;
692  size_t len = 0;
693  size_t bi = 0;
694  char* p;
695  const char* seq;
696  size_t ii;
697  unsigned char c;
698 
699  /*
700  * Check whether data contains only of printable ASCII characters
701  * (all supported character sets are ASCII extensions)
702  */
704  {
705  /* Decode */
706  res = NULL;
707  while(s[i])
708  {
709  /* Attention: This assumes valid UTF-8 encoding for decoded data! */
710  if(bi + (size_t) 5 >= len) /* At least (4 + NUL) additional bytes */
711  {
712  /* Allocate more memory in exponentially increasing chunks */
713  if(!len) { len = 64; }
714  p = posix_realloc((void*) buf, len *= (size_t) 2);
715  if(NULL == p) { posix_free((void*) buf); buf = NULL; break; }
716  else { buf = p; }
717  }
718  /* Decode next character */
719  if(128U > (unsigned char) s[i])
720  {
721  /* ASCII */
722  buf[bi++] = s[i++];
723  continue;
724  }
725  /* Decode non-ASCII character */
726  c = (unsigned char) s[i] - (unsigned char) 128;
727  switch(charset)
728  {
729  case ENC_CS_ASCII: { seq = qm; break; }
730  case ENC_CS_ISO8859_1: { seq = enc_iso8859_1[c]; break; }
731  case ENC_CS_ISO8859_2: { seq = enc_iso8859_2[c]; break; }
732  case ENC_CS_ISO8859_3: { seq = enc_iso8859_3[c]; break; }
733  case ENC_CS_ISO8859_4: { seq = enc_iso8859_4[c]; break; }
734  case ENC_CS_ISO8859_5: { seq = enc_iso8859_5[c]; break; }
735  case ENC_CS_ISO8859_6: { seq = enc_iso8859_6[c]; break; }
736  case ENC_CS_ISO8859_7: { seq = enc_iso8859_7[c]; break; }
737  case ENC_CS_ISO8859_8: { seq = enc_iso8859_8[c]; break; }
738  case ENC_CS_ISO8859_9: { seq = enc_iso8859_9[c]; break; }
739  case ENC_CS_ISO8859_10: { seq = enc_iso8859_10[c]; break; }
740  case ENC_CS_ISO8859_11: { seq = enc_iso8859_11[c]; break; }
741  case ENC_CS_ISO8859_13: { seq = enc_iso8859_13[c]; break; }
742  case ENC_CS_ISO8859_14: { seq = enc_iso8859_14[c]; break; }
743  case ENC_CS_ISO8859_15: { seq = enc_iso8859_15[c]; break; }
744  case ENC_CS_ISO8859_16: { seq = enc_iso8859_16[c]; break; }
745  case ENC_CS_MACINTOSH: { seq = enc_mac_roman[c]; break; }
746  case ENC_CS_KOI8R: { seq = enc_koi8r[c]; break; }
747  case ENC_CS_KOI8U: { seq = enc_koi8u[c]; break; }
748  case ENC_CS_WINDOWS_1250: { seq = enc_windows_1250[c]; break; }
749  case ENC_CS_WINDOWS_1251: { seq = enc_windows_1251[c]; break; }
750  case ENC_CS_WINDOWS_1252: { seq = enc_windows_1252[c]; break; }
751  case ENC_CS_WINDOWS_1253: { seq = enc_windows_1253[c]; break; }
752  case ENC_CS_WINDOWS_1254: { seq = enc_windows_1254[c]; break; }
753  case ENC_CS_WINDOWS_1255: { seq = enc_windows_1255[c]; break; }
754  case ENC_CS_WINDOWS_1256: { seq = enc_windows_1256[c]; break; }
755  case ENC_CS_WINDOWS_1257: { seq = enc_windows_1257[c]; break; }
756  case ENC_CS_WINDOWS_1258: { seq = enc_windows_1258[c]; break; }
757  case ENC_CS_IBM437: { seq = enc_ibm437[c]; break; }
758  case ENC_CS_IBM775: { seq = enc_ibm775[c]; break; }
759  case ENC_CS_IBM850: { seq = enc_ibm850[c]; break; }
760  case ENC_CS_IBM852: { seq = enc_ibm852[c]; break; }
761  case ENC_CS_IBM858: { seq = enc_ibm858[c]; break; }
762  default: { seq = NULL; break; }
763  }
764  if(NULL == seq) { posix_free((void*) buf); buf = NULL; break; }
765  for(ii = 0; ii < 4; ++ii)
766  {
767  if(!seq[ii]) { break; }
768  else { buf[bi++] = seq[ii]; }
769  }
770  ++i;
771  }
772  if(NULL != buf)
773  {
774  buf[bi] = 0;
775  res = buf;
776  }
777  }
778 
779  return(res);
780 }
781 
782 
783 /* ========================================================================== */
784 /*! \brief Verify CESU-8 or UTF-8 encoding
785  *
786  * \param[in] s String to verify
787  * \param[in] utf Reject surrogate codepoints if nonzero.
788  *
789  * \note CESU-8 is defined in Unicode Technical Report #26.
790  *
791  * \attention
792  * Read chapter 10 of RFC 3629 for UTF-8 security considerations.
793  *
794  * According to RFC 3629 the following rules are applied:
795  * - Character code points beyond 0x10FFFF are invalid => We reject them.
796  * - Only the shortest possible code sequence is allowed => We verify this.
797  * - Surrogate character code points are invalid for UTF-8 => We reject them.
798  *
799  * \return
800  * - 0 on success
801  * - Negative value on error
802  */
803 
804 static int enc_uc_check_cesu8(const char* s, unsigned char utf)
805 {
806  int res = 0;
807  size_t i = 0;
808  int c;
809  int multibyte = 0;
810  size_t len = 0;
811  size_t remaining = 0;
812  unsigned long int mbc = 0;
813 
814  /* Assignment in truth expression is intended */
815  while((c = (int) s[i++]))
816  {
817  /* Verify singlebyte character */
818  if(!multibyte)
819  {
820  if(!(0 <= c && 127 >= c)) { multibyte = 1; }
821  }
822  /* Verify multibyte character */
823  if(multibyte)
824  {
825  if(!remaining)
826  {
827  if((c & 0xE0) == 0xC0) { len = 2; }
828  else if((c & 0xF0) == 0xE0) { len = 3; }
829  else if((c & 0xF8) == 0xF0) { len = 4; }
830  else
831  {
832  PRINT_ERROR("Invalid start of code sequence in UTF-8 data");
833  res = -1;
834  break;
835  }
836  switch(len)
837  {
838  case 2: mbc |= (unsigned long int) (c & 0x1F) << 6; break;
839  case 3: mbc |= (unsigned long int) (c & 0x0F) << 12; break;
840  case 4: mbc |= (unsigned long int) (c & 0x07) << 18; break;
841  }
842  remaining = len - (size_t) 1;
843  }
844  else
845  {
846  if((c & 0xC0) != 0x80)
847  {
848  PRINT_ERROR("Invalid continuation character in UTF-8 sequence");
849  res = -1;
850  break;
851  }
852  else
853  {
854  --remaining;
855  mbc |= (unsigned long int) (c & 0x3F) << remaining * (size_t) 6;
856  }
857  if(!remaining)
858  {
859  /* Verify character code */
860  switch(len)
861  {
862  case 2:
863  {
864  if(0x000080UL > mbc)
865  {
866  PRINT_ERROR("Invalid UTF-8 2-byte code sequence");
867  res = -1;
868  }
869  break;
870  }
871  case 3:
872  {
873  if(0x000800UL > mbc
874  || (utf && 0x00D800UL <= mbc && 0x00DFFFUL >= mbc))
875  {
876  PRINT_ERROR("Invalid UTF-8 3-byte code sequence");
877  res = -1;
878  }
879  break;
880  }
881  case 4:
882  {
883  if(0x010000UL > mbc || 0x10FFFFUL < mbc)
884  {
885  PRINT_ERROR("Invalid UTF-8 4-byte code sequence");
886  res = -1;
887  }
888  break;
889  }
890  default:
891  {
892  PRINT_ERROR("Bug in UTF-8 verify state machine");
893  res = -1;
894  break;
895  }
896  }
897  if(res) { break; }
898  /* Code sequence completely checked => Reset state machine */
899  multibyte = 0;
900  remaining = 0;
901  mbc = 0;
902  }
903  }
904  }
905  }
906  /* Check for incomplete multibyte code sequence at end of string */
907  if(multibyte) { res = -1; }
908 
909  return(res);
910 }
911 
912 
913 /* ========================================================================== */
914 /* Decode next Unicode codepoint from UTF-8 string
915  *
916  * \param[in] s UTF-8 string to decode
917  * \param[in,out] i Pointer to current index in string
918  *
919  * \attention
920  * The string \e s MUST be already checked for valid UTF-8 encoding before
921  * calling this function!
922  *
923  * On success, the index of the next codepoint is written to the location
924  * pointed to by \e i .
925  *
926  * \return
927  * - Next Unicode codepoint on success
928  * - -1 on error
929  */
930 
931 static long int enc_uc_decode_utf8(const char* s, size_t* i)
932 {
933  long int res = -1L;
934  int c;
935  int multibyte = 0;
936  size_t len = 0;
937  size_t remaining = 0;
938  unsigned long int mbc = 0;
939  int error = 0;
940 
941  /* Assignment in truth expression is intended */
942  while((c = (int) s[(*i)++]))
943  {
944  /* Check for singlebyte codepoint */
945  if(!multibyte)
946  {
947  if(0 <= c && 127 >= c) { res = (long int) c; break; }
948  else { multibyte = 1; }
949  }
950  /* Decode multibyte codepoint */
951  if(multibyte)
952  {
953  if(!remaining)
954  {
955  if((c & 0xE0) == 0xC0) { len = 2; }
956  else if((c & 0xF0) == 0xE0) { len = 3; }
957  else if((c & 0xF8) == 0xF0) { len = 4; }
958  switch(len)
959  {
960  case 2: mbc |= (unsigned long int) (c & 0x1F) << 6; break;
961  case 3: mbc |= (unsigned long int) (c & 0x0F) << 12; break;
962  case 4: mbc |= (unsigned long int) (c & 0x07) << 18; break;
963  default:
964  {
965  PRINT_ERROR("UTF-8 decoder called with invalid data");
966  error = 1;
967  break;
968  }
969  }
970  if(error) { res = -1L; break; }
971  remaining = len - (size_t) 1;
972  }
973  else
974  {
975  --remaining;
976  mbc |= (unsigned long int) (c & 0x3F) << remaining * (size_t) 6;
977  if(!remaining)
978  {
979  /* Codepoint decoding complete */
980  res = (long int) mbc;
981  break;
982  }
983  }
984  }
985  }
986 
987  return(res);
988 }
989 
990 
991 /* ========================================================================== */
992 /*! \brief Encode Unicode codepoints to UTF-8
993  *
994  * \param[out] buf Encoded UTF-8 string
995  * \param[in,out] i Current index in \e buf
996  * \param[in] dbuf Codepoint buffer
997  * \param[in,out] di Number of codepoints in \e dbuf
998  *
999  * \attention
1000  * The target buffer \e buf must be large enough for the encoded data. This must
1001  * be ensured by the caller using worst case calculations.
1002  *
1003  * On success, the start index of the next codepoint is written to the location
1004  * pointed to by \e i and zero is written to the location pointed to by \e di .
1005  */
1006 
1007 /* 'static' removed because test program must call this function */
1008 void enc_uc_encode_utf8(char* buf, size_t* i, long int* dbuf, size_t* di)
1009 {
1010  size_t ii;
1011  int inval = 0;
1012  unsigned char prefix;
1013  unsigned char data;
1014 
1015  for(ii = 0; ii < *di; ++ii)
1016  {
1017  if (0L > dbuf[ii]) { inval = 1; }
1018  else if(0x00007FL >= dbuf[ii]) { buf[(*i)++] = (char) dbuf[ii]; }
1019  else if(0x0007FFL >= dbuf[ii])
1020  {
1021  data = (unsigned char) ((dbuf[ii] >> 6) & 0x1FL);
1022  prefix = 0xC0;
1023  buf[(*i)++] = (char) (prefix | data);
1024  data = (unsigned char) (dbuf[ii] & 0x3FL);
1025  prefix = 0x80;
1026  buf[(*i)++] = (char) (prefix | data);
1027  }
1028  else if(0x00FFFFL >= dbuf[ii])
1029  {
1030  data = (unsigned char) ((dbuf[ii] >> 12) & 0x0FL);
1031  prefix = 0xE0;
1032  buf[(*i)++] = (char) (prefix | data);
1033  data = (unsigned char) ((dbuf[ii] >> 6) & 0x3FL);
1034  prefix = 0x80;
1035  buf[(*i)++] = (char) (prefix | data);
1036  data = (unsigned char) (dbuf[ii] & 0x3FL);
1037  prefix = 0x80;
1038  buf[(*i)++] = (char) (prefix | data);
1039  }
1040  else if(0x10FFFFL >= dbuf[ii])
1041  {
1042  data = (unsigned char) ((dbuf[ii] >> 18) & 0x07L);
1043  prefix = 0xF0;
1044  buf[(*i)++] = (char) (prefix | data);
1045  data = (unsigned char) ((dbuf[ii] >> 12) & 0x3FL);
1046  prefix = 0x80;
1047  buf[(*i)++] = (char) (prefix | data);
1048  data = (unsigned char) ((dbuf[ii] >> 6) & 0x3FL);
1049  prefix = 0x80;
1050  buf[(*i)++] = (char) (prefix | data);
1051  data = (unsigned char) (dbuf[ii] & 0x3FL);
1052  prefix = 0x80;
1053  buf[(*i)++] = (char) (prefix | data);
1054  }
1055  else { inval = 1; }
1056  if(inval)
1057  {
1058  PRINT_ERROR("Unicode UTF-8 encoder: Invalid codepoint detected");
1059  buf[(*i)++] = '?';
1060  inval = 0;
1061  }
1062  }
1063  *di = 0;
1064 }
1065 
1066 
1067 /* ========================================================================== */
1068 /* Convert from UTF-16BE to UTF-8 NFC
1069  *
1070  * \param[in] mb Multibyte input data to convert (in UTF-16BE format)
1071  * \param[in] len Length in bytes
1072  *
1073  * If \e len is an odd number, the last byte of input data \e mb is ignored.
1074  *
1075  * \attention
1076  * The result string is only guaranteed to have valid UTF-8 encoding. It
1077  * nevertheless may be semantically invalid and is not normalized!
1078  *
1079  * \return
1080  * - Pointer to result string in UTF-8 format (a new memory block was allocated)
1081  * - NULL on error
1082  */
1083 
1084 static const char* enc_uc_convert_utf16be_to_utf8(const char* mb, size_t len)
1085 {
1086  const char* res = NULL;
1087  char* buf = NULL;
1088  size_t bi = 0;
1089  size_t i;
1090  long int ucp; /* Unicode codepoint */
1091  int surrogate = 0; /* Flag indicating that trail surrogate is expected */
1092  long int lead_surrogate = 0;
1093  size_t di; /* Number of codepoints for UTF-8 encoder */
1094 
1095  /* Ensure even byte count */
1096  if((size_t) 1 & len) { --len; }
1097  if(len)
1098  {
1099  /* Allocate buffer */
1100  buf = (char*) posix_malloc(len * (size_t) 2 + (size_t) 1);
1101  if(NULL != buf)
1102  {
1103  /* Process 16 bit words */
1104  for(i = 0; i < len; i += 2)
1105  {
1106  /* Decode UTF-16 using big endian byte order */
1107  ucp = (long int) (unsigned char) mb[i] << 8
1108  | (long int) (unsigned char) mb[i + (size_t) 1];
1109  /* Decode surrogate pair and extract codepoint */
1110  if(0x00D800L <= ucp && 0x00DFFFL >= ucp)
1111  {
1112  if(!surrogate
1113  && (0x00D800L <= ucp && 0x00DBFFL >= ucp))
1114  {
1115  lead_surrogate = ucp - 0x00D800L;
1116  surrogate = 1;
1117  continue;
1118  }
1119  else if(surrogate
1120  && (0x00DC00L <= ucp && 0x00DFFFL >= ucp))
1121  {
1122  ucp = (lead_surrogate << 10L) | (ucp - 0x00DC00L);
1123  ucp += 0x010000L;
1124  surrogate = 0;
1125  }
1126  else
1127  {
1128  PRINT_ERROR("Missing UTF-16 lead surrogate");
1129  ucp = (long int) (unsigned char) '?';
1130  surrogate = 0;
1131  }
1132  }
1133  if(surrogate && !(0x00DC00L <= ucp && 0x00DFFFL >= ucp))
1134  {
1135  PRINT_ERROR("Invalid UTF-16 trail surrogate");
1136  ucp = (long int) (unsigned char) '?';
1137  }
1138  surrogate = 0;
1139  /* Encode codepoint as UTF-8 into target buffer */
1140  di = 1;
1141  enc_uc_encode_utf8(buf, &bi, &ucp, &di);
1142  }
1143  }
1144  }
1145 
1146  /* Check for error */
1147  if(!bi)
1148  {
1149  PRINT_ERROR("Unicode UTF-16 to UTF-8 converter failed");
1150  if(NULL != buf) { posix_free((void*) buf); }
1151  }
1152  else
1153  {
1154  /* Terminate result string on success */
1155  buf[bi] = 0;
1156  res = buf;
1157  }
1158 
1159  return(res);
1160 }
1161 
1162 
1163 /* ========================================================================== */
1164 /* Check whether Unicode codepoint is an unwanted control character
1165  *
1166  * \param[in] ucp Unicode codepoint to check
1167  *
1168  * RFC 5198 forbids C1 control characters => We reject them.
1169  *
1170  * Unicode variation selectors and control characters for bidirectional text
1171  * should be accepted. The Cocoa backend of FLTK (for Apple macOS) is able to
1172  * process them.
1173  *
1174  * \return
1175  * - 0 if nothing was found
1176  * - -1 if unwanted control characters are present
1177  */
1178 
1179 static int enc_uc_check_control(long int ucp)
1180 {
1181  int res = 0;
1182 
1183  /* Check for ASCII C0 control characters plus DEL */
1184  if(0x1FL >= ucp || 0x7FL == ucp)
1185  {
1186  /* Accept only HT, LF and CR (required for canonical format) */
1187  if(0x09L != ucp && 0x0AL != ucp && 0x0DL != ucp) { res = -1; }
1188  }
1189  /* Reject ISO 8859 C1 control characters */
1190  else if(0x80L <= ucp && 0x9FL >= ucp) { res = -1; }
1191  /* Reject Unicode INTERLINEAR ANNOTATION special characters */
1192  else if(0xFFF9L <= ucp && 0xFFFBL >= ucp) { res = -1; }
1193  /* Reject Unicode LINE SEPARATOR and PARAGRAPH SEPARATOR */
1194  else if(0x2028L <= ucp && 0x2029L >= ucp) { res = -1; }
1195  /* Reject Unicode LANGUAGE TAG */
1196  else if( 0xE0001L == ucp) { res = -1; }
1197 #if 0
1198  /*
1199  * Reject LANGUAGE TAG associated range
1200  *
1201  * Note:
1202  * This range was deprecated since Unicode 5.1, but was reintroduced for
1203  * another purpose in Unicode 9.0 => No longer reject it.
1204  */
1205  else if(0xE0020L <= ucp && 0xE007FL >= ucp) { res = -1; }
1206 #endif
1207 
1208  return(res);
1209 }
1210 
1211 
1212 /* ========================================================================== */
1213 /* Lookup canonical decomposition and combining class of Unicode codepoint
1214  *
1215  * \param[in] ucp Unicode codepoint to lookup
1216  * \param[out] res Result
1217  */
1218 
1219 static void enc_uc_lookup_cdc(long int ucp, struct uc_cdc* res)
1220 {
1221  size_t i = 0;
1222 
1223  res->cp = ucp;
1224  res->ccc = 0;
1225  res->dc1 = -1L;
1226  res->dc2 = -1L;
1227  /* ASCII codepoints are always starters without canonical decomposition */
1228  if(128L <= ucp)
1229  {
1230  /* Lookup codepoint in Unicode database */
1231  while(-1L != uc_cdc_table[i].cp)
1232  {
1233  if(ucp == uc_cdc_table[i].cp)
1234  {
1235  /* Found nonstarter or canonical decomposable */
1236  res->ccc = uc_cdc_table[i].ccc;
1237  res->dc1 = uc_cdc_table[i].dc1;
1238  res->dc2 = uc_cdc_table[i].dc2;
1239  break;
1240  }
1241  ++i;
1242  }
1243  }
1244  /* Use codepoint as decomposition if no canonical decomposition was found */
1245  if(-1L == res->dc1) { res->dc1 = ucp; res->dc2 = -1L; }
1246 }
1247 
1248 
1249 /* ========================================================================== */
1250 /* Lookup canonical composition of codepoint pair
1251  *
1252  * \param[in] starter Unicode codepoint of starter
1253  * \param[in] cm Unicode codepoint of combination mark (or other starter)
1254  *
1255  * \note
1256  * If \e starter is not a codepoint with canoncial combining class zero, the
1257  * function returns error.
1258  *
1259  * \return
1260  * - Codepoint of composition character on success
1261  * - -1 on error
1262  */
1263 
1264 static long int enc_uc_lookup_cc(long int starter, long int cm)
1265 {
1266  long int res = -1;
1267  struct uc_cdc cdc; /* Canonical decomposition data */
1268  size_t i;
1269  size_t ii;
1270  long int first;
1271  long int last;
1272 
1273  /* Only used for hangul algorithmic canonical composition */
1274  const long int SBase = 0xAC00L;
1275  const long int LBase = 0x1100L;
1276  const long int VBase = 0x1161L;
1277  const long int TBase = 0x11A7L;
1278  const long int NCount = 588L;
1279  const long int LCount = 19L;
1280  const long int VCount = 21L;
1281  const long int TCount = 28L;
1282  int jamo = 0; /* Number of jamo in syllable */
1283  enum uc_hs_type hst;
1284  enum uc_hs_type hst2;
1285  long int LIndex;
1286  long int VIndex;
1287  long int TIndex;
1288 
1289  /* Check whether starter really is a starter */
1290  enc_uc_lookup_cdc(starter, &cdc);
1291  if(!cdc.ccc)
1292  {
1293  /* Yes => Lookup decomposition in Unicode database */
1294  i = 0;
1295  while(-1L != uc_cdc_table[i].cp)
1296  {
1297  if(uc_cdc_table[i].dc1 == starter && uc_cdc_table[i].dc2 == cm)
1298  {
1299  /* Found composition */
1300  res = uc_cdc_table[i].cp;
1301  /* Check for composition exception */
1302  ii = 0;
1303  while(-1L != uc_fce_table[ii].first)
1304  {
1305  first = uc_fce_table[ii].first;
1306  last = uc_fce_table[ii].last;
1307  if(first <= res && last >= res)
1308  {
1309  /* Composition exception found */
1310 #if ENC_UC_NORM_DEBUG
1311  printf(" Canonical composition exception\n");
1312 #endif /* ENC_UC_NORM_DEBUG */
1313  res = -1;
1314  break;
1315  }
1316  ++ii;
1317  }
1318  break;
1319  }
1320  ++i;
1321  }
1322 
1323  /*
1324  * On error, check whether algorithmic composition is possible using
1325  * Unicode hangul syllable type database
1326  */
1327  enc_uc_lookup_cdc(cm, &cdc);
1328  if(!cdc.ccc)
1329  {
1330  i = 0;
1331  while(-1L == res && -1L != uc_hst_table[i].first)
1332  {
1333  first = uc_hst_table[i].first;
1334  last = uc_hst_table[i].last;
1335  hst = uc_hst_table[i].hst;
1336  if(first <= starter && last >= starter)
1337  {
1338  if(UC_HST_L == hst || UC_HST_LV == hst)
1339  {
1340  /* Starter is a hangul L-type consonant or LV-type syllable */
1341  ii = 0;
1342  while(-1L != uc_hst_table[ii].first)
1343  {
1344  first = uc_hst_table[ii].first;
1345  last = uc_hst_table[ii].last;
1346  hst2 = uc_hst_table[ii].hst;
1347  if(first <= cm && last >= cm)
1348  {
1349  if(UC_HST_L == hst && UC_HST_V == hst2)
1350  {
1351  if(LBase <= starter && VBase <= cm)
1352  {
1353  LIndex = starter - LBase;
1354  if(LIndex < LCount)
1355  {
1356  VIndex = cm - VBase;
1357  if(VIndex < VCount)
1358  {
1359 #if ENC_UC_NORM_DEBUG
1360  printf(" Canonical composition"
1361  " for hangul LV-syllable found\n");
1362  printf("Hangul LIndex: %ld\n", LIndex);
1363  printf("Hangul VIndex: %ld\n", VIndex);
1364 #endif /* ENC_UC_NORM_DEBUG */
1365  jamo = 2;
1366  res = SBase
1367  + LIndex * NCount + VIndex * TCount;
1368  }
1369  }
1370  }
1371  }
1372  else if(UC_HST_LV == hst && UC_HST_T == hst2)
1373  {
1374  if(TBase <= cm)
1375  {
1376  TIndex = cm - TBase;
1377  if(TIndex < TCount)
1378  {
1379 #if ENC_UC_NORM_DEBUG
1380  printf(" Canonical composition"
1381  " for hangul LVT-syllable found\n");
1382  printf("Hangul TIndex: %ld\n", TIndex);
1383 #endif /* ENC_UC_NORM_DEBUG */
1384  jamo = 3;
1385  res = starter + TIndex;
1386  }
1387  }
1388  }
1389  if(jamo)
1390  {
1391  if(-1L == res)
1392  {
1393  PRINT_ERROR("Unicode algorithmic composition"
1394  " for hangul syllable failed");
1395  }
1396 #if 1
1397  /* Optional: Check hangul syllable type of result */
1398  else
1399  {
1400  ii = 0;
1401  while(-1L != uc_hst_table[ii].first)
1402  {
1403  first = uc_hst_table[ii].first;
1404  last = uc_hst_table[ii].last;
1405  hst = uc_hst_table[ii].hst;
1406  if(first <= res && last >= res)
1407  {
1408  if(2 == jamo && UC_HST_LV != hst)
1409  {
1410  /* Result should be a LV-syllable! */
1411  res = -1L;
1412  }
1413  if(3 == jamo && UC_HST_LVT != hst)
1414  {
1415  /* Result should be a LVT-syllable! */
1416  res = -1L;
1417  }
1418  break;
1419  }
1420  ++ii;
1421  }
1422  if(-1L == res)
1423  {
1424  PRINT_ERROR("Invalid Unicode hangul syllable"
1425  " detected (Bug)");
1426  }
1427  }
1428 #endif
1429  break;
1430  }
1431  }
1432  ++ii;
1433  }
1434  }
1435  }
1436  ++i;
1437  }
1438  }
1439  }
1440 
1441  return(res);
1442 }
1443 
1444 
1445 /* ========================================================================== */
1446 /* Lookup mapping for default case folding
1447  *
1448  * \param[in] ucp Unicode codepoint
1449  * \param[in] cfm Case folded mapping (up to 3 codepoints)
1450  *
1451  * If the case fold mapping is smaller than 3 codepoints, -1 is written to the
1452  * unused fields.
1453  *
1454  * \note
1455  * If \e ucp has no mapping for default case folding, \e ucp itself is
1456  * returned in the first field.
1457  */
1458 
1459 static void enc_uc_lookup_cf(long int ucp, long int mapping[3])
1460 {
1461  size_t i = 0;
1462  int found = 0;
1463 
1464  /* Lookup codepoint in Unicode database */
1465  while(-1L != uc_cf_table[i].cp)
1466  {
1467  if(ucp == uc_cf_table[i].cp)
1468  {
1469  /* Found mapping for Unicode default case folding */
1470  mapping[0] = uc_cf_table[i].first;
1471  mapping[1] = uc_cf_table[i].second;
1472  mapping[2] = uc_cf_table[i].third;
1473  found = 1;
1474  break;
1475  }
1476  ++i;
1477  }
1478  if(!found)
1479  {
1480  mapping[0] = ucp;
1481  mapping[1] = -1L;
1482  mapping[2] = -1L;
1483  }
1484 }
1485 
1486 
1487 /* ========================================================================== */
1488 /* Get number of Unicode glyphs from UTF-8 string
1489  *
1490  * \param[in] s UTF-8 string to decode
1491  * \param[in] end Check glyph count up to (but not including) this index
1492  *
1493  * To check all glyphs of a string, set \e end to zero.
1494  *
1495  * \attention
1496  * The string \e s MUST be already checked for valid UTF-8 encoding before
1497  * calling this function!
1498  *
1499  * \note
1500  * Soft hyphens (SHY) are not counted, except directly before \e end .
1501  * <br>
1502  * For this function the glyph count is defined as "number of starters".
1503  * For complex scripts this may not match the display width.
1504  *
1505  * \return
1506  * - Glyph count
1507  */
1508 
1509 static size_t enc_uc_get_glyph_count(const char* s, size_t end)
1510 {
1511  size_t res = 0;
1512  size_t i = 0;
1513  long int ucp;
1514  struct uc_cdc cdc; /* Canonical decomposition data */
1515 
1516  while(1)
1517  {
1518  ucp = enc_uc_decode_utf8(s, &i);
1519  if(-1L == ucp) { break; }
1520  else
1521  {
1522  /* Do not count SHY characters, except at the end */
1523  if (!((0x00ADL == ucp) && (end && i < end)))
1524  {
1525  /* Check whether codepoint is a starter */
1526  enc_uc_lookup_cdc(ucp, &cdc);
1527  if(!cdc.ccc) { ++res; }
1528  }
1529  }
1530  if(end && i >= end) { break; }
1531  }
1532 
1533  return(res);
1534 }
1535 
1536 
1537 /* ========================================================================== */
1538 /* Quick check for Unicode NFC normalization
1539  *
1540  * This function verify that:
1541  * - all codepoints are allowed in NFC
1542  * - the canonical ordering of combining marks is correct
1543  *
1544  * \param[in] s UTF-8 string to check
1545  *
1546  * \attention
1547  * The string \e s MUST be already checked for valid UTF-8 encoding before
1548  * calling this function!
1549  */
1550 
1551 static int enc_uc_check_nfc(const char* s)
1552 {
1553  int res = 0;
1554  size_t i = 0;
1555  long int ucp = 0;
1556  long int first;
1557  long int last;
1558  size_t ii;
1559  struct uc_cdc cdc; /* Canonical decomposition data */
1560  unsigned char ccc_last = 0;
1561 
1562  while(1)
1563  {
1564  ucp = enc_uc_decode_utf8(s, &i);
1565  if(-1L == ucp) { break; }
1566  /* Quick check for ASCII */
1567  if(128L <= ucp)
1568  {
1569  /* Lookup codepoint in Unicode database */
1570  ii = 0;
1571  while(-1L != uc_qc_nfc_table[ii].first)
1572  {
1573  first = uc_qc_nfc_table[ii].first;
1574  last = uc_qc_nfc_table[ii].last;
1575  if(first <= ucp && last >= ucp)
1576  {
1577  /* Codepoint is (maybe) not allowed in NFC */
1578  res = -1;
1579  break;
1580  }
1581  ++ii;
1582  }
1583  if(res) { break; }
1584  /* Check ordering of combining marks */
1585  enc_uc_lookup_cdc(ucp, &cdc);
1586  if(cdc.ccc && (cdc.ccc < ccc_last)) { res = -1; break; }
1587  ccc_last = cdc.ccc;
1588  }
1589  else { ccc_last = 0; }
1590  }
1591 
1592 #if ENC_UC_NORM_DEBUG
1593  if(res)
1594  {
1595  printf("Maybe not NFC: %s (len: %u)\n ", s, (unsigned int) strlen(s));
1596  i = 0; while(s[i])
1597  {
1598  printf(" 0x%02X", (unsigned int) (unsigned char) s[i++]);
1599  }
1600  printf("\n");
1601  }
1602 #endif /* ENC_UC_NORM_DEBUG */
1603 
1604  return(res);
1605 }
1606 
1607 
1608 /* ========================================================================== */
1609 /* Unicode canonical decomposition engine
1610  *
1611  * This function is reentrant and calls itself for recursive decomposition.
1612  *
1613  * \attention
1614  * The decomposition buffer \e dbuf must have the fixed size
1615  * \ref ENC_UC_DECOMPOSITION_BUFSIZE bytes.
1616  *
1617  * \param[in] ucp Unicode codepoint
1618  * \param[in,out] dbuf Decomposition buffer
1619  * \param[in] di Pointer to index in decomposition buffer
1620  */
1621 
1622 static int enc_uc_engine_decompose(long int ucp, long int* dbuf, size_t* di)
1623 {
1624  int res = 0;
1625  struct uc_cdc cdc; /* Canonical decomposition data */
1626 
1627  /* Ensure that there is space for 2 codepoints in decomposition buffer */
1628  if(ENC_UC_DECOMPOSITION_BUFSIZE - (size_t) 2 <= *di)
1629  {
1630  /* Decomposition buffer not large enough */
1631  PRINT_ERROR("Unicode canonical decomposition engine failed");
1632  dbuf[0] = (long int) (unsigned char) '[';
1633  dbuf[1] = (long int) (unsigned char) 'E';
1634  dbuf[2] = (long int) (unsigned char) 'r';
1635  dbuf[3] = (long int) (unsigned char) 'r';
1636  dbuf[4] = (long int) (unsigned char) 'o';
1637  dbuf[5] = (long int) (unsigned char) 'r';
1638  dbuf[6] = (long int) (unsigned char) ']';
1639  *di = 7;
1640  res = -1;
1641  }
1642  else
1643  {
1644  /* Recursively decompose */
1645  enc_uc_lookup_cdc(ucp, &cdc);
1646  if(cdc.dc1 != ucp) { res = enc_uc_engine_decompose(cdc.dc1, dbuf, di); }
1647  else { dbuf[(*di)++] = cdc.dc1; }
1648  if(-1L != cdc.dc2) { dbuf[(*di)++] = cdc.dc2; }
1649  }
1650 
1651  return(res);
1652 }
1653 
1654 
1655 /* ========================================================================== */
1656 /* Unicode canonical ordering engine
1657  *
1658  * All bursts of codepoints with nonzero canonical combining class are
1659  * stable sorted in ascending order.
1660  *
1661  * \param[in,out] dbuf Decomposition buffer
1662  * \param[in] di Number of codepoints in decomposition buffer
1663  */
1664 
1665 static void enc_uc_engine_reorder(long int* dbuf, size_t di)
1666 {
1667  size_t i, ii, iii, iiii;
1668  struct uc_cdc cdc; /* Canonical decomposition data */
1669  size_t len;
1670  long int tmp;
1671  unsigned char ccc1;
1672  unsigned char ccc2;
1673 
1674  for(i = 0; i < di; ++i)
1675  {
1676  enc_uc_lookup_cdc(dbuf[i], &cdc);
1677  /* Starters (Ccc = 0) always stay in place */
1678  if(cdc.ccc)
1679  {
1680 #if ENC_UC_NORM_DEBUG
1681  printf(" Nonstarter: U+%04lX (ccc=%u)\n",
1682  dbuf[i], (unsigned int) cdc.ccc);
1683 #endif /* ENC_UC_NORM_DEBUG */
1684  ii = i;
1685  while(++ii < di)
1686  {
1687  enc_uc_lookup_cdc(dbuf[ii], &cdc);
1688  if(!cdc.ccc) { break; }
1689  }
1690  len = ii - i;
1691  /* Sort burst of nonstarter codepoints (ccc != 0) to canonical order */
1692 #if ENC_UC_NORM_DEBUG
1693  printf(" Sort burst: len=%u\n", (unsigned int) len);
1694 #endif /* ENC_UC_NORM_DEBUG */
1695  for(iii = i; iii < i + len; ++iii)
1696  {
1697  /*
1698  * Bubble sort from end of buffer
1699  * This is very inefficient because the ccc lookup data is not
1700  * buffered. For european languages there are seldom more than
1701  * two or three codepoints combined => Keep it simple.
1702  */
1703  for(iiii = i + len - (size_t) 1; iiii > iii; --iiii)
1704  {
1705  enc_uc_lookup_cdc(dbuf[iiii - (size_t) 1], &cdc);
1706  ccc1 = cdc.ccc;
1707  enc_uc_lookup_cdc(dbuf[iiii], &cdc);
1708  ccc2 = cdc.ccc;
1709  if(ccc2 < ccc1)
1710  {
1711  tmp = dbuf[iiii - (size_t) 1];
1712  dbuf[iiii - (size_t) 1] = dbuf[iiii];
1713  dbuf[iiii] = tmp;
1714  }
1715  }
1716  }
1717  }
1718 #if ENC_UC_NORM_DEBUG
1719  else { printf(" Starter : U+%04lX\n", dbuf[i]); }
1720 #endif /* ENC_UC_NORM_DEBUG */
1721  }
1722 }
1723 
1724 
1725 /* ========================================================================== */
1726 /* Unicode canonical composition engine
1727  *
1728  * \param[in,out] dbuf Codepoint buffer
1729  * \param[in] di Pointer to number of codepoints in buffer
1730  */
1731 
1732 static void enc_uc_engine_compose(long int* dbuf, size_t* di)
1733 {
1734  size_t i = 0;
1735  size_t ii;
1736  long int ucp; /* Unicode codepoint */
1737  struct uc_cdc cdc; /* Canonical decomposition data */
1738  unsigned char ccc;
1739  int skip;
1740 
1741  while(++i < *di)
1742  {
1743  /* Check whether codepoint i can be canonically composed with starter */
1744 #if ENC_UC_NORM_DEBUG
1745  printf(" ---\n");
1746  printf(" Starter at beginning : U+%04lX\n", dbuf[0]);
1747  printf(" Codepoint in question: U+%04lX\n", dbuf[i]);
1748 #endif /* ENC_UC_NORM_DEBUG */
1749  ucp = enc_uc_lookup_cc(dbuf[0], dbuf[i]);
1750  if(-1L != ucp)
1751  {
1752  /* Yes => Get canonical combining class */
1753  enc_uc_lookup_cdc(dbuf[i], &cdc);
1754  ccc = cdc.ccc;
1755 #if ENC_UC_NORM_DEBUG
1756  printf(" Codepoint has ccc : %u\n", (unsigned int) ccc);
1757  printf(" Canonical composition: U+%04lX\n", ucp);
1758 #endif /* ENC_UC_NORM_DEBUG */
1759  /* Search for other codepoints with same canonical combining class */
1760  skip = 0;
1761  for(ii = 1; ii < i; ++ii)
1762  {
1763  enc_uc_lookup_cdc(dbuf[ii], &cdc);
1764  if(cdc.ccc >= ccc)
1765  {
1766  /* Found => Preserve canonical ordering => Don't compose */
1767 #if ENC_UC_NORM_DEBUG
1768  printf(" => Don't compose\n");
1769 #endif /* ENC_UC_NORM_DEBUG */
1770  skip = 1;
1771  break;
1772  }
1773  }
1774  if(skip) { continue; }
1775  /* Not found => Compose */
1776 #if ENC_UC_NORM_DEBUG
1777  printf(" => Compose\n");
1778 #endif /* ENC_UC_NORM_DEBUG */
1779  for(ii = i; ii < *di - (size_t) 1; ++ii)
1780  {
1781  dbuf[ii] = dbuf[ii + (size_t) 1];
1782  }
1783  dbuf[--*di] = -1L;
1784  dbuf[0] = ucp;
1785  /* Rewind index for now missing codepoint */
1786  --i;
1787  }
1788 #if ENC_UC_NORM_DEBUG
1789  printf(" ---\n");
1790 #endif /* ENC_UC_NORM_DEBUG */
1791  }
1792 }
1793 
1794 
1795 /* ========================================================================== */
1796 /* Unicode normalization engine (shared part)
1797  *
1798  * \param[in] s Valid Unicode string with arbitrary or no normalization
1799  * \param[out] l Pointer to length of result
1800  * \param[in] nfc Normalization form (NFC if nonzero, otherwise NFD)
1801  *
1802  * \attention
1803  * The string \e s MUST be already checked for valid UTF-8 encoding before
1804  * calling this function!
1805  *
1806  * \return
1807  * - Pointer to processed Unicode data
1808  * A new memory block was allocated
1809  * - NULL on error (Original memory block for \e s is still allocated)
1810  */
1811 
1812 static const char* enc_uc_engine_n(const char* s, size_t* l, int nfc)
1813 {
1814  char* res = NULL;
1815  size_t rlen = 0;
1816  size_t ri = 0;
1817  char* p;
1818  size_t i = 0;
1819  size_t last;
1820  long int ucp; /* Unicode codepoint */
1821  struct uc_cdc cdc; /* Canonical decomposition data */
1822  long int dbuf[ENC_UC_DECOMPOSITION_BUFSIZE];
1823  size_t di = 0;
1824  int error = 0; /* Error flag => Skip to next starter */
1825 
1826  while(1)
1827  {
1828  /* Allocate memory in exponentially increasing chunks */
1829  if(rlen - ri <= (size_t) 4 * ENC_UC_DECOMPOSITION_BUFSIZE)
1830  {
1831  /*
1832  * Ensure there is space in the result buffer for at least 4 times the
1833  * decompositon buffer size. Reason: Every Unicode codepoint can
1834  * consume up to 4 bytes after encoded to UTF-8 in worst case.
1835  */
1836  if(!rlen) { rlen = (size_t) 4 * ENC_UC_DECOMPOSITION_BUFSIZE; }
1837  rlen *= (size_t) 2;
1838  p = posix_realloc((void*) res, rlen);
1839  if(NULL == p) { posix_free((void*) res); res = NULL; break; }
1840  else { res = p; }
1841  }
1842  /* Check whether next codepoint is a starter (ccc = 0) */
1843  last = i;
1844  ucp = enc_uc_decode_utf8(s, &i);
1845  if(-1L == ucp) { break; }
1846  enc_uc_lookup_cdc(ucp, &cdc);
1847  if(!cdc.ccc)
1848  {
1849  /* Yes => Check for buffered sequence */
1850  if(di)
1851  {
1852  /* Present => Push last codepoint back and flush buffer first */
1853  i = last;
1854  enc_uc_engine_reorder(dbuf, di);
1855  if(nfc) { enc_uc_engine_compose(dbuf, &di); }
1856  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1857  error = 0;
1858  continue;
1859  }
1860  }
1861  /* Recursive canonical decomposition */
1862  if(!error) { error = enc_uc_engine_decompose(ucp, dbuf, &di); }
1863  }
1864  /* Flush buffer */
1865  if(di)
1866  {
1867  enc_uc_engine_reorder(dbuf, di);
1868  if(nfc) { enc_uc_engine_compose(dbuf, &di); }
1869  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1870  }
1871 
1872  /* Terminate result string */
1873  if(NULL != res)
1874  {
1875  res[ri] = 0;
1876  *l = ri;
1877  }
1878 
1879  return(res);
1880 }
1881 
1882 
1883 /* ========================================================================== */
1884 /* Unicode NFD normalization engine
1885  *
1886  * \param[in] s Valid Unicode string with arbitrary or no normalization
1887  * \param[out] l Pointer to length of result
1888  *
1889  * \attention
1890  * The string \e s MUST be already checked for valid UTF-8 encoding before
1891  * calling this function!
1892  *
1893  * \return
1894  * - Pointer to decomposed Unicode data (UTF-8 encoded with NFD normalization)
1895  * A new memory block was allocated
1896  * - NULL on error (Original memory block for \e s is still allocated)
1897  */
1898 
1899 static const char* enc_uc_engine_nfd(const char* s, size_t* l)
1900 {
1901  return(enc_uc_engine_n(s, l, 0));
1902 }
1903 
1904 
1905 /* ========================================================================== */
1906 /* Unicode NFC normalization engine part 1
1907  *
1908  * \param[in] s Valid Unicode string with arbitrary or no normalization
1909  * \param[out] l Pointer to length of result
1910  *
1911  * \attention
1912  * The string \e s MUST be already checked for valid UTF-8 encoding before
1913  * calling this function!
1914  *
1915  * Part 1 does all the work but cannot compose starters, therefore the result
1916  * may contain starter pairs with canonical composition and must be
1917  * postprocessed by part 2 of the engine.
1918  *
1919  * \return
1920  * - Pointer to processed data (input for part 2 of the engine)
1921  * A new memory block was allocated
1922  * - NULL on error (Original memory block for \e s is still allocated)
1923  */
1924 
1925 static const char* enc_uc_engine_nfc_part1(const char* s, size_t* l)
1926 {
1927  return(enc_uc_engine_n(s, l, 1));
1928 }
1929 
1930 
1931 /* ========================================================================== */
1932 /* Unicode NFC normalization engine part 2
1933  *
1934  * \param[in] s Unicode string processed by part 1 of the engine
1935  * \param[in] l Length of \e s
1936  * \param[out] flag Flag indicating modified data
1937  *
1938  * Part 2 is for canonical composition of codepoint pairs that are both
1939  * starters. This includes algorithmic canonical composition for hangul
1940  * syllables.
1941  *
1942  * \return
1943  * - Pointer to precomposed Unicode data (UTF-8 encoded with NFC normalization)
1944  * A new memory block was allocated
1945  * - NULL on error (Undefined data written to \e flag)
1946  */
1947 
1948 static const char* enc_uc_engine_nfc_part2(const char* s, size_t l,
1949  int* flag)
1950 {
1951  char* res = NULL;
1952  size_t ri = 0;
1953  size_t i = 0;
1954  long int ucp; /* Unicode codepoint */
1955  struct uc_cdc cdc; /* Canonical decomposition data */
1956  long int dbuf[2];
1957  size_t di = 0;
1958 
1959  *flag = 0;
1960 #if ENC_UC_NORM_DEBUG
1961  printf(" *** Part 2 ***\n");
1962 #endif /* ENC_UC_NORM_DEBUG */
1963  res = posix_malloc(++l);
1964  if(NULL != res)
1965  {
1966  while(1)
1967  {
1968  /* Append next codepoint to buffer */
1969  ucp = enc_uc_decode_utf8(s, &i);
1970  if(-1L == ucp) { break; }
1971  dbuf[di++] = ucp;
1972  /* Check whether codepoint is a starter (ccc = 0) */
1973  enc_uc_lookup_cdc(ucp, &cdc);
1974  if(cdc.ccc)
1975  {
1976  /* No => Flush buffer */
1977  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1978  }
1979  else
1980  {
1981  /* Yes => Check for canonical composition of starter pair */
1982  if((size_t) 2 == di)
1983  {
1984  enc_uc_engine_compose(dbuf, &di);
1985  /* Flush first starter if there was no canonical composition */
1986  if((size_t) 2 == di)
1987  {
1988  di = 1;
1989  enc_uc_encode_utf8(res, &ri, dbuf, &di);
1990  dbuf[0] = ucp;
1991  di = 1;
1992  }
1993  else
1994  {
1995  /* Canonical composition found for starter pair */
1996  *flag = 1;
1997  }
1998  }
1999  }
2000  }
2001  }
2002  /* Flush buffer */
2003  if(di) { enc_uc_encode_utf8(res, &ri, dbuf, &di); }
2004  /* Terminate result string */
2005  if(NULL != res) { res[ri] = 0; }
2006 
2007 #if ENC_UC_NORM_DEBUG
2008  if(NULL != res)
2009  {
2010  printf("Now NFC: %s (len: %u)\n ", res, (unsigned int) strlen(res));
2011  i = 0; while(res[i])
2012  {
2013  printf(" 0x%02X", (unsigned int) (unsigned char) res[i++]);
2014  }
2015  printf("\n\n");
2016  }
2017 #endif /* ENC_UC_NORM_DEBUG */
2018 
2019  return(res);
2020 }
2021 
2022 
2023 /* ========================================================================== */
2024 /* Strip defective combining character sequences (at the beginning of string)
2025  *
2026  * \param[in] s UTF-8 string to process
2027  *
2028  * \attention
2029  * The string \e s MUST be already checked for valid UTF-8 encoding before
2030  * calling this function!
2031  *
2032  * This function strips defective combining character sequences at the
2033  * beginning so that the result becomes semantically valid when standing
2034  * alone.
2035  *
2036  * \return
2037  * - Pointer to processed Unicode data
2038  * If the result is not equal to \e s , a new memory block was allocated
2039  * - NULL on error
2040  */
2041 
2042 static const char* enc_uc_strip_dccs(const char* s)
2043 {
2044  const char* res = NULL;
2045  int skip = 0; /* Garbage at the beginning of string must be skipped */
2046  long int ucp; /* Unicode codepoint */
2047  struct uc_cdc cdc; /* Canonical decomposition data */
2048  size_t i = 0;
2049  size_t last;
2050  size_t len;
2051 
2052  while(1)
2053  {
2054  last = i;
2055  ucp = enc_uc_decode_utf8(s, &i);
2056  if(-1L == ucp) { break; }
2057  enc_uc_lookup_cdc(ucp, &cdc);
2058  if(!cdc.ccc) { break; }
2059  else
2060  {
2061  /* The Unicode data tries to compose something with void */
2062  if(!skip) { PRINT_ERROR("Semantic error in Unicode string"); }
2063  skip = 1;
2064  }
2065  }
2066  i = last;
2067  if(skip)
2068  {
2069  len = strlen(&s[i]);
2070  res = (const char*) posix_malloc(++len);
2071  if(NULL != res) { memcpy((void*) res, &s[i], len); }
2072  }
2073  else { res = s; }
2074 
2075  return(res);
2076 }
2077 
2078 
2079 /* ========================================================================== */
2080 /* Normalize UTF-8 string to NFD
2081  *
2082  * Documentation about Unicode normalization:
2083  * <br>
2084  * http://www.unicode.org/reports/tr15/
2085  *
2086  * \param[in] s UTF-8 string to normalize
2087  *
2088  * \attention
2089  * The string \e s MUST be already checked for valid UTF-8 encoding before
2090  * calling this function!
2091  *
2092  * \note
2093  * An Unicode string with valid transformation encoding can still be
2094  * semantically nonsense. If it starts with a codepoint that is not a "starter"
2095  * in terms of the standard, the resulting string starts with a "defective
2096  * combining character sequence" - but may still make sense if concatenated
2097  * with other data in front of it.
2098  * This function always strips defective combining character sequences at the
2099  * beginning so that the result becomes semantically valid even when standing
2100  * alone.
2101  *
2102  * \return
2103  * - Pointer to decomposed Unicode data (UTF-8 encoded with NFD normalization)
2104  * If the result is not equal to \e s , a new memory block was allocated
2105  * - NULL on error
2106  */
2107 
2108 static const char* enc_uc_normalize_to_nfd(const char* s)
2109 {
2110  const char* res = NULL;
2111  const char* tgt;
2112  size_t l = 0;
2113 
2114  /* Strip all nonstarters at the beginning */
2115  res = enc_uc_strip_dccs(s);
2116  if(NULL != res)
2117  {
2118  /* Normalize string to NFD */
2119  tgt = enc_uc_engine_nfd(res, &l);
2120  if(res != s) { posix_free((void*) res); }
2121  res = tgt;
2122  }
2123 
2124  /* Check for error */
2125  if(NULL == res) { PRINT_ERROR("Unicode NFD normalization failed"); }
2126 
2127  return(res);
2128 }
2129 
2130 
2131 /* ========================================================================== */
2132 /* Normalize UTF-8 string to NFC
2133  *
2134  * Documentation about Unicode normalization:
2135  * <br>
2136  * http://www.unicode.org/reports/tr15/
2137  *
2138  * \param[in] s UTF-8 string to normalize
2139  *
2140  * RFC 5198 recommends NFC for use in general Internet text messages
2141  * => We do so.
2142  *
2143  * \attention
2144  * The string \e s MUST be already checked for valid UTF-8 encoding before
2145  * calling this function!
2146  *
2147  * \note
2148  * An Unicode string with valid transformation encoding can still be
2149  * semantically nonsense. If it starts with a codepoint that is not a "starter"
2150  * in terms of the standard, the resulting string starts with a "defective
2151  * combining character sequence" - but may still make sense if concatenated
2152  * with other data in front of it.
2153  * This function always strips defective combining character sequences at the
2154  * beginning so that the result becomes semantically valid even when standing
2155  * alone.
2156  *
2157  * \return
2158  * - Pointer to precomposed Unicode data (UTF-8 encoded with NFC normalization)
2159  * If the result is not equal to \e s , a new memory block was allocated
2160  * - NULL on error
2161  */
2162 
2163 static const char* enc_uc_normalize_to_nfc(const char* s)
2164 {
2165  const char* res = NULL;
2166  const char* tgt;
2167  size_t l = 0;
2168  int flag;
2169 
2170  /* Strip all nonstarters at the beginning */
2171  res = enc_uc_strip_dccs(s);
2172 
2173  /* Quick check whether the string is already in NFC */
2174  if(NULL != res && enc_uc_check_nfc(res))
2175  {
2176  /* No => Normalize string to NFC */
2177  tgt = enc_uc_engine_nfc_part1(res, &l);
2178  if(res != s) { posix_free((void*) res); }
2179  res = tgt;
2180  if(NULL != res)
2181  {
2182  /* Fixme: This is ugly and very slow => Should be single pass */
2183  tgt = enc_uc_engine_nfc_part2(res, l, &flag);
2184  posix_free((void*) res);
2185  res = tgt;
2186  if(NULL != res && flag)
2187  {
2188  /* Part 1 must be repeated if starters were composed */
2189  tgt = enc_uc_engine_nfc_part1(res, &l);
2190  posix_free((void*) res);
2191  res = tgt;
2192  }
2193  }
2194  }
2195 
2196  /* Check for error */
2197  if(NULL == res) { PRINT_ERROR("Unicode NFC normalization failed"); }
2198 
2199  return(res);
2200 }
2201 
2202 
2203 /* ========================================================================== */
2204 /* Convert from supported ISO 2022 character sets to UTF-8 NFC
2205  *
2206  * \note
2207  * This is a quick&dirty, incomplete implementation. If there should be added
2208  * support for other ISO 2022 character sets in the future, this function should
2209  * be rewritten to use a shared, full-featured escape sequence decoder.
2210  *
2211  * \param[in] charset 8 bit character set used for string \e s
2212  * \param[in] s String to convert
2213  *
2214  * \return
2215  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
2216  * - NULL on error
2217  */
2218 
2219 static const char* enc_iso2022_convert_to_utf8_nfc(enum enc_mime_cs charset,
2220  const char* s)
2221 {
2222  const char* res = s;
2223  size_t len = strlen(s);
2224  size_t i;
2225  enum iso2022_state state = ISO2022_ASCII;
2226  int esc = 0;
2227  int mode = 1;
2228  int first = 1;
2229  unsigned char ku = 0;
2230  unsigned char ten;
2231  char* buf;
2232  size_t bi = 0;
2233  size_t ii;
2234  long int jis;
2235  long int ucp;
2236 
2237  switch(charset)
2238  {
2239  case ENC_CS_ISO2022_JP:
2240  {
2241  /* Original japanese JUNET variant according to RFC 1468 */
2242  if(NULL != strchr(s, 0x1B))
2243  {
2244  /*
2245  * Allocate the result buffer twice as large as the original data
2246  * This is always sufficient because the escape sequences are deleted
2247  * and every multibyte character can't use more that 4 bytes in UTF-8.
2248  */
2249  buf = (char*) posix_malloc(++len * (size_t) 2);
2250  if(NULL != buf)
2251  {
2252  for(i = 0; i < len; ++i)
2253  {
2254  /* Check for escape sequence */
2255  if(0x1BU == (unsigned int) s[i]) { esc = 1; continue; }
2256  /* Decode escape sequence */
2257  if(1 == esc)
2258  {
2259  mode = 0;
2260  if('(' == s[i]) { mode = 1; }
2261  else if('$' == s[i]) { mode = 2; }
2262  esc = 2;
2263  continue;
2264  }
2265  else if(2 == esc)
2266  {
2267  if(1 == mode)
2268  {
2269  if('B' == s[i]) { state = ISO2022_ASCII; }
2270  else if('J' == s[i]) { state = ISO2022_ISO646; }
2271  }
2272  else if(2 == mode)
2273  {
2274  /* Don't distinguish between the JIS X 0208 variants */
2275  state = ISO2022_JIS_X_0208;
2276  first = 1;
2277  }
2278  else
2279  {
2280  /* Switch back to ASCII if escape sequence is invalid */
2281  state = ISO2022_ASCII;
2282  PRINT_ERROR("Invalid ISO 2022 escape sequence");
2283  }
2284  esc = 0;
2285  continue;
2286  }
2287  /* Copy next character */
2288  switch(state)
2289  {
2290  case ISO2022_ISO646:
2291  {
2292  if(0x80U & (unsigned int) s[i])
2293  {
2294  PRINT_ERROR("Invalid ISO 646 codepoint"
2295  " in ISO 2022-JP data");
2296  buf[bi++] = '?';
2297  }
2298  else
2299  {
2300  /* Handle Yen sign */
2301  if(0x5CU == (unsigned int) s[i])
2302  {
2303  buf[bi++] = (char) 0xC2;
2304  buf[bi++] = (char) 0xA5;
2305  }
2306  /* Handle overline */
2307  else if('~' == s[i])
2308  {
2309  buf[bi++] = (char) 0xE2;
2310  buf[bi++] = (char) 0x80;
2311  buf[bi++] = (char) 0xBE;
2312  }
2313  else { buf[bi++] = s[i]; }
2314  }
2315  break;
2316  }
2317  case ISO2022_JIS_X_0208:
2318  {
2319  /*
2320  * All characters are 2 byte sequences
2321  * These codepoints are called "kuten".
2322  * They form a 94x94 character table.
2323  */
2324  if(first) { ku = (unsigned char) s[i]; }
2325  else
2326  {
2327  ten = (unsigned char) s[i];
2328  if(33U > ku || 126U < ku || 33U > ten || 126U < ten)
2329  {
2330  PRINT_ERROR("Invalid kuten in ISO 2022-JP data");
2331  buf[bi++] = '?';
2332  }
2333  else
2334  {
2335  /* Convert kuten to Unicode */
2336  if( (32U + 9U <= ku && 32U + 15U >= ku)
2337  || (32U + 85U <= ku && 32U + 94U >= ku) )
2338  {
2339  /* Unassigned codepoint */
2340  PRINT_ERROR(
2341  "Unassigned codepoint in ISO 2022-JP data");
2342  buf[bi++] = (char) 0xEF;
2343  buf[bi++] = (char) 0xBF;
2344  buf[bi++] = (char) 0xBD;
2345  }
2346  else
2347  {
2348  jis = (long int) ((unsigned long int) ku << 8);
2349  jis |= (long int) (unsigned long int) ten;
2350  ucp = -1L;
2351  ii = 0;
2352  while(1)
2353  {
2354  if(iso2022_jp_table[ii].jis == -1L)
2355  {
2356  PRINT_ERROR("Decoding of kuten in "
2357  "ISO 2022-JP data failed");
2358  break;
2359  }
2360  if(iso2022_jp_table[ii].jis == jis)
2361  {
2362  ucp = iso2022_jp_table[ii].uc;
2363  break;
2364  }
2365  ++ii;
2366  }
2367  if(0L > ucp)
2368  {
2369  buf[bi++] = (char) 0xEF;
2370  buf[bi++] = (char) 0xBF;
2371  buf[bi++] = (char) 0xBD;
2372  }
2373  else
2374  {
2375  ii = 1;
2376  enc_uc_encode_utf8(buf, &bi, &ucp, &ii);
2377  }
2378  }
2379  }
2380  }
2381  first = !first;
2382  break;
2383  }
2384  default:
2385  {
2386  if(0x80U & (unsigned int) s[i])
2387  {
2388  PRINT_ERROR("Invalid ISO 646 codepoint"
2389  " in US-ASCII data");
2390  buf[bi++] = '?';
2391  }
2392  else
2393  {
2394  buf[bi++] = s[i];
2395  }
2396  break;
2397  }
2398  }
2399  }
2400  /* Ensure result is terminated in any case */
2401  buf[bi] = 0;
2402  /* Check for correct exit state */
2403  if(ISO2022_ASCII != state)
2404  {
2405  PRINT_ERROR("ISO 2022-JP data doesn't end in ASCII state");
2406  }
2407  }
2408  res = buf;
2409  }
2410  break;
2411  }
2412  default:
2413  {
2414  /* Not supported */
2415  res = NULL;
2416  break;
2417  }
2418  }
2419 
2420  return(res);
2421 }
2422 
2423 
2424 /* ========================================================================== */
2425 /* Decode IANA character set description
2426  *
2427  * \param[in] s Description to decode
2428  * \param[in] len Length of string \e s
2429  *
2430  * This function checks whether the description \e s represents a supported IANA
2431  * character set name and return the corresponding ID for it.
2432  * According to RFC 2047 the character set is treated case-insensitive.
2433  *
2434  * \result
2435  * - MIME character set ID (from \ref enc_mime_cs )
2436  * - \c ENC_CS_UNKNOWN on error
2437  */
2438 
2439 static enum enc_mime_cs enc_mime_get_charset(const char* s, size_t len)
2440 {
2441  enum enc_mime_cs res = ENC_CS_UNKNOWN;
2442  char buf[ENC_CS_BUFLEN];
2443  size_t i;
2444  const char not_supported[] = "MIME: Unsupported character set: ";
2445  char* p;
2446  size_t l;
2447  int isoinv = 0;
2448  int macinv = 0;
2449  int ibminv = 0;
2450 
2451  if(ENC_CS_BUFLEN <= len)
2452  {
2453  /* If you get this error, the value of 'ENC_CS_BUFLEN' is too small */
2454  PRINT_ERROR("MIME: Name of character set too long");
2455  }
2456  else
2457  {
2458  /* Convert description to upper case */
2459  for(i = 0; i < len; ++i)
2460  {
2461  buf[i] = (char) toupper((int) s[i]);
2462  }
2463  buf[len] = 0;
2464  /* Check for all known character sets */
2465  if(!strcmp(buf, "US-ASCII")) { res = ENC_CS_ASCII; }
2466  else if(!strcmp(buf, "UTF-8")) { res = ENC_CS_UTF_8; }
2467  else if(!strcmp(buf, "CESU-8")) { res = ENC_CS_CESU_8; }
2468  else if(!strcmp(buf, "UTF-7")) { res = ENC_CS_UTF_7; }
2469  else if(!strcmp(buf, "ISO-8859-1")) { res = ENC_CS_ISO8859_1; }
2470  else if(!strcmp(buf, "ISO-8859-2")) { res = ENC_CS_ISO8859_2; }
2471  else if(!strcmp(buf, "ISO-8859-3")) { res = ENC_CS_ISO8859_3; }
2472  else if(!strcmp(buf, "ISO-8859-4")) { res = ENC_CS_ISO8859_4; }
2473  else if(!strcmp(buf, "ISO-8859-5")) { res = ENC_CS_ISO8859_5; }
2474  else if(!strcmp(buf, "ISO-8859-6")) { res = ENC_CS_ISO8859_6; }
2475  else if(!strcmp(buf, "ISO-8859-7")) { res = ENC_CS_ISO8859_7; }
2476  else if(!strcmp(buf, "ISO-8859-8")) { res = ENC_CS_ISO8859_8; }
2477  else if(!strcmp(buf, "ISO-8859-9")) { res = ENC_CS_ISO8859_9; }
2478  else if(!strcmp(buf, "ISO-8859-10")) { res = ENC_CS_ISO8859_10; }
2479  else if(!strcmp(buf, "TIS-620")) { res = ENC_CS_ISO8859_11; }
2480  /* Note: The proposed draft for ISO 8859-12 was released as ISO 8859-14 */
2481  else if(!strcmp(buf, "ISO-8859-13")) { res = ENC_CS_ISO8859_13; }
2482  else if(!strcmp(buf, "ISO-8859-14")) { res = ENC_CS_ISO8859_14; }
2483  else if(!strcmp(buf, "ISO-8859-15")) { res = ENC_CS_ISO8859_15; }
2484  else if(!strcmp(buf, "ISO-8859-16")) { res = ENC_CS_ISO8859_16; }
2485  else if(!strcmp(buf, "WINDOWS-1250")) { res = ENC_CS_WINDOWS_1250; }
2486  else if(!strcmp(buf, "WINDOWS-1251")) { res = ENC_CS_WINDOWS_1251; }
2487  else if(!strcmp(buf, "WINDOWS-1252")) { res = ENC_CS_WINDOWS_1252; }
2488  else if(!strcmp(buf, "WINDOWS-1253")) { res = ENC_CS_WINDOWS_1253; }
2489  else if(!strcmp(buf, "WINDOWS-1254")) { res = ENC_CS_WINDOWS_1254; }
2490  else if(!strcmp(buf, "WINDOWS-1255")) { res = ENC_CS_WINDOWS_1255; }
2491  else if(!strcmp(buf, "WINDOWS-1256")) { res = ENC_CS_WINDOWS_1256; }
2492  else if(!strcmp(buf, "WINDOWS-1257")) { res = ENC_CS_WINDOWS_1257; }
2493  else if(!strcmp(buf, "WINDOWS-1258")) { res = ENC_CS_WINDOWS_1258; }
2494  else if(!strcmp(buf, "KOI8-R")) { res = ENC_CS_KOI8R; }
2495  else if(!strcmp(buf, "KOI8-U")) { res = ENC_CS_KOI8U; }
2496  else if(!strcmp(buf, "MACINTOSH")) { res = ENC_CS_MACINTOSH; }
2497  else if(!strcmp(buf, "IBM437")) { res = ENC_CS_IBM437; }
2498  else if(!strcmp(buf, "IBM775")) { res = ENC_CS_IBM775; }
2499  else if(!strcmp(buf, "IBM850")) { res = ENC_CS_IBM850; }
2500  else if(!strcmp(buf, "IBM852")) { res = ENC_CS_IBM852; }
2501  else if(!strcmp(buf, "IBM00858")) { res = ENC_CS_IBM858; }
2502  else if(!strcmp(buf, "ISO-2022-JP")) { res = ENC_CS_ISO2022_JP; }
2503 
2504  /* Check for official IANA aliases */
2505  /* US-ASCII */
2506  else if(!strcmp(buf, "ANSI_X3.4-1968")) { res = ENC_CS_ASCII; }
2507  else if(!strcmp(buf, "ANSI_X3.4-1986")) { res = ENC_CS_ASCII; }
2508  else if(!strcmp(buf, "ISO-IR-6")) { res = ENC_CS_ASCII; }
2509  else if(!strcmp(buf, "ISO_646.IRV:1991")) { res = ENC_CS_ASCII; }
2510  else if(!strcmp(buf, "ISO646-US")) { res = ENC_CS_ASCII; }
2511  else if(!strcmp(buf, "IBM367")) { res = ENC_CS_ASCII; }
2512  else if(!strcmp(buf, "CP367")) { res = ENC_CS_ASCII; }
2513  else if(!strcmp(buf, "CSASCII")) { res = ENC_CS_ASCII; }
2514  else if(!strcmp(buf, "US")) { res = ENC_CS_ASCII; }
2515  /* UTF-8 */
2516  else if(!strcmp(buf, "CSUTF-8")) { res = ENC_CS_UTF_8; }
2517  /* CESU-8 */
2518  else if(!strcmp(buf, "CSCESU8")) { res = ENC_CS_CESU_8; }
2519  else if(!strcmp(buf, "CSCESU-8")) { res = ENC_CS_CESU_8; }
2520  /* UTF-7 */
2521  else if(!strcmp(buf, "CSUTF-7")) { res = ENC_CS_UTF_7; }
2522 
2523  /* Check for official IANA aliases of ISO 8859 parts */
2524  /* ISO 8859-1 */
2525  else if(!strcmp(buf, "ISO_8859-1:1987")) { res = ENC_CS_ISO8859_1; }
2526  else if(!strcmp(buf, "ISO_8859-1")) { res = ENC_CS_ISO8859_1; }
2527  else if(!strcmp(buf, "ISO-IR-100")) { res = ENC_CS_ISO8859_1; }
2528  else if(!strcmp(buf, "IBM819")) { res = ENC_CS_ISO8859_1; }
2529  else if(!strcmp(buf, "CP819")) { res = ENC_CS_ISO8859_1; }
2530  else if(!strcmp(buf, "CSISOLATIN1")) { res = ENC_CS_ISO8859_1; }
2531  else if(!strcmp(buf, "LATIN1")) { res = ENC_CS_ISO8859_1; }
2532  else if(!strcmp(buf, "L1")) { res = ENC_CS_ISO8859_1; }
2533  /* ISO 8859-2 */
2534  else if(!strcmp(buf, "ISO_8859-2:1987")) { res = ENC_CS_ISO8859_2; }
2535  else if(!strcmp(buf, "ISO_8859-2")) { res = ENC_CS_ISO8859_2; }
2536  else if(!strcmp(buf, "ISO-IR-101")) { res = ENC_CS_ISO8859_2; }
2537  else if(!strcmp(buf, "CSISOLATIN2")) { res = ENC_CS_ISO8859_2; }
2538  else if(!strcmp(buf, "LATIN2")) { res = ENC_CS_ISO8859_2; }
2539  else if(!strcmp(buf, "L2")) { res = ENC_CS_ISO8859_2; }
2540  /* ISO 8859-3 */
2541  else if(!strcmp(buf, "ISO_8859-3:1988")) { res = ENC_CS_ISO8859_3; }
2542  else if(!strcmp(buf, "ISO_8859-3")) { res = ENC_CS_ISO8859_3; }
2543  else if(!strcmp(buf, "ISO-IR-109")) { res = ENC_CS_ISO8859_3; }
2544  else if(!strcmp(buf, "CSISOLATIN3")) { res = ENC_CS_ISO8859_3; }
2545  else if(!strcmp(buf, "LATIN3")) { res = ENC_CS_ISO8859_3; }
2546  else if(!strcmp(buf, "L3")) { res = ENC_CS_ISO8859_3; }
2547  /* ISO 8859-4 */
2548  else if(!strcmp(buf, "ISO_8859-4:1988")) { res = ENC_CS_ISO8859_4; }
2549  else if(!strcmp(buf, "ISO_8859-4")) { res = ENC_CS_ISO8859_4; }
2550  else if(!strcmp(buf, "ISO-IR-110")) { res = ENC_CS_ISO8859_4; }
2551  else if(!strcmp(buf, "CSISOLATIN4")) { res = ENC_CS_ISO8859_4; }
2552  else if(!strcmp(buf, "LATIN4")) { res = ENC_CS_ISO8859_4; }
2553  else if(!strcmp(buf, "L4")) { res = ENC_CS_ISO8859_4; }
2554  /* ISO 8859-5 */
2555  else if(!strcmp(buf, "ISO_8859-5:1988")) { res = ENC_CS_ISO8859_5; }
2556  else if(!strcmp(buf, "ISO_8859-5")) { res = ENC_CS_ISO8859_5; }
2557  else if(!strcmp(buf, "ISO-IR-144")) { res = ENC_CS_ISO8859_5; }
2558  else if(!strcmp(buf, "CSISOLATINCYRILLIC")) { res = ENC_CS_ISO8859_5; }
2559  else if(!strcmp(buf, "CYRILLIC")) { res = ENC_CS_ISO8859_5; }
2560  /* ISO 8859-6 */
2561  else if(!strcmp(buf, "ISO_8859-6:1987")) { res = ENC_CS_ISO8859_6; }
2562  else if(!strcmp(buf, "ISO_8859-6")) { res = ENC_CS_ISO8859_6; }
2563  else if(!strcmp(buf, "ISO-IR-127")) { res = ENC_CS_ISO8859_6; }
2564  else if(!strcmp(buf, "ECMA-114")) { res = ENC_CS_ISO8859_6; }
2565  else if(!strcmp(buf, "ASMO-708")) { res = ENC_CS_ISO8859_6; }
2566  else if(!strcmp(buf, "CSISOLATINARABIC")) { res = ENC_CS_ISO8859_6; }
2567  else if(!strcmp(buf, "ARABIC")) { res = ENC_CS_ISO8859_6; }
2568  /* ISO 8859-7 */
2569  else if(!strcmp(buf, "ISO_8859-7:1987")) { res = ENC_CS_ISO8859_7; }
2570  else if(!strcmp(buf, "ISO_8859-7")) { res = ENC_CS_ISO8859_7; }
2571  else if(!strcmp(buf, "ISO-IR-126")) { res = ENC_CS_ISO8859_7; }
2572  else if(!strcmp(buf, "ECMA-118")) { res = ENC_CS_ISO8859_7; }
2573  else if(!strcmp(buf, "ELOT_928")) { res = ENC_CS_ISO8859_7; }
2574  else if(!strcmp(buf, "CSISOLATINGREEK")) { res = ENC_CS_ISO8859_7; }
2575  else if(!strcmp(buf, "GREEK8")) { res = ENC_CS_ISO8859_7; }
2576  else if(!strcmp(buf, "GREEK")) { res = ENC_CS_ISO8859_7; }
2577  /* ISO 8859-8 */
2578  else if(!strcmp(buf, "ISO_8859-8:1988")) { res = ENC_CS_ISO8859_8; }
2579  else if(!strcmp(buf, "ISO_8859-8")) { res = ENC_CS_ISO8859_8; }
2580  else if(!strcmp(buf, "CSISOLATIN8")) { res = ENC_CS_ISO8859_8; }
2581  else if(!strcmp(buf, "LATIN8")) { res = ENC_CS_ISO8859_8; }
2582  else if(!strcmp(buf, "L8")) { res = ENC_CS_ISO8859_8; }
2583  else if(!strcmp(buf, "ISO-IR-138")) { res = ENC_CS_ISO8859_8; }
2584  else if(!strcmp(buf, "HEBREW")) { res = ENC_CS_ISO8859_8; }
2585  else if(!strcmp(buf, "CSISOLATINHEBREW")) { res = ENC_CS_ISO8859_8; }
2586  /* ISO 8859-9 */
2587  else if(!strcmp(buf, "ISO_8859-9:1989")) { res = ENC_CS_ISO8859_9; }
2588  else if(!strcmp(buf, "ISO_8859-9")) { res = ENC_CS_ISO8859_9; }
2589  else if(!strcmp(buf, "ISO-IR-148")) { res = ENC_CS_ISO8859_9; }
2590  else if(!strcmp(buf, "CSISOLATIN5")) { res = ENC_CS_ISO8859_9; }
2591  else if(!strcmp(buf, "LATIN5")) { res = ENC_CS_ISO8859_9; }
2592  else if(!strcmp(buf, "L5")) { res = ENC_CS_ISO8859_9; }
2593  /* ISO 8859-10 */
2594  else if(!strcmp(buf, "ISO_8859-10:1992")) { res = ENC_CS_ISO8859_10; }
2595  else if(!strcmp(buf, "ISO-IR-157")) { res = ENC_CS_ISO8859_10; }
2596  else if(!strcmp(buf, "CSISOLATIN6")) { res = ENC_CS_ISO8859_10; }
2597  else if(!strcmp(buf, "LATIN6")) { res = ENC_CS_ISO8859_10; }
2598  else if(!strcmp(buf, "L6")) { res = ENC_CS_ISO8859_10; }
2599  /* ISO 8859-11 */
2600  else if(!strcmp(buf, "ISO_8859-11")) { res = ENC_CS_ISO8859_11; }
2601  else if(!strcmp(buf, "CSTIS620")) { res = ENC_CS_ISO8859_11; }
2602  /* ISO 8859-13 */
2603  else if(!strcmp(buf, "CSISO885913")) { res = ENC_CS_ISO8859_13; }
2604  /* ISO 8859-14 */
2605  else if(!strcmp(buf, "ISO_8859-14:1998")) { res = ENC_CS_ISO8859_14; }
2606  else if(!strcmp(buf, "ISO_8859-14")) { res = ENC_CS_ISO8859_14; }
2607  else if(!strcmp(buf, "ISO-IR-199")) { res = ENC_CS_ISO8859_14; }
2608  else if(!strcmp(buf, "CSISO885914")) { res = ENC_CS_ISO8859_14; }
2609  else if(!strcmp(buf, "ISO-CELTIC")) { res = ENC_CS_ISO8859_14; }
2610  else if(!strcmp(buf, "LATIN8")) { res = ENC_CS_ISO8859_14; }
2611  else if(!strcmp(buf, "L8")) { res = ENC_CS_ISO8859_14; }
2612  /* ISO 8859-15 */
2613  else if(!strcmp(buf, "ISO_8859-15")) { res = ENC_CS_ISO8859_15; }
2614  else if(!strcmp(buf, "CSISO885915")) { res = ENC_CS_ISO8859_15; }
2615  else if(!strcmp(buf, "LATIN9")) { res = ENC_CS_ISO8859_15; }
2616  /* ISO 8859-16 */
2617  else if(!strcmp(buf, "ISO_8859-16:2001")) { res = ENC_CS_ISO8859_16; }
2618  else if(!strcmp(buf, "ISO_8859-16")) { res = ENC_CS_ISO8859_16; }
2619  else if(!strcmp(buf, "ISO-IR-226")) { res = ENC_CS_ISO8859_16; }
2620  else if(!strcmp(buf, "CSISO885916")) { res = ENC_CS_ISO8859_16; }
2621  else if(!strcmp(buf, "LATIN10")) { res = ENC_CS_ISO8859_16; }
2622  else if(!strcmp(buf, "L10")) { res = ENC_CS_ISO8859_16; }
2623 
2624  /* Check for official IANA aliases of Windows codepages */
2625  /* Windows-1250 */
2626  else if(!strcmp(buf, "CSWINDOWS1250")) { res = ENC_CS_WINDOWS_1250; }
2627  /* Windows-1251 */
2628  else if(!strcmp(buf, "CSWINDOWS1251")) { res = ENC_CS_WINDOWS_1251; }
2629  /* Windows-1252 */
2630  else if(!strcmp(buf, "CSWINDOWS1252")) { res = ENC_CS_WINDOWS_1252; }
2631  /* Windows-1253 */
2632  else if(!strcmp(buf, "CSWINDOWS1253")) { res = ENC_CS_WINDOWS_1253; }
2633  /* Windows-1254 */
2634  else if(!strcmp(buf, "CSWINDOWS1254")) { res = ENC_CS_WINDOWS_1254; }
2635  /* Windows-1255 */
2636  else if(!strcmp(buf, "CSWINDOWS1255")) { res = ENC_CS_WINDOWS_1255; }
2637  /* Windows-1256 */
2638  else if(!strcmp(buf, "CSWINDOWS1256")) { res = ENC_CS_WINDOWS_1256; }
2639  /* Windows-1257 */
2640  else if(!strcmp(buf, "CSWINDOWS1257")) { res = ENC_CS_WINDOWS_1257; }
2641  /* Windows-1258 */
2642  else if(!strcmp(buf, "CSWINDOWS1258")) { res = ENC_CS_WINDOWS_1258; }
2643 
2644  /* Check for official IANA aliases of KOI8 codepages */
2645  else if(!strcmp(buf, "CSKOI8R")) { res = ENC_CS_KOI8R; }
2646  else if(!strcmp(buf, "CSKOI8U")) { res = ENC_CS_KOI8U; }
2647 
2648  /* Check for official IANA aliases of Macintosh */
2649  else if(!strcmp(buf, "CSMACINTOSH")) { res = ENC_CS_MACINTOSH; }
2650  else if(!strcmp(buf, "MAC")) { res = ENC_CS_MACINTOSH; }
2651 
2652  /* Check for official IANA aliases of IBM codepages */
2653  /* IBM437 */
2654  else if(!strcmp(buf, "CSPC8CODEPAGE437")) { res = ENC_CS_IBM437; }
2655  else if(!strcmp(buf, "CP437")) { res = ENC_CS_IBM437; }
2656  else if(!strcmp(buf, "437")) { res = ENC_CS_IBM437; }
2657  /* IBM775 */
2658  else if(!strcmp(buf, "CSPC775BALTIC")) { res = ENC_CS_IBM775; }
2659  else if(!strcmp(buf, "CP775")) { res = ENC_CS_IBM775; }
2660  /* IBM850 */
2661  else if(!strcmp(buf, "CSPC850MULTILINGUAL")) { res = ENC_CS_IBM850; }
2662  else if(!strcmp(buf, "CP850")) { res = ENC_CS_IBM850; }
2663  else if(!strcmp(buf, "850")) { res = ENC_CS_IBM850; }
2664  /* IBM852 */
2665  else if(!strcmp(buf, "CSPCP852")) { res = ENC_CS_IBM852; }
2666  else if(!strcmp(buf, "CP852")) { res = ENC_CS_IBM852; }
2667  else if(!strcmp(buf, "852")) { res = ENC_CS_IBM852; }
2668  /* IBM00858 */
2669  else if(!strcmp(buf, "PC-MULTILINGUAL-850+EURO"))
2670  { res = ENC_CS_IBM858; }
2671  else if(!strcmp(buf, "CSIBM00858")) { res = ENC_CS_IBM858; }
2672  else if(!strcmp(buf, "CCSID00858")) { res = ENC_CS_IBM858; }
2673  else if(!strcmp(buf, "CP00858")) { res = ENC_CS_IBM858; }
2674  /* ISO 2022-JP */
2675  else if(!strcmp(buf, "CSISO2022JP")) { res = ENC_CS_ISO2022_JP; }
2676 
2677  /* -------------------------------------------------------------------- */
2678  /* To be more tolerant: Check again for invalid ISO 8859 declarations */
2679  else if(!strcmp(buf, "ISO8859-1"))
2680  { isoinv = 1; res = ENC_CS_ISO8859_1; }
2681  else if(!strcmp(buf, "ISO8859-2"))
2682  { isoinv = 1; res = ENC_CS_ISO8859_2; }
2683  else if(!strcmp(buf, "ISO8859-3"))
2684  { isoinv = 1; res = ENC_CS_ISO8859_3; }
2685  else if(!strcmp(buf, "ISO8859-4"))
2686  { isoinv = 1; res = ENC_CS_ISO8859_4; }
2687  else if(!strcmp(buf, "ISO8859-5"))
2688  { isoinv = 1; res = ENC_CS_ISO8859_5; }
2689  else if(!strcmp(buf, "ISO8859-6"))
2690  { isoinv = 1; res = ENC_CS_ISO8859_6; }
2691  else if(!strcmp(buf, "ISO8859-7"))
2692  { isoinv = 1; res = ENC_CS_ISO8859_7; }
2693  else if(!strcmp(buf, "ISO8859-8"))
2694  { isoinv = 1; res = ENC_CS_ISO8859_8; }
2695  else if(!strcmp(buf, "ISO8859-9"))
2696  { isoinv = 1; res = ENC_CS_ISO8859_9; }
2697  else if(!strcmp(buf, "ISO8859-10"))
2698  { isoinv = 1; res = ENC_CS_ISO8859_10; }
2699  else if(!strcmp(buf, "ISO-8859-11"))
2700  { isoinv = 1; res = ENC_CS_ISO8859_11; }
2701  else if(!strcmp(buf, "ISO8859-11"))
2702  { isoinv = 1; res = ENC_CS_ISO8859_11; }
2703  else if(!strcmp(buf, "ISO8859-13"))
2704  { isoinv = 1; res = ENC_CS_ISO8859_13; }
2705  else if(!strcmp(buf, "ISO8859-14"))
2706  { isoinv = 1; res = ENC_CS_ISO8859_14; }
2707  else if(!strcmp(buf, "ISO8859-15"))
2708  { isoinv = 1; res = ENC_CS_ISO8859_15; }
2709  else if(!strcmp(buf, "ISO8859-16"))
2710  { isoinv = 1; res = ENC_CS_ISO8859_16; }
2711 
2712  /* To be more tolerant: Check again for invalid MACINTOSH declarations */
2713  else if(!strcmp(buf, "MAC")) { macinv = 1; res = ENC_CS_MACINTOSH; }
2714  else if(!strcmp(buf, "MACROMAN"))
2715  { macinv = 1; res = ENC_CS_MACINTOSH; }
2716  else if(!strcmp(buf, "X-MAC-ROMAN"))
2717  { macinv = 1; res = ENC_CS_MACINTOSH; }
2718 
2719  /* To be more tolerant: Check again for invalid IBM declarations */
2720  else if(!strcmp(buf, "CP-437")) { ibminv = 1; res = ENC_CS_IBM437; }
2721  else if(!strcmp(buf, "IBM858")) { ibminv = 1; res = ENC_CS_IBM858; }
2722  else if(!strcmp(buf, "CP858")) { ibminv = 1; res = ENC_CS_IBM858; }
2723  else if(!strcmp(buf, "CP1250"))
2724  { ibminv = 1; res = ENC_CS_WINDOWS_1250; }
2725  else if(!strcmp(buf, "CP1251"))
2726  { ibminv = 1; res = ENC_CS_WINDOWS_1251; }
2727  else if(!strcmp(buf, "CP1252"))
2728  { ibminv = 1; res = ENC_CS_WINDOWS_1252; }
2729  else if(!strcmp(buf, "CP1253"))
2730  { ibminv = 1; res = ENC_CS_WINDOWS_1253; }
2731  else if(!strcmp(buf, "CP1254"))
2732  { ibminv = 1; res = ENC_CS_WINDOWS_1254; }
2733  else if(!strcmp(buf, "CP1255"))
2734  { ibminv = 1; res = ENC_CS_WINDOWS_1255; }
2735  else if(!strcmp(buf, "CP1256"))
2736  { ibminv = 1; res = ENC_CS_WINDOWS_1256; }
2737  else if(!strcmp(buf, "CP1257"))
2738  { ibminv = 1; res = ENC_CS_WINDOWS_1257; }
2739  else if(!strcmp(buf, "CP1258"))
2740  { ibminv = 1; res = ENC_CS_WINDOWS_1258; }
2741 
2742  /* To be more tolerant: Check again for invalid UTF declarations */
2743  else if(!strcmp(buf, "UTF7"))
2744  {
2745  PRINT_ERROR("MIME: Invalid character set UTF7 accepted as UTF-7");
2746  res = ENC_CS_UTF_7;
2747  }
2748  else if(!strcmp(buf, "UTF8"))
2749  {
2750  PRINT_ERROR("MIME: Invalid character set UTF8 accepted as UTF-8");
2751  res = ENC_CS_UTF_8;
2752  }
2753  /* -------------------------------------------------------------------- */
2754 
2755  /* Check whether character set is supported */
2756  if(ENC_CS_UNKNOWN == res)
2757  {
2758  l = strlen(MAIN_ERR_PREFIX) + strlen(not_supported) + len;
2759  p = (char*) posix_malloc(++l);
2760  if(NULL != p)
2761  {
2762  /* No => Character set not supported */
2763  strcpy(p, MAIN_ERR_PREFIX);
2764  strcat(p, not_supported);
2765  strncat(p, buf, len);
2766  print_error(p);
2767  posix_free((void*) p);
2768  }
2769  /* Special check for ISO 8859-x character sets that aren't supported */
2770  buf[8] = 0;
2771  if(!strcmp(buf, "ISO-8859")) { res = ENC_CS_ISO8859_X; }
2772  /* To be more tolerant: Check again for invalid ISO 8859 declaration */
2773  if(!strcmp(buf, "ISO8859")) { isoinv = 1; res = ENC_CS_ISO8859_X; }
2774  }
2775  if(isoinv)
2776  {
2777  PRINT_ERROR("MIME: Invalid ISO 8859 character set accepted");
2778  }
2779  else if(macinv)
2780  {
2781  PRINT_ERROR("MIME: Invalid Macintosh character set accepted");
2782  }
2783  else if(ibminv)
2784  {
2785  PRINT_ERROR("MIME: Invalid IBM codepage accepted");
2786  }
2787  }
2788 
2789  return(res);
2790 }
2791 
2792 
2793 /* ========================================================================== */
2794 /* Decode MIME quoted printable data
2795  *
2796  * \param[in] start Pointer to start of data
2797  * \param[in] end Pointer to end of data
2798  * \param[in] ec Flag to switch between normal and encoded-word syntax
2799  * \param[out] dlen Pointer to length of decoded data
2800  *
2801  * \e end must never be smaller than \e start and must point to the location
2802  * after the last character.
2803  *
2804  * \attention
2805  * If \e ec is true, the syntax is switched to that for encoded-words according
2806  * to RFC 2047.
2807  *
2808  * According to RFC 2045 the following rules are applied:
2809  * - Whitespace at end of lines must be ignored => We do so.
2810  * - A robust decoder may exclude invalid input data and continue
2811  * => We do so by decoding all invalid characters as '?'.
2812  * - The hexadecimal representation of a character must use upper case letters
2813  * A decoder is allowed to accept lower case letters too => We do so.
2814  * - If an invalid sequence follows a '=' character, it is allowed to accept
2815  * this data as plain ASCII => We do so.
2816  * - Lines are not allowed to be longer than 76 characters, but it is allowed
2817  * that a decoder accept lines of arbitrary length => We do so.
2818  *
2819  * \note
2820  * A NUL termination is always appended to the decoded data (but not calculated
2821  * for \e dlen ). This means that if the decoded data is text, the result buffer
2822  * can be directly used as C string.
2823  *
2824  * \result
2825  * - Pointer to new memory block containing the decoded data
2826  * - NULL on error (Value at location \e dlen is not valid)
2827  */
2828 
2829 static char* enc_mime_decode_qp(const char* start, const char* end,
2830  int ec, size_t* dlen)
2831 {
2832  char* res = NULL;
2833  size_t len;
2834  size_t bi = 0;
2835  size_t i;
2836  char* src = NULL;
2837  size_t ws = POSIX_SIZE_MAX;
2838  char* tmp = NULL;
2839  char* p;
2840  char current;
2841  unsigned char c;
2842  int state = 0;
2843  char nibble_high = 0;
2844  int v;
2845  int invalid;
2846 
2847  /* Delete whitespace at end of lines */
2848  len = (size_t) (end - start);
2849  p = posix_malloc(len + (size_t) 1);
2850  if(NULL == p) { return(NULL); }
2851  else
2852  {
2853  src = p;
2854  for(i = 0; i < len; ++i)
2855  {
2856  /* Check for EOL */
2857  if((char) 0x0A == start[i] && i)
2858  {
2859  if((char) 0x0D == start[i - (size_t) 1] && POSIX_SIZE_MAX != ws)
2860  {
2861  /* Seek back to remove whitespace */
2862  bi = ws;
2863  src[bi++] = 0x0D;
2864  }
2865  }
2866  /* Check for whitespace */
2867  if((char) 0x09 == start[i] || (char) 0x20 == start[i])
2868  {
2869  if(POSIX_SIZE_MAX == ws) { ws = bi; }
2870  }
2871  else if((char) 0x0D != start[i]) { ws = POSIX_SIZE_MAX; }
2872  src[bi++] = start[i];
2873  }
2874  /* Terminate string in source buffer */
2875  src[bi] = 0;
2876  /* Reassign start and end pointers */
2877  start = src;
2878  end = &src[bi];
2879  }
2880 
2881  /* Decode data */
2882  len = 0;
2883  bi = 0;
2884  for(i = 0; i < (size_t) (end - start); ++i)
2885  {
2886  /* Allocate more memory in exponentially increasing chunks */
2887  /* Attention: An invalid QP sequence stays undecoded and 3 octets long! */
2888  if(bi + (size_t) 4 >= len) /* We need (3 + NUL) additional bytes */
2889  {
2890  if(!len) { len = 64; }
2891  p = posix_realloc((void*) tmp, len *= (size_t) 2);
2892  if(NULL == p)
2893  {
2894  posix_free((void*) tmp);
2895  tmp = NULL;
2896  break;
2897  }
2898  else { tmp = p; }
2899  }
2900  /* Parse current character */
2901  current = start[i];
2902  /* Only printable ASCII characters, SPACE, HT, LF and CR are allowed */
2903  invalid = 0;
2904  v = (int) current;
2905  if(!((9 <= v && 10 >= v) || 13 == v || (32 <= v && 126 >= v)))
2906  {
2907  invalid = 1;
2908  }
2909  /* SPACE and HT are not allowed in encoded-words */
2910  if(ec && !invalid && (9 == v || 32 == v)) { invalid = 1; }
2911  if(invalid)
2912  {
2913  /* Invalid character detected */
2914  PRINT_ERROR("MIME: Decoding invalid quoted printable data");
2915  current = '?';
2916  }
2917  /* Equal sign sequence decoder state machine */
2918  c = 0;
2919  if(!state && '=' == current) { ++state; }
2920  switch(state)
2921  {
2922  case 1:
2923  {
2924  /* Skip equal sign */
2925  ++state;
2926  break;
2927  }
2928  case 2:
2929  {
2930  /* Store CR of soft line break or high nibble of encoded octet */
2931  nibble_high = current;
2932  ++state;
2933  break;
2934  }
2935  case 3:
2936  {
2937  /* SPACE and HT at end of line must be ignored */
2938  if( !ec &&
2939  ((char) 0x09 == nibble_high || (char) 0x20 == nibble_high) )
2940  {
2941  if((char) 0x09 == current || (char) 0x20 == current) { break; }
2942  else if((char) 0x0D == current)
2943  {
2944  nibble_high = current;
2945  break;
2946  }
2947  }
2948  ++state;
2949  /* No break here is intended! */
2950  }
2951  case 4:
2952  {
2953  state = 0;
2954  /* Check for soft line break */
2955  if(!ec && (char) 0x0D == nibble_high && (char) 0x0A == current)
2956  {
2957  /* printf("Soft line break\n"); */
2958  break;
2959  }
2960  /* Decode octet */
2961  invalid = 0;
2962  v = enc_hex_decode_nibble(nibble_high);
2963  if(0 > v) { invalid = 1; }
2964  else
2965  {
2966  c = (unsigned char) (v * 16);
2967  v = enc_hex_decode_nibble(current);
2968  if(0 > v) { invalid = 1; }
2969  else { c += (unsigned char) v; }
2970  }
2971  if(invalid)
2972  {
2973  /* Invalid encoding => Accept data as ASCII */
2974  PRINT_ERROR("MIME: Invalid quoted printable encoded data");
2975  tmp[bi++] = '=';
2976  tmp[bi++] = nibble_high;
2977  c = (unsigned char) current;
2978  }
2979  break;
2980  }
2981  default:
2982  {
2983  /* Decode underscore to space */
2984  if(ec && '_' == current) { c = (unsigned char) 0x20; }
2985  else { c = (unsigned char) current; }
2986  break;
2987  }
2988  }
2989  if(c) { tmp[bi++] = (char) c; }
2990  }
2991 
2992  /* Terminate decoded data (for use as C string) */
2993  if(NULL == tmp) { res = NULL; }
2994  else
2995  {
2996  tmp[bi] = 0;
2997  /* Report length without the NUL termination */
2998  *dlen = bi;
2999  res = tmp;
3000  }
3001  posix_free((void*) src);
3002 
3003  return(res);
3004 }
3005 
3006 
3007 /* ========================================================================== */
3008 /* Convert MIME quoted printable encoded text to Unicode (UTF-8 NFC)
3009  *
3010  * \param[in] charset Character set of data
3011  * \param[in] start Pointer to start of data
3012  * \param[in] end Pointer to end of data
3013  * \param[in] ec Flag to switch between normal and encoded-word syntax
3014  *
3015  * \e end must never be smaller than \e start and must point to the location
3016  * after the last character.
3017  *
3018  * \attention
3019  * If \e ec is true, the syntax is switched to that for encoded-words according
3020  * to RFC 2047.
3021  *
3022  * \result
3023  * - Pointer to new memory block containing the decoded data
3024  * - NULL on error
3025  */
3026 
3027 static const char* enc_mime_decode_q(enum enc_mime_cs charset,
3028  const char* start, const char* end,
3029  int ec)
3030 {
3031  const char* res = NULL;
3032  size_t len;
3033  char* tmp = NULL;
3034 
3035  tmp = enc_mime_decode_qp(start, end, ec, &len);
3036  if(NULL != tmp)
3037  {
3038  /* Convert result to Unicode and normalize to NFC */
3039  res = enc_convert_to_utf8_nfc(charset, tmp);
3040  if(tmp != res) { posix_free((void*) tmp); }
3041  }
3042 
3043  return(res);
3044 }
3045 
3046 
3047 /* ========================================================================== */
3048 /* Decode MIME base64 data
3049  *
3050  * \param[in] start Pointer to start of data
3051  * \param[in] end Pointer to end of data
3052  * \param[in] np Flag to request no padding
3053  * \param[out] dlen Pointer to length of decoded data
3054  *
3055  * \e end must never be smaller than \e start and must point to the location
3056  * after the last character.
3057  *
3058  * If \e np is true, the syntax is switched to that for base64 inside of UTF-7
3059  * according to RFC 2152.
3060  *
3061  * \note
3062  * A NUL termination is always appended to the decoded data (but not calculated
3063  * for \e dlen ). This means that if the decoded data is text, the result buffer
3064  * can be directly used as C string.
3065  *
3066  * \result
3067  * - Pointer to new memory block containing the decoded data
3068  * - NULL on error (Value at location \e dlen is not valid)
3069  */
3070 
3071 static char* enc_mime_decode_base64(const char* start, const char* end,
3072  int np, size_t* dlen)
3073 {
3074  char* res = NULL;
3075  size_t len = 0;
3076  size_t bi = 0;
3077  size_t i; /* Index in input data */
3078  char* tmp = NULL;
3079  char* p;
3080  int valid = 0;
3081  int state = 0;
3082  unsigned long int ii = 0; /* Index in base64 alphabet */
3083  unsigned long int input = 0;
3084  int padding = 0;
3085  int abort = 0;
3086 
3087  for(i = 0; i < (size_t) (end - start); ++i)
3088  {
3089  /* Allocate more memory in exponentially increasing chunks */
3090  if(bi + (size_t) 4 >= len) /* We need (3 + NUL) additional bytes */
3091  {
3092  if(!len) { len = 64; }
3093  p = posix_realloc((void*) tmp, len *= (size_t) 2);
3094  if(NULL == p)
3095  {
3096  PRINT_ERROR("MIME: Out of memory while decoding base64 data");
3097  abort = 1;
3098  break;
3099  }
3100  else { tmp = p; }
3101  }
3102  valid = 0;
3103  /* Check for padding */
3104  if('=' == start[i])
3105  {
3106  ii = 0;
3107  if(0 == state || 1 == state)
3108  {
3109  /* Invalid padding character detected => Abort */
3110  PRINT_ERROR("MIME: Invalid padding in base64 encoded data");
3111  abort = 1;
3112  break;
3113  }
3114  else { valid = 1; }
3115  }
3116  /* Check for valid encoding */
3117  else
3118  {
3119  if(' ' == start[i] || 0x09 == start[i]
3120  || 0x0A == start[i] || 0x0D == start[i])
3121  {
3122  /* Accept and ignore whitespace and line breaks */
3123  continue;
3124  }
3125  for(ii = 0; ii < 64UL; ++ii)
3126  {
3127  if(enc_base64[ii] == start[i])
3128  {
3129  valid = 1;
3130  break;
3131  }
3132  }
3133  }
3134  if(!valid)
3135  {
3136  /* Invalid character detected => Abort */
3137  PRINT_ERROR("MIME: Invalid character in base64 encoded data");
3138  abort = 1;
3139  break;
3140  }
3141  /* Parse current character */
3142  switch(state)
3143  {
3144  case 0:
3145  {
3146  /* Get first input character */
3147  input = ii << 18;
3148  padding = 0;
3149  ++state;
3150  break;
3151  }
3152  case 1:
3153  {
3154  /* Get second input character */
3155  input |= ii << 12;
3156  ++state;
3157  break;
3158  }
3159  case 2:
3160  {
3161  /* Get third input character */
3162  if('=' == start[i]) { padding = 2; }
3163  else { input |= ii << 6; }
3164  ++state;
3165  break;
3166  }
3167  case 3:
3168  {
3169  /* Get fourth input character */
3170  if('=' == start[i])
3171  {
3172  if(!padding) { padding = 1; }
3173  }
3174  else if(padding)
3175  {
3176  PRINT_ERROR("MIME: Invalid padding in base64 encoded data");
3177  abort = 1;
3178  break;
3179  }
3180  else { input |= ii; }
3181  /* Decode 3 output octet from 4 input characters */
3182  tmp[bi++] = (char) (unsigned char) ((input >> 16) & 0xFFUL);
3183  if (2 > padding)
3184  {
3185  tmp[bi++] = (char) (unsigned char) ((input >> 8) & 0xFFUL);
3186  if (1 > padding)
3187  {
3188  tmp[bi++] = (char) (unsigned char) (input & 0xFFUL);
3189  }
3190  }
3191  state = 0;
3192  break;
3193  }
3194  default:
3195  {
3196  PRINT_ERROR("MIME: base64 state machine failed (Bug)");
3197  abort = 1;
3198  break;
3199  }
3200  }
3201  if(abort) { break; }
3202  if(padding && 0 == state) { break; }
3203  }
3204  if(abort)
3205  {
3206  posix_free((void*) tmp);
3207  tmp = NULL;
3208  }
3209 
3210  /* Check for unpadded format */
3211  if(!abort && np && state)
3212  {
3213  switch(state)
3214  {
3215  case 2:
3216  {
3217  /* One character left */
3218  if(input & 0x000FFFUL) { abort = 1; }
3219  else
3220  {
3221  tmp[bi++] = (char) (unsigned char) ((input >> 16) & 0xFFUL);
3222  }
3223  break;
3224  }
3225  case 3:
3226  {
3227  /* Two characters left */
3228  if(input & 0x00003FUL) { abort = 1; }
3229  else
3230  {
3231  tmp[bi++] = (char) (unsigned char) ((input >> 16) & 0xFFUL);
3232  tmp[bi++] = (char) (unsigned char) ((input >> 8) & 0xFFUL);
3233  }
3234  break;
3235  }
3236  default:
3237  {
3238  abort = 1;
3239  break;
3240  }
3241  }
3242  if(abort)
3243  {
3244  PRINT_ERROR("MIME: Invalid end of unpadded base64 data");
3245  }
3246  }
3247 
3248  /* Terminate decoded data (for use as C string) */
3249  if(NULL == tmp) { res = NULL; }
3250  else
3251  {
3252  tmp[bi] = 0;
3253  /* Report length without the NUL termination */
3254  *dlen = bi;
3255 #if 0
3256  /* For debugging */
3257  printf("Length of base64 decoded data: %u\n", (unsigned int) *dlen);
3258 #endif
3259  res = tmp;
3260  }
3261 
3262  return(res);
3263 }
3264 
3265 
3266 /* ========================================================================== */
3267 /* Convert MIME base64 encoded text to Unicode (UTF-8)
3268  *
3269  * \param[in] charset Character set of data
3270  * \param[in] start Pointer to start of data
3271  * \param[in] end Pointer to end of data
3272  * \param[in] np Flag to request no padding
3273  *
3274  * \e end must never be smaller than \e start and must point to the location
3275  * after the last character.
3276  *
3277  * \attention
3278  * If \e np is true, the syntax is switched to that for base64 inside of UTF-7
3279  * according to RFC 2152. The resulting Unicode data is not normalized because
3280  * an UTF-7 shift sequence may start with a non-starter codepoint and represent
3281  * a "Defective combining character sequence" if processed without context.
3282  * The UTF-7 converter must normalize the data together with the context.
3283  *
3284  * \result
3285  * - Pointer to new memory block containing the decoded data
3286  * - NULL on error
3287  */
3288 
3289 static const char* enc_mime_decode_b(enum enc_mime_cs charset,
3290  const char* start, const char* end,
3291  int np)
3292 {
3293  const char* res = NULL;
3294  size_t len = 0;
3295  char* tmp = NULL;
3296  const char* tmp2 = NULL;
3297 
3298  tmp = enc_mime_decode_base64(start, end, np, &len);
3299  if(NULL != tmp)
3300  {
3301  tmp2 = tmp;
3302  /* Convert UTF-16BE with separate function because data may contain NUL */
3303  if(ENC_CS_UTF_16BE == charset)
3304  {
3305  tmp2 = enc_uc_convert_utf16be_to_utf8(tmp, len);
3306  if(NULL == tmp2)
3307  {
3308  /* Error => Return empty string */
3309  tmp[0] = 0;
3310  tmp2 = tmp;
3311  }
3312  else { posix_free((void*) tmp); }
3313  /* Do not normalize to NFC yet (maybe context is required) */
3314  res = tmp2;
3315  }
3316  else
3317  {
3318  /* Convert result to Unicode and normalize to NFC */
3319  res = enc_convert_to_utf8_nfc(charset, tmp2);
3320  }
3321  if(tmp2 != res) { posix_free((void*) tmp2); }
3322  }
3323 
3324  return(res);
3325 }
3326 
3327 
3328 /* ========================================================================== */
3329 /* Convert from CESU-8 to UTF-8 NFC
3330  *
3331  * \param[in] s String to convert
3332  *
3333  * \todo Data with invalid UTF-8 sequences ist rejected. Should be repaired.
3334  *
3335  * \return
3336  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
3337  * - NULL on error
3338  */
3339 
3340 static const char* enc_uc_convert_cesu8_to_utf8(const char* s)
3341 {
3342  const char* res = NULL;
3343  char* buf;
3344  size_t len = strlen(s);
3345  size_t i = 0;
3346  size_t bi = 0;
3347  int rv;
3348  long int mbc;
3349  long int hs = 0; /* Buffer for high surrogate */
3350  long int ucp;
3351  size_t num;
3352 
3353  rv = enc_uc_check_cesu8(s, 0);
3354  if(0 == rv)
3355  {
3356  /*
3357  * Allocate the result buffer for same size as the original data.
3358  * This is always sufficient.
3359  */
3360  buf = (char*) posix_malloc(len + (size_t) 1);
3361  if(NULL != buf)
3362  {
3363  res = buf;
3364  while(len > i)
3365  {
3366  /* Decode multibyte character */
3367  mbc = enc_uc_decode_utf8(s, &i);
3368  if(-1L == mbc)
3369  {
3370  free((void*) buf);
3371  res = NULL;
3372  break;
3373  }
3374  /* Check for surrogate pair */
3375  if(0x00D800L <= mbc && 0x00DFFFL >= mbc)
3376  {
3377  if(0x00DBFFL >= mbc)
3378  {
3379  /* High surrogate */
3380  if(hs)
3381  {
3382  PRINT_ERROR("Invalid high surrogate in CESU-8 data");
3383  /* Replacement character has same size in UTF-8 */
3384  ucp = 0x00FFFDL;
3385  }
3386  else
3387  {
3388  hs = mbc;
3389  continue;
3390  }
3391  }
3392  else
3393  {
3394  /* Low surrogate */
3395  if(!hs)
3396  {
3397  PRINT_ERROR("Invalid low surrogate in CESU-8 data");
3398  /* Replacement character has same size in UTF-8 */
3399  ucp = 0x00FFFDL;
3400  }
3401  else
3402  {
3403  /* Calculate Unicode codepoint */
3404  ucp = (hs & 0x0003FFL) << 10;
3405  ucp |= (mbc & 0x0003FFL);
3406  hs = 0;
3407  ucp += 0x010000L; /* Add size of BMP */
3408  }
3409  }
3410  }
3411  else { ucp = mbc; }
3412  num = 1;
3413  enc_uc_encode_utf8(buf, &bi, &ucp, &num);
3414  }
3415  if(NULL != res)
3416  {
3417  /* Terminate result string */
3418  buf[bi++] = 0;
3419  }
3420  }
3421  }
3422 
3423  return(res);
3424 }
3425 
3426 
3427 /* ========================================================================== */
3428 /* Convert from UTF-7 to UTF-8 NFC
3429  *
3430  * \param[in] s String to convert
3431  *
3432  * \return
3433  * - Pointer to result (if not equal to \e s , a new memory block was allocated)
3434  * - NULL on error
3435  */
3436 
3437 static const char* enc_uc_convert_utf7_to_utf8(const char* s)
3438 {
3439  const char* res = NULL;
3440  size_t i = 0;
3441  int shift = 0; /* State of decoder */
3442  char* buf;
3443  size_t bi = 0;
3444  size_t len = strlen(s);
3445  int c;
3446  size_t ii;
3447  const char* start = NULL;
3448  const char* end = NULL;
3449  const char* utf8;
3450  size_t utf8_len;
3451  int eod = 0; /* Flag indicating end of (input) data */
3452 
3453  /*
3454  * Allocate the result buffer twice the size as the original data.
3455  * This is always sufficient.
3456  *
3457  * The same buffer size may not be sufficient even if the base64 decoding
3458  * always reduce the size of the data. The reason is the UTF-16BE
3459  * representation of the UTF-7 data that we want to convert to UTF-8.
3460  * The UTF-8 representation of Unicode can be larger than UTF-16!
3461  * This is the case for the codepoints that require 3 bytes in UTF-8
3462  * and therefore an enlargement factor of 1.5 must be calculated.
3463  */
3464  buf = (char*) posix_malloc(++len * (size_t) 2);
3465  if(NULL != buf)
3466  {
3467  while(s[i])
3468  {
3469  /* Check current state */
3470  if(!shift)
3471  {
3472  /* Check for shift */
3473  if('+' == s[i])
3474  {
3475  if('-' == s[i + (size_t) 1])
3476  {
3477  /* Literal '+' detected */
3478  ++i;
3479  buf[bi++] = '+';
3480  }
3481  else
3482  {
3483  start = &s[i + (size_t) 1];
3484  shift = 1;
3485  }
3486  }
3487  else
3488  {
3489  c = (int) s[i];
3490  /* Check for control characters (Rule 3) */
3491  if(0x09 == c || 0x0A == c || 0x0D == c|| 0x20 == c)
3492  {
3493  buf[bi++] = s[i];
3494  }
3495  /* Check for character set D (Rule 1) */
3496  else if( (0x30 <= c && 0x39 >= c)
3497  || (0x41 <= c && 0x5A >= c) || (0x61 <= c && 0x7A >= c)
3498  || (0x27 <= c && 0x29 >= c) || (0x2C <= c && 0x2F >= c)
3499  || 0x3A == c || 0x3F == c )
3500  {
3501  buf[bi++] = s[i];
3502  }
3503  /* Check for character set O (Rule 1) */
3504  else if( (0x21 <= c && 0x26 >= c) || 0x2A == c
3505  || (0x3B <= c && 0x3E >= c)
3506  || 0x40 == c || 0x5B == c
3507  || (0x5D <= c && 0x60 >= c) || (0x7B <= c && 0x7D >= c) )
3508  {
3509  buf[bi++] = s[i];
3510  }
3511  else
3512  {
3513  PRINT_ERROR("Invalid character in UTF-7 data ignored");
3514  }
3515  }
3516  }
3517  else
3518  {
3519  /* Check for character set B (Rule 2) */
3520  c = 0;
3521  for(ii = 0; ii < (size_t) 64; ++ii)
3522  {
3523  if(enc_base64[ii] == s[i]) { c = (int) s[i]; break; }
3524  }
3525  /* Treat end of data like a termination character */
3526  if(!s[i + (size_t) 1]) { eod = 1; }
3527  if(!c || eod)
3528  {
3529  /* Attention: eod must have lower precedence if both are true */
3530  if(!c) { end = &s[i]; }
3531  else { end = &s[i + (size_t) 1]; }
3532  /* Decode Unicode data as UTF-16BE */
3533  utf8 = enc_mime_decode_b(ENC_CS_UTF_16BE, start, end, 1);
3534  if(NULL == utf8) { buf[bi++] = '?'; }
3535  else
3536  {
3537  utf8_len = strlen(utf8);
3538  for(ii = 0; ii < utf8_len; ++ii) { buf[bi++] = utf8[ii]; }
3539  posix_free((void*) utf8);
3540  }
3541  /* Preserve any terminating character except '-' */
3542  if(!c && '-' != s[i]) { --i; }
3543  /* Terminate shift state */
3544  shift = 0;
3545  }
3546  }
3547  ++i;
3548  }
3549  /* Terminate result string */
3550  buf[bi] = 0;
3551  res = buf;
3552  }
3553 
3554  /* Check for error */
3555  if(shift)
3556  {
3557  PRINT_ERROR("UTF-7 converter terminated in shift state (bug)");
3558  }
3559 
3560  return(res);
3561 }
3562 
3563 
3564 /* ========================================================================== */
3565 /* Check for leap year in terms of gregorian calendar
3566  *
3567  * \return
3568  * - 0 if \e year is not a leap year
3569  * - Nonzero if \e year is a leap year
3570  */
3571 
3572 static int enc_check_leap_year(unsigned int year)
3573 {
3574  if(!(year % 400U) || (!(year % 4U) && (year % 100U))) { return(1); }
3575  else { return(0); }
3576 }
3577 
3578 
3579 /* ========================================================================== */
3580 /* Encode date and time to POSIX timestamp (seconds since epoche)
3581  *
3582  * \param[out] pts Pointer to seconds since epoche (as defined by POSIX.1)
3583  * \param[in] year Years
3584  * \param[in] month Months
3585  * \param[in] day Days
3586  * \param[in] hour Hours
3587  * \param[in] minute Minutes
3588  * \param[in] seconds Seconds
3589  * \param[in] zone Timezone correction in minutes (Zero for UTC)
3590  *
3591  * \attention
3592  * This function accepts no timestamps before the epoche (the Usenet has not
3593  * existed yet at that time).
3594  *
3595  * On error, zero is written to \e pts .
3596  */
3597 
3598 static int enc_encode_posix_timestamp(core_time_t* pts, unsigned int year,
3599  unsigned int month, unsigned int day,
3600  unsigned int hour, unsigned int minute,
3601  unsigned int second, int zone)
3602 {
3603  static const unsigned int dom[12] = { 31U, 29U, 31U, 30U, 31U, 30U,
3604  31U, 31U, 30U, 31U, 30U, 31U };
3605  int res = -1;
3606  core_time_t ts = 0;
3607  core_time_t zone_seconds;
3608  unsigned int i;
3609 
3610  /* Clamp year down to 1970 */
3611  if(1970U <= year)
3612  {
3613  /* Check for 'core_time_t' overflow (leave at least one year) */
3614  if(2104U < year)
3615  {
3616  PRINT_ERROR("Warning: core_time_t overflow while decoding timestamp");
3617  year = 2104U;
3618  }
3619  for(i = 1970U; i < year; ++i)
3620  {
3621  ts += (core_time_t) 365 * (core_time_t) 86400;
3622  /* Add an additional day for leap years */
3623  if(enc_check_leap_year(i)) { ts += (core_time_t) 86400; }
3624  }
3625  for(i = 0; i < month - 1U; ++i)
3626  {
3627  ts += (core_time_t) dom[i] * (core_time_t) 86400;
3628  /* Subtract one day if current year is not a leap year */
3629  if(1U == i && !enc_check_leap_year(year))
3630  {
3631  ts -= (core_time_t) 86400;
3632  }
3633  }
3634  ts += (core_time_t) (day - 1U) * (core_time_t) 86400;
3635  ts += (core_time_t) hour * (core_time_t) 3600;
3636  ts += (core_time_t) minute * (core_time_t) 60;
3637  ts += (core_time_t) second;
3638  if(0 > zone)
3639  {
3640  zone_seconds = (core_time_t) -zone * (core_time_t) 60;
3641  ts += zone_seconds;
3642  res = 0;
3643  }
3644  else
3645  {
3646  zone_seconds = (core_time_t) zone * (core_time_t) 60;
3647  if(ts >= zone_seconds)
3648  {
3649  ts -= zone_seconds;
3650  res = 0;
3651  }
3652  }
3653  }
3654 
3655  /* Store result */
3656  if(res)
3657  {
3658  PRINT_ERROR("Encoding POSIX timestamp failed");
3659  *pts = 0;
3660  }
3661  else { *pts = ts; }
3662 
3663 #if 0
3664  if(!res)
3665  {
3666  /* For debugging (not thread safe) */
3667  printf("Seconds : %lu\n", (long int) ts);
3668  struct tm* t;
3669  t = gmtime((posix_time_t*) &ts);
3670  printf("Conv. UTC: %04d-%02d-%02d %02d:%02d:%02d\n",
3671  t->tm_year + 1900, t->tm_mon + 1, t->tm_mday,
3672  t->tm_hour, t->tm_min, t->tm_sec);
3673  }
3674 #endif
3675 
3676  return(res);
3677 }
3678 
3679 
3680 /* ========================================================================== */
3681 /* Check RFC 5322 atom
3682  *
3683  * \param[in] s Pointer to single character
3684  *
3685  * \return
3686  * - 0 if character pointed to by \e s is allowed
3687  * - Negative value on error
3688  */
3689 
3690 static int enc_check_atom(const char* s)
3691 {
3692  int res = -1;
3693  int c = (int) *s;
3694 
3695  /* Allow 'atext' */
3696  if(0x30 <= c && 0x39 >=c) { res = 0; }
3697  else if(0x41 <= c && 0x5A >=c) { res = 0; }
3698  else if(0x61 <= c && 0x7A >=c) { res = 0; }
3699  else if((int) '!' == c) { res = 0; }
3700  else if((int) '#' == c) { res = 0; }
3701  else if((int) '$' == c) { res = 0; }
3702  else if((int) '%' == c) { res = 0; }
3703  else if((int) '&' == c) { res = 0; }
3704  else if(0x27 == c) { res = 0; }
3705  else if((int) '*' == c) { res = 0; }
3706  else if((int) '+' == c) { res = 0; }
3707  else if((int) '-' == c) { res = 0; }
3708  else if((int) '/' == c) { res = 0; }
3709  else if((int) '=' == c) { res = 0; }
3710  else if((int) '?' == c) { res = 0; }
3711  else if((int) '^' == c) { res = 0; }
3712  else if((int) '_' == c) { res = 0; }
3713  else if((int) '`' == c) { res = 0; }
3714  else if((int) '{' == c) { res = 0; }
3715  else if((int) '|' == c) { res = 0; }
3716  else if((int) '}' == c) { res = 0; }
3717  else if((int) '~' == c) { res = 0; }
3718 
3719  return(res);
3720 }
3721 
3722 
3723 /* ========================================================================== */
3724 /* Check RFC 5322 dot-atom
3725  *
3726  * \param[in] s Pointer to single character
3727  *
3728  * \return
3729  * - 0 if character pointed to by \e s is allowed
3730  * - Negative value on error
3731  */
3732 
3733 static int enc_check_dotatom(const char* s)
3734 {
3735  int res;
3736 
3737  /* Allow dot and atext */
3738  if('.' == *s) { res = 0; }
3739  else { res = enc_check_atom(s); }
3740 
3741  return(res);
3742 }
3743 
3744 
3745 /* ========================================================================== */
3746 /* Encode words in display-name
3747  *
3748  * Words containing only 7 bit characters are encoded to atom or quoted-string.
3749  * Words containing 8 bit characters are preserved unchanged for MIME encoder.
3750  *
3751  * \param[in,out] s Pointer to data buffer
3752  */
3753 
3754 static void enc_encode_dispname(char* s)
3755 {
3756  size_t i = 0;
3757  char buf[ENC_HDR_BUFSIZE + (size_t) 1];
3758  size_t bi = 0;
3759  char word[ENC_HDR_BUFSIZE + (size_t) 1];
3760  char* w;
3761  size_t word_len;
3762  int last_word = 0;
3763  size_t ii;
3764  int atom;
3765  size_t start;
3766  int error;
3767  char cbuf[2];
3768 
3769  while(s[i])
3770  {
3771  /* Extract next word */
3772  ii = i; while(' ' == s[ii]) { ++ii; }
3773  w = strchr(&s[ii], (int) ' ');
3774  if(NULL == w)
3775  {
3776  word_len = strlen(&s[i]);
3777  last_word = 1;
3778  }
3779  else { word_len = (size_t) (w - &s[i]); }
3780  if(ENC_HDR_BUFSIZE < word_len) { word[0] = 0; }
3781  else
3782  {
3783  memcpy((void*) word, (void*) &s[i], word_len);
3784  word[word_len] = 0;
3785  }
3786  i += word_len;
3787  if(!last_word) { ++i; } /* Skip SP delimiter */
3788 
3789  /* Check word */
3790  atom = 1;
3791  ii = 0;
3792  while(word[ii])
3793  {
3794  if(0x80U <= (unsigned int) (unsigned char) word[ii])
3795  {
3796  atom = 1;
3797  break;
3798  }
3799  if(enc_check_atom(&word[ii])) { atom = 0; }
3800  ++ii;
3801  }
3802 
3803  /* SP delimiter between words */
3804  if(bi)
3805  {
3806  if(ENC_HDR_BUFSIZE <= bi) { break; } else { buf[bi++] = ' '; }
3807  }
3808 
3809  /* Copy data to buffer */
3810  if(atom)
3811  {
3812  if(ENC_HDR_BUFSIZE - bi < word_len) { break; }
3813  else
3814  {
3815  memcpy((void*) &buf[bi], (void*) word, word_len);
3816  bi += word_len;
3817  }
3818  }
3819  else
3820  {
3821  /* Create quoted-string */
3822  start = bi;
3823  error = 0;
3824  /* Leading DQUOTE delimiter */
3825  if(ENC_HDR_BUFSIZE <= bi) { error = 1; } else { buf[bi++] = '"'; }
3826  /* Process data */
3827  for(ii = 0; ii < word_len; ++ii)
3828  {
3829  /* Skip control characters */
3830  cbuf[0] = word[ii]; cbuf[1] = 0;
3831  if(enc_ascii_check_printable(cbuf)) { continue; }
3832  /* Check remaining buffer size */
3833  if(ENC_HDR_BUFSIZE - bi < (size_t) 2) { error = 1; break; }
3834  /* Check whether quoted pair is required */
3835  if('"' == word[ii] || 0x5C == (int) word[ii]) { buf[bi++] = 0x5C; }
3836  buf[bi++] = word[ii];
3837  }
3838  /* Trailing DQUOTE delimiter */
3839  if(ENC_HDR_BUFSIZE <= bi) { error = 1; } else { buf[bi++] = '"'; }
3840  if(error) { bi = start; }
3841  }
3842  if(last_word) { break; }
3843  }
3844  /* Terminate buffer */
3845  buf[bi] = 0;
3846  /* Copy data back to callers buffer */
3847  strncpy(s, buf, ++bi);
3848 
3849  return;
3850 }
3851 
3852 
3853 /* ========================================================================== */
3854 /* Decode MIME parameter percent encoding
3855  *
3856  * \param[in] buf Pointer to data buffer
3857  * \param[in] cs IANA name of character set
3858  *
3859  * This function decodes the percent encoding defined for MIME parameters by
3860  * RFC 2231. The IANA name of the character set for the resulting octet stream
3861  * must be specified by \e cs . The data is converted to Unicode in UTF-8
3862  * representation with NFC normalization.
3863  *
3864  * \return
3865  * - Pointer to decoded data (a new memory block was allocated)
3866  * - NULL on error
3867  */
3868 
3869 static char* enc_mime_decode_parameter(const char* buf, const char* cs)
3870 {
3871  char* res = NULL;
3872  char* tmp = NULL;
3873  const char* tmp2 = NULL;
3874  size_t len;
3875  int rv;
3876  enum enc_mime_cs charset;
3877 
3878  if(NULL != buf)
3879  {
3880  /* Percent decoder */
3881  len = strlen(buf);
3882  tmp = (char*) posix_malloc(++len);
3883  if(NULL != tmp)
3884  {
3885  memcpy((void*) tmp, (void*) buf, len);
3886  if(enc_ascii_check_printable(tmp))
3887  {
3888  PRINT_ERROR("MIME: Nonprintable characters in parameter");
3889  }
3890  else
3891  {
3892  rv = enc_percent_decode(tmp, 1);
3893  if(0 > rv)
3894  {
3895  PRINT_ERROR("MIME: Percent encoding failed for parameter");
3896  }
3897  else
3898  {
3899  charset = enc_mime_get_charset(cs, strlen(cs));
3900  tmp2 = enc_convert_to_utf8_nfc(charset, tmp);
3901  if(NULL == tmp2)
3902  {
3903  PRINT_ERROR("MIME: Parameter charset not supported");
3904  }
3905  else
3906  {
3907  len = strlen(tmp2);
3908  res = (char*) posix_malloc(++len);
3909  if(NULL != res)
3910  {
3911  memcpy((void*) res, (void*) tmp2, len);
3912  }
3913  if(tmp != tmp2) { posix_free((void*) tmp2); }
3914  }
3915  }
3916  }
3917  }
3918  }
3919  posix_free((void*) tmp);
3920 
3921  return(res);
3922 }
3923 
3924 
3925 /* ========================================================================== */
3926 /*! \brief Create a "name-addr" construct according to RFC 5322
3927  *
3928  * This function is intended to create the "From" and "Reply-To" header fields.
3929  *
3930  * \param[in] data Input data
3931  * \param[in] offset Folding offset, e.g. \c sizeof("From: ")
3932  *
3933  * The input data must have the following format: \c name \c <addr-spec> .
3934  *
3935  * \attention
3936  * The \c addr-spec construct is not allowed to contain comments or quoted
3937  * strings. Both parts, \c name and \c <addr-spec> must fit on a single header
3938  * line of 998 characters. Note that \e offset adds to the length of \c name .
3939  *
3940  * \c name must be an Unicode identifier corresponding to \c addr-spec . If it
3941  * contains non-ASCII characters, it is converted to a valid \c display-name
3942  * token. The result will be folded according to RFC 2047.
3943  *
3944  * On success the caller is responsible to free the memory allocated for the
3945  * result.
3946  *
3947  * \return
3948  * - Pointer to encoded data (a new memory block was allocated)
3949  * - NULL on error
3950  */
3951 
3952 const char* enc_create_name_addr(const char* data, size_t offset)
3953 {
3954  const char* res = NULL;
3955  size_t len = 4; /* The space after name, the angle brackets and NUL */
3956  size_t i;
3957  size_t counter = 0;
3958  int error = 0;
3959  char c;
3960  char* buf;
3961  int rv;
3962  char name[(size_t) 2 * ENC_HDR_BUFSIZE + (size_t) 1];
3963  char addr_spec[ENC_HDR_BUFSIZE + (size_t) 1];
3964 
3965  /* Extract name and addr-spec parts from input data */
3966  if((size_t) 2 * ENC_HDR_BUFSIZE < strlen(data)) { error = 1; }
3967  else
3968  {
3969  strcpy(name, data);
3970  addr_spec[0] = 0;
3971  if(!strlen(name)) { error = 1; }
3972  else
3973  {
3974  i = 0;
3975  while(name[i])
3976  {
3977  if('<' == name[i])
3978  {
3979  if(NULL == strchr(&name[i + (size_t) 1], (int) '<'))
3980  {
3981  if(!i) { name[0] = 0; }
3982  else { name[i - (size_t) 1] = 0; }
3983  if(ENC_HDR_BUFSIZE < strlen(&name[i])) { error = 1; }
3984  else
3985  {
3986  strcpy(addr_spec, &name[++i]);
3987  i = strlen(addr_spec) - (size_t) 1;
3988  if('>' != addr_spec[i]) { addr_spec[0] = 0; }
3989  else { addr_spec[i] = 0; }
3990  }
3991  break;
3992  }
3993  }
3994  ++i;
3995  }
3996  }
3997  }
3998 
3999  /* Prepare display-name */
4000  if(!error)
4001  {
4002  enc_encode_dispname(name);
4003  len += strlen(name);
4004  }
4005 
4006  /* Check addr-spec */
4007  if(!error)
4008  {
4009  len += strlen(addr_spec);
4010  error = enc_ascii_check(addr_spec);
4011  if(!error)
4012  {
4013  i = 0;
4014  do
4015  {
4016  c = addr_spec[i]; if(!c) { break; }
4017  if('@' != c && enc_check_dotatom(&c))
4018  {
4019  /* Invalid dot-atom found */
4020  error = 1;
4021  }
4022  /* Handle "@" separator */
4023  else if('@' == c)
4024  {
4025  ++counter;
4026  if(!i || !addr_spec[i + (size_t) 1])
4027  {
4028  /* Invalid separator found (at beginning or end) */
4029  error = 1;
4030  }
4031  /* Verify that dot-atoms don't have dots at beginning or end */
4032  if('.' == addr_spec[i - (size_t) 1]
4033  || '.' == addr_spec[i + (size_t) 1])
4034  {
4035  /* Invalid dot-atom found */
4036  error = 1;
4037  }
4038  }
4039  /* Verify that dot-atoms don't have dots at beginning or end */
4040  if(!error && '.' == c)
4041  {
4042  if(!i || !addr_spec[i + (size_t) 1])
4043  {
4044  /* Invalid dot-atom found */
4045  error = 1;
4046  }
4047  }
4048  ++i;
4049  }
4050  while(!error);
4051  }
4052  /* Final checks */
4053  if(! (error || (size_t) 1 != counter || (size_t) 5 > strlen(addr_spec)) )
4054  {
4055  /* Allocate buffer */
4056  buf = (char*) posix_malloc(len);
4057  if(NULL != buf)
4058  {
4059  /* Copy name and add trailing space (if not empty string) */
4060  if(name[0])
4061  {
4062  strcpy(buf, name);
4063  strcat(buf, " <");
4064  }
4065  else { strcpy(buf, "<"); }
4066  /* Copy addr-spec between angle brackets */
4067  strcat(buf, addr_spec);
4068  strcat(buf, ">");
4069  /* MIME encoding */
4070  rv = enc_mime_word_encode(&res, buf, offset);
4071  if(0 >= rv) { posix_free((void*) buf); }
4072  /* For positive return value, 'buf' was assigned to 'res'! */
4073  }
4074  }
4075  }
4076 
4077  /* Check for error */
4078  if(error) { PRINT_ERROR("Creating name-addr construct failed"); }
4079 
4080  /* For code review: Do not 'free()' memory pointed to by 'buf' here! */
4081 
4082  return(res);
4083 }
4084 
4085 /* ========================================================================== */
4086 /*! \brief Decode number of lines
4087  *
4088  * \param[in] lines Number of lines
4089  *
4090  * \e lines must be a RFC 5536 conformant body of the (now obsolete) "Lines"
4091  * header field.
4092  *
4093  * \return
4094  * - Number of lines
4095  * - 0 on error
4096  */
4097 
4098 unsigned long int enc_lines_decode(const char* lines)
4099 {
4100  unsigned long int res;
4101 
4102  if(1 != sscanf(lines, "%lu", &res)) { res = 0; }
4103 
4104  return(res);
4105 }
4106 
4107 
4108 /* ========================================================================== */
4109 /*! \brief Convert number of lines to string
4110  *
4111  * \param[out] l Pointer to result buffer (at least 11 characters large)
4112  * \param[in] l_raw Number of lines
4113  *
4114  * \attention
4115  * The value of \e l_raw must be representable as decimal number with not more
4116  * than 10 digits. Otherwise the string \c "Error" is returned.
4117  */
4118 
4119 void enc_convert_lines_to_string(char* l, unsigned long int l_raw)
4120 {
4121  int rv;
4122 
4123  rv = posix_snprintf(l, 11, "%lu", l_raw);
4124  if(0 > rv || 11 <= rv)
4125  {
4126  l[0] = 'E';
4127  l[1] = 'r';
4128  l[2] = 'r';
4129  l[3] = 'o';
4130  l[4] = 'r';
4131  l[5] = 0;
4132  }
4133 }
4134 
4135 
4136 /* ========================================================================== */
4137 /*! \brief Decode canonical timestamp to POSIX time (seconds since epoche)
4138  *
4139  * According to RFC 5322 all military timezones should be
4140  * treated as UTC because there was an error in RFC 822
4141  * => We do so and accept "Z" as valid because it means UTC
4142  *
4143  * \note
4144  * This function accepts no timestamps before the epoche (the Usenet has not
4145  * existed yet at that time).
4146  *
4147  * \param[in] timestamp RFC 5536 conformant timestamp string
4148  *
4149  * \return
4150  * - Seconds since epoche (as defined by POSIX.1)
4151  * - 0 on error
4152  */
4153 
4154 core_time_t enc_timestamp_decode(const char* timestamp)
4155 {
4156  static const char* months[12] = { "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
4157  "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
4158  static const unsigned int dom[12] = { 31U, 29U, 31U, 30U, 31U, 30U,
4159  31U, 31U, 30U, 31U, 30U, 31U };
4160  core_time_t res = 0;
4161  int error = 1;
4162  const char* p;
4163  const char* q;
4164  int rv;
4165  char m[4];
4166  char z[6];
4167  unsigned int zh = 0;
4168  unsigned int zm = 0;
4169  unsigned int i;
4170  unsigned int year;
4171  unsigned int month = 13U;
4172  unsigned int day;
4173  unsigned int hour = 0;
4174  unsigned int minute = 0;
4175  unsigned int second = 0;
4176  int zone = 1; /* Correction in minutes */
4177  int pos = 0;
4178 
4179 #if 0
4180  /* For debugging */
4181  printf("------------------------------------------------------------\n");
4182  printf("Timestamp: %s\n", timestamp);
4183 #endif
4184 
4185  /* Skip optional day-of-week */
4186  p = strchr(timestamp, (int) ',');
4187  if(NULL == p) { p = timestamp; } else { ++p; }
4188 
4189  /* Extract date */
4190  rv = sscanf(p, "%u %3c %u%n", &day, m, &year, &pos);
4191  if(3 != rv) { PRINT_ERROR("Invalid date in timestamp"); }
4192  else
4193  {
4194  /* Check for obsolete year format as defined by RFC 5322 */
4195  if(1000U > year)
4196  {
4197  if(50U > year) { year += 2000U; } else { year += 1900U; }
4198  }
4199  /* Decode month */
4200  m[0] = (char) toupper((int) m[0]);
4201  m[1] = (char) toupper((int) m[1]);
4202  m[2] = (char) toupper((int) m[2]);
4203  m[3] = 0;
4204  for(i = 0; i < 12; ++i)
4205  {
4206  if(!strcmp(months[i], m)) { month = i + 1U; break; }
4207  }
4208  if(13U <= month) { PRINT_ERROR("Invalid month in timestamp"); }
4209  else if(i < 12)
4210  {
4211  /* Check day */
4212  if(1U > day || dom[i] < day) { month = 13U; }
4213  if(13U > month)
4214  {
4215  if(2U == month && 29U == day)
4216  {
4217  /* Check for leap year in terms of gregorian calendar */
4218  if(!enc_check_leap_year(year)) { month = 13U; }
4219  }
4220  }
4221  if(13U <= month)
4222  {
4223  PRINT_ERROR("Invalid day of month in timestamp");
4224  }
4225  }
4226  }
4227 
4228  /* Extract time if date was found */
4229  if(13U > month)
4230  {
4231  p += pos;
4232  rv = sscanf(p, "%u : %u%n", &hour, &minute, &pos);
4233  if(2 != rv)
4234  {
4235  PRINT_ERROR("Invalid time in timestamp");
4236  }
4237  else
4238  {
4239  p += pos;
4240  q = strchr(p, (int) ':');
4241  if(NULL != q) { p = q; }
4242  rv = sscanf(p, ": %u%n", &second, &pos);
4243  if(1 == rv) { p += pos; }
4244  rv = sscanf(p, "%5s", z);
4245  z[5] = 0;
4246  if(1 != rv)
4247  {
4248  PRINT_ERROR("Missing timezone in timestamp");
4249  }
4250  else
4251  {
4252  /* Check time (accept leap second according to RFC 5322) */
4253  if(23U < hour || 59U < minute || 60U < second)
4254  {
4255  PRINT_ERROR("Invalid time in timestamp");
4256  }
4257  else
4258  {
4259  /* Decode timezone */
4260  if('+' == z[0] || '-' == z[0])
4261  {
4262  for(i = 1; i < 5; ++i)
4263  {
4264  if(enc_ascii_check_digit(&z[i])) { zone = 0; break; }
4265  }
4266  if(zone)
4267  {
4268  zh = ((unsigned int) z[1] - 0x30) * 10U;
4269  zh += ((unsigned int) z[2] - 0x30);
4270  zm = ((unsigned int) z[3] - 0x30) * 10U;
4271  zm += ((unsigned int) z[4] - 0x30);
4272  if(59U < zm) { zone = 0; }
4273  }
4274  if(!zone)
4275  {
4276  PRINT_ERROR("Invalid timezone in timestamp");
4277  }
4278  else
4279  {
4280  zone = (int) (zh * 60U + zm);
4281  if('-' == z[0]) { zone *= -1; }
4282  }
4283  }
4284  else
4285  {
4286  /* Check for obsolete timezone format */
4287  if(!strcmp("GMT", z)) { zone = 0; }
4288  else if(!strcmp("UT", z)) { zone = 0; }
4289  else if(!strcmp("EDT", z)) { zone = -4 * 60; }
4290  else if(!strcmp("EST", z)) { zone = -5 * 60; }
4291  else if(!strcmp("CDT", z)) { zone = -5 * 60; }
4292  else if(!strcmp("CST", z)) { zone = -6 * 60; }
4293  else if(!strcmp("MDT", z)) { zone = -6 * 60; }
4294  else if(!strcmp("MST", z)) { zone = -7 * 60; }
4295  else if(!strcmp("PDT", z)) { zone = -7 * 60; }
4296  else if(!strcmp("PST", z)) { zone = -8 * 60; }
4297  else if(!strcmp("Z", z)) { zone = 0; }
4298  else
4299  {
4300  zone = 0;
4301  PRINT_ERROR("Decode unknown timezone in timestamp as UTC");
4302  }
4303  }
4304 #if 0
4305  /* For debugging */
4306  printf("Decoded : %04u-%02u-%02u %02u:%02u:%02u %+d minutes\n",
4307  year, month, day, hour, minute, second, zone);
4308 #endif
4309  /* Decoding successful */
4310  error = 0;
4311  }
4312  }
4313  }
4314  }
4315 
4316  /* Calculate seconds since epoche */
4317  if(!error)
4318  {
4319  enc_encode_posix_timestamp(&res, year, month, day, hour, minute, second,
4320  zone);
4321  }
4322 
4323  return(res);
4324 }
4325 
4326 
4327 /* ========================================================================== */
4328 /*! \brief Convert POSIX timestamp to ISO 8601 conformant local date and time
4329  *
4330  * \param[out] isodate Buffer for date string (at least 20 characters)
4331  * \param[in] pts Seconds since epoche (as defined by POSIX.1)
4332  *
4333  * ISO 8601 allows to omit the 'T' character between the date and time fields
4334  * if there is no risk of confusing a date and time of day representation.
4335  * This is the case here => We omit the 'T' for better human readability
4336  *
4337  * \return
4338  * - 0 on success
4339  * - Negative value on error
4340  */
4341 
4343 {
4344  int res = -1;
4345  posix_time_t ts;
4346  struct_posix_tm t_data;
4347  struct_posix_tm* t;
4348 
4349  /*
4350  * Check for potential 'time_t' overflow
4351  * Many historical 32 bit Unix systems use 'signed int' for 'time_t'.
4352  * We clamp the 'pts' to this lowest common denominator.
4353  */
4354  if((core_time_t) INT_MAX < pts)
4355  {
4356  /* Clamp time up to 2038-01-19T03:14:07Z if system uses 32 bit 'int' */
4357  PRINT_ERROR("Warning: time_t overflow while converting timestamp");
4358  ts = (core_time_t) INT_MAX;
4359  }
4360  else { ts = (posix_time_t) pts; }
4361 
4362  /* Convert POSIX timestamp to ISO 8601 date */
4363  /*! \todo
4364  * Calling operating system for date conversion should be replaced until the
4365  * year 2038 (when 32 bit signed \c time_t implementations will overflow).
4366  */
4367  t = posix_localtime_r(&ts, &t_data);
4368  if(NULL != t)
4369  {
4370  /* Return value is intentionally ignred, use ign to silence compiler */
4371  ign = posix_snprintf(isodate, 20, "%04d-%02d-%02d %02d:%02d:%02d",
4372  t->tm_year + 1900, t->tm_mon + 1, t->tm_mday,
4373  t->tm_hour, t->tm_min, t->tm_sec);
4374  res = 0;
4375  }
4376 
4377  if(0 > res) { PRINT_ERROR("Timestamp conversion failed"); }
4378 
4379  return(res);
4380 }
4381 
4382 
4383 /* ========================================================================== */
4384 /*! \brief Get current UTC date in ISO 8601 conformant format
4385  *
4386  * \param[out] isodate Buffer for date string (at least 21 characters)
4387  *
4388  * The date is written to \e isodate in \c YYYY-MM-DDTHH-MM-SSZ format.
4389  *
4390  * \return
4391  * - 0 on success
4392  * - Negative value on error
4393  */
4394 
4395 int enc_get_iso8601_utc(char* isodate)
4396 {
4397  int res = -1;
4398  posix_time_t ts;
4399  struct_posix_tm t_data;
4400  struct_posix_tm* t;
4401 
4402  /*
4403  * Check for potential 'time_t' overflow
4404  * Many historical 32 bit Unix systems use 'signed int' for 'time_t'.
4405  * We clamp the 'pts' to this lowest common denominator.
4406  */
4407  posix_time(&ts);
4408  if((posix_time_t) 0 > ts) { res = -1; }
4409  else
4410  {
4411  /* Convert POSIX timestamp to ISO 8601 date */
4412  /*! \todo
4413  * Calling operating system for date conversion should be replaced until the
4414  * year 2038 (when 32 bit signed \c time_t implementations will overflow).
4415  */
4416  t = posix_gmtime_r(&ts, &t_data);
4417  if(NULL != t)
4418  {
4419  /* Return value is intentionally ignred, use ign to silence compiler */
4420  ign = posix_snprintf(isodate, 21, "%04d-%02d-%02dT%02d:%02d:%02dZ",
4421  t->tm_year + 1900, t->tm_mon + 1, t->tm_mday,
4422  t->tm_hour, t->tm_min, t->tm_sec);
4423  res = 0;
4424  }
4425  }
4426 
4427  if(0 > res) { PRINT_ERROR("ISO 8601 date request failed"); }
4428 
4429  return(res);
4430 }
4431 
4432 
4433 /* ========================================================================== */
4434 /*! \brief Convert ISO 8601 conformant UTC date and time to POSIX timestamp
4435  *
4436  * \param[out] pts Seconds since epoche (as defined by POSIX.1)
4437  * \param[in] isodate Buffer for date string (at least 20 characters)
4438  *
4439  * \attention
4440  * The parameter \e isodate must be in \c YYYY-MM-DDTHH-MM-SSZ format (UTC).
4441  *
4442  * \note
4443  * This function accepts no date input before the epoche.
4444  *
4445  * \return
4446  * - 0 on success
4447  * - Negative value on error
4448  */
4449 
4450 int enc_convert_iso8601_to_posix(core_time_t* pts, const char* isodate)
4451 {
4452  int res = -1;
4453  int rv;
4454  unsigned int year;
4455  unsigned int month;
4456  unsigned int mday;
4457  unsigned int hour;
4458  unsigned int minute;
4459  unsigned int second;
4460 
4461  /* Split ISO 8601 date */
4462  rv = sscanf(isodate, "%u-%u-%uT%u:%u:%uZ", &year, &month, &mday,
4463  &hour, &minute, &second);
4464  if(6 != rv) { PRINT_ERROR("ISO 8601 timestamp has invalid format"); }
4465  else
4466  {
4467  if(1970U <= year && 9999U >= year
4468  && 1U <= month && 12U >= month
4469  && 1U <= mday && 31U >= mday
4470  && 23U >= hour && 59U >= minute && 59U >= second) { res = 0; }
4471  }
4472 
4473  /* Calculate seconds since epoche */
4474  if(!res)
4475  {
4476  res = enc_encode_posix_timestamp(pts, year, month, mday,
4477  hour, minute, second, 0);
4478  }
4479 
4480  return(res);
4481 }
4482 
4483 
4484 /* ========================================================================== */
4485 /*! \brief Convert ISO 8601 conformant date to canonical timestamp
4486  *
4487  * \param[out] ts Pointer to canonical timestamp as defined by RFC 5322
4488  * \param[in] isodate ISO 8601 date string (exactly 10 characters)
4489  *
4490  * \attention
4491  * The parameter \e isodate must be in \c YYYY-MM-DD format (only date, time is
4492  * not supported).
4493  *
4494  * \note
4495  * On success, the caller is responsible to free the memory allocated for the
4496  * result string.
4497  *
4498  * \return
4499  * - 0 on success
4500  * - Negative value on error
4501  */
4502 
4503 int enc_convert_iso8601_to_timestamp(const char** ts, const char* isodate)
4504 {
4505  static const char* months[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
4506  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
4507  int res = -1;
4508  int rv;
4509  unsigned int year;
4510  unsigned int month;
4511  unsigned int mday;
4512  char* buf = NULL;
4513  size_t len = 50;
4514 
4515  /* Split ISO 8601 date */
4516  rv = sscanf(isodate, "%u-%u-%u", &year, &month, &mday);
4517  if(3 != rv) { PRINT_ERROR("ISO 8601 timestamp has invalid format"); }
4518  else
4519  {
4520  if(1900U <= year && 9999U >= year
4521  && 1U <= month && 12U >= month
4522  && 1U <= mday && 31U >= mday) { res = 0; }
4523  }
4524  if(!res)
4525  {
4526  /* Allocate buffer for result */
4527  buf = (char*) posix_malloc(len);
4528  if(NULL == buf) { res = -1; }
4529  else
4530  {
4531  posix_snprintf(buf, 50, "%u %s %04u %02d:%02d:%02d -0000",
4532  mday, months[--month], year, 0, 0, 0);
4533  *ts = (const char*) buf;
4534  }
4535  }
4536 
4537  return(res);
4538 }
4539 
4540 
4541 /* ========================================================================== */
4542 /*! \brief Convert article number from numerical format to ASCII
4543  *
4544  * \param[out] result Pointer to result string buffer (Size: 17 bytes)
4545  * \param[out] len Pointer to length of result string (Maximum value: 16)
4546  * \param[in] wm Article number (watermark) to convert
4547  *
4548  * RFC 3977 allows max. 16 digits.
4549  *
4550  * \note
4551  * The output is locale independent.
4552  *
4553  * \return
4554  * - 0 on success
4555  * - Negative value on error (\e result and \e len are not valid)
4556  */
4557 
4558 int enc_convert_anum_to_ascii(char result[17], size_t* len, core_anum_t wm)
4559 {
4560  int res = -1;
4561  int rv;
4562 
4563  /* C90 compilers are not required to support more than 32 bit data types */
4564  if (CORE_ANUM_T_MAX > ULONG_MAX)
4565  {
4566  PRINT_ERROR("Value of CORE_ANUM_T_MAX is too large");
4567  }
4568  else
4569  {
4570  rv = posix_snprintf(result, 17, "%lu", (unsigned long int) wm);
4571  if(rv > 0 && rv <= 16)
4572  {
4573  *len = (size_t) rv;
4574  res = 0;
4575  }
4576  }
4577 
4578  return(res);
4579 }
4580 
4581 
4582 /* ========================================================================== */
4583 /*! \brief Convert number from ASCII to numerical format
4584  *
4585  * \param[out] result Pointer to result
4586  * \param[in] wm Article number (watermark) string to convert
4587  * \param[in] len Length of string \e wm
4588  *
4589  * Max. 20 digits are supported, sufficient for 64-bit article numbers.
4590  * RFC 3977 allows max. 16 digits.
4591  *
4592  * This function correctly processes leading zeros and does not use standard
4593  * library functions with locale dependent behaviour.
4594  *
4595  * \note
4596  * \e wm needs no termination, the first \e len characters are used.
4597  *
4598  * \return
4599  * - 0 on success
4600  * - Negative value on error
4601  * - -2 means larger than \ref NNTP_ANUM_T_MAX
4602  */
4603 
4604 int enc_convert_ascii_to_anum(core_anum_t* result, const char* wm,
4605  int len)
4606 {
4607  int res = -1;
4608  unsigned char c;
4609  nntp_anum_t pot = 1; /* 10^0 (0th power of 10) */
4610  nntp_anum_t d;
4611  nntp_anum_t v = 0;
4612 
4613  /* Check length */
4614  if(0 < len && 20 >= len)
4615  {
4616  /* Process every digit as a power of ten */
4617  while(len)
4618  {
4619  /* Get character and check whether it is a digit */
4620  c = (unsigned char) wm[len-- - 1];
4621  if(enc_ascii_check_digit((char*) &c)) { len = -1; break; }
4622  /* ASCII decode it to numerical digit */
4623  d = (nntp_anum_t) (unsigned char) (c - 0x30U);
4624  /* Calculate value of digit */
4625  d *= pot;
4626  /* Avoid overflow */
4627  if(NNTP_ANUM_T_MAX - v < d) { res = -2; break; }
4628  /* Add value of digit to result */
4629  v += d;
4630  /* Calculate next power of ten */
4631  pot *= 10;
4632  }
4633  /* Check whether processing was successful */
4634  if(!len)
4635  {
4636  *result = v;
4637  res = 0;
4638  }
4639  }
4640 
4641  if(0 > res) { PRINT_ERROR("Article number conversion failed"); }
4642 
4643  return(res);
4644 }
4645 
4646 
4647 /* ========================================================================== */
4648 /*! \brief Convert octet to hexadecimal (ASCII) format
4649  *
4650  * \param[out] result Pointer to result
4651  * \param[in] octet Octet to convert
4652  *
4653  * Exactly 3 bytes are written to the buffer pointed to by \e result .
4654  * If \e octet is smaller than 16, a leading zero is created.
4655  * On error, the result "XX" is generated.
4656  * The \e result is always a zero terminated string.
4657  *
4658  * \return
4659  * - 0 on success
4660  * - Negative value on error
4661  */
4662 
4663 
4664 int enc_convert_octet_to_hex(char* result, unsigned int octet)
4665 {
4666  int res = -1;
4667 
4668  if(255U >= octet)
4669  {
4670  if(2 == posix_snprintf(result, 3, "%02X", octet)) { res = 0; }
4671  }
4672 
4673  /* Check for error */
4674  if(res) { strcpy(result, "XX"); }
4675 
4676  return(res);
4677 }
4678 
4679 
4680 /* ========================================================================== */
4681 /*! \brief Encode or decode data with ROT13 algorithm
4682  *
4683  * \param[in] data Pointer to buffer with Data to encode/decode
4684  *
4685  * Any character that is not a latin ASCII character in the ranges A..Z and
4686  * a..z will stay unchanged.
4687  *
4688  * No memory is allocated. The operation is executed in the buffer pointed to
4689  * by \e data .
4690  */
4691 
4692 void enc_rot13(char* data)
4693 {
4694  size_t i = 0;
4695  int c;
4696  int modified = 0;
4697 
4698  while(data[i])
4699  {
4700  c = (int) data[i];
4701  /* Check for capital letter */
4702  if(65 <= c && 90 >= c)
4703  {
4704  c += 13;
4705  if(90 < c) { c = 65 - 1 + (c - 90); }
4706  modified = 1;
4707  }
4708  /* Check for small letter */
4709  else if(97 <= c && 122 >= c)
4710  {
4711  c += 13;
4712  if(122 < c) { c = 97 - 1 + (c - 122); }
4713  modified = 1;
4714  }
4715  /* Change character */
4716  if(modified) { data[i] = (char) c; }
4717  /* Process next character */
4718  modified = 0;
4719  ++i;
4720  }
4721 }
4722 
4723 
4724 /* ========================================================================== */
4725 /*! \brief Encode binary data to base64
4726  *
4727  * \param[out] enc Pointer to result (zero terminated string)
4728  * \param[in] data Data to encode
4729  * \param[in] len Data length
4730  *
4731  * If \e len is zero, \e data is not dereferenced and the result will be an
4732  * empty string.
4733  *
4734  * On error, nothing is written to \e enc .
4735  *
4736  * On success a pointer to the result buffer will be written to \e enc .
4737  * The caller is responsible to free the memory allocated for this buffer.
4738  *
4739  * \return
4740  * - 0 on success
4741  * - Negative value on error
4742  */
4743 
4744 int enc_mime_encode_base64(const char** enc, const char* data, size_t len)
4745 {
4746  int res = -1;
4747  size_t enc_len = (size_t) 4 * len / (size_t) 3;
4748  char* buf = NULL;
4749  size_t i = 0;
4750  size_t enc_i = 0;
4751  unsigned long int word;
4752  unsigned char b64[4];
4753 
4754  enc_len += (size_t) 5; /* Last quartet and termination */
4755  buf = (char*) posix_malloc(enc_len);
4756  if(NULL != buf)
4757  {
4758  if(len)
4759  {
4760  while((size_t) 3 <= len)
4761  {
4762  /* Compose 24 bit input word */
4763  word = (unsigned long int) (unsigned char) data[i++] << 16;
4764  word |= (unsigned long int) (unsigned char) data[i++] << 8;
4765  word |= (unsigned long int) (unsigned char) data[i++];
4766  len -= (size_t) 3;
4767  /* Encode */
4768  b64[3] = (unsigned char) (word & 0x00003FUL);
4769  b64[2] = (unsigned char) ((word & 0x000FC0UL) >> 6);
4770  b64[1] = (unsigned char) ((word & 0x03F000UL) >> 12);
4771  b64[0] = (unsigned char) ((word & 0xFC0000UL) >> 18);
4772  buf[enc_i++] = enc_base64[b64[0]];
4773  buf[enc_i++] = enc_base64[b64[1]];
4774  buf[enc_i++] = enc_base64[b64[2]];
4775  buf[enc_i++] = enc_base64[b64[3]];
4776  }
4777  if(len)
4778  {
4779  /* End of data and padding */
4780  word = (unsigned long int) (unsigned char) data[i++] << 16;
4781  if((size_t) 1 < len)
4782  {
4783  word |= (unsigned long int) (unsigned char) data[i++] << 8;
4784  }
4785  /* Encode */
4786  b64[3] = (unsigned char) (word & 0x00003FUL);
4787  b64[2] = (unsigned char) ((word & 0x000FC0UL) >> 6);
4788  b64[1] = (unsigned char) ((word & 0x03F000UL) >> 12);
4789  b64[0] = (unsigned char) ((word & 0xFC0000UL) >> 18);
4790  buf[enc_i++] = enc_base64[b64[0]];
4791  buf[enc_i++] = enc_base64[b64[1]];
4792  if((size_t) 1 < len) { buf[enc_i++] = enc_base64[b64[2]]; }
4793  else { buf[enc_i++] = '='; }
4794  buf[enc_i++] = '=';
4795  }
4796  }
4797  buf[enc_i] = 0;
4798  *enc = buf;
4799  res = 0;
4800  }
4801 
4802  /* Check for error */
4803  if(res) { posix_free((void*) buf); }
4804 
4805  return(res);
4806 }
4807 
4808 
4809 /* ========================================================================== */
4810 /*! \brief Extract addr-spec token from RFC 5322 mailbox
4811  *
4812  * \param[in] mailbox RFC 5322 mailbox
4813  *
4814  * \attention
4815  * The checks are more restrictive than the formal specification of RFC 5322.
4816  * White space is not allowed inside the \c addr-spec token!
4817  *
4818  * \note
4819  * It is tolerated that \e mailbox contains an invalid \c name-addr token
4820  * because it is ignored anyway.
4821  *
4822  * On success a pointer to the result buffer is returned.
4823  * The caller is responsible to free the memory allocated for this buffer.
4824  *
4825  * \return
4826  * - Pointer to new memory block containing the \c addr-spec token
4827  * - NULL on error
4828  */
4829 
4830 const char* enc_extract_addr_spec(const char* mailbox)
4831 {
4832  char* res = NULL;
4833  unsigned int state;
4834  const char* s;
4835  const char* e;
4836  size_t len;
4837  size_t i, ii;
4838 
4839  if(NULL != mailbox)
4840  {
4841  len = strlen(mailbox);
4842  /* A valid addr-spec is at least 3 characters long */
4843  if((size_t) 3 <= len)
4844  {
4845  /* Default to assumption that whole mailbox is an 'addr-spec' token */
4846  s = mailbox;
4847  e = &mailbox[len];
4848  /*
4849  * To tolerate arbitrary garbage for the 'name-addr' token search
4850  * backward from the end for an 'angle-addr' token.
4851  */
4852  state = 0;
4853  for(i = len; i; --i)
4854  {
4855  ii = i - (size_t) 1;
4856  if(!state && '>' == mailbox[ii])
4857  {
4858  e = &mailbox[ii];
4859  ++state;
4860  continue;
4861  }
4862  if(1U == state && '@' == mailbox[ii]) { ++state; continue; }
4863  if(2U == state && '<' == mailbox[ii])
4864  {
4865  /* 'angle-addr' token found at end of mailbox */
4866  ++state;
4867  s = &mailbox[ii];
4868  ++s;
4869  break;
4870  }
4871  }
4872  if((!state || 3U <= state) && e > s + 2)
4873  {
4874  /* Allocate new memory block and copy 'addr-spec' to it */
4875  len = (size_t) (e - s);
4876  res = (char*) posix_malloc(len + (size_t) 1);
4877  if(NULL != res)
4878  {
4879  memcpy((void*) res, (void*) s, len); res[len] = 0;
4880  /* Check whether result is a valid 'addr-spec' token */
4881  i = 0;
4882  state = 0;
4883  while(res[i] && '@' != res[i])
4884  {
4885  if(enc_check_dotatom(&res[i])) { state = 1; break; }
4886  ++i;
4887  }
4888  if(!state)
4889  {
4890  if(!i || '@' != res[i]) { state = 1; }
4891  /* Skip '@' and verify that there is more data */
4892  else { if(!res[++i]) { state = 1; } }
4893  }
4894  if(!state)
4895  {
4896  while(res[i])
4897  {
4898  /*
4899  * Relaxed check for 'domain-literal'
4900  * Check for printable ASCII only
4901  */
4902  if('[' == res[i])
4903  {
4904  if(enc_ascii_check_printable(&res[i])) { state = 1; }
4905  break;
4906  }
4907  /* Check 'dot-atom' */
4908  if(enc_check_dotatom(&res[i]))
4909  {
4910  state = 1;
4911  break;
4912  }
4913  ++i;
4914  }
4915  }
4916  if(state)
4917  {
4918  /* Invalid address format */
4919  posix_free((void*) res);
4920  res = NULL;
4921  }
4922  }
4923  }
4924  }
4925  }
4926 
4927  /* Check for error */
4928  if(NULL == res) { PRINT_ERROR("Invalid e-mail address"); }
4929 
4930  return(res);
4931 }
4932 
4933 
4934 /* ========================================================================== */
4935 /*! \brief Verify ASCII encoding
4936  *
4937  * \param[in] s String to verify
4938  *
4939  * \return
4940  * - 0 on success
4941  * - Negative value on error
4942  */
4943 
4944 int enc_ascii_check(const char* s)
4945 {
4946  int res = 0;
4947  size_t i = 0;
4948  int c;
4949 
4950  /* Assignment in the truth expression is intended */
4951  while((c = (int) s[i++]))
4952  {
4953  if(!(0 <= c && 127 >= c)) { res = -1; }
4954  }
4955 
4956  return(res);
4957 }
4958 
4959 
4960 /* ========================================================================== */
4961 /*! \brief Check for ASCII alphabetic characters
4962  *
4963  * \param[in] s Pointer to single character
4964  *
4965  * Locale independent check based on ASCII.
4966  *
4967  * \return
4968  * - 0 if \e s is an alphabetic character
4969  * - Negative value if \e s is not an alphabetic character
4970  */
4971 
4972 int enc_ascii_check_alpha(const char* s)
4973 {
4974  int res = 0;
4975  int c = (int) *s;
4976 
4977  if(!(65 <= c && 90 >= c) && !(97 <= c && 122 >= c)) { res = -1; }
4978 
4979  return(res);
4980 }
4981 
4982 
4983 /* ========================================================================== */
4984 /*! \brief Check for ASCII digit characters
4985  *
4986  * \param[in] s Pointer to single character
4987  *
4988  * Locale independent check based on ASCII.
4989  *
4990  * \return
4991  * - 0 if \e s is a digit character
4992  * - Negative value if \e s is not a digit character
4993  */
4994 
4995 int enc_ascii_check_digit(const char* s)
4996 {
4997  int res = 0;
4998  int c = (int) *s;
4999 
5000  if(!(48 <= c && 57 >= c)) { res = -1; }
5001 
5002  return(res);
5003 }
5004 
5005 
5006 /* ========================================================================== */
5007 /*! \brief Check for printable ASCII characters
5008  *
5009  * \param[in] s String to check
5010  *
5011  * HT (9) and SPACE (32, 0x20) inside \e s are treated as "printable" to make
5012  * this function suitable to check header field bodies according to RFC 5322.
5013  *
5014  * \note
5015  * The function \ref enc_ascii_convert_to_printable() can be used on error.
5016  *
5017  * \return
5018  * - 0 on success
5019  * - Negative value on error
5020  */
5021 
5022 int enc_ascii_check_printable(const char* s)
5023 {
5024  int res = 0;
5025  size_t i = 0;
5026  int c;
5027 
5028  /* Assignment in the truth expression is intended */
5029  while((c = (int) s[i++]))
5030  {
5031  if(!(9 == c || (32 <= c && 126 >= c))) { res = -1; }
5032  }
5033 
5034  return(res);
5035 }
5036 
5037 
5038 /* ========================================================================== */
5039 /*! \brief Convert to printable ASCII format
5040  *
5041  * \param[in] s String to convert
5042  *
5043  * This function should be used to repair a string in-place after the function
5044  * \ref enc_ascii_check_printable() have reported an error.
5045  *
5046  * Every invalid byte is replaced with '?'.
5047  */
5048 
5050 {
5051  size_t i = 0;
5052  int c;
5053 
5054  while(s[i])
5055  {
5056  c = (int) s[i];
5057  if(!(9 == c || (32 <= c && 126 >= c))) { s[i] = '?'; }
5058  ++i;
5059  }
5060 }
5061 
5062 
5063 /* ========================================================================== */
5064 /*! \brief Convert body of distribution header field
5065  *
5066  * \param[in] s String with unfolded body to convert
5067  *
5068  * This function process \e s in-place. The result will always be shorter or
5069  * same length as the original data.
5070  *
5071  * Every element of \c dist-list that contains invalid characters is removed.
5072  */
5073 
5075 {
5076  size_t i;
5077  size_t len;
5078  int c;
5079  size_t start = 0;
5080  int error = 0;
5081  char* p;
5082 
5083  /* Remove whitespace */
5084  i = 0;
5085  while(s[i])
5086  {
5087  c = (int) s[i];
5088  if(9 == c || 32 == c)
5089  {
5090  len = strlen(&s[i + (size_t) 1]);
5091  /* Move including NUL termination */
5092  memmove((void*) &s[i], (void*) &s[i + (size_t) 1], ++len);
5093  }
5094  else { ++i; }
5095  }
5096 
5097  /* Check content */
5098  i = 0;
5099  while(s[i])
5100  {
5101  /* Check for alphanumeric characters */
5102  if(enc_ascii_check_alpha(&s[i]) && enc_ascii_check_digit(&s[i]))
5103  {
5104  /* No => Check first character of 'dist-name' token */
5105  if(!start && !i) { error = 1; }
5106  else if(start && start + (size_t) 1 == i) { error = 1; }
5107  else
5108  {
5109  /* Not first => Check other characters of 'dist-name' token */
5110  if('+' != s[i] && '-' != s[i] && '_' != s[i] && ',' != s[i])
5111  {
5112  error = 1;
5113  }
5114  }
5115  /* Check for separator between entries */
5116  if(!error && ',' == s[i])
5117  {
5118  start = i;
5119  if(!s[i + (size_t) 1]) { error = 1; }
5120  }
5121  }
5122  /* Check for error */
5123  if(error)
5124  {
5125  PRINT_ERROR("Invalid entry in distribution list removed");
5126  p = strchr(&s[i + (size_t) 1], (int) ',');
5127  i = start;
5128  if(NULL == p) { s[i] = 0; }
5129  else
5130  {
5131  /* Remove invalid entry */
5132  if(!start) { p += 1; } /* No separator before first entry */
5133  len = strlen(p);
5134  /* Move including NUL termination */
5135  memmove((void*) &s[start], (void*) p, ++len);
5136  }
5137  error = 0;
5138  }
5139  else { ++i; }
5140  }
5141 }
5142 
5143 
5144 /* ========================================================================== */
5145 /*! \brief Verify UTF-8 encoding
5146  *
5147  * \param[in] s String to verify
5148  *
5149  * \attention
5150  * Read chapter 10 of RFC 3629 for UTF-8 security considerations.
5151  *
5152  * According to RFC 3629 the following rules are applied:
5153  * - Character code points beyond 0x10FFFF are invalid => We reject them.
5154  * - Only the shortest possible code sequence is allowed => We verify this.
5155  * - Surrogate character code points are invalid for UTF-8 => We reject them.
5156  *
5157  * \return
5158  * - 0 on success
5159  * - Negative value on error
5160  */
5161 
5162 int enc_uc_check_utf8(const char* s)
5163 {
5164  /* Enable additional check for surrogate-pairs */
5165  return enc_uc_check_cesu8(s, 1);
5166 }
5167 
5168 
5169 /* ========================================================================== */
5170 /*! \brief Repair UTF-8 encoding
5171  *
5172  * \param[in] s String to repair
5173  *
5174  * Invalid UTF-8 sequences and invalid codepoints are replaced with U+FFFD.
5175  *
5176  * \return
5177  * - Pointer to new memory block on success
5178  * - \c NULL on error
5179  */
5180 
5181 const char* enc_uc_repair_utf8(const char* s)
5182 {
5183  char* res = (char*) posix_malloc(strlen(s) * (size_t) 3);
5184  const char rc[3] = { (char) 0xEF, (char) 0xBF, (char) 0xBD };
5185  size_t i = 0;
5186  size_t ri = 0;
5187  int c;
5188  int multibyte = 0;
5189  size_t len = 0;
5190  size_t remaining = 0;
5191  unsigned long int mbc = 0;
5192  int error = 0;
5193 
5194  if(NULL != res)
5195  {
5196  /* Assignment in truth expression is intended */
5197  while((c = (int) s[i++]))
5198  {
5199  /* Resync after error */
5200  if(error)
5201  {
5202  if((c & 0xC0) == 0x80) { continue; }
5203  else { multibyte = 0; }
5204  }
5205  /* Verify singlebyte character */
5206  if(!multibyte)
5207  {
5208  if(!(0 <= c && 127 >= c)) { multibyte = 1; }
5209  else { res[ri++] = (char) c; }
5210  /* Reset state machine */
5211  remaining = 0;
5212  mbc = 0;
5213  error = 0;
5214  }
5215  /* Verify multibyte character */
5216  if(multibyte)
5217  {
5218  if(!remaining)
5219  {
5220  if((c & 0xE0) == 0xC0) { len = 2; }
5221  else if((c & 0xF0) == 0xE0) { len = 3; }
5222  else if((c & 0xF8) == 0xF0) { len = 4; }
5223  else
5224  {
5225  /* Invalid start of code sequence in UTF-8 data */
5226  res[ri++] = rc[0]; res[ri++] = rc[1]; res[ri++] = rc[2];
5227  error = 1;
5228  }
5229  switch(len)
5230  {
5231  case 2: mbc |= (unsigned long int) (c & 0x1F) << 6; break;
5232  case 3: mbc |= (unsigned long int) (c & 0x0F) << 12; break;
5233  case 4: mbc |= (unsigned long int) (c & 0x07) << 18; break;
5234  default: break;
5235  }
5236  remaining = len - (size_t) 1;
5237  }
5238  else
5239  {
5240  if((c & 0xC0) != 0x80)
5241  {
5242  /* Invalid continuation character in UTF-8 sequence */
5243  res[ri++] = rc[0]; res[ri++] = rc[1]; res[ri++] = rc[2];
5244  if(0 <= c && 127 >= c) { res[ri++] = (char) c; }
5245  error = 1;
5246  }
5247  else
5248  {
5249  --remaining;
5250  mbc |= (unsigned long int) (c & 0x3F) << remaining * (size_t) 6;
5251  }
5252  if(!remaining && !error)
5253  {
5254  /* Verify character code */
5255  switch(len)
5256  {
5257  case 2:
5258  {
5259  if(0x000080UL > mbc)
5260  {
5261  /* Invalid UTF-8 2-byte code sequence */
5262  res[ri++] = rc[0];
5263  res[ri++] = rc[1];
5264  res[ri++] = rc[2];
5265  error = 1;
5266  }
5267  else
5268  {
5269  res[ri++] = s[i - (size_t) 2];
5270  res[ri++] = s[i - (size_t) 1];
5271  }
5272  break;
5273  }
5274  case 3:
5275  {
5276  if(0x000800UL > mbc
5277  || (0x00D800UL <= mbc && 0x00DFFFUL >= mbc))
5278  {
5279  /* Invalid UTF-8 3-byte code sequence */
5280  res[ri++] = rc[0];
5281  res[ri++] = rc[1];
5282  res[ri++] = rc[2];
5283  error = 1;
5284  }
5285  else
5286  {
5287  res[ri++] = s[i - (size_t) 3];
5288  res[ri++] = s[i - (size_t) 2];
5289  res[ri++] = s[i - (size_t) 1];
5290  }
5291  break;
5292  }
5293  case 4:
5294  {
5295  if(0x010000UL > mbc || 0x10FFFFUL < mbc)
5296  {
5297  /* Invalid UTF-8 4-byte code sequence */
5298  res[ri++] = rc[0];
5299  res[ri++] = rc[1];
5300  res[ri++] = rc[2];
5301  error = 1;
5302  }
5303  else
5304  {
5305  res[ri++] = s[i - (size_t) 4];
5306  res[ri++] = s[i - (size_t) 3];
5307  res[ri++] = s[i - (size_t) 2];
5308  res[ri++] = s[i - (size_t) 1];
5309  }
5310  break;
5311  }
5312  default:
5313  {
5314  PRINT_ERROR("Bug in UTF-8 repair state machine");
5315  posix_free((void*) res);
5316  res = NULL;
5317  break;
5318  }
5319  }
5320  /* Code sequence complete */
5321  multibyte = 0;
5322  }
5323  }
5324  }
5325  }
5326  }
5327 
5328  /* Check for error */
5329  if(NULL != res)
5330  {
5331  /* Terminate new string */
5332  res[ri] = 0;
5333  /* Verify again */
5334  if(enc_uc_check_utf8(res))
5335  {
5336  PRINT_ERROR("UTF-8 data still invalid after repair (bug)");
5337  posix_free((void*) res);
5338  res = NULL;
5339  }
5340  else { PRINT_ERROR("UTF-8 data repaired"); }
5341  }
5342 
5343  return(res);
5344 }
5345 
5346 
5347 /* ========================================================================== */
5348 /*! \brief Create wildmat pattern array
5349  *
5350  * \param[out] obj Pointer to wildmat pattern array
5351  * \param[in] wm RFC 3977 conformant wildmat
5352  *
5353  * This function splits a RFC 3977 conformant \c wildmat into its elements of
5354  * type \c wildmat-pattern . Every \c wildmat-pattern is converted to a POSIX
5355  * extended regular expression and stored together with a negation flag (that
5356  * is set if the \c wildmat-pattern was preceded by an exclamation mark) in
5357  * the array \e obj .
5358  *
5359  * On success the caller is responsible to free the memoy allocated for the
5360  * resulting array with the function \e enc_destory_wildmat() .
5361  *
5362  * \attention
5363  * If the wildmat \e wm contains Unicode data, it must be normalized to NFC by
5364  * the caller.
5365  *
5366  * \return
5367  * - Number of patterns in the object on success
5368  * - Negative value on error (\c NULL was written to \e obj)
5369  */
5370 
5371 int enc_create_wildmat(struct enc_wm_pattern** obj, const char* wm)
5372 {
5373  int res = 0;
5374  size_t len;
5375  size_t i = 0;
5376  char* buf = NULL;
5377  size_t bi = 0;
5378  int error = 0;
5379  int negate = 0;
5380  int store = 0;
5381  int eod = 0;
5382  struct enc_wm_pattern* p;
5383  size_t obj_len = 0;
5384 
5385  *obj = NULL;
5386 
5387  /* Wildmat must have valid UTF-8 encoding */
5388  if(!enc_uc_check_utf8(wm))
5389  {
5390  /* Check for invalid characters (backslash and brackets) */
5391  if(NULL == strpbrk(wm, "\x5C[]"))
5392  {
5393  /* Extract wildmat-pattern elements */
5394  do
5395  {
5396  store = 0;
5397  negate = 0;
5398  /* Allocate ERE buffer for next pattern */
5399  len = strlen(&wm[i]);
5400  /* Required buffer size (see below): Triple + 2 + NUL */
5401  buf = (char*) posix_malloc(len * (size_t) 3 + (size_t) 3);
5402  if(NULL == buf) { break; }
5403  else
5404  {
5405  bi = 0;
5406  buf[bi++] = '^';
5407  while(!store)
5408  {
5409  /* Check for EOD */
5410  if(!wm[i])
5411  {
5412  if((size_t) 1 < bi) { store = 1; }
5413  eod = 1;
5414  break;
5415  }
5416  /* Check for (remaining) special character */
5417  if(NULL != strchr(".()*+?{|^$", (int) wm[i]))
5418  {
5419  switch((int) wm[i])
5420  {
5421  /* Match arbitrary single UTF-8 codepoint (not octet) */
5422  case (int) ')':
5423  {
5424  /* Replace with "[)]" (*3) */
5425  buf[bi++] = '[';
5426  buf[bi++] = ')';
5427  buf[bi++] = ']';
5428  break;
5429  }
5430  case (int) '*':
5431  {
5432  /* Replace with ".*" (*2) */
5433  buf[bi++] = '.';
5434  buf[bi++] = '*';
5435  break;
5436  }
5437  case (int) '?':
5438  {
5439  /* Replace with dot (*1) */
5440  buf[bi++] = '.';
5441  break;
5442  }
5443  default:
5444  {
5445  /* Escape special character with backslash (*2) */
5446  buf[bi++] = 0x5C;
5447  buf[bi++] = wm[i];
5448  break;
5449  }
5450  }
5451  }
5452  else
5453  {
5454  switch((int) wm[i])
5455  {
5456  case 0x09:
5457  case 0x20:
5458  {
5459  /* Ignore whitespace */
5460  break;
5461  }
5462  case (int) '!':
5463  {
5464  negate = 1;
5465  break;
5466  }
5467  case (int) ',':
5468  {
5469  store = 1;
5470  break;
5471  }
5472  default:
5473  {
5474  /* Ordinary character */
5475  buf[bi++] = wm[i];
5476  break;
5477  }
5478  }
5479  }
5480  ++i;
5481  }
5482  /* Store element into object */
5483  if(!store) { posix_free((void*) buf); }
5484  else
5485  {
5486  if(INT_MAX == res) { error = 1; }
5487  else
5488  {
5489  buf[bi++] = '$';
5490  buf[bi] = 0;
5491  /* printf("Pattern converted to ERE: %s\n", buf); */
5492  obj_len += sizeof(struct enc_wm_pattern);
5493  p = (struct enc_wm_pattern*) posix_realloc(*obj, obj_len);
5494  if(NULL == p) { error = 1; }
5495  else
5496  {
5497  *obj = p;
5498  (*obj)[res].negate = negate;
5499  (*obj)[res].ere = buf;
5500  ++res;
5501  }
5502  }
5503  if(error)
5504  {
5505  posix_free((void*) buf);
5506  break;
5507  }
5508  }
5509  }
5510  }
5511  while(!eod);
5512  }
5513  }
5514 
5515  /* Check for error */
5516  if(error || !eod || 0 >= res)
5517  {
5518  PRINT_ERROR("Failed to convert RFC 3977 wildmat");
5519  enc_destroy_wildmat(obj, res);
5520  res = -1;
5521  }
5522 
5523  return(res);
5524 }
5525 
5526 
5527 /* ========================================================================== */
5528 /*! \brief Destroy wildmat pattern array
5529  *
5530  * \param[in,out] obj Pointer to wildmat pattern array
5531  * \param[in] num Number of elements in array
5532  *
5533  * \c NULL is written to the location pointed to by \e obj after releasing the
5534  * memory allocated for the array.
5535  */
5536 
5537 void enc_destroy_wildmat(struct enc_wm_pattern** obj, int num)
5538 {
5539  int i;
5540 
5541  if(NULL != obj && NULL != *obj)
5542  {
5543  for(i = 0; i < num; ++i)
5544  {
5545  posix_free((void*) (*obj)[i].ere);
5546  }
5547  posix_free((void*) *obj);
5548  *obj = NULL;
5549  }
5550 }
5551 
5552 
5553 /* ========================================================================== */
5554 /*! \brief Convert from canonical (RFC 822) to local (POSIX) form
5555  *
5556  * \param[in] s String to convert
5557  * \param[in] rcr Replace invalid CR control characters if nonzero
5558  * \param[in] rlf Replace invalid LF control characters if nonzero
5559  *
5560  * According to RFC 822 and RFC 2049 this function accepts plain text article
5561  * content in canonical form and convert the CRLF line breaks to local (POSIX,
5562  * single LF) form.
5563  *
5564  * \attention
5565  * Single CR and LF control characters (not part of a CRLF sequence) are
5566  * forbidden in canonical format of text by RFC 2045 and RFC 2046.
5567  * Default behaviour is to preserve single CR and LF control characters.
5568  * The Unicode codepoint defined by \c ENC_RC can be inserted as replacement
5569  * for CR or/and LF by setting \e rcr or/and \e rlf respectively to a nonzero
5570  * value.
5571  *
5572  * On success the caller is responsible to free the allocated memory.
5573  *
5574  * \return
5575  * - Pointer to decoded data (a new memory block was allocated)
5576  * - NULL on error
5577  */
5578 
5579 const char* enc_convert_canonical_to_posix(const char* s, int rcr, int rlf)
5580 {
5581  const char* res = NULL;
5582  size_t i = 0;
5583  char* buf = NULL;
5584  size_t len = 0;
5585  size_t bi = 0;
5586  char* p;
5587  size_t escr = 0;
5588  size_t eslf = 0;
5589  long int rc_ucp = ENC_RC;
5590  size_t di;
5591 
5592  if(NULL != s)
5593  {
5594  /* Check for empty string and accept it */
5595  if(!s[0])
5596  {
5597  p = (char*) posix_malloc((size_t) 1);
5598  if(NULL != p) { p[0] = 0; res = p; }
5599  }
5600  else
5601  {
5602  while(s[i])
5603  {
5604  /*
5605  * Reserve space for one Unicode codepoint in UTF-8 transformation
5606  * format (4 octets in worst case).
5607  * At least 1 octet must stay available for NUL termination.
5608  */
5609  if(bi + (size_t) 4 + (size_t) 1 >= len)
5610  {
5611  /* Allocate more memory in exponentially increasing chunks */
5612  if(!len) { len = 64; }
5613  p = (char*) posix_realloc((void*) buf, len *= (size_t) 2);
5614  if(NULL == p) { posix_free((void*) buf); buf = NULL; break; }
5615  else { buf = p; }
5616  }
5617  /* Check for end of line (CRLF) */
5618  if(bi && i && 0x0A == (int) s[i] && 0x0D == (int) s[i - (size_t) 1])
5619  {
5620  /* Yes => Replace the CR with LF and don't increment position */
5621  buf[bi - (size_t) 1] = 0x0A;
5622  }
5623  else if(i && 0x0A != (int) s[i] && 0x0D == (int) s[i - (size_t) 1])
5624  {
5625  /* Single CR character (not part of a CRLF sequence) detected */
5626  ++escr;
5627  if(rcr)
5628  {
5629  --bi;
5630  di = 1;
5631  enc_uc_encode_utf8(buf, &bi, &rc_ucp, &di);
5632  }
5633  buf[bi++] = s[i];
5634  }
5635  else if(0x0A == (int) s[i])
5636  {
5637  /* Single LF character (not part of a CRLF sequence) detected */
5638  ++eslf;
5639  if(rlf)
5640  {
5641  di = 1;
5642  enc_uc_encode_utf8(buf, &bi, &rc_ucp, &di);
5643  }
5644  else
5645  {
5646  buf[bi++] = s[i];
5647  }
5648  }
5649  else { buf[bi++] = s[i]; }
5650  ++i;
5651  }
5652  if(NULL != buf)
5653  {
5654  buf[bi] = 0;
5655  res = buf;
5656  /* Print stored errors */
5657  if(escr)
5658  {
5659  /* Print error message only once */
5660  PRINT_ERROR("Invalid CR control character(s) detected"
5661  " while decoding canonical format");
5662  }
5663  if(eslf)
5664  {
5665  /* Print error message only once */
5666  PRINT_ERROR("Invalid LF control character(s) detected"
5667  " while decoding canonical format");
5668  }
5669  }
5670  }
5671  }
5672 
5673  return(res);
5674 }
5675 
5676 
5677 /* ========================================================================== */
5678 /*! \brief Convert from local (POSIX) to canonical (RFC 822) form
5679  *
5680  * \param[in] s String to convert
5681  *
5682  * According to RFC 822 and RFC 2049 this function accepts plain text article
5683  * content in local (POSIX) form and convert the single LF line breaks to
5684  * canonical (CRLF) form.
5685  *
5686  * According to RFC 2045 and RFC 2046 single CR characters are deleted.
5687  *
5688  * On success the caller is responsible to free the allocated memory.
5689  *
5690  * \return
5691  * - Pointer to decoded data (a new memory block was allocated)
5692  * - NULL on error
5693  */
5694 
5695 const char* enc_convert_posix_to_canonical(const char* s)
5696 {
5697  const char* res = NULL;
5698  size_t i = 0;
5699  char* buf = NULL;
5700  size_t len = 0;
5701  size_t bi = 0;
5702  char* p;
5703 
5704  if(NULL != s)
5705  {
5706  /* Check for empty string and accept it */
5707  if(!s[0])
5708  {
5709  p = (char*) posix_malloc((size_t) 1);
5710  if(NULL != p) { p[0] = 0; res = p; }
5711  }
5712  else
5713  {
5714  while(s[i])
5715  {
5716  /* At least 3 octets must stay available for CR + LF + NUL */
5717  if(bi + (size_t) 4 >= len)
5718  {
5719  /* Allocate more memory in exponentially increasing chunks */
5720  if(!len) { len = 64; }
5721  p = (char*) posix_realloc((void*) buf, len *= (size_t) 2);
5722  if(NULL == p) { posix_free((void*) buf); buf = NULL; break; }
5723  else { buf = p; }
5724  }
5725  /* Check for end of line (LF) */
5726  if(0x0A == (int) s[i])
5727  {
5728  /* Yes => Add a CR before the LF */
5729  buf[bi++] = 0x0D;
5730  buf[bi++] = 0x0A;
5731  }
5732  else if(0x0D == (int) s[i])
5733  {
5734  PRINT_ERROR("Invalid CR control character deleted"
5735  " while converting to canonical format");
5736  }
5737  else { buf[bi++] = s[i]; }
5738  ++i;
5739  }
5740  if(NULL != buf)
5741  {
5742  /* Ensure that last line of nonempty result ends with CRLF */
5743  if(bi)
5744  {
5745  if(0x0A != (int) buf[bi - (size_t) 1])
5746  {
5747  /* Append CR+LF */
5748  buf[bi++] = 0x0D;
5749  buf[bi++] = 0x0A;
5750  }
5751  }
5752  /* Add termination */
5753  buf[bi] = 0;
5754  res = buf;
5755  }
5756  }
5757  }
5758 
5759  return(res);
5760 }
5761 
5762 
5763 /* ========================================================================== */
5764 /*! \brief Convert string from supported character set to Unicode (UTF-8 NFC)
5765  *
5766  * \param[in] charset Character set of string \e s
5767  * \param[in] s String to convert
5768  *
5769  * According to RFC 2049 the following rules are applied:
5770  * - For all character sets from the ISO 8859 family that are not supported,
5771  * at least the ASCII characters must be decoded correctly
5772  * => We decode all non ASCII characters as "?" in this case.
5773  *
5774  * According to RFC 3629 the following rules are applied:
5775  * - If the input data is already UTF-8 is is not allowed to accept it
5776  * unchecked. It is mandatory to check the validity of the encoding
5777  * => We do so.
5778  *
5779  * \note
5780  * Some control characters that may cause problems are removed.
5781  *
5782  * \return
5783  * - Pointer to decoded Unicode data (UTF-8 encoded with NFC normalization)
5784  * If the result is not equal to \e s , a new memory block was allocated
5785  * - NULL on error (Original memory block for \e s is still allocated)
5786  */
5787 
5788 const char* enc_convert_to_utf8_nfc(enum enc_mime_cs charset, const char* s)
5789 {
5790  const char* res = NULL;
5791  char* p;
5792  const char* tmp;
5793  size_t len;
5794  size_t i;
5795  size_t ii;
5796  long int ucp;
5797  long int rc_ucp = ENC_RC;
5798  char rc_utf8[5] = { 0 };
5799  int cc_flag = 0; /* Flag indicating unwanted control characters */
5800  size_t di;
5801 
5802  switch(charset)
5803  {
5804  case ENC_CS_ISO8859_X:
5805  {
5806  PRINT_ERROR("Convert unsupported ISO 8859 character set as US-ASCII");
5807  /* No break here is intended */
5808  }
5809  case ENC_CS_ASCII:
5810  {
5811  len = strlen(s);
5812  p = (char*) posix_malloc(++len);
5813  if(NULL == p) { break; }
5814  for(i = 0; i < len; ++i)
5815  {
5816  p[i] = s[i];
5817  if((unsigned char) 127 < (unsigned char) p[i]) { p[i] = '?'; }
5818  }
5819  res = p;
5820  break;
5821  }
5822  case ENC_CS_ISO8859_1:
5823  case ENC_CS_ISO8859_2:
5824  case ENC_CS_ISO8859_3:
5825  case ENC_CS_ISO8859_4:
5826  case ENC_CS_ISO8859_5:
5827  case ENC_CS_ISO8859_6:
5828  case ENC_CS_ISO8859_7:
5829  case ENC_CS_ISO8859_8:
5830  case ENC_CS_ISO8859_9:
5831  case ENC_CS_ISO8859_10:
5832  case ENC_CS_ISO8859_11:
5833  case ENC_CS_ISO8859_13:
5834  case ENC_CS_ISO8859_14:
5835  case ENC_CS_ISO8859_15:
5836  case ENC_CS_ISO8859_16:
5837  case ENC_CS_MACINTOSH:
5838  case ENC_CS_KOI8R:
5839  case ENC_CS_KOI8U:
5840  case ENC_CS_WINDOWS_1250:
5841  case ENC_CS_WINDOWS_1251:
5842  case ENC_CS_WINDOWS_1252:
5843  case ENC_CS_WINDOWS_1253:
5844  case ENC_CS_WINDOWS_1254:
5845  case ENC_CS_WINDOWS_1255:
5846  case ENC_CS_WINDOWS_1256:
5847  case ENC_CS_WINDOWS_1257:
5848  case ENC_CS_WINDOWS_1258:
5849  case ENC_CS_IBM437:
5850  case ENC_CS_IBM775:
5851  case ENC_CS_IBM850:
5852  case ENC_CS_IBM852:
5853  case ENC_CS_IBM858:
5854  {
5855  res = enc_8bit_convert_to_utf8_nfc(charset, s);
5856  break;
5857  }
5858  case ENC_CS_ISO2022_JP:
5859  {
5860  res = enc_iso2022_convert_to_utf8_nfc(charset, s);
5861  break;
5862  }
5863  case ENC_CS_UTF_7:
5864  {
5865  res = enc_uc_convert_utf7_to_utf8(s);
5866  break;
5867  }
5868  case ENC_CS_CESU_8:
5869  {
5870  res = enc_uc_convert_cesu8_to_utf8(s);
5871  break;
5872  }
5873  case ENC_CS_UTF_8:
5874  {
5875  res = s;
5876  break;
5877  }
5878  default:
5879  {
5880  /* Not supported */
5881  res = NULL;
5882  break;
5883  }
5884  }
5885 
5886  /* Check encoding */
5887  if(NULL != res)
5888  {
5889  if(enc_uc_check_utf8(res))
5890  {
5891  /* Encoding is invalid */
5892  if(ENC_CS_UTF_8 != charset && ENC_CS_CESU_8 != charset
5893  && ENC_CS_UTF_7 != charset)
5894  {
5895  /*
5896  * Encoding was created by our own converters
5897  * (Likely a bug in the conversion tables or the ISO 2022 decoder)
5898  */
5899  PRINT_ERROR("Invalid UTF-8 encoding detected (bug)");
5900  }
5901  /* Repair encoding */
5902  tmp = enc_uc_repair_utf8(res);
5903  if(res != tmp && res != s) { posix_free((void*) res); }
5904  res = tmp;
5905  }
5906  }
5907 
5908  /* Normalize to NFC */
5909  if(NULL != res)
5910  {
5911  tmp = enc_uc_normalize_to_nfc(res);
5912  if(res != tmp && res != s) { posix_free((void*) res); }
5913  res = tmp;
5914  }
5915 
5916  /* Remove unwanted control characters */
5917  if(NULL != res)
5918  {
5919  i = 0;
5920  while(1)
5921  {
5922  ucp = enc_uc_decode_utf8(res, &i);
5923  if(-1L == ucp) { break; }
5924  if(enc_uc_check_control(ucp)) { cc_flag = 1; break; }
5925  }
5926  if(cc_flag)
5927  {
5928  /* Unwanted control characters found */
5929  len = strlen(res);
5930  di = 1;
5931  i = 0;
5932  enc_uc_encode_utf8(rc_utf8, &i, &rc_ucp, &di);
5933  rc_utf8[i] = 0;
5934  len *= strlen(rc_utf8);
5935  p = (char*) posix_malloc(++len);
5936  if(NULL == p)
5937  {
5938  if(s != res) { posix_free((void*) res); }
5939  res = NULL;
5940  }
5941  else
5942  {
5943  i = 0; ii = 0;
5944  while(1)
5945  {
5946  ucp = enc_uc_decode_utf8(res, &i);
5947  if(-1L == ucp) { break; }
5948  if(enc_uc_check_control(ucp))
5949  {
5950  /* Replace them */
5951  di = 1;
5952  enc_uc_encode_utf8(p, &ii, &rc_ucp, &di);
5953  }
5954  else
5955  {
5956  di = 1;
5957  enc_uc_encode_utf8(p, &ii, &ucp, &di);
5958  }
5959  }
5960  p[ii] = 0;
5961  if(s != res) { posix_free((void*) res); }
5962  res = p;
5963  }
5964  PRINT_ERROR("Unwanted control characters detected and replaced");
5965  }
5966  }
5967 
5968  return(res);
5969 }
5970 
5971 
5972 /* ========================================================================== */
5973 /*! \brief Convert string from Unicode (UTF-8 NFC) to an 8bit character set
5974  *
5975  * \param[out] charset Pointer to character set of result (or \c NULL)
5976  * \param[in] s Unicode string to convert in UTF-8 NFC format
5977  * \param[out] cs_iana Pointer to IANA charset name of result (or \c NULL)
5978  *
5979  * \attention
5980  * Ensure that the string \e s is valid UTF-8 and normalized to NFC. Otherwise
5981  * this function will not work as expected.
5982  *
5983  * According to RFC 2046 the following rules are applied:
5984  * - In general, composition software should always use the "lowest common
5985  * denominator" character set possible
5986  * => We do so by preferring the widely supported ISO 8859-1 character set.
5987  *
5988  * \note
5989  * If this function supports more character sets in the future, ISO 8859-1 must
5990  * always stay the preferred one (because this is our fallback locale character
5991  * set to allow the use of POSIX regular expressions without Unicode support
5992  * from the system).
5993  *
5994  * If \c NULL is passed as parameter \e charset or \e cs_iana , this indicates
5995  * that the caller is not interested in this information. The corresponding
5996  * data is discarded in this case.
5997  *
5998  * \return
5999  * - Pointer to encoded data (the character set is written to \e charset)
6000  * If the result is not equal to \e s , a new memory block was allocated
6001  * - NULL on error (Original memory block for \e s is still allocated)
6002  * Nothing is written to \e charset and \e cs_iana in this case
6003  */
6004 
6005 const char* enc_convert_to_8bit(enum enc_mime_cs* charset, const char* s,
6006  const char** cs_iana)
6007 {
6008  const char* res = NULL;
6009  size_t i = 0;
6010  size_t ii = 0;
6011  long int ucp = 0;
6012  char* p = NULL;
6013  size_t len;
6014  int error = 0;
6015 
6016  /*
6017  * Allocate target buffer with same size as source buffer.
6018  * This is always sufficient for every 8bit character set.
6019  */
6020  len = strlen(s);
6021  p = (char*) posix_malloc(++len);
6022  if(NULL != p)
6023  {
6024  while(1)
6025  {
6026  ucp = enc_uc_decode_utf8(s, &i);
6027  if(-1L == ucp) { break; }
6028  /* ISO 8859-1 is mapped 1:1 into the Unicode codepoint space */
6029  if(256L <= ucp) { error = 1; break; }
6030  else { p[ii++] = (char) (unsigned char) ucp; }
6031  }
6032  /* Check for error */
6033  if(error) { posix_free((void*) p); }
6034  else
6035  {
6036  p[ii] = 0;
6037  res = p;
6038  if(NULL != charset) { *charset = ENC_CS_ISO8859_1; }
6039  if(NULL != cs_iana) { *cs_iana = "ISO-8859-1"; }
6040  }
6041  }
6042 
6043  return(res);
6044 }
6045 
6046 
6047 /* ========================================================================== */
6048 /*! \brief Encode header field body using MIME \c encoded-word tokens
6049  *
6050  * This function use quoted-printable encoding.
6051  *
6052  * \param[out] r Pointer to result string pointer
6053  * \param[in] b Header field body that contains potential Unicode data
6054  * \param[in] pl Length of header field prefix (Length limit: 25)
6055  *
6056  * The header field body \e b must be verified by the caller to be valid UTF-8
6057  * (this function will do the normalization to NFC).
6058  * The CRLF termination must be removed before calling this function.
6059  *
6060  * The length \e pl must include the header field name, the colon and any
6061  * potential white space not included in \e b .
6062  *
6063  * According to RFC 5536 the following rules are applied:
6064  * - A header field line is not allowed to be empty
6065  * => The header field is never folded immediately after the name separator.
6066  * - Lines are not allowed to contain more than 1000 characters
6067  * => We respect this by rejecting words that are longer than 998 characters.
6068  *
6069  * According to RFC 2047 the following rules are applied:
6070  * - White space between encoded-words is semantically ignored
6071  * => A single space between encoded-words is included in the trailing word,
6072  * additional LWSP characters are included into the leading word.
6073  * - A header line containing encoded-words must be no longer than 76 characters
6074  * => We fold before this limit.
6075  * - If folding is required, each encoded-word must contain an integral number
6076  * of characters and must be self-contained
6077  * => We only split between Unicode combining character sequences when using
6078  * UTF-8
6079  * (between grapheme clusters would be better, but is not supported yet)
6080  * - If there is more than one character set that can represent the 8-bit
6081  * content of an encoded-word, ISO 8859 should be preferred
6082  * => We do so if the required ISO 8859 encoder is available
6083  * (can be disabled with the \c force_unicode option in configfile).
6084  * - If encoded-word is not used because of 8-bit data, US-ASCII should be used
6085  * => We do so
6086  * (can be disabled with the \c force_unicode option in configfile).
6087  *
6088  * According to RFC 5198 the following rules are applied:
6089  * - It's recommended to use NFC normalization in general Internet text messages
6090  * => We do so.
6091  *
6092  * On success, the address of the result buffer is written to the location
6093  * pointed to by \e r (this may be the same as \e b if there is nothing to do).
6094  * The caller is responsible to free the potentially allocated memory.
6095  * On error \c NULL is written to the location pointed to by \e r .
6096  *
6097  * \return
6098  * - 0 on success if a new memory block was allocated
6099  * - 1 on success if there was nothing to encode and no memory was allocated
6100  * - -1 on error
6101  */
6102 
6103 int enc_mime_word_encode(const char** r, const char* b, size_t pl)
6104 {
6105  static const char error_msg[] = "[Error]";
6106  static const char folding[] = "\n "; /* Line break must be in POSIX form */
6107  int res = 0;
6108  char* rbuf = NULL;
6109  size_t rbuf_len = 0;
6110  size_t ri = 0;
6111  const char* body = NULL;
6112  const char* body_tmp = NULL;
6113  const char* cs_iana = "UTF-8";
6114  enum enc_mime_cs cs = ENC_CS_UTF_8;
6115  size_t start = 0;
6116  size_t end = 0;
6117  size_t i = 0;
6118  size_t ii;
6119  size_t iii;
6120  int enc_flag = 0;
6121  int enc_last = 0;
6122  int enc_split = 0;
6123  char enc_word[1001]; /* sizeof(folding) + 998 + NUL */
6124  size_t ei;
6125  size_t word_len;
6126  unsigned int dh, dl;
6127  char* p;
6128  size_t rem = 0;
6129  int init = 1; /* Flag indicating initial word */
6130  int first = 1; /* Flag indicating first line of header field */
6131  int uc_split; /* Flag indicating Unicode must be split here */
6132 #if !ENC_MIME_HEADER_FOLD_ASCII_LINES
6133  int no_ec = 1; /* Flag indicating line contains no encoded-words */
6134 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
6135  long int ucp; /* Unicode code point */
6136  struct uc_cdc cdc; /* Unicode canonical decomposition data */
6137  size_t gcpsl; /* Unicode combing character sequence length */
6138  int eod; /* End of data */
6139 
6140  /* Check parameters */
6141  if((size_t) 25 < pl)
6142  {
6143  PRINT_ERROR("MIME: Header field name too long");
6144  res = -1;
6145  }
6146  else
6147  {
6148  /* Calculate remaining bytes for folding */
6149  rem = (size_t) 76 - pl;
6150  /*
6151  * Check whether header field body contains only printable ASCII
6152  * and no "=?" or "?=" (to be more friendly) sequences
6153  */
6155  && NULL == strstr(b, "=?") && NULL == strstr(b, "?="))
6156  {
6157  /* Nothing to do => Data can be used "as is" */
6158  res = 1;
6159  }
6160  else
6161  {
6162  /* Check Unicode */
6163  if(enc_uc_check_utf8(b))
6164  {
6165  /* Invalid Unicode */
6166  PRINT_ERROR("MIME: Encoding of header field failed");
6167  p = (char*) posix_malloc(strlen(error_msg) + (size_t) 1);
6168  if(NULL != p) { strcpy(p, error_msg); }
6169  body_tmp = p;
6170  }
6171  else
6172  {
6173  /* Normalize Unicode */
6174  body_tmp = enc_uc_normalize_to_nfc(b);
6175  }
6176  if(NULL == body_tmp) { res = -1; }
6177  }
6178  }
6179 
6180  /* Check for error */
6181  if(!res)
6182  {
6183  /* Check whether user has forced Unicode */
6184  if (config[CONF_FORCE_UNICODE].val.i) { body = body_tmp; }
6185  else
6186  {
6187  /* Convert body to target character set */
6188  body = enc_convert_to_8bit(&cs, body_tmp, &cs_iana);
6189  if(NULL == body) { body = body_tmp; }
6190  else
6191  {
6192  /* Check for 7bit data */
6193  if (0 == enc_ascii_check(body))
6194  {
6195  cs = ENC_CS_ASCII;
6196  cs_iana = "US-ASCII";
6197  }
6198  }
6199  }
6200  /* Split body into words using SP delimiter */
6201  do
6202  {
6203  end = i++;
6204  if(!body[i] || ' ' == body[i])
6205  {
6206  /* Check for 2*LWSP */
6207  if(body[i])
6208  {
6209  if(' ' == body[i + (size_t) 1]
6210  || (char) 0x09 == body[i + (size_t) 1])
6211  {
6212  continue;
6213  }
6214  }
6215  /* Check whether word needs encoding */
6216  enc_last = enc_flag; enc_flag = 0;
6217  ei = 0;
6218  for(ii = start; ii <= end; ++ii)
6219  {
6220  enc_word[ei++] = body[ii];
6221  if(128U <= (unsigned int) body[ii]) { enc_flag = 1; break; }
6222  if('=' == (unsigned int) body[ii])
6223  {
6224  if((ii < end && '?' == body[ii + (size_t) 1])
6225  || (ii > start && '?' == body[ii - (size_t) 1]))
6226  {
6227  enc_flag = 1;
6228  break;
6229  }
6230  }
6231  }
6232  if(enc_split) { enc_flag = 1; }
6233  if(enc_flag)
6234  {
6235  /* Create MIME encoded word using quoted printable encoding */
6236 #if !ENC_MIME_HEADER_FOLD_ASCII_LINES
6237  no_ec = 0;
6238 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
6239  strcpy(enc_word, "=?");
6240  strcat(enc_word, cs_iana);
6241  strcat(enc_word, "?Q?");
6242  uc_split = 0;
6243  if(enc_last && !enc_split)
6244  {
6245  /* The space between encoded words is not semantical */
6246  strcat(enc_word, "_");
6247  }
6248  ei = strlen(enc_word);
6249  for(ii = start; ii <= end; ++ii)
6250  {
6251  /* Check for start of UTF-8 sequence */
6252  if(ENC_CS_UTF_8 == cs && 0x80 != ((int) body[ii] & 0xC0))
6253  {
6254  /* Search for next starter */
6255  eod = 0;
6256  iii = 0;
6257  while(!uc_split)
6258  {
6259  /* Count bytes as "=XX", even if encoded "as is" */
6260  gcpsl = iii * (size_t) 3;
6261  /* Check for end of data */
6262  if(!body[ii + iii]) { eod = 1; }
6263  else
6264  {
6265  /* Decode UTF-8 sequence for codepoint */
6266  if(!body[ii + iii]) { break; }
6267  ucp = enc_uc_decode_utf8(&body[ii], &iii);
6268  if(0L > ucp)
6269  {
6270  PRINT_ERROR("MIME: Decoding UCP failed");
6271  break;
6272  }
6273  enc_uc_lookup_cdc(ucp, &cdc);
6274  /* Check for starter */
6275  if(!gcpsl)
6276  {
6277  /* If Starter => Skip */
6278  if(!cdc.ccc) { continue; }
6279  /* Else abort */
6280  else { break; }
6281  }
6282  }
6283  /*
6284  * Check for next Unicode combining character sequence
6285  * boundary
6286  */
6287  if(eod || !cdc.ccc) /* Check eod first */
6288  {
6289  /* Combining character sequence boundary found */
6290  /*
6291  * Reserve space for encoded word prefix and suffix:
6292  * "=?UTF-8?Q??="
6293  * => 12 characters with the folding space
6294  */
6295  if((size_t) (75 - 12) < gcpsl)
6296  {
6297  /* Combining character sequence too long */
6298  PRINT_ERROR("MIME: "
6299  "Combining character sequence too long");
6300  /*
6301  * Replace with '?' (U+FFFD is too large!)
6302  * Maximum allowed length is one "=XX" triplet.
6303  */
6304  enc_word[ei++] = '=';
6305  enc_word[ei++] = '3';
6306  enc_word[ei++] = 'F';
6307  ii += iii - (size_t) 1;
6308  uc_split = 1;
6309  }
6310  /*
6311  * Check for length limit
6312  * Reserve 2 characters for closing "?="
6313  * Special handling for first line with less space
6314  */
6315  else if(first && ((size_t) (rem - 2) - gcpsl < ei))
6316  {
6317  uc_split = 1;
6318  }
6319  else if((size_t) (75 - 2) - gcpsl < ei)
6320  {
6321  uc_split = 1;
6322  }
6323  break;
6324  }
6325  }
6326  }
6327  if(uc_split) { /* Rewind current byte */ --ii; }
6328  else
6329  {
6330  /* Check whether character can be encoded "as is" */
6331  if( ('0' <= body[ii] && '9' >= body[ii])
6332  || ('A' <= body[ii] && 'Z' >= body[ii])
6333  || ('a' <= body[ii] && 'z' >= body[ii])
6334  || '!' == body[ii] || '*' == body[ii] || '+' == body[ii]
6335  || '-' == body[ii] || '/' == body[ii] )
6336  {
6337  /* Yes */
6338  enc_word[ei++] = body[ii];
6339  }
6340  else
6341  {
6342  /* No => Encode with hexadecimal syntax */
6343  enc_word[ei++] = '=';
6344  dh = (unsigned int) (unsigned char) body[ii] / 16U;
6345  if(10U > dh) { enc_word[ei++] = (char) (48U + dh); }
6346  else { enc_word[ei++] = (char) (65U + dh - 10U); }
6347  dl = (unsigned int) (unsigned char) body[ii] % 16U;
6348  if(10U > dl) { enc_word[ei++] = (char) (48U + dl); }
6349  else { enc_word[ei++] = (char) (65U + dl - 10U); }
6350  }
6351  }
6352  /*
6353  * Check for length limit
6354  * Reserve 3 characters for next hexdecimal value
6355  * Reserve 2 characters for closing "?="
6356  */
6357  if(uc_split || (size_t) (75 - 3 - 2) < ei)
6358  {
6359  /* Terminate normally if there are no more characters */
6360  if(ii < end)
6361  {
6362  enc_split = 1;
6363  /* Rewind index to process skipped data in next run */
6364  i -= (end - ii);
6365  --i;
6366  break;
6367  }
6368  }
6369  else { enc_split = 0; }
6370  }
6371  /* End mark of encoded-word */
6372  enc_word[ei++] = '?';
6373  enc_word[ei++] = '=';
6374  }
6375  /* Terminate word */
6376  enc_word[ei] = 0;
6377  /* printf("Word: |%s|\n", enc_word); */
6378  /* One additional character for potential delimiting space */
6379  word_len = strlen(enc_word) + (size_t) 1;
6380  if((size_t) 998 < word_len)
6381  {
6382  PRINT_ERROR("MIME: Encoded-word too long");
6383  res = -1;
6384  break;
6385  }
6386  /* Fold header field if lines get too long otherwise */
6387  if(word_len && (word_len > rem)
6389  && !(no_ec && !enc_flag && (word_len < rem + (size_t) 922))
6390 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
6391  )
6392  {
6393  /* Fold => This automatically creates SP delimiter */
6394  if(first)
6395  {
6396  PRINT_ERROR("MIME: Encoded-word too long for first line");
6397  res = -1;
6398  break;
6399  }
6400  else if(word_len > rem)
6401  {
6402  memmove((void*) &enc_word[strlen(folding)], (void*) enc_word,
6403  word_len--);
6404  /* Decrement because SP delimitier is part of folding mark */
6405  memcpy((void*) enc_word, (void*) folding, strlen(folding));
6406  word_len += strlen(folding);
6407  rem = (size_t) 75;
6408  }
6409 #if !ENC_MIME_HEADER_FOLD_ASCII_LINES
6410  /* Check whether last word was an encoded word */
6411  if(!enc_flag) { no_ec = 1; }
6412 #endif /* ENC_MIME_HEADER_FOLD_ASCII_LINES */
6413  }
6414  else
6415  {
6416  /*
6417  * Prepend SP delimiter
6418  * Note that this delimiter is always syntactical, but not sematical
6419  * between two encoded words!
6420  */
6421  if(init) { init = 0; --word_len; }
6422  else
6423  {
6424  memmove((void*) &enc_word[1], (void*) enc_word, word_len);
6425  enc_word[0] = ' ';
6426  }
6427  }
6428  /* Allocate more memory in exponentially increasing chunks */
6429  /* Attention: Be prepared for large data (ASCII only lines) */
6430  while(ri + word_len >= rbuf_len) /* One additional byte for NUL */
6431  {
6432  if(!rbuf_len) { rbuf_len = 128; }
6433  p = posix_realloc((void*) rbuf, rbuf_len *= (size_t) 2);
6434  if(NULL == p) { res = -1; break; }
6435  else { rbuf = p; }
6436  }
6437  if(-1 == res) { break; }
6438  /* Copy word to result buffer */
6439  memcpy((void*) &rbuf[ri], (void*) enc_word, word_len);
6440  ri += word_len;
6441  if(rem < word_len) { rem = 0; }
6442  else { rem -= word_len; }
6443  first = 0;
6444  /* Store new start index */
6445  start = i + (size_t) 1;
6446  }
6447  }
6448  while(body[i]);
6449  }
6450  if(body != body_tmp) { posix_free((void*) body); }
6451  if(body_tmp != b) { posix_free((void*) body_tmp); }
6452  /* Terminate result string */
6453  if(NULL != rbuf) { rbuf[ri] = 0; }
6454 
6455  /* Check result */
6456  switch(res)
6457  {
6458  case 0:
6459  {
6460  *r = (const char*) rbuf;
6461  break;
6462  }
6463  case 1:
6464  {
6465  *r = b;
6466  break;
6467  }
6468  default:
6469  {
6470  posix_free((void*) rbuf);
6471  *r = NULL;
6472  break;
6473  }
6474  }
6475  /* if(0 <= res) { printf("Result: %s\n", *r); } */
6476 
6477  return(res);
6478 }
6479 
6480 
6481 /* ========================================================================== */
6482 /*! \brief Decode header field containing potential MIME \c encoded-word tokens
6483  *
6484  * \param[out] r Pointer to result string pointer
6485  * \param[in] b Header field body that contains potential encoded-words
6486  *
6487  * The header field body \e b must be unfolded before calling this function.
6488  *
6489  * According to RFC 2047 the following rules are applied:
6490  * - An encoded-word is not allowed to be longer than 75 characters
6491  * => We decode encoded-word of arbitrary length.
6492  * - An encoded-word not at the beginning can start after a 'linear-white-space'
6493  * token => We resync the parser after every white space.
6494  * - Any amount of linear-space-white between 'encoded-word's must be ignored
6495  * => We do so.
6496  * - The character set and encoding fields must be treated case-insensitive
6497  * => We do so.
6498  * - All character sets from the ISO 8859 family that are not supported must be
6499  * handled in a way that contained ASCII characters are decoded correctly
6500  * => We do so.
6501  *
6502  * According to RFC 3629 the following rules are applied:
6503  * - If the content of an encoded word is UTF-8 encoded, it is is not allowed
6504  * to accept it unchecked. It is mandatory to check the validity of the
6505  * encoding => We do so.
6506  *
6507  * On success, the address of the result buffer is written to the location
6508  * pointed to by \e r (this may be the same as \e b if there is nothing to do).
6509  * The caller is responsible to free the potentially allocated memory.
6510  * On error \c NULL is written to the location pointed to by \e r .
6511  *
6512  * \return
6513  * - 0 on success if something was decoded and a new memory block was allocated
6514  * - 1 on success if there was nothing to decode and no memory was allocated
6515  * - -1 on error
6516  */
6517 
6518 int enc_mime_word_decode(const char** r, const char* b)
6519 {
6520  int res = 0;
6521  char* rbuf = NULL;
6522  size_t rbuf_len = 0;
6523  size_t ri = 0;
6524  size_t i = 0;
6525  const char* target;
6526  char* p;
6527  char* p2;
6528  enum enc_mime_cs charset;
6529  char encoding;
6530  const char* wbuf;
6531  const char* nbuf;
6532  size_t nbuf_len;
6533  int word_flag = 0;
6534  size_t word_trailing_space = 0;
6535  size_t ii;
6536  int ctrl = 0; /* Indicates unwanted control characters to remove */
6537  size_t len = 0;
6538 
6539  /* Fast special check for "no equal sign" */
6540  target = strchr(&b[i], (int) '=');
6541  if(NULL == target)
6542  {
6543  /* ... and no unwanted LF and CR control characters */
6544  target = strchr(&b[i], 0x0A);
6545  if(NULL == target)
6546  {
6547  target = strchr(&b[i], 0x0D);
6548  if(NULL == target) { res = 1; }
6549  }
6550  }
6551 
6552  while(!res && b[i])
6553  {
6554  wbuf = NULL;
6555  /* Skip white space */
6556  nbuf_len = 0;
6557  while(b[i] &&
6558  (' ' == b[i + nbuf_len] || (const char) 0x09 == b[i + nbuf_len]))
6559  {
6560  ++nbuf_len;
6561  }
6562  if(!nbuf_len)
6563  {
6564  /* Check for encoded word */
6565  p = NULL;
6566  target = &b[i];
6567  if('=' == target[0])
6568  {
6569  if('?' == target[1])
6570  {
6571  /* Start delimiter detected */
6572  p = strchr(&target[2], (int) '?');
6573  if(NULL != p)
6574  {
6575  /* Extract character set (ignore RF2231 language tokens) */
6576  p2 = strchr(&target[2], (int) '*');
6577  if(NULL == p2) { p2 = p; }
6578  else if(p < p2) { p2 = p; }
6579  charset = enc_mime_get_charset(&target[2],
6580  (size_t) (p2 - &target[2]));
6581  /* Extract encoding */
6582  if(p[1])
6583  {
6584  encoding = (char) toupper((int) p[1]);
6585  if('?' != p[2])
6586  {
6587  PRINT_ERROR("MIME: Syntax error in encoded-word");
6588  }
6589  else
6590  {
6591  /* Extract payload */
6592  target = &p[3];
6593  p = strchr(target, (int) '?');
6594  if(NULL != p)
6595  {
6596  if('=' != p[1])
6597  {
6598  PRINT_ERROR("MIME: "
6599  "Too many fields in encoded-word");
6600  }
6601  else
6602  {
6603  /* End delimiter detected */
6604  switch(encoding)
6605  {
6606  case 'Q':
6607  {
6608  /* Use quoted printable decoder */
6609  wbuf = enc_mime_decode_q(charset,
6610  target, p, 1);
6611  break;
6612  }
6613  case 'B':
6614  {
6615  /* Use base64 decoder */
6616  wbuf = enc_mime_decode_b(charset,
6617  target, p, 0);
6618  break;
6619  }
6620  default:
6621  {
6622  PRINT_ERROR("MIME: Encoding not supported");
6623  break;
6624  }
6625  }
6626  }
6627  }
6628  }
6629  }
6630  }
6631  }
6632  }
6633  if(NULL != wbuf)
6634  {
6635  /* Rewind white space between encoded words */
6636  if(word_flag)
6637  {
6638  while( ri && (' ' == rbuf[ri - (size_t) 1] ||
6639  0x09 == (int) rbuf[ri - (size_t) 1]) )
6640  {
6641  --ri;
6642  }
6643  ri += word_trailing_space;
6644  }
6645  /* Copy encoded word */
6646  word_flag = 1;
6647  nbuf = wbuf;
6648  nbuf_len = strlen(nbuf);
6649  i += (size_t) (&p[2] - &b[i]);
6650  /* Store number of trailing spaces */
6651  word_trailing_space = 0;
6652  if(nbuf_len)
6653  {
6654  ii = nbuf_len;
6655  while(ii--)
6656  {
6657  if(' ' != nbuf[ii]) { break; }
6658  else { ++word_trailing_space; }
6659  }
6660  }
6661  }
6662  else
6663  {
6664  /* Copy as ASCII up to next white space */
6665  word_flag = 0;
6666  nbuf = &b[i];
6667  p = strchr(nbuf, (int) ' ');
6668  p2 = strchr(nbuf, 0x09);
6669  if(NULL != p2 && p2 < p) { p = p2; }
6670  if(NULL == p) { nbuf_len = strlen(nbuf); }
6671  else { nbuf_len = (size_t) (p - nbuf); }
6672  i += nbuf_len;
6673  }
6674  }
6675  else
6676  {
6677  /* Copy white space */
6678  nbuf = &b[i];
6679  i += nbuf_len;
6680  }
6681 
6682  /* Allocate more memory in exponentially increasing chunks */
6683  while(ri + nbuf_len >= rbuf_len) /* 1 additional byte for termination */
6684  {
6685  if(!rbuf_len) { rbuf_len = 128; }
6686  if(POSIX_SIZE_MAX / (size_t) 2 < rbuf_len) { res = -1; break; }
6687  p = (char*) posix_realloc((void*) rbuf, rbuf_len *= (size_t) 2);
6688  if(NULL == p) { res = -1; break; }
6689  else { rbuf = p; }
6690  }
6691 
6692  /* Copy decoded word to result buffer */
6693  memcpy((void*) &rbuf[ri], (void*) nbuf, nbuf_len);
6694  ri += nbuf_len;
6695  if(NULL != wbuf) { posix_free((void*) wbuf); }
6696  }
6697  /* Terminate result string */
6698  if(NULL != rbuf) { len = ri; rbuf[len] = 0; }
6699 
6700  /* Replace unwanted LF and CR control characters with U+FFFE */
6701  if(NULL != rbuf)
6702  {
6703  ri = 0;
6704  while(rbuf[ri])
6705  {
6706  if(0x0A == (int) rbuf[ri] || 0x0D == (int) rbuf[ri])
6707  {
6708  ctrl = 1;
6709  break;
6710  }
6711  ++ri;
6712  }
6713  if(ctrl)
6714  {
6715  if(POSIX_SIZE_MAX / (size_t) 3 <= len) { res = -1; }
6716  else
6717  {
6718  /* Multiply string length by 3 and add 1 for NUL termination */
6719  rbuf_len = len * (size_t) 3 + (size_t) 1;
6720  p = (char*) posix_realloc((void*) rbuf, rbuf_len);
6721  if(NULL == p) { res = -1; }
6722  else
6723  {
6724  rbuf = p;
6725  /* Use U+FFFE (3 octets) as replacement character */
6726  ri = 0;
6727  do
6728  {
6729  if(0x0A == (int) rbuf[ri] || 0x0D == (int) rbuf[ri])
6730  {
6731  memmove((void*) &rbuf[ri + (size_t) 2], (void*) &rbuf[ri],
6732  rbuf_len - (ri + (size_t) 2));
6733  rbuf[ri++] = 0xEF;
6734  rbuf[ri++] = 0xBF;
6735  rbuf[ri] = 0xBD;
6736  }
6737  }
6738  while(rbuf[++ri]);
6739  }
6740  PRINT_ERROR("MIME: "
6741  "Unwanted CR and/or LF detected in header field");
6742  }
6743  }
6744  }
6745 
6746  /* Check result */
6747  switch(res)
6748  {
6749  case 0:
6750  {
6751  *r = (const char*) rbuf;
6752  break;
6753  }
6754  case 1:
6755  {
6756  *r = b;
6757  break;
6758  }
6759  default:
6760  {
6761  posix_free((void*) rbuf);
6762  *r = NULL;
6763  break;
6764  }
6765  }
6766 
6767  return(res);
6768 }
6769 
6770 
6771 /* ========================================================================== */
6772 /*! \brief Decode header field containing potential MIME parameters
6773  *
6774  * \param[out] r Pointer to result string pointer
6775  * \param[in] b Prepared header field body that contains potential parameters
6776  * \param[in] m Operating mode (see description below)
6777  *
6778  * The parameter \e m enable special processing if set to a nonzero value.
6779  * \e m should be set to 1 for the \c Content-Type header field.
6780  *
6781  * \attention
6782  * This function must be called after unfolding the field body, with comments
6783  * stripped and after decoding of \c quoted-string tokens. Whitespace must
6784  * already be merged into the semantically equivalent single SP (and removed
6785  * completely before semicolons and around equal signs) by the caller.
6786  *
6787  * According to RFC 2231 the following rules are applied:
6788  * - Parameters can be split into multiple sections which can be listed in
6789  * arbitrary order inside the header field body
6790  * => We accept parameter sections in any order and merge them in ascending
6791  * order.
6792  * - Parameter sections are allowed to contain literal content as well as
6793  * \c quoted-string tokens. Mixing sections of both types is allowed
6794  * => \c quoted-string tokens must already be decoded in \e b by the caller.
6795  * - Parameters can contain character set information
6796  * => We accept content in any supported character set and decode it to
6797  * Unicode NFC (non-US_ASCII octets of unsupported character sets are
6798  * decoded to the underscore character).
6799  * - Parameter can contain language information => We accept and ignore it.
6800  *
6801  * According to RFC 3629 the following rules are applied:
6802  * - If the content of a parameter is UTF-8 encoded, it is is not allowed to
6803  * accept it unchecked. It is mandatory to check the validity of the encoding
6804  * => We do so.
6805  *
6806  * On success, the address of the result buffer is written to the location
6807  * pointed to by \e r (this may be the same as \e b if there is nothing to do).
6808  * The caller is responsible to free the potentially allocated memory.
6809  * On error \c NULL is written to the location pointed to by \e r .
6810  *
6811  * \return
6812  * - 0 on success if something was decoded and a new memory block was allocated
6813  * - 1 on success if there was nothing to decode and no memory was allocated
6814  * - -1 on error
6815  */
6816 
6817 int enc_mime_para_decode(const char** r, const char* b, int m)
6818 {
6819  int res = -1;
6820  posix_locale_t loc_ctype_posix = 0;
6821  int finished = 0;
6822  struct mime_parameter** parray = NULL;
6823  size_t ppsize = sizeof(struct mime_parameter*);
6824  struct mime_parameter* pdata;
6825  size_t psize = sizeof(struct mime_parameter);
6826  const char* first_end; /* Semicolon after first element */
6827  const char* p; /* Start of parameter */
6828  const char* p_cs; /* Start of charset name */
6829  const char* p_start; /* Start of parameter value */
6830  const char* p_end; /* End of parameter */
6831  const char* p_eq_sign; /* Position of equal sign */
6832  const char* p_asterisk; /* Position of asterisk (or 'p_eq_sign') */
6833  size_t alen; /* Length of attribute token */
6834  size_t clen; /* Length of charset token */
6835  size_t i = 0;
6836  size_t ii = 0;
6837  int rv;
6838  struct mime_parameter** tmp;
6839  char* tmp2;
6840  char* tmp3;
6841  unsigned int sec_num;
6842  char ext_mark; /* Flag indicating extended-parameter (with charset) */
6843  const char* q;
6844  char* rbuf = NULL;
6845  size_t rbuf_len;
6846  size_t ri = 0;
6847  size_t len;
6848  size_t len2;
6849  int rewind;
6850  int error = 0;
6851  char* para_charset; /* Pointer to charset declaration of first section */
6852 
6853  /* Create a locale object with LC_CTYPE == POSIX */
6854  loc_ctype_posix = posix_newlocale(POSIX_LC_CTYPE_MASK, "POSIX",
6855  (posix_locale_t) 0);
6856  if((posix_locale_t) 0 == loc_ctype_posix)
6857  {
6858  PRINT_ERROR("MIME: Cannot create locale object");
6859  return(res);
6860  }
6861 
6862  /* Nothing to do if there are no asterisks */
6863  if(NULL == strchr(b, (int) '*')) { *r = b; res = 1; }
6864  else
6865  {
6866 #if 0
6867  /* For debugging */
6868  printf("---------------\n");
6869  printf("Header field body : %s\n", b);
6870 #endif
6871  /* Skip to end of content */
6872  first_end = strchr(b, (int) ';');
6873  if(NULL == first_end) { *r = b; res = 1; }
6874  else
6875  {
6876  /* Initialize parameter section array */
6877  parray = (struct mime_parameter**) posix_malloc(ppsize);
6878  if(NULL != parray)
6879  {
6880  parray[0] = NULL;
6881  /* Parse parameters */
6882  p_end = first_end;
6883  do
6884  {
6885  p = p_end + 1;
6886  sec_num = 0;
6887  ext_mark = ' ';
6888  clen = 0;
6889  /* Skip potential space after semicolon */
6890  if(' ' == *p) { ++p; }
6891  /* Seach for end of parameter section content */
6892  p_end = strchr(p, (int) ';');
6893  if(NULL == p_end)
6894  {
6895  p_end = p + strlen(p);
6896  /* Strip potential trailing space */
6897  if(' ' == *(p_end - 1)) { --p_end; }
6898  finished = 1;
6899  }
6900  /* Search for end of parameter name */
6901  p_eq_sign = strchr(p, (int) '=');
6902  if(NULL == p_eq_sign) { break; }
6903  if(p_end < p_eq_sign) { break; }
6904  /* Search for end of attribute token (asterisk) */
6905  p_asterisk = strchr(p, (int) '*');
6906  if(NULL != p_asterisk && p_eq_sign > p_asterisk)
6907  {
6908  /* Extract section number */
6909  rv = sscanf(p_asterisk, " * %u", &sec_num);
6910  if(1 != rv)
6911  {
6912  /* No section number specified */
6913  sec_num = 0;
6914  /* Check for extended-parameter */
6915  sscanf(p_asterisk, " %c", &ext_mark);
6916  }
6917  else
6918  {
6919  /* Check for extended-parameter */
6920  sscanf(p_asterisk, " * %*u %c", &ext_mark);
6921  }
6922  }
6923  else { p_asterisk = p_eq_sign; }
6924  alen = (size_t) (p_asterisk - p);
6925  if(alen && ' ' == p[alen - (size_t) 1])
6926  {
6927  /* Strip potential trailing space */
6928  --alen;
6929  }
6930  /* Check for parameter attribute length limit */
6931  if(ENC_MIME_PARA_LENGTH_MAX < alen)
6932  {
6933  PRINT_ERROR("MIME: Parameter attribute too long");
6934  continue;
6935  }
6936  /* Extract charset */
6937  p_start = p_eq_sign + 1;
6938  p_cs = p_start;
6939  if(!sec_num && '*' == ext_mark)
6940  {
6941  q = strchr(p_start, 0x27);
6942  if(NULL == q)
6943  {
6944  PRINT_ERROR("MIME: Parameter charset field missing");
6945  }
6946  else
6947  {
6948  clen = (size_t) (q - p_start);
6949  if(ENC_MIME_PARA_LENGTH_MAX < clen)
6950  {
6951  PRINT_ERROR("MIME: Parameter charset too long");
6952  clen = 0;
6953  }
6954  p_start = q + 1;
6955  q = strchr(p_start, 0x27);
6956  if(NULL == q)
6957  {
6958  PRINT_ERROR("MIME: Parameter language field missing");
6959  }
6960  else { p_start = q + 1; }
6961  }
6962  }
6963  /* Remove unknown parameters for "Content-Type" mode */
6964  if(1 == m)
6965  {
6966  if(posix_strncasecmp_l(p, "Charset", alen, loc_ctype_posix)
6967  && posix_strncasecmp_l(p, "Format", alen, loc_ctype_posix)
6968  && posix_strncasecmp_l(p, "DelSp", alen, loc_ctype_posix)
6969  && posix_strncasecmp_l(p, "InsLine", alen, loc_ctype_posix)
6970  && posix_strncasecmp_l(p, "Boundary", alen, loc_ctype_posix)
6971  )
6972  {
6973  /* Ignore all other parameters */
6974  continue;
6975  }
6976  }
6977  /* Increase size of array */
6978  tmp = (struct mime_parameter**)
6979  posix_realloc(parray,
6980  ppsize += sizeof(struct mime_parameter*));
6981  if(NULL == tmp)
6982  {
6983  PRINT_ERROR("MIME: Parameter memory allocation failed");
6984  break;
6985  }
6986  parray = tmp;
6987  /* Construct parameter structure ... */
6988  pdata = (struct mime_parameter*) posix_malloc(psize);
6989  if(NULL == pdata)
6990  {
6991  PRINT_ERROR("MIME: Parameter memory allocation failed");
6992  break;
6993  }
6994  strncpy(pdata->attribute, p, alen);
6995  pdata->attribute[alen] = 0;
6996  pdata->attribute_len = alen;
6997  pdata->section = sec_num;
6998  strncpy(pdata->charset, p_cs, clen);
6999  pdata->charset[clen] = 0;
7000  pdata->value_start = p_start;
7001  pdata->value_end = p_end;
7002  pdata->valid = 1;
7003 #if 0
7004  /* For debugging */
7005  printf("Index : %u / Section: %u (%s): ", (unsigned int) i,
7006  sec_num, pdata->attribute);
7007  if(strlen(pdata->charset))
7008  {
7009  printf("[Charset: %s] ", pdata->charset);
7010  }
7011  for(size_t iii = 0; (size_t) (p_end - p_start) > iii; ++iii)
7012  {
7013  printf("%c", pdata->value_start[iii]);
7014  }
7015  printf("\n");
7016 #endif
7017  /* ... and append it to array */
7018  parray[i] = pdata;
7019  parray[++i] = NULL;
7020  }
7021  while(!finished);
7022  /* -------------------------------------------------------------- */
7023  /* Allocate new memory buffer for result */
7024  rbuf_len = (size_t) (first_end - b);
7025  /* 3 additional bytes for "; " separator and NUL termination */
7026  rbuf = (char*) posix_malloc(rbuf_len + (size_t) 3);
7027  if(NULL != rbuf)
7028  {
7029  /* Copy first element (including the semicolon) */
7030  strncpy(rbuf, b, rbuf_len);
7031  rbuf[rbuf_len] = 0;
7032  /* Strip SPs from first element for "Content-Type" mode */
7033  if(1 == m)
7034  {
7035  /* Assignment in truth expression is intended */
7036  while(NULL != (q = strchr(rbuf, (int) ' ')))
7037  {
7038  memmove((void*) q, (void*) (q + 1),
7039  strlen(q + 1) + (size_t) 1);
7040  if(rbuf_len) { --rbuf_len; }
7041  }
7042  }
7043  ri += rbuf_len;
7044  rbuf_len += (size_t) 3;
7045  /* Merge parameter sections */
7046  res = 0;
7047  i = 0;
7048  if(NULL == parray[i])
7049  {
7050  PRINT_ERROR("MIME: Missing parameters");
7051  }
7052  else do
7053  {
7054  /* Array contain at least 1 element */
7055  if(!parray[i]->valid) { continue; }
7056  if(parray[i]->section) { continue; }
7057  /* Found initial section => Insert separator */
7058  rbuf[ri++] = ';'; rbuf[ri++] = ' ';
7059  /* Select initial section to force first match */
7060  sec_num = 0;
7061  rewind = 0;
7062  ii = 0;
7063  para_charset = NULL;
7064  do
7065  {
7066  /* Search for next segment */
7067  if(rewind) { rewind = 0; ii = 0; }
7068  if(!parray[ii]->valid) { continue; }
7069  if(sec_num != parray[ii]->section) { continue; }
7070  else if(!strcmp(parray[i]->attribute,
7071  parray[ii]->attribute))
7072  {
7073  /* Calculate length */
7074  if(!sec_num)
7075  {
7076  /* One additional byte for NUL (and later '=') */
7077  alen = parray[ii]->attribute_len + (size_t) 1;
7078  }
7079  else { alen = 0; }
7080  len = alen;
7081  len += (size_t) (parray[ii]->value_end
7082  - parray[ii]->value_start);
7083  /* Allocate memory in exponentially increasing chunks */
7084  len += 3; /* For "; " separator and NUL termination */
7085  while(ri + len >= rbuf_len)
7086  {
7087  tmp2 = posix_realloc((void*) rbuf,
7088  rbuf_len *= (size_t) 2);
7089  if(NULL == tmp2)
7090  {
7091  PRINT_ERROR("MIME: Memory allocation"
7092  " for result buffer failed");
7093  error = 1;
7094  continue;
7095  }
7096  else { rbuf = tmp2; }
7097  }
7098  len -= (size_t) 3;
7099  /* Append attribute to result buffer for section 0 */
7100  if(!sec_num)
7101  {
7102  strncpy(&rbuf[ri], parray[ii]->attribute, alen);
7103  rbuf[ri + alen - (size_t) 1] = '=';
7104  ri += alen;
7105  len -= alen;
7106  }
7107  /* Only first parameter section has a charset field */
7108  if(!sec_num) { para_charset = parray[ii]->charset; }
7109  /* Append decoded value section to result buffer */
7110  tmp3 = NULL;
7111  if(NULL != para_charset)
7112  {
7113  /* Interpret zero length charset as "US-ASCII" */
7114  if((size_t) 0 == strlen(para_charset))
7115  {
7116  para_charset="US-ASCII";
7117  }
7118  /* Decode charset of value section */
7119  tmp2 = posix_malloc(len + (size_t) 1);
7120  if(NULL != tmp2)
7121  {
7122  strncpy(tmp2, parray[ii]->value_start, len);
7123  tmp2[len] = 0;
7124  tmp3 = enc_mime_decode_parameter(tmp2,
7125  para_charset);
7126  if(NULL != tmp3)
7127  {
7128  len2 = strlen(tmp3);
7129  if(len < len2)
7130  {
7131  PRINT_ERROR("MIME: Decoding error");
7132  posix_free((void*) tmp3);
7133  tmp3 = NULL;
7134  }
7135  else
7136  {
7137  strcpy(&rbuf[ri], tmp3);
7138  ri += len2;
7139  }
7140  }
7141  posix_free((void*) tmp2);
7142  }
7143  }
7144  if(NULL == tmp3)
7145  {
7146  strncpy(&rbuf[ri], parray[ii]->value_start, len);
7147  rbuf[ri + len] = 0;
7148  ri += len;
7149  }
7150  posix_free((void*) tmp3);
7151  parray[ii]->valid = 0;
7152  /* Rewind index for next section */
7153  rewind = 1;
7154  ++sec_num;
7155  }
7156  }
7157  while(!error && (NULL != parray[++ii] || rewind));
7158  parray[i]->valid = 0;
7159  }
7160  while(!error && NULL != parray[++i]);
7161  }
7162  /* Destroy parameter section array */
7163  i = 0;
7164  while(NULL != parray[i]) { posix_free((void*) parray[i++]); }
7165  posix_free((void*) parray);
7166  }
7167  }
7168  }
7169  if(error) { res = -1; }
7170 
7171  /* Destroy locale object */
7172  if((posix_locale_t) 0 != loc_ctype_posix)
7173  {
7174  posix_freelocale(loc_ctype_posix);
7175  }
7176 
7177  /* Check for error */
7178  if(0 > res) { *r = NULL; }
7179  if(!res)
7180  {
7181 #if 0
7182  /* For debugging (Attention: Terminal must use UTF-8 encoding!) */
7183  printf("Result: %s\n", rbuf);
7184  printf("---------------\n");
7185 #endif
7186  *r = rbuf;
7187  } else { posix_free((void*) rbuf); }
7188 
7189  return(res);
7190 }
7191 
7192 
7193 /* ========================================================================== */
7194 /*! \brief Decode MIME "Content-Type" header field
7195  *
7196  * \param[out] ct Pointer to result structure
7197  * \param[in] hf_body Header field body that contains the MIME content type
7198  * \param[out] bo Pointer to buffer for multipart boundary delimiter
7199  *
7200  * The header field body \e hf_body is decoded and content IDs are written to
7201  * the structure pointed to by \e ct .
7202  *
7203  * The buffer for the boundary string used in messages with content type
7204  * "multipart" must be allocated by the caller with a size of at least
7205  * \ref ENC_BO_BUFLEN and a pointer to the start of this buffer must be passed
7206  * as \e bo parameter. It is allowed to pass \c NULL for \e bo if the caller
7207  * is not interested in the boundary string.
7208  *
7209  * According to RFC 2045 the following rules are applied:
7210  * - If the content type is not present, "text/plain" and "US-ASCII" must be
7211  * used as default => We do so.
7212  *
7213  * According to RFC 2046 the following rules are applied:
7214  * - The content type and subtype must be treated case insensitive => We do so.
7215  * - The parameter names must be treated case insensitive => We do so.
7216  * - The default character set must be assumed as "US-ASCII" if the "charset"
7217  * parameter is missing for "text/plain" content type => We do so.
7218  *
7219  * According to RFC 3676 the following rules are applied:
7220  * - The values of parameters "Format" and "DelSp" must be treated case
7221  * insensitive => We do so.
7222  * - The parameter "DelSp" should be ignored if content type is not "text/plain"
7223  * with "format=flowed" => We do so.
7224  *
7225  * The experimental parameter "InsLine" set to "yes" adds an empty line
7226  * separator after every paragraph that end with an empty line.
7227  * This allows to declare single lines as paragraphs, e.g. for Smartphones,
7228  * without losing the separation to the following text (or creating double empty
7229  * line separation in compatibility view).
7230  *
7231  * \note
7232  * This function never fails, instead \c ENC_xxx_UNKNOWN IDs are returned.
7233  */
7234 
7235 void enc_mime_get_ct(struct enc_mime_ct* ct, const char* hf_body, char* bo)
7236 {
7237  char* body = NULL;
7238  size_t len;
7239  size_t i;
7240  size_t ii;
7241  char fmt[ENC_FMT_BUFLEN];
7242  char cs[ENC_CS_BUFLEN];
7243  size_t bo_len, bo_len_valid;
7244  int trailing_sp = 0;
7245 
7246  /* Initialize result */
7247  ct->type = ENC_CT_UNKNOWN;
7248  ct->subtype = ENC_CTS_UNKNOWN;
7249  ct->charset = ENC_CS_UNKNOWN;
7250  ct->flags = 0;
7251 
7252  /* Accept NULL pointer (treat as "field is not present") */
7253  if(NULL == hf_body)
7254  {
7255 #if 0
7256  /* For debugging */
7257  printf("Content-Type: Not specified\n");
7258 #endif
7259  ct->type = ENC_CT_TEXT;
7260  ct->subtype = ENC_CTS_PLAIN;
7261  ct->charset = ENC_CS_ASCII;
7262  return;
7263  }
7264 
7265  /* Allocate memory for case conversion */
7266  len = strlen(hf_body);
7267  body = (char*) posix_malloc(len + (size_t) 1);
7268  if(NULL != body)
7269  {
7270  /* Convert header field body to upper case */
7271  for(i = 0; i < len; ++i) { body[i] = (char) toupper((int) hf_body[i]); }
7272  body[len] = 0;
7273 #if 0
7274  /* For debugging */
7275  printf("Content-Type: %s\n", body);
7276 #endif
7277  /* Check for content type "text" */
7278  if(!strncmp("TEXT", body, 4))
7279  {
7280  ct->type = ENC_CT_TEXT;
7281  ct->charset = ENC_CS_ASCII;
7282  if(!strncmp("TEXT/PLAIN", body, 10))
7283  {
7284  ct->subtype = ENC_CTS_PLAIN;
7285  /* Search for RFC 3676 "Format" parameter (case insensitive) */
7286  for(i = 0; i < len; ++i)
7287  {
7288  if(!strncmp("FORMAT", &body[i], 6))
7289  {
7290  /* Extract parameter value */
7291  ii = i + (size_t) 6;
7292  while(body[ii])
7293  {
7294  if('=' != body[ii] && ' ' != body[ii]) { break; }
7295  else { ++ii; }
7296  }
7297  for(i = 0; i < ENC_FMT_BUFLEN; ++i)
7298  {
7299  if(!body[ii + i]
7300  || ';' == body[ii + i] || ' ' == body[ii + i])
7301  {
7302  fmt[i] = 0; break;
7303  }
7304  else { fmt[i] = body[ii + i]; }
7305  }
7306  fmt[ENC_FMT_BUFLEN - (size_t) 1] = 0;
7307  if(!strncmp("FLOWED", fmt, 6))
7308  {
7309  ct->flags |= ENC_CT_FLAG_FLOWED;
7310  }
7311  break;
7312  }
7313  }
7314  if(ct->flags & ENC_CT_FLAG_FLOWED)
7315  {
7316  /* Search for RFC 3676 "DelSp" parameter (case insensitive) */
7317  for(i = 0; i < len; ++i)
7318  {
7319  if(!strncmp("DELSP", &body[i], 5))
7320  {
7321  /* Extract parameter value */
7322  ii = i + (size_t) 5;
7323  while(body[ii])
7324  {
7325  if('=' != body[ii] && ' ' != body[ii]) { break; }
7326  else { ++ii; }
7327  }
7328  for(i = 0; i < ENC_FMT_BUFLEN; ++i)
7329  {
7330  if(!body[ii + i]
7331  || ';' == body[ii + i] || ' ' == body[ii + i])
7332  {
7333  fmt[i] = 0; break;
7334  }
7335  else { fmt[i] = body[ii + i]; }
7336  }
7337  fmt[ENC_FMT_BUFLEN - (size_t) 1] = 0;
7338  if(!strncmp("YES", fmt, 3))
7339  {
7340  ct->flags |= ENC_CT_FLAG_DELSP;
7341  }
7342  break;
7343  }
7344  }
7345  /* Search for "InsLine" parameter (case insensitive) */
7346  for(i = 0; i < len; ++i)
7347  {
7348  if(!strncmp("INSLINE", &body[i], 7))
7349  {
7350  /* Extract parameter value */
7351  ii = i + (size_t) 7;
7352  while(body[ii])
7353  {
7354  if('=' != body[ii] && ' ' != body[ii]) { break; }
7355  else { ++ii; }
7356  }
7357  for(i = 0; i < ENC_FMT_BUFLEN; ++i)
7358  {
7359  if(!body[ii + i]
7360  || ';' == body[ii + i] || ' ' == body[ii + i])
7361  {
7362  fmt[i] = 0; break;
7363  }
7364  else { fmt[i] = body[ii + i]; }
7365  }
7366  fmt[ENC_FMT_BUFLEN - (size_t) 1] = 0;
7367  if(!strncmp("YES", fmt, 3))
7368  {
7369  ct->flags |= ENC_CT_FLAG_INSLINE;
7370  }
7371  break;
7372  }
7373  }
7374  }
7375  }
7376  /* Search for "charset" parameter */
7377  for(i = 0; i < len; ++i)
7378  {
7379  if(!strncmp("CHARSET", &body[i], 7))
7380  {
7381  /* Extract parameter value */
7382  ii = i + (size_t) 7;
7383  while(body[ii])
7384  {
7385  if('=' != body[ii] && ' ' != body[ii]) { break; }
7386  else { ++ii; }
7387  }
7388  for(i = 0; i < ENC_CS_BUFLEN; ++i)
7389  {
7390  if(!body[ii + i]
7391  || ';' == body[ii + i] || ' ' == body[ii + i])
7392  {
7393  cs[i] = 0; break;
7394  }
7395  else { cs[i] = body[ii + i]; }
7396  }
7397  cs[ENC_CS_BUFLEN - (size_t) 1] = 0;
7398  ct->charset = enc_mime_get_charset(cs, strlen(cs));
7399  break;
7400  }
7401  }
7402  }
7403  /* Check for content type "image" */
7404  else if(!strncmp("IMAGE", body, 5))
7405  {
7406  ct->type = ENC_CT_IMAGE;
7407  }
7408  /* Check for content type "audio" */
7409  else if(!strncmp("AUDIO", body, 5))
7410  {
7411  ct->type = ENC_CT_AUDIO;
7412  }
7413  /* Check for content type "video" */
7414  else if(!strncmp("VIDEO", body, 5))
7415  {
7416  ct->type = ENC_CT_VIDEO;
7417  }
7418  /* Check for content type "message" (only subtype "rfc822" supported) */
7419  else if(!strncmp("MESSAGE/RFC822", body, 14))
7420  {
7421  ct->type = ENC_CT_MESSAGE;
7422  ct->subtype = ENC_CTS_RFC822;
7423  }
7424  /* Check for content type "multipart", map unknown subtypes to "mixed" */
7425  else if(!strncmp("MULTIPART", body, 9))
7426  {
7427  ct->type = ENC_CT_MULTIPART;
7428  ct->subtype = ENC_CTS_MIXED;
7429  if(!strncmp("MULTIPART/ALTERNATIVE", body, 21))
7430  {
7432  }
7433  else if(!strncmp("MULTIPART/DIGEST", body, 16))
7434  {
7435  ct->subtype = ENC_CTS_DIGEST;
7436  }
7437  bo[0] = 0;
7438  /* Search for "boundary" parameter */
7439  for(i = 0; i < len; ++i)
7440  {
7441  if(!strncmp("BOUNDARY", &body[i], 8))
7442  {
7443  /* Extract case sensitive parameter value */
7444  ii = i + (size_t) 8;
7445  if('=' != hf_body[ii++])
7446  {
7447  PRINT_ERROR("MIME: "
7448  "Missing multipart boundary parameter value");
7449  }
7450  else
7451  {
7452  /* Start of boundary parameter value found */
7453  for(i = 0; i < ENC_BO_BUFLEN; ++i)
7454  {
7455  if(!hf_body[ii + i] || ';' == hf_body[ii + i])
7456  {
7457  bo[i] = 0; break;
7458  }
7459  else { bo[i] = hf_body[ii + i]; }
7460  }
7461  bo[ENC_BO_BUFLEN - (size_t) 1] = 0;
7462  /* Check boundary */
7463  bo_len = strlen(bo);
7464  bo_len_valid = strspn(bo,
7465  "0123456789"
7466  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
7467  "'()+_,-./:=?"
7468  " ");
7469  if (bo_len_valid != bo_len)
7470  {
7471  PRINT_ERROR("MIME: Invalid multipart boundary parameter");
7472  if(bo_len_valid && ' ' != bo[bo_len_valid - (size_t) 1])
7473  {
7474  /* Strip invalid tail */
7475  bo[bo_len_valid] = 0;
7476  bo_len = bo_len_valid;
7477  }
7478  else
7479  {
7480  /* Boundary not usable, use "?" as dummy replacement */
7481  bo[0] = '?';
7482  bo_len = 1;
7483  bo[bo_len] = 0;
7484  }
7485  }
7486  while(bo_len)
7487  {
7488  if(' ' != bo[bo_len - (size_t) 1]) { break; }
7489  /* Strip trailing SP */
7490  trailing_sp = 1;
7491  bo[--bo_len] = 0;
7492  }
7493  if(trailing_sp)
7494  {
7495  PRINT_ERROR("MIME: Stripped trailing whitespace "
7496  "from multipart boundary parameter");
7497  }
7498  }
7499  break;
7500  }
7501  }
7502  }
7503  }
7504  posix_free((void*) body);
7505 }
7506 
7507 
7508 /* ========================================================================== */
7509 /*! \brief Decode content transfer encoding description
7510  *
7511  * \param[in] hf_body MIME content transfer encoding description string
7512  *
7513  * This function checks whether the string \e hf_body represents a supported
7514  * content transfer encoding and return the corresponding ID for it.
7515  * According to RFC 2047 the content transfer encoding is treated
7516  * case-insensitive.
7517  *
7518  * \note
7519  * It is allowed to call this function with \e hf_body set to \c NULL. This is
7520  * treated as an error and the return value will indicate an unknown transfer
7521  * encoding.
7522  *
7523  * \note
7524  * RFC 2049 requires that every non-7bit MIME content must be labeled with a
7525  * content transfer encoding header field of "8bit" or "binary".
7526  *
7527  * \result
7528  * - MIME content transfer encoding ID (from \ref enc_mime_cte )
7529  * - \c ENC_CTE_UNKNOWN on error
7530  */
7531 /* If this header field is missing, we assume "binary" instead of "7bit".
7532  * Don't change this because it is required for handling unknown transfer
7533  * encodings!
7534  */
7535 
7536 enum enc_mime_cte enc_mime_get_cte(const char* hf_body)
7537 {
7538  enum enc_mime_cte res = ENC_CTE_BIN;
7539  char buf[ENC_CTE_BUFLEN];
7540  size_t len;
7541  size_t i;
7542  const char not_supported[]
7543  = "ENC: MIME: Unsupported content transfer encoding: ";
7544  char* p;
7545  size_t l;
7546 
7547  /* Accept NULL pointer */
7548  if(NULL != hf_body)
7549  {
7550  res = ENC_CTE_UNKNOWN;
7551  len = strlen(hf_body);
7552  if(ENC_CTE_BUFLEN <= len)
7553  {
7554  /* If you get this error, the value of 'ENC_CTE_BUFLEN' is too small */
7555  PRINT_ERROR("MIME: Name of content transfer encoding too long");
7556  }
7557  else
7558  {
7559  /* Convert description to upper case */
7560  for(i = 0; i < len; ++i)
7561  {
7562  buf[i] = (char) toupper((int) hf_body[i]);
7563  }
7564  buf[len] = 0;
7565  /* Check for all known content transfer encodings */
7566  if(!strcmp(buf, "7BIT")) { res = ENC_CTE_7BIT; }
7567  if(!strcmp(buf, "8BIT")) { res = ENC_CTE_8BIT; }
7568  if(!strcmp(buf, "BINARY")) { res = ENC_CTE_BIN; }
7569  if(!strcmp(buf, "QUOTED-PRINTABLE")) { res = ENC_CTE_Q; }
7570  if(!strcmp(buf, "BASE64")) { res = ENC_CTE_B; }
7571  /* To be more tolerant: Check again for invalid identity declaration */
7572  if(!strcmp(buf, "7-BIT"))
7573  {
7574  PRINT_ERROR("MIME: "
7575  "Invalid content transfer encoding 7-bit accepted as 7bit");
7576  res = ENC_CTE_7BIT;
7577  }
7578  if(!strcmp(buf, "8-BIT"))
7579  {
7580  PRINT_ERROR("MIME: "
7581  "Invalid content transfer encoding 8-bit accepted as 8bit");
7582  res = ENC_CTE_8BIT;
7583  }
7584  /* Check whether content transfer encoding is supported */
7585  if(ENC_CTE_UNKNOWN == res)
7586  {
7587  l = strlen(not_supported) + len;
7588  p = (char*) posix_malloc(++l);
7589  if(NULL != p)
7590  {
7591  strcpy(p, not_supported);
7592  strncat(p, buf, len);
7593  print_error(p);
7594  posix_free((void*) p);
7595  }
7596  }
7597  }
7598  }
7599 
7600  return(res);
7601 }
7602 
7603 
7604 /* ========================================================================== */
7605 /*! \brief Decode content disposition
7606  *
7607  * \param[in] hf_body Body of Content-Disposition header field
7608  * \param[out] type Pointer to content disposition type ID
7609  * \param[out] filename Pointer to filename
7610  *
7611  * The field body \e hf_body must be unfolded and preprocessed (parameters must
7612  * ne already decoded according to RFC 2231).
7613  * The value for the filename parameter must be already converted to UTF-8.
7614  *
7615  * If a filename parameter is present, a new memory block is allocated for
7616  * \e filename . Otherwise \c NULL is returned.
7617  */
7618 
7619 void enc_mime_get_cd(const char* hf_body,
7620  enum enc_mime_cd* type, const char** filename)
7621 {
7622  posix_locale_t loc_ctype_posix;
7623  char* body = NULL;
7624  const char* fn_para = "FILENAME=";
7625  const char* p;
7626  const char* q;
7627  size_t len;
7628  char* buf;
7629  size_t i;
7630 
7631  /* Prepare values to return if an error occurs */
7632  *type = ENC_CD_UNKNOWN;
7633  *filename = NULL;
7634 
7635  /* Extract disposition type (case-insensitive) */
7636  loc_ctype_posix = posix_newlocale(POSIX_LC_CTYPE_MASK, "POSIX",
7637  (posix_locale_t) 0);
7638  if((posix_locale_t) 0 == loc_ctype_posix)
7639  {
7640  PRINT_ERROR("MIME: Cannot create locale object");
7641  return;
7642  }
7643  else
7644  {
7645  if(!posix_strncasecmp_l(hf_body, "inline", strlen("inline"),
7646  loc_ctype_posix))
7647  {
7648  *type = ENC_CD_INLINE;
7649  }
7650  else if(!posix_strncasecmp_l(hf_body, "attachment", strlen("attachment"),
7651  loc_ctype_posix))
7652  {
7653  *type = ENC_CD_ATTACHMENT;
7654  }
7655  posix_freelocale(loc_ctype_posix);
7656  }
7657 
7658  /* Extract filename */
7659  len = strlen(hf_body);
7660  body = (char*) posix_malloc(len + (size_t) 1);
7661  if(NULL != body)
7662  {
7663  /* Convert header field body to upper case */
7664  for(i = 0; i < len; ++i) { body[i] = (char) toupper((int) hf_body[i]); }
7665  body[len] = 0;
7666  /* Check for parameter "filename" */
7667  p = strstr(body, fn_para);
7668  if(NULL != p)
7669  {
7670  p += strlen(fn_para);
7671  q = strchr(p, ';');
7672  if(NULL != q) { len = (size_t) (q - p); }
7673  else { len = strlen(p); }
7674  /* Copy filename case-sensitive */
7675  buf = (char*) malloc(len + (size_t) 1);
7676  if(NULL != buf)
7677  {
7678  i = (size_t) (p - body);
7679  strncpy(buf, &hf_body[i], len);
7680  buf[len] = 0;
7681  /* Strip path, if present */
7682  p = strrchr(buf, '/');
7683  if(NULL != p) { ++p; memmove(buf, p, strlen(p) + (size_t) 1); }
7684  /*
7685  * Reject filename if it contains '~', '|' or '\' characters.
7686  * See RFC 2183 Section 5 "Security Considerations" for details
7687  */
7688  p = strpbrk(buf, "~|\x5C");
7689  if(NULL != p)
7690  {
7691  PRINT_ERROR("MIME: "
7692  "Filename in Content-Disposition rejected");
7693  }
7694  else { *filename = buf; }
7695  }
7696  }
7697  }
7698  posix_free((void*) body);
7699 }
7700 
7701 
7702 /* ========================================================================== */
7703 /*! \brief Decode MIME content transfer encoding and save to file
7704  *
7705  * \param[in] pn Pathname of file
7706  * \param[in] cte MIME content transfer encoding
7707  * \param[in] entity MIME entity body
7708  *
7709  * According to RFC 2049 all transfer encodings not defined in MIME 1.0 are
7710  * rejected.
7711  *
7712  * \return
7713  * - 0 on success
7714  * - -1 on error
7715  */
7716 
7717 int enc_mime_save_to_file(const char* pn, enum enc_mime_cte cte,
7718  const char* entity)
7719 {
7720  int res = -1;
7721  size_t len = strlen(entity);
7722  const char* p = entity;
7723  const char* buf = NULL;
7724  int fd;
7725  int rv;
7726  posix_mode_t perm = POSIX_S_IRUSR | POSIX_S_IWUSR |
7727  POSIX_S_IRGRP | POSIX_S_IWGRP |
7728  POSIX_S_IROTH | POSIX_S_IWOTH;
7729 
7730  /* Decode transfer encoding */
7731  switch(cte)
7732  {
7733  case ENC_CTE_Q:
7734  {
7735  buf = enc_mime_decode_qp(entity, &entity[len], 0, &len);
7736  p = buf;
7737  break;
7738  }
7739  case ENC_CTE_B:
7740  {
7741  buf = enc_mime_decode_base64(entity, &entity[len], 0, &len);
7742  p = buf;
7743  break;
7744  }
7745  case ENC_CTE_7BIT:
7746  case ENC_CTE_8BIT:
7747  case ENC_CTE_BIN:
7748  {
7749  break;
7750  }
7751  default:
7752  {
7753  PRINT_ERROR("MIME: Content transfer encoding not supported");
7754  break;
7755  }
7756  }
7757 
7758  /* Save to file */
7759  if(NULL != p)
7760  {
7761  rv = fu_open_file(pn, &fd, POSIX_O_WRONLY | POSIX_O_CREAT | POSIX_O_TRUNC,
7762  perm);
7763  if(rv)
7764  {
7765  PRINT_ERROR("MIME: Opening file failed");
7766  }
7767  else
7768  {
7769  rv = fu_write_to_filedesc(fd, p, len);
7770  if(rv)
7771  {
7772  PRINT_ERROR("MIME: Writing to file failed");
7773  }
7774  else { res = 0; }
7775  fu_close_file(&fd, NULL);
7776  }
7777  }
7778 
7779  enc_free((void*) buf);
7780 
7781  return(res);
7782 }
7783 
7784 
7785 /* ========================================================================== */
7786 /*! \brief Decode MIME text content to UTF-8 NFC
7787  *
7788  * \param[in] cte MIME content transfer encoding
7789  * \param[in] charset MIME character set
7790  * \param[in] s MIME encoded data
7791  *
7792  * According to RFC 2049 all transfer encodings not defined in MIME 1.0 are
7793  * rejected.
7794  *
7795  * \return
7796  * - Pointer to decoded data.
7797  * If the result is not equal to \e s , a new memory block was allocated
7798  * - NULL on error (Original memory block for \e s is still allocated)
7799  */
7800 
7801 const char* enc_mime_decode(enum enc_mime_cte cte, enum enc_mime_cs charset,
7802  const char* s)
7803 {
7804  const char* res = NULL;
7805  size_t len = strlen(s);
7806 
7807  /* Decode transfer encoding and convert charset to Unicode */
7808  switch(cte)
7809  {
7810  case ENC_CTE_Q:
7811  {
7812  res = enc_mime_decode_q(charset, s, &s[len], 0);
7813  break;
7814  }
7815  case ENC_CTE_B:
7816  {
7817  res = enc_mime_decode_b(charset, s, &s[len], 0);
7818  break;
7819  }
7820  case ENC_CTE_7BIT:
7821  case ENC_CTE_8BIT:
7822  case ENC_CTE_BIN:
7823  {
7824  res = enc_convert_to_utf8_nfc(charset, s);
7825  break;
7826  }
7827  default:
7828  {
7829  PRINT_ERROR("MIME: Content transfer encoding not supported");
7830  break;
7831  }
7832  }
7833 
7834  return(res);
7835 }
7836 
7837 
7838 /* ========================================================================== */
7839 /*! \brief Decode MIME "text/plain" content with "format=flowed" parameter
7840  *
7841  * \param[in] s MIME encoded data in canonical form
7842  * \param[in] delsp Delete spaces at EOL if nonzero
7843  * \param[in] insline Add empty line separator after paragraphs if nonzero
7844  *
7845  * \attention
7846  * The encoding of the data referenced by \e s must be valid Unicode in UTF-8
7847  * representation. This must be checked by the caller before this function is
7848  * used.
7849  *
7850  * \return
7851  * - Pointer to decoded data
7852  * (if the result is not equal to \e s , a new memory block was allocated)
7853  * - NULL on error (Original memory block for \e s is still allocated)
7854  */
7855 
7856 const char* enc_mime_flowed_decode(const char* s, unsigned int delsp,
7857  unsigned int insline)
7858 {
7859  const char* quote_mark;
7860  int error = 0;
7861  int abort;
7862  int check;
7863  char* p;
7864  size_t ii;
7865  /* Index in input buffer */
7866  size_t i = 0;
7867  /* Target buffer */
7868  char* buf = NULL;
7869  size_t len = 0;
7870  size_t bi = 0;
7871  int insert_crlf = 0;
7872  /* Paragraph buffer */
7873  char* para = NULL;
7874  size_t plen = 0;
7875  size_t pi = 0;
7876  int pflowed;
7877  int pell; /* Empty last line */
7878  /* Line buffer */
7879  size_t start;
7880  size_t end;
7881  size_t llen = 0;
7882  size_t llimit;
7883  int flowed;
7884  /* Quote depth */
7885  int qdepth;
7886  size_t qd;
7887  /* Index after last space (or SHY) suitable for line break */
7888  size_t last_space;
7889  size_t ustring_len;
7890 
7891  /* Set quote mark style according to config file */
7892  switch(config[CONF_QUOTESTYLE].val.i)
7893  {
7894  case 0: { quote_mark = ">"; break; }
7895  case 1: { quote_mark = "> "; break; }
7896  default:
7897  {
7898  PRINT_ERROR("Quoting style configuration not supported");
7899  /* Use default from old versions that can't be configured */
7900  quote_mark = "> ";
7901  break;
7902  }
7903  }
7904  /* Process data */
7905  while(s[i])
7906  {
7907  /* Process next paragraph */
7908  pi = 0;
7909  pflowed = 0;
7910  pell = 0;
7911  qdepth = -1;
7912  do
7913  {
7914  /* Process next line */
7915  flowed = 0;
7916  /* Calculate quoting depth */
7917  qd = 0;
7918  while('>' == s[i])
7919  {
7920  if(POSIX_INT_MAX <= qd) { break; }
7921  ++qd;
7922  ++i;
7923  }
7924  if(-1 == qdepth) { qdepth = (int) qd; }
7925  else
7926  {
7927  if((int) qd != qdepth)
7928  {
7929  PRINT_ERROR("MIME: Invalid paragraph format"
7930  " (format=flowed)");
7931  i -= qd;
7932  break;
7933  }
7934  }
7935  /* Remove space stuffing */
7936  if(' ' == s[i]) { ++i; }
7937  start = end = i;
7938  /* Search for EOL */
7939  while(s[i])
7940  {
7941  if(i && 0x0A == (int) s[i])
7942  {
7943  if(0x0D != (int) s[i - (size_t) 1])
7944  {
7945  /* Canonical line termination must be CR+LF */
7946  PRINT_ERROR("MIME: Invalid line termination"
7947  " (format=flowed)");
7948  end = i;
7949  }
7950  else { end = i - (size_t) 1; }
7951  ++i;
7952  break;
7953  }
7954  /* Special handling for last line without CR+LF */
7955  if(!s[++i]) { end = i; }
7956  }
7957  llen = end - start;
7958  /* Check for flowed line */
7959  if(llen && ' ' == s[end - (size_t) 1])
7960  {
7961  /* Check for signature separator */
7962  if(!((size_t) 3 == llen
7963  && '-' == s[start] && '-' == s[start + (size_t) 1]))
7964  {
7965  flowed = 1;
7966  pflowed = 1;
7967  if(delsp) { --llen; --end; }
7968  }
7969  }
7970  /* Allocate memory in exponentially increasing chunks */
7971  while(pi + llen + (size_t) 1 >= plen) /* At least 1 additional byte */
7972  {
7973  if(!plen) { plen = 128; }
7974  p = (char*) posix_realloc((void*) para, plen *= (size_t) 2);
7975  if(NULL == p)
7976  {
7977  PRINT_ERROR("Memory allocation failed");
7978  error = 1;
7979  break;
7980  }
7981  else { para = p; }
7982  }
7983  if(error) { break; }
7984  /* Copy line to paragraph buffer */
7985  strncpy(&para[pi], &s[start], llen);
7986  pi += llen;
7987  }
7988  while(flowed);
7989  if(error) { break; }
7990  para[pi] = 0;
7991  /* Set flag if paragraph ends with empty line */
7992  if(pflowed && !llen) { pell = 1; };
7993  /* Copy fixed line or flowed paragraph to target buffer */
7994  pi = 0;
7995  do
7996  {
7997  llen = (size_t) qdepth * strlen(quote_mark);
7998  if(!pflowed)
7999  {
8000  start = 0;
8001  end = strlen(para);
8002  llen += end;
8003  }
8004  else
8005  {
8006  /* Rewrap flowed lines before 72 characters if possible */
8007  start = pi;
8008  last_space = 0;
8009  abort = 0;
8010  while(!abort)
8011  {
8012  check = 0;
8013  if(!para[pi]) { abort = 1; }
8014  else
8015  {
8016  /* Check for SP */
8017  if(' ' == para[pi]) { check = 1; }
8018  /* Check for SHY (in UTF-8 encoding) */
8019  else if(pi
8020  && 0xADU == (unsigned int) (unsigned char) para[pi]
8021  && 0xC2U == (unsigned int)
8022  (unsigned char) para[pi - (size_t) 1])
8023  {
8024  check = 1;
8025  }
8026  /* Check for ZWSP (in UTF-8 encoding) */
8027  else if(1 < pi
8028  && 0x8BU == (unsigned int) (unsigned char) para[pi]
8029  && 0x80U == (unsigned int)
8030  (unsigned char) para[pi - (size_t) 1]
8031  && 0xE2U == (unsigned int)
8032  (unsigned char) para[pi - (size_t) 2])
8033  {
8034  check = 1;
8035  }
8036  ++pi;
8037  }
8038  if(abort || check)
8039  {
8040  /* Allow max. 78 characters for quoted content */
8041  llimit = (size_t) 72;
8042  if(1 == qdepth) { llimit = (size_t) 74; }
8043  else if(2 == qdepth) { llimit = (size_t) 76; }
8044  else if(3 <= qdepth) { llimit = (size_t) 78; }
8045  /* Use 20 characters as minimum content width */
8046  if(llimit - (size_t) 20
8047  <= (size_t) qdepth * strlen(quote_mark))
8048  {
8049  llimit = (size_t) 20;
8050  }
8051  else
8052  {
8053  llimit -= (size_t) qdepth * strlen(quote_mark);
8054  }
8055  /* Check for line length limit */
8056  ustring_len = pi - start;
8057  if(ustring_len)
8058  {
8059  /* Do not count trailing SP */
8060  if(pi && ' ' == para[pi - (size_t) 1]) { --ustring_len; }
8061  }
8062  if(llimit < enc_uc_get_glyph_count(&para[start], ustring_len))
8063  {
8064  /* Check for second last line */
8065  if(last_space) { pi = last_space; }
8066  /* Check for last line */
8067  else if(abort)
8068  {
8069  pflowed = 0;
8070  if(pell) { insert_crlf = 1; }
8071  }
8072  break;
8073  }
8074  /* Check for end of paragraph */
8075  if(abort)
8076  {
8077  pflowed = 0;
8078  if(pell) { insert_crlf = 1; }
8079  }
8080  else { last_space = pi; }
8081  }
8082  }
8083  /* Skip trailing SP */
8084  if(start < pi && ' ' == para[pi - (size_t) 1])
8085  {
8086  end = pi - (size_t) 1;
8087  }
8088  else { end = pi; }
8089  llen += end - start;
8090  }
8091  /* Two additional characters for CR+LF line termination */
8092  llen += (size_t) 2;
8093  /* InsLine parameter has precedence over configfile entry */
8094  if(!insline)
8095  {
8096  /* Reset request for empty line separator if not configured */
8097  if(!config[CONF_FLOWED_CRLF].val.i) { insert_crlf = 0; }
8098  }
8099  /* Two additional characters for optional empty line after paragraph */
8100  if(insert_crlf) { llen += (size_t) 2; }
8101  /* Allocate memory in exponentially increasing chunks */
8102  while(bi + llen + (size_t) 1 >= len) /* At least 1 additional byte */
8103  {
8104  if(!len) { len = 256; }
8105  p = (char*) posix_realloc((void*) buf, len *= (size_t) 2);
8106  if(NULL == p)
8107  {
8108  PRINT_ERROR("Memory allocation failed");
8109  error = 1;
8110  break;
8111  }
8112  else { buf = p; }
8113  }
8114  if(error) { break; }
8115  /* Copy quote marks */
8116  for(ii = 0; ii < (size_t) qdepth; ++ii)
8117  {
8118  strncpy(&buf[bi], quote_mark, strlen(quote_mark));
8119  bi += strlen(quote_mark);
8120  }
8121  /* Copy line */
8122  strncpy(&buf[bi], &para[start], end - start);
8123  bi += end - start;
8124  /* Copy line termination */
8125  buf[bi++] = (char) 0x0D; buf[bi++] = (char) 0x0A;
8126  }
8127  while(pflowed);
8128  /* Insert optional empty line separator after paragraph */
8129  if(insert_crlf)
8130  {
8131  buf[bi++] = (char) 0x0D; buf[bi++] = (char) 0x0A;
8132  insert_crlf = 0;
8133  }
8134  if(error) { break; }
8135  }
8136  posix_free((void*) para);
8137  if(error)
8138  {
8139  PRINT_ERROR("MIME: Decoding of format=flowed content failed");
8140  posix_free((void*) buf);
8141  buf = NULL;
8142  }
8143  else if(NULL != buf)
8144  {
8145  /* Terminate string in target buffer */
8146  buf[bi] = 0;
8147  }
8148 
8149  return(buf);
8150 }
8151 
8152 
8153 /* ========================================================================== */
8154 /*! \brief Extract MIME encapsulated message
8155  *
8156  * \param[in] s MIME encapsulated message
8157  * \param[in] len Length of encapsulated message
8158  * \param[out] mpe MIME multipart entity locations
8159  *
8160  * On success a pointer to the result array is written to \e mpe . The caller
8161  * is responsible to free the memory allocated for this array.
8162  *
8163  * \return
8164  * - 1 on success
8165  * - 0 on error
8166  */
8167 
8168 size_t enc_mime_message(const char* s, size_t len,
8169  struct enc_mime_mpe** mpe)
8170 {
8171  size_t res = 0;
8172  struct enc_mime_mpe* array;
8173 
8174  /* Allocate memory for array element */
8175  array = (struct enc_mime_mpe*) posix_malloc(sizeof(struct enc_mime_mpe));
8176  if(NULL == array)
8177  {
8178  PRINT_ERROR("Parsing encapsulated message aborted");
8179  }
8180  else
8181  {
8182  /* Store start index and length of entity */
8183  array[res].start = s;
8184  array[res++].len = len;
8185  }
8186 
8187  /* Check for success */
8188  *mpe = NULL;
8189  if(res) { *mpe = array; }
8190 
8191  return(res);
8192 }
8193 
8194 
8195 /* ========================================================================== */
8196 /*! \brief Parse MIME multipart content
8197  *
8198  * \param[in] s MIME encoded multipart data
8199  * \param[in] b MIME boundary delimiter
8200  * \param[out] mpe MIME multipart entity locations
8201  *
8202  * On success a pointer to the result array is written to \e mpe . The caller
8203  * is responsible to free the memory allocated for this array.
8204  *
8205  * \return
8206  * - Nonzero number of entities in multipart data on success
8207  * - 0 on error
8208  */
8209 
8210 size_t enc_mime_multipart(const char* s, const char* b,
8211  struct enc_mime_mpe** mpe)
8212 {
8213  size_t res = 0;
8214  size_t b_len;
8215  char boundary[ENC_BO_BUFLEN] = "--";
8216  size_t i = 0;
8217  int preamble = 1;
8218  size_t match;
8219  size_t start = 0;
8220  size_t end = 0;
8221  size_t e_len;
8222  struct enc_mime_mpe* array = NULL;
8223  struct enc_mime_mpe* tmp;
8224 
8225  b_len = strlen(b);
8226  /* RFC 2046 limits the boundary delimiter length to 70 characters */
8227  if(!b_len || (size_t) 70 < b_len)
8228  {
8229  PRINT_ERROR("Invalid MIME multipart boundary delimiter");
8230  }
8231  else if ((size_t) 75 > ENC_BO_BUFLEN)
8232  {
8233  PRINT_ERROR("Value of ENC_BO_BUFLEN must be at least 75");
8234  }
8235  else
8236  {
8237  /* Add "--" prefix to boundary */
8238  strncpy(&boundary[2], b, 71);
8239  b_len += (size_t) 2;
8240  /* Parse content */
8241  while(s[i])
8242  {
8243  /*
8244  * Store potential end of entity
8245  * RFC 2046 specifies that last CRLF of an entity is part of the
8246  * following boundary delimiter.
8247  */
8248  if((size_t) 2 <= i) { end = i - (size_t) 2; }
8249  /* Compare boundary with beginning of line */
8250  match = 0;
8251  if(!strncmp(&s[i], boundary, b_len)) { match = 1; }
8252  /* Skip to beginning of next line (this also consumes potential LWS) */
8253  while(1)
8254  {
8255  if(!s[i]) { break; }
8256  else if((char) 0x0D == s[i++])
8257  {
8258  if((char) 0x0A == s[i++]) { break; }
8259  }
8260  }
8261  /* Check for start of entity */
8262  if(match)
8263  {
8264  /* Ignore preamble */
8265  if(!preamble && end > start)
8266  {
8267  /* Allocate memory for new array element */
8268  e_len = end - start;
8269  tmp = (struct enc_mime_mpe*)
8270  posix_realloc(array, (res + (size_t) 1)
8271  * sizeof(struct enc_mime_mpe));
8272  if(NULL == tmp)
8273  {
8274  PRINT_ERROR("Parsing multipart message aborted");
8275  break;
8276  }
8277  else
8278  {
8279  array = tmp;
8280  /* Store start index and length of entity */
8281  array[res].start = &s[start];
8282  array[res++].len = e_len;
8283  }
8284  }
8285  /* Prepare for next entity */
8286  start = i;
8287  preamble = 0;
8288  }
8289  }
8290  }
8291 
8292  /* Check for success */
8293  *mpe = NULL;
8294  if(res) { *mpe = array; }
8295 
8296  return(res);
8297 }
8298 
8299 
8300 /* ========================================================================== */
8301 /*! \brief Percent decoder
8302  *
8303  * \param[in] s String to decode (URI or MIME parameter value)
8304  * \param[in] clean Replace NUL and ';' with '_' if nonzero
8305  *
8306  * \note
8307  * The data is decoded in place because it can't be larger after the decoding
8308  * operation.
8309  *
8310  * If \e s is \c NULL no operation is performed and success is returned.
8311  *
8312  * \return
8313  * - Positive value on success (if data in \e s was decoded)
8314  * - 0 on success (if there was nothing to do)
8315  * - Negative value if percent encoding in \e s is invalid
8316  */
8317 
8318 int enc_percent_decode(char* s, int clean)
8319 {
8320  int res = 0;
8321  char* p = s;
8322  char* q;
8323  int invalid;
8324  int v;
8325  unsigned char c = 0;
8326  size_t len;
8327 
8328  while(NULL != p)
8329  {
8330  q = p;
8331  p = strchr(q, (int) '%');
8332  if(NULL != p)
8333  {
8334  /* Percent sign found => Decode character */
8335  res = 1;
8336  if((size_t) 3 > strlen(p)) { res = -1; break; }
8337  invalid = 0;
8338  v = enc_hex_decode_nibble(p[1]);
8339  if(0 > v) { invalid = 1; }
8340  else
8341  {
8342  c = (unsigned char) (v * 16);
8343  v = enc_hex_decode_nibble(p[2]);
8344  if(0 > v) { invalid = 1; }
8345  else { c += (unsigned char) v; }
8346  }
8347  /* Check for invalid data */
8348  if(invalid) { res = -1; break; }
8349  else
8350  {
8351  p[0] = (char) c;
8352  if(clean)
8353  {
8354  /* Replace NUL and ';' with '_' */
8355  if(!p[0] || ';' == p[0]) { p[0] = '_'; }
8356  }
8357  len = strlen(&p[3]);
8358  memmove((void*) &p[1], (void*) &p[3], ++len);
8359  ++p;
8360  }
8361  }
8362  }
8363  if(-1 == res) { PRINT_ERROR("Percent decoding of URI failed"); }
8364 
8365  return(res);
8366 }
8367 
8368 
8369 /* ========================================================================== */
8370 /*! \brief Percent encoding for URI content
8371  *
8372  * \param[in] s URI body to encode
8373  * \param[in] sch URI scheme
8374  *
8375  * Passing \c NULL for parameter \e s is allowed and treated as error.
8376  *
8377  * Generic URI syntax is defined in RFC 3986.
8378  * <br>
8379  * The scheme "ftp" is defined in RFC 1738.
8380  * <br>
8381  * The scheme "http" is defined in RFC 7230.
8382  * <br>
8383  * The scheme "mailto" is defined in RFC 6068.
8384  * <br>
8385  * The scheme "news" is defined in RFC 5538.
8386  *
8387  * The following characters are percent encoded:
8388  * - Space (not allowed for "mailto" and "news" schemes)
8389  * - The literal percent sign
8390  * - The list "gen-delims" defined in RFC 3986
8391  * - Anything not in the list "unreserved" for "http" and "ftp" schemes
8392  * - For the "mailto" scheme exactly one "commercial at" sign is required and
8393  * treated literally
8394  * - For the "news" scheme a single "commercial at" sign is accepted literally
8395  *
8396  * \return
8397  * - Pointer to result on success.
8398  * If the result is not equal to \e s , a new memory block was allocated
8399  * - NULL on error
8400  */
8401 
8402 const char* enc_uri_percent_encode(const char* s, enum enc_uri_scheme sch)
8403 {
8404  const char* res = NULL;
8405  const char* gen_delims =
8406  ":/?#[]@"; /* gen-delims */
8407  const char* sub_delims =
8408  "!$&'()*+,;="; /* sub-delims */
8409  const char* unreserved =
8410  "abcdefghijklmnopqrstuvwxyz" /* Small letters */
8411  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* Capital letters */
8412  "0123456789" /* Digits */
8413  "-._~"; /* Hyphen, Period, Underscore and Tilde */
8414  int process = 0;
8415  size_t i = 0;
8416  char* buf = NULL;
8417  size_t bi = 0;
8418  int error = 0;
8419  int encode;
8420  size_t commercial_at = 0;
8421  unsigned int nibble;
8422 
8423  if(NULL != s)
8424  {
8425  /* Check whether only unreserved characters are present */
8426  while(s[i])
8427  {
8428  if(NULL == strchr(unreserved, (int) s[i])) { process = 1; break; }
8429  ++i;
8430  }
8431  if(!process) { res = s; }
8432  else
8433  {
8434  /* Allocate new buffer (Triple size is always sufficient) */
8435  buf = (char*) posix_malloc(strlen(s) * (size_t) 3 + (size_t) 1);
8436  if(NULL != buf)
8437  {
8438  i = 0;
8439  while(s[i])
8440  {
8441  encode = 0;
8442  switch(sch)
8443  {
8444  case ENC_URI_SCHEME_HTTP:
8445  case ENC_URI_SCHEME_FTP:
8446  {
8447  /*
8448  * Because we don't parse the URI syntax, it is not
8449  * possible to decide whether a slash is allowed or not
8450  * here => Always accept it.
8451  */
8452  if('/' == s[i]) { encode = 0; }
8453  else if(NULL == strchr(unreserved, (int) s[i]))
8454  {
8455  encode = 1;
8456  }
8457  break;
8458  }
8459  case ENC_URI_SCHEME_MAILTO:
8460  case ENC_URI_SCHEME_NEWS:
8461  {
8462  if(ENC_URI_SCHEME_NEWS == sch && '>' == s[i])
8463  {
8464  /* As defined by RFC 5536 Section 3.1.3 */
8465  error = 1;
8466  }
8467  if(' ' == s[i]) { error = 1; }
8468  else if('%' == s[i]) { encode = 1; }
8469  else if('@' == s[i])
8470  {
8471  if(!commercial_at)
8472  {
8473  /* Accept zero or one "commercial at" signs */
8474  ++commercial_at;
8475  }
8476  else { error = 1; }
8477  }
8478  else if(NULL != strchr(gen_delims, (int) s[i]))
8479  {
8480  encode = 1;
8481  }
8482  else if(ENC_URI_SCHEME_MAILTO == sch
8483  && NULL != strchr(sub_delims, (int) s[i]))
8484  {
8485  /* Some listed in RFC 6068 Section 2 => Encode all */
8486  encode = 1;
8487  }
8488  break;
8489  }
8490  default:
8491  {
8492  PRINT_ERROR("Invalid URI scheme for percent encoding");
8493  error = 1;
8494  break;
8495  }
8496  }
8497  if(error) { break; }
8498  if(!encode) { buf[bi++] = s[i]; }
8499  else
8500  {
8501  /* Percent encoder */
8502  buf[bi++] = '%';
8503  /* High nibble */
8504  nibble = ((unsigned int) s[i] & 0xF0U) >> 4;
8505  if(10U > nibble) { buf[bi] = 0x30; }
8506  else { buf[bi] = 0x41; nibble -= 10U; }
8507  buf[bi++] += (char) nibble;
8508  /* Low nibble */
8509  nibble = (unsigned int) s[i] & 0x0FU;
8510  if(10 > nibble) { buf[bi] = 0x30; }
8511  else { buf[bi] = 0x41; nibble -= 10U; }
8512  buf[bi++] += (char) nibble;
8513  }
8514  ++i;
8515  }
8516  /* Terminate result string */
8517  buf[bi] = 0;
8518  /* Check for error */
8519  if(!error) { res = buf; }
8520  }
8521  }
8522  }
8523  if(NULL != res)
8524  {
8525  /* Ensure that one "@" is present in URI with scheme "mailto" */
8526  if(ENC_URI_SCHEME_MAILTO == sch && !commercial_at)
8527  {
8528  PRINT_ERROR("Missing \"@\" in URI with scheme \"mailto\"");
8529  error = 1;
8530  }
8531  }
8532  if(error)
8533  {
8534  PRINT_ERROR("Percent encoding of URI failed");
8535  posix_free((void*) buf);
8536  res = NULL;
8537  }
8538 
8539  return(res);
8540 }
8541 
8542 
8543 /* ========================================================================== */
8544 /* Unicode search (case insensitive)
8545  *
8546  * This function uses the Unicode Default Case Folding algorithm.
8547  * This means it is based on the full case folding operations without the
8548  * context-dependent mappings sensitive to the casing context.
8549  *
8550  * https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf
8551  *
8552  * According to Unicode 13.0.0 Section 3.13 the following algorithm is required
8553  * for caseless matching of two strings:
8554  *
8555  * NFD(toCasefold(NFD(X))) == NFD(toCasefold(NFD(Y)))
8556  *
8557  * \param[in] s String to search in
8558  * \param[in] start_pos Start index in \e s
8559  * \param[in] search_s Search string
8560  * \param[out] found_pos Position in \e s where \e search_s was found
8561  * \param[out] found_len Length of match in \e s
8562  *
8563  * \attention
8564  * The strings \e s and \e search_s must be normalized to NFC by the caller!
8565  *
8566  * \note
8567  * It is treated as error if \e start_pos points inside a combing character
8568  * sequence (to a codepoint with nonzero canonical combining class).
8569  *
8570  * \return
8571  * - 0 on success (\e found_pos and \e found_len are valid)
8572  * - -1 on error (nothing was written to \e found_pos and \e found_len )
8573  */
8574 
8575 int enc_uc_search(const char* s, size_t start_pos, const char* search_s,
8576  size_t* found_pos, size_t* found_len)
8577 {
8578  int res = -1;
8579  int ok = 0, ok2 = 0, ok3 = 0, ok4 = 0, ok5 = 0;
8580  size_t search_s_len; /* Length of search string (in NFD) */
8581  size_t s_len; /* Length of target from start_pos (in NFD) */
8582  size_t i, j;
8583  size_t bi; /* Index in case folding target buffer */
8584  long int ucp;
8585  struct uc_cdc ucp_attr;
8586  long int mapping[3];
8587  size_t di;
8588  size_t match_pos = 0, match_len = 0; /* With case folding */
8589  size_t tmp_pos = 0, end_pos = 0; /* Without case folding */
8590  const char* s_nfd = NULL;
8591  const char* search_s_nfd = NULL;
8592  const char* s_cf = NULL;
8593  const char* search_s_cf = NULL;
8594  char* p;
8595  const char* q;
8596  char utf[4];
8597  size_t inc;
8598 
8599  /*
8600  * Only non-ASCII codepoints (at least 2 bytes in UTF-8) will increase the
8601  * length. The maximum length of an UTF-8 sequence is 4 bytes.
8602  * => Worst case increase factor is 2 (per codepoint for UTF-8).
8603  * Maximum increase factor on codepoint basis is 3 according to
8604  * Unicode 13.0.0 standard.
8605  * => Worst case increase factor is 6 (2 * 3 for case folding with UTF-8).
8606  */
8607  const size_t mem_factor = 6;
8608 
8609  /* Check input data and normalize to NFD from start position */
8610  if(NULL == s || enc_uc_check_utf8(s)) { goto error; }
8611  i = 0; ucp = enc_uc_decode_utf8(s, &i);
8612  if(-1L == ucp) { goto error; }
8613  enc_uc_lookup_cdc(ucp, &ucp_attr);
8614  if(ucp_attr.ccc) { goto error; }
8615  s_nfd = enc_uc_normalize_to_nfd(&s[start_pos]);
8616  if(NULL == s_nfd) { goto error; }
8617 
8618  /* Check search string and normalize it to NFD */
8619  if(NULL == search_s || enc_uc_check_utf8(search_s)) { goto error; }
8620  search_s_nfd = enc_uc_normalize_to_nfd(search_s);
8621  if(NULL == search_s_nfd) { goto error; }
8622 
8623  /* Unicode case folding for search_s */
8624  search_s_len = strlen(search_s_nfd);
8625  if(!search_s_len) { goto error; }
8626  if(search_s_len * mem_factor + (size_t) 1 < search_s_len)
8627  {
8628  /* Wraparound in memory size calculation */
8629  PRINT_ERROR("Memory allocation failed");
8630  }
8631  else
8632  {
8633  p = (char*) posix_malloc(search_s_len * mem_factor + (size_t) 1);
8634  if(NULL == p) { PRINT_ERROR("Memory allocation failed"); }
8635  else
8636  {
8637  i = 0;
8638  bi = 0;
8639  while(1)
8640  {
8641  ucp = enc_uc_decode_utf8(search_s_nfd, &i);
8642  if(-1L == ucp) { break; }
8643  else
8644  {
8645  enc_uc_lookup_cf(ucp, mapping);
8646  for(j = 0; (size_t) 3 > j; ++j)
8647  {
8648  if(-1L == mapping[j]) { break; }
8649  else
8650  {
8651  di = 1;
8652  enc_uc_encode_utf8(p, &bi, &mapping[j], &di);
8653  }
8654  }
8655  }
8656  }
8657  p[bi] = 0;
8658  /* Normalize target string to NFD again after case folding */
8659  q = enc_uc_normalize_to_nfd(p);
8660  if(NULL == q) { enc_free((void*) p); }
8661  else
8662  {
8663  if(p == q) { search_s_cf = p; }
8664  else
8665  {
8666  enc_free((void*) p);
8667  search_s_cf = q;
8668  }
8669  match_len = strlen(search_s_cf);
8670  ok = 1;
8671  }
8672  }
8673  }
8674 
8675  /* Unicode case folding for s */
8676  if(ok)
8677  {
8678  s_len = strlen(s_nfd);
8679  if(s_len * mem_factor + (size_t) 1 < s_len)
8680  {
8681  /* Wraparound in memory size calculation */
8682  PRINT_ERROR("Memory allocation failed");
8683  }
8684  else
8685  {
8686  p = (char*) posix_malloc(s_len * mem_factor + (size_t) 1);
8687  if(NULL == p) { PRINT_ERROR("Memory allocation failed"); }
8688  else
8689  {
8690  i = 0;
8691  bi = 0;
8692  while(1)
8693  {
8694  ucp = enc_uc_decode_utf8(s_nfd, &i);
8695  if(-1L == ucp) { break; }
8696  else
8697  {
8698  enc_uc_lookup_cf(ucp, mapping);
8699  for(j = 0; (size_t) 3 > j; ++j)
8700  {
8701  if(-1L == mapping[j]) { break; }
8702  else
8703  {
8704  di = 1;
8705  enc_uc_encode_utf8(p, &bi, &mapping[j], &di);
8706  }
8707  }
8708  }
8709  }
8710  p[bi] = 0;
8711  /* Normalize target string to NFD again after case folding */
8712  q = enc_uc_normalize_to_nfd(p);
8713  if(NULL == q) { enc_free((void*) p); }
8714  else
8715  {
8716  if(strlen(p) != strlen(q))
8717  {
8718  /*
8719  * The result must have the same length (only reordering is
8720  * allowed), otherwise the position calculation below will
8721  * fail!
8722  *
8723  * For this implementation:
8724  * It is assumed, that the NFD normalization after case
8725  * folding of NFD data will not change the length in UTF-8.
8726  * Fail gracefully if this assumption is wrong or become
8727  * wrong with the database from a future Unicode version
8728  * => Report no match and print a bug warning in this case.
8729  */
8730  PRINT_ERROR("Case folding failed, length changed (bug)");
8731  }
8732  else
8733  {
8734  if(p == q) { s_cf = p; }
8735  else
8736  {
8737  enc_free((void*) p);
8738  s_cf = q;
8739  }
8740  ok2 = 1;
8741  }
8742  }
8743  }
8744  }
8745  }
8746 
8747  /* Search with binary compare in case folded data */
8748  if(ok && ok2)
8749  {
8750  p = strstr(s_cf, search_s_cf);
8751  if(NULL != p)
8752  {
8753  match_pos = (size_t) (p - s_cf);
8754  ok3 = 1;
8755  }
8756  }
8757 
8758  /*
8759  * Unicode normalization and full case folding may have changed the length
8760  * of the data (in codepoints and in bytes for UTF-8).
8761  *
8762  * Therefore both positions of a match, start and end, may be different
8763  * compared to the original data. The corresponding positions for the
8764  * original data must be calculated.
8765  */
8766  if(ok3)
8767  {
8768  /* Calculate start and end offsets of match in (unfolded) NFD data */
8769  i = 0;
8770  bi = 0;
8771  while(1)
8772  {
8773  if(bi == match_pos)
8774  {
8775  tmp_pos = i;
8776  ok4 = 1;
8777  }
8778  ucp = enc_uc_decode_utf8(s_nfd, &i);
8779  if(-1L == ucp) { break; }
8780  else
8781  {
8782  enc_uc_lookup_cf(ucp, mapping);
8783  for(j = 0; (size_t) 3 > j; ++j)
8784  {
8785  if(-1L == mapping[j]) { break; }
8786  else
8787  {
8788  di = 1;
8789  inc = 0;
8790  enc_uc_encode_utf8(utf, &inc, &mapping[j], &di);
8791  /* Result data is thrown away, only its length is used */
8792  bi += inc;
8793  }
8794  }
8795  }
8796  if(ok4 && (bi == match_pos + match_len))
8797  {
8798  end_pos = i;
8799  ok5 = 1;
8800  break;
8801  }
8802  }
8803 
8804  /* Calculate start and end offsets of match in NFC data */
8805  if(ok5 && tmp_pos < end_pos)
8806  {
8807  p = (char*) posix_malloc(end_pos + (size_t) 1);
8808  if(NULL == p) { PRINT_ERROR("Memory allocation failed"); }
8809  else
8810  {
8811  strncpy(p, s_nfd, end_pos); p[end_pos] = 0;
8812  q = enc_uc_normalize_to_nfc(p);
8813  if(NULL != q)
8814  {
8815  j = strlen(q);
8816  if(p != q) { posix_free((void*) q); }
8817  if(end_pos >= j)
8818  {
8819  end_pos -= (end_pos - j);
8820  /* Calculate start offset of match in original NFC data */
8821  p[tmp_pos] = 0;
8822  q = enc_uc_normalize_to_nfc(p);
8823  if(NULL != q)
8824  {
8825  j = strlen(q);
8826  if(p != q) { posix_free((void*) q); }
8827  if(tmp_pos >= j)
8828  {
8829  tmp_pos -= (tmp_pos - j);
8830  *found_pos = start_pos + tmp_pos;
8831  *found_len = end_pos - tmp_pos;
8832  res = 0;
8833  }
8834  }
8835  }
8836  }
8837  posix_free((void*) p);
8838  }
8839  }
8840  }
8841 
8842  posix_free((void*) search_s_cf);
8843  posix_free((void*) s_cf);
8844 
8845 error:
8846  if(search_s != search_s_nfd) { posix_free((void*) search_s_nfd); }
8847  if(&s[start_pos] != s_nfd) { posix_free((void*) s_nfd); }
8848 
8849  return(res);
8850 }
8851 
8852 
8853 /* ========================================================================== */
8854 /*! \brief Free an object allocated by encoding module
8855  *
8856  * Use this function to release dynamic memory that was allocated by the
8857  * encoding module.
8858  *
8859  * \param[in] p Pointer to object
8860  *
8861  * Release the memory for the object pointed to by \e p.
8862  *
8863  * \note
8864  * The pointer \e p is allowed to be \c NULL and no operation is performed in
8865  * this case.
8866  */
8867 
8868 void enc_free(void* p)
8869 {
8870  /*
8871  * Attention:
8872  * Parts of the CORE module (for historical reasons) are still using
8873  * \c posix_free() to release memory allocated by this module. Until the
8874  * separation is complete, the memory manager of this module cannot be
8875  * changed.
8876  */
8877  posix_free(p);
8878 }
8879 
8880 
8881 /*! @} */
8882 
8883 /* EOF */
ENC_CS_ISO8859_14
Definition: encoding.h:75
fu_write_to_filedesc
int fu_write_to_filedesc(int filedesc, const char *buffer, size_t len)
Write data block to filedescriptor.
Definition: fileutils.c:542
enc_get_iso8601_utc
int enc_get_iso8601_utc(char *isodate)
Get current UTC date in ISO 8601 conformant format.
Definition: encoding.c:4395
ENC_CS_ISO8859_5
Definition: encoding.h:67
enc_ascii_convert_distribution
void enc_ascii_convert_distribution(char *s)
Convert body of distribution header field.
Definition: encoding.c:5074
ENC_CS_KOI8U
Definition: encoding.h:81
enc_mime_ct::subtype
enum enc_mime_ct_subtype subtype
Definition: encoding.h:115
ENC_CS_IBM850
Definition: encoding.h:93
CONF_FORCE_UNICODE
Definition: conf.h:77
ENC_URI_SCHEME_HTTP
Definition: encoding.h:132
ENC_CT_VIDEO
Definition: encoding.h:29
ENC_CS_ASCII
Definition: encoding.h:62
ENC_CTE_BIN
Definition: encoding.h:53
enc_mime_mpe
Locations of MIME multipart entities.
Definition: encoding.h:122
enc_free
void enc_free(void *p)
Free an object allocated by encoding module.
Definition: encoding.c:8868
ENC_CS_IBM775
Definition: encoding.h:92
enc_convert_to_utf8_nfc
const char * enc_convert_to_utf8_nfc(enum enc_mime_cs charset, const char *s)
Convert string from supported character set to Unicode (UTF-8 NFC)
Definition: encoding.c:5788
MAIN_ERR_PREFIX
#define MAIN_ERR_PREFIX
Message prefix for ENCODING module.
Definition: encoding.c:52
core_anum_t
#define core_anum_t
Article number data type (value zero is always reserved)
Definition: core.h:24
enc_mime_cs
enc_mime_cs
IDs for supported MIME character sets.
Definition: encoding.h:59
enc_uri_percent_encode
const char * enc_uri_percent_encode(const char *s, enum enc_uri_scheme sch)
Percent encoding for URI content.
Definition: encoding.c:8402
ENC_URI_SCHEME_NEWS
Definition: encoding.h:134
ENC_CT_MULTIPART
Definition: encoding.h:30
enc_mime_flowed_decode
const char * enc_mime_flowed_decode(const char *s, unsigned int delsp, unsigned int insline)
Decode MIME "text/plain" content with "format=flowed" parameter.
Definition: encoding.c:7856
ENC_CS_BUFLEN
#define ENC_CS_BUFLEN
Buffer size for character set name strings.
Definition: encoding.h:153
core_time_t
unsigned long int core_time_t
Time in seconds since the epoche (in terms of POSIX.1)
Definition: core.h:54
enc_mime_save_to_file
int enc_mime_save_to_file(const char *pn, enum enc_mime_cte cte, const char *entity)
Decode MIME content transfer encoding and save to file.
Definition: encoding.c:7717
enc_rot13
void enc_rot13(char *data)
Encode or decode data with ROT13 algorithm.
Definition: encoding.c:4692
ENC_CS_ISO8859_4
Definition: encoding.h:66
enc_mime_word_decode
int enc_mime_word_decode(const char **r, const char *b)
Decode header field containing potential MIME encoded-word tokens.
Definition: encoding.c:6518
ENC_CS_ISO8859_8
Definition: encoding.h:70
ENC_CS_ISO8859_10
Definition: encoding.h:72
enc_mime_get_cte
enum enc_mime_cte enc_mime_get_cte(const char *hf_body)
Decode content transfer encoding description.
Definition: encoding.c:7536
ENC_CS_WINDOWS_1256
Definition: encoding.h:88
enc_ascii_check
int enc_ascii_check(const char *s)
Verify ASCII encoding.
Definition: encoding.c:4944
config
struct conf config[CONF_NUM]
Global configuration.
Definition: conf.c:63
ENC_CTE_8BIT
Definition: encoding.h:52
enc_convert_ascii_to_anum
int enc_convert_ascii_to_anum(core_anum_t *result, const char *wm, int len)
Convert number from ASCII to numerical format.
Definition: encoding.c:4604
ENC_CS_KOI8R
Definition: encoding.h:80
enc_percent_decode
int enc_percent_decode(char *s, int clean)
Percent decoder.
Definition: encoding.c:8318
ENC_BO_BUFLEN
#define ENC_BO_BUFLEN
Buffer size for multipart boundary strings.
Definition: encoding.h:176
ENC_CS_ISO8859_X
Definition: encoding.h:78
ENC_CS_ISO8859_16
Definition: encoding.h:77
enc_mime_ct::type
enum enc_mime_ct_type type
Definition: encoding.h:114
enc_mime_mpe::start
const char * start
Definition: encoding.h:124
ENC_CTE_Q
Definition: encoding.h:54
enc_convert_lines_to_string
void enc_convert_lines_to_string(char *l, unsigned long int l_raw)
Convert number of lines to string.
Definition: encoding.c:4119
ENC_CS_WINDOWS_1255
Definition: encoding.h:87
enc_mime_mpe::len
size_t len
Definition: encoding.h:125
enc_wm_pattern
Wildmat array element (for RFC 3977 wildmat-pattern)
Definition: encoding.h:139
ENC_CS_ISO8859_6
Definition: encoding.h:68
ENC_CS_ISO8859_7
Definition: encoding.h:69
enc_convert_posix_to_iso8601
int enc_convert_posix_to_iso8601(char *isodate, core_time_t pts)
Convert POSIX timestamp to ISO 8601 conformant local date and time.
Definition: encoding.c:4342
enc_extract_addr_spec
const char * enc_extract_addr_spec(const char *mailbox)
Extract addr-spec token from RFC 5322 mailbox.
Definition: encoding.c:4830
enc_ascii_check_alpha
int enc_ascii_check_alpha(const char *s)
Check for ASCII alphabetic characters.
Definition: encoding.c:4972
CONF_QUOTESTYLE
Definition: conf.h:59
enc_uc_encode_utf8
void enc_uc_encode_utf8(char *buf, size_t *i, long int *dbuf, size_t *di)
Encode Unicode codepoints to UTF-8.
Definition: encoding.c:1008
enc_timestamp_decode
core_time_t enc_timestamp_decode(const char *timestamp)
Decode canonical timestamp to POSIX time (seconds since epoche)
Definition: encoding.c:4154
ENC_CTE_BUFLEN
#define ENC_CTE_BUFLEN
Buffer size for content transfer encoding name strings.
Definition: encoding.h:150
enc_ascii_check_printable
int enc_ascii_check_printable(const char *s)
Check for printable ASCII characters.
Definition: encoding.c:5022
ENC_CS_ISO2022_JP
Definition: encoding.h:96
ENC_CS_WINDOWS_1257
Definition: encoding.h:89
ENC_CT_AUDIO
Definition: encoding.h:28
enc_ascii_check_digit
int enc_ascii_check_digit(const char *s)
Check for ASCII digit characters.
Definition: encoding.c:4995
enc_mime_message
size_t enc_mime_message(const char *s, size_t len, struct enc_mime_mpe **mpe)
Extract MIME encapsulated message.
Definition: encoding.c:8168
ENC_CS_ISO8859_13
Definition: encoding.h:74
enc_mime_para_decode
int enc_mime_para_decode(const char **r, const char *b, int m)
Decode header field containing potential MIME parameters.
Definition: encoding.c:6817
ENC_CS_WINDOWS_1253
Definition: encoding.h:85
enc_uc_check_utf8
int enc_uc_check_utf8(const char *s)
Verify UTF-8 encoding.
Definition: encoding.c:5162
enc_mime_decode
const char * enc_mime_decode(enum enc_mime_cte cte, enum enc_mime_cs charset, const char *s)
Decode MIME text content to UTF-8 NFC.
Definition: encoding.c:7801
enc_convert_anum_to_ascii
int enc_convert_anum_to_ascii(char result[17], size_t *len, core_anum_t wm)
Convert article number from numerical format to ASCII.
Definition: encoding.c:4558
ENC_CTS_MIXED
Definition: encoding.h:40
enc_mime_ct::charset
enum enc_mime_cs charset
Definition: encoding.h:116
PRINT_ERROR
#define PRINT_ERROR(s)
Prepend module prefix and print error message.
Definition: main.h:19
data
struct core_data data
Global data object (shared by all threads)
Definition: core.c:242
ENC_URI_SCHEME_MAILTO
Definition: encoding.h:135
ENC_CTS_PLAIN
Definition: encoding.h:39
ENC_MIME_PARA_LENGTH_MAX
#define ENC_MIME_PARA_LENGTH_MAX
Maximum length of MIME parameter attribute tokens.
Definition: encoding.c:58
ENC_CTS_DIGEST
Definition: encoding.h:42
enc_destroy_wildmat
void enc_destroy_wildmat(struct enc_wm_pattern **obj, int num)
Destroy wildmat pattern array.
Definition: encoding.c:5537
enc_convert_octet_to_hex
int enc_convert_octet_to_hex(char *result, unsigned int octet)
Convert octet to hexadecimal (ASCII) format.
Definition: encoding.c:4664
ENC_CS_WINDOWS_1258
Definition: encoding.h:90
NNTP_ANUM_T_MAX
#define NNTP_ANUM_T_MAX
Maximum value this implementation supports for nntp_anum_t.
Definition: nntp.h:52
ENC_CTE_7BIT
Definition: encoding.h:51
ENC_CTS_RFC822
Definition: encoding.h:43
ENC_URI_SCHEME_FTP
Definition: encoding.h:133
ENC_CS_UTF_7
Definition: encoding.h:98
ENC_CS_UTF_16BE
Definition: encoding.h:100
ENC_CS_MACINTOSH
Definition: encoding.h:79
enc_mime_ct
MIME content type information.
Definition: encoding.h:112
enc_ascii_convert_to_printable
void enc_ascii_convert_to_printable(char *s)
Convert to printable ASCII format.
Definition: encoding.c:5049
enc_convert_posix_to_canonical
const char * enc_convert_posix_to_canonical(const char *s)
Convert from local (POSIX) to canonical (RFC 822) form.
Definition: encoding.c:5695
enc_uri_scheme
enc_uri_scheme
URI schemes.
Definition: encoding.h:129
ENC_CS_ISO8859_9
Definition: encoding.h:71
ENC_CS_IBM852
Definition: encoding.h:94
ENC_CS_WINDOWS_1251
Definition: encoding.h:83
ENC_CS_WINDOWS_1252
Definition: encoding.h:84
ENC_CS_ISO8859_11
Definition: encoding.h:73
enc_create_wildmat
int enc_create_wildmat(struct enc_wm_pattern **obj, const char *wm)
Create wildmat pattern array.
Definition: encoding.c:5371
enc_convert_iso8601_to_timestamp
int enc_convert_iso8601_to_timestamp(const char **ts, const char *isodate)
Convert ISO 8601 conformant date to canonical timestamp.
Definition: encoding.c:4503
fu_close_file
void fu_close_file(int *filedesc, FILE **stream)
Close file (and potentially associated I/O stream)
Definition: fileutils.c:290
ENC_CTS_ALTERNATIVE
Definition: encoding.h:41
enc_uc_repair_utf8
const char * enc_uc_repair_utf8(const char *s)
Repair UTF-8 encoding.
Definition: encoding.c:5181
ENC_CS_IBM858
Definition: encoding.h:95
ENC_CS_ISO8859_15
Definition: encoding.h:76
ENC_CS_WINDOWS_1254
Definition: encoding.h:86
ENC_CT_MESSAGE
Definition: encoding.h:31
ENC_CT_IMAGE
Definition: encoding.h:27
ENC_CS_ISO8859_3
Definition: encoding.h:65
ENC_MIME_HEADER_FOLD_ASCII_LINES
#define ENC_MIME_HEADER_FOLD_ASCII_LINES
MIME word encoder folding behaviour.
Definition: encoding.c:73
ENC_CS_CESU_8
Definition: encoding.h:97
enc_mime_encode_base64
int enc_mime_encode_base64(const char **enc, const char *data, size_t len)
Encode binary data to base64.
Definition: encoding.c:4744
enc_convert_canonical_to_posix
const char * enc_convert_canonical_to_posix(const char *s, int rcr, int rlf)
Convert from canonical (RFC 822) to local (POSIX) form.
Definition: encoding.c:5579
enc_create_name_addr
const char * enc_create_name_addr(const char *data, size_t offset)
Create a "name-addr" construct according to RFC 5322.
Definition: encoding.c:3952
enc_lines_decode
unsigned long int enc_lines_decode(const char *lines)
Decode number of lines.
Definition: encoding.c:4098
ENC_CTE_B
Definition: encoding.h:55
enc_mime_get_cd
void enc_mime_get_cd(const char *hf_body, enum enc_mime_cd *type, const char **filename)
Decode content disposition.
Definition: encoding.c:7619
ENC_CS_ISO8859_2
Definition: encoding.h:64
enc_mime_ct::flags
unsigned int flags
Definition: encoding.h:118
ENC_CS_WINDOWS_1250
Definition: encoding.h:82
ENC_CS_UTF_8
Definition: encoding.h:99
nntp_anum_t
unsigned long int nntp_anum_t
Article number.
Definition: nntp.h:28
enc_mime_cd
enc_mime_cd
IDs for supported MIME content disposition.
Definition: encoding.h:104
CONF_FLOWED_CRLF
Definition: conf.h:76
ENC_CS_ISO8859_1
Definition: encoding.h:63
enc_mime_multipart
size_t enc_mime_multipart(const char *s, const char *b, struct enc_mime_mpe **mpe)
Parse MIME multipart content.
Definition: encoding.c:8210
CORE_ANUM_T_MAX
#define CORE_ANUM_T_MAX
Article number limit.
Definition: core.h:180
enc_mime_word_encode
int enc_mime_word_encode(const char **r, const char *b, size_t pl)
Encode header field body using MIME encoded-word tokens.
Definition: encoding.c:6103
enc_mime_get_ct
void enc_mime_get_ct(struct enc_mime_ct *ct, const char *hf_body, char *bo)
Decode MIME "Content-Type" header field.
Definition: encoding.c:7235
ENC_CS_IBM437
Definition: encoding.h:91
enc_mime_cte
enc_mime_cte
IDs for supported MIME content transfer encodings.
Definition: encoding.h:48
enc_convert_to_8bit
const char * enc_convert_to_8bit(enum enc_mime_cs *charset, const char *s, const char **cs_iana)
Convert string from Unicode (UTF-8 NFC) to an 8bit character set.
Definition: encoding.c:6005
enc_convert_iso8601_to_posix
int enc_convert_iso8601_to_posix(core_time_t *pts, const char *isodate)
Convert ISO 8601 conformant UTC date and time to POSIX timestamp.
Definition: encoding.c:4450
fu_open_file
int fu_open_file(const char *pathname, int *filedesc, int mode, posix_mode_t perm)
Open file.
Definition: fileutils.c:243
ENC_CT_TEXT
Definition: encoding.h:26
print_error
void print_error(const char *)
Print error message.
Definition: main.cxx:276

Generated at 2024-04-27 using  doxygen