GNU Linux-libre 4.14.254-gnu1
[releases.git] / fs / udf / unicode.c
1 /*
2  * unicode.c
3  *
4  * PURPOSE
5  *      Routines for converting between UTF-8 and OSTA Compressed Unicode.
6  *      Also handles filename mangling
7  *
8  * DESCRIPTION
9  *      OSTA Compressed Unicode is explained in the OSTA UDF specification.
10  *              http://www.osta.org/
11  *      UTF-8 is explained in the IETF RFC XXXX.
12  *              ftp://ftp.internic.net/rfc/rfcxxxx.txt
13  *
14  * COPYRIGHT
15  *      This file is distributed under the terms of the GNU General Public
16  *      License (GPL). Copies of the GPL can be obtained from:
17  *              ftp://prep.ai.mit.edu/pub/gnu/GPL
18  *      Each contributing author retains all rights to their own work.
19  */
20
21 #include "udfdecl.h"
22
23 #include <linux/kernel.h>
24 #include <linux/string.h>       /* for memset */
25 #include <linux/nls.h>
26 #include <linux/crc-itu-t.h>
27 #include <linux/slab.h>
28
29 #include "udf_sb.h"
30
31 #define SURROGATE_MASK 0xfffff800
32 #define SURROGATE_PAIR 0x0000d800
33
34 static int udf_uni2char_utf8(wchar_t uni,
35                              unsigned char *out,
36                              int boundlen)
37 {
38         int u_len = 0;
39
40         if (boundlen <= 0)
41                 return -ENAMETOOLONG;
42
43         if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
44                 return -EINVAL;
45
46         if (uni < 0x80) {
47                 out[u_len++] = (unsigned char)uni;
48         } else if (uni < 0x800) {
49                 if (boundlen < 2)
50                         return -ENAMETOOLONG;
51                 out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
52                 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
53         } else {
54                 if (boundlen < 3)
55                         return -ENAMETOOLONG;
56                 out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
57                 out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
58                 out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
59         }
60         return u_len;
61 }
62
63 static int udf_char2uni_utf8(const unsigned char *in,
64                              int boundlen,
65                              wchar_t *uni)
66 {
67         unsigned int utf_char;
68         unsigned char c;
69         int utf_cnt, u_len;
70
71         utf_char = 0;
72         utf_cnt = 0;
73         for (u_len = 0; u_len < boundlen;) {
74                 c = in[u_len++];
75
76                 /* Complete a multi-byte UTF-8 character */
77                 if (utf_cnt) {
78                         utf_char = (utf_char << 6) | (c & 0x3f);
79                         if (--utf_cnt)
80                                 continue;
81                 } else {
82                         /* Check for a multi-byte UTF-8 character */
83                         if (c & 0x80) {
84                                 /* Start a multi-byte UTF-8 character */
85                                 if ((c & 0xe0) == 0xc0) {
86                                         utf_char = c & 0x1f;
87                                         utf_cnt = 1;
88                                 } else if ((c & 0xf0) == 0xe0) {
89                                         utf_char = c & 0x0f;
90                                         utf_cnt = 2;
91                                 } else if ((c & 0xf8) == 0xf0) {
92                                         utf_char = c & 0x07;
93                                         utf_cnt = 3;
94                                 } else if ((c & 0xfc) == 0xf8) {
95                                         utf_char = c & 0x03;
96                                         utf_cnt = 4;
97                                 } else if ((c & 0xfe) == 0xfc) {
98                                         utf_char = c & 0x01;
99                                         utf_cnt = 5;
100                                 } else {
101                                         utf_cnt = -1;
102                                         break;
103                                 }
104                                 continue;
105                         } else {
106                                 /* Single byte UTF-8 character (most common) */
107                                 utf_char = c;
108                         }
109                 }
110                 *uni = utf_char;
111                 break;
112         }
113         if (utf_cnt) {
114                 *uni = '?';
115                 return -EINVAL;
116         }
117         return u_len;
118 }
119
120 #define ILLEGAL_CHAR_MARK       '_'
121 #define EXT_MARK                '.'
122 #define CRC_MARK                '#'
123 #define EXT_SIZE                5
124 /* Number of chars we need to store generated CRC to make filename unique */
125 #define CRC_LEN                 5
126
127 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
128                               int *str_o_idx,
129                               const uint8_t *str_i, int str_i_max_len,
130                               int *str_i_idx,
131                               int u_ch, int *needsCRC,
132                               int (*conv_f)(wchar_t, unsigned char *, int),
133                               int translate)
134 {
135         uint32_t c;
136         int illChar = 0;
137         int len, gotch = 0;
138
139         for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
140                 if (*str_o_idx >= str_o_max_len) {
141                         *needsCRC = 1;
142                         return gotch;
143                 }
144
145                 /* Expand OSTA compressed Unicode to Unicode */
146                 c = str_i[*str_i_idx];
147                 if (u_ch > 1)
148                         c = (c << 8) | str_i[*str_i_idx + 1];
149
150                 if (translate && (c == '/' || c == 0))
151                         illChar = 1;
152                 else if (illChar)
153                         break;
154                 else
155                         gotch = 1;
156         }
157         if (illChar) {
158                 *needsCRC = 1;
159                 c = ILLEGAL_CHAR_MARK;
160                 gotch = 1;
161         }
162         if (gotch) {
163                 len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
164                 /* Valid character? */
165                 if (len >= 0)
166                         *str_o_idx += len;
167                 else if (len == -ENAMETOOLONG) {
168                         *needsCRC = 1;
169                         gotch = 0;
170                 } else {
171                         str_o[(*str_o_idx)++] = '?';
172                         *needsCRC = 1;
173                 }
174         }
175         return gotch;
176 }
177
178 static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
179                              const uint8_t *ocu, int ocu_len,
180                              int (*conv_f)(wchar_t, unsigned char *, int),
181                              int translate)
182 {
183         uint32_t c;
184         uint8_t cmp_id;
185         int idx, len;
186         int u_ch;
187         int needsCRC = 0;
188         int ext_i_len, ext_max_len;
189         int str_o_len = 0;      /* Length of resulting output */
190         int ext_o_len = 0;      /* Extension output length */
191         int ext_crc_len = 0;    /* Extension output length if used with CRC */
192         int i_ext = -1;         /* Extension position in input buffer */
193         int o_crc = 0;          /* Rightmost possible output pos for CRC+ext */
194         unsigned short valueCRC;
195         uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
196         uint8_t crc[CRC_LEN];
197
198         if (str_max_len <= 0)
199                 return 0;
200
201         if (ocu_len == 0) {
202                 memset(str_o, 0, str_max_len);
203                 return 0;
204         }
205
206         cmp_id = ocu[0];
207         if (cmp_id != 8 && cmp_id != 16) {
208                 memset(str_o, 0, str_max_len);
209                 pr_err("unknown compression code (%d)\n", cmp_id);
210                 return -EINVAL;
211         }
212         u_ch = cmp_id >> 3;
213
214         ocu++;
215         ocu_len--;
216
217         if (ocu_len % u_ch) {
218                 pr_err("incorrect filename length (%d)\n", ocu_len + 1);
219                 return -EINVAL;
220         }
221
222         if (translate) {
223                 /* Look for extension */
224                 for (idx = ocu_len - u_ch, ext_i_len = 0;
225                      (idx >= 0) && (ext_i_len < EXT_SIZE);
226                      idx -= u_ch, ext_i_len++) {
227                         c = ocu[idx];
228                         if (u_ch > 1)
229                                 c = (c << 8) | ocu[idx + 1];
230
231                         if (c == EXT_MARK) {
232                                 if (ext_i_len)
233                                         i_ext = idx;
234                                 break;
235                         }
236                 }
237                 if (i_ext >= 0) {
238                         /* Convert extension */
239                         ext_max_len = min_t(int, sizeof(ext), str_max_len);
240                         ext[ext_o_len++] = EXT_MARK;
241                         idx = i_ext + u_ch;
242                         while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
243                                                   ocu, ocu_len, &idx,
244                                                   u_ch, &needsCRC,
245                                                   conv_f, translate)) {
246                                 if ((ext_o_len + CRC_LEN) < str_max_len)
247                                         ext_crc_len = ext_o_len;
248                         }
249                 }
250         }
251
252         idx = 0;
253         while (1) {
254                 if (translate && (idx == i_ext)) {
255                         if (str_o_len > (str_max_len - ext_o_len))
256                                 needsCRC = 1;
257                         break;
258                 }
259
260                 if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
261                                         ocu, ocu_len, &idx,
262                                         u_ch, &needsCRC, conv_f, translate))
263                         break;
264
265                 if (translate &&
266                     (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
267                         o_crc = str_o_len;
268         }
269
270         if (translate) {
271                 if (str_o_len <= 2 && str_o[0] == '.' &&
272                     (str_o_len == 1 || str_o[1] == '.'))
273                         needsCRC = 1;
274                 if (needsCRC) {
275                         str_o_len = o_crc;
276                         valueCRC = crc_itu_t(0, ocu, ocu_len);
277                         crc[0] = CRC_MARK;
278                         crc[1] = hex_asc_upper_hi(valueCRC >> 8);
279                         crc[2] = hex_asc_upper_lo(valueCRC >> 8);
280                         crc[3] = hex_asc_upper_hi(valueCRC);
281                         crc[4] = hex_asc_upper_lo(valueCRC);
282                         len = min_t(int, CRC_LEN, str_max_len - str_o_len);
283                         memcpy(&str_o[str_o_len], crc, len);
284                         str_o_len += len;
285                         ext_o_len = ext_crc_len;
286                 }
287                 if (ext_o_len > 0) {
288                         memcpy(&str_o[str_o_len], ext, ext_o_len);
289                         str_o_len += ext_o_len;
290                 }
291         }
292
293         return str_o_len;
294 }
295
296 static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
297                            const uint8_t *str_i, int str_len,
298                            int (*conv_f)(const unsigned char *, int, wchar_t *))
299 {
300         int i, len;
301         unsigned int max_val;
302         wchar_t uni_char;
303         int u_len, u_ch;
304
305         if (ocu_max_len <= 0)
306                 return 0;
307
308         memset(ocu, 0, ocu_max_len);
309         ocu[0] = 8;
310         max_val = 0xff;
311         u_ch = 1;
312
313 try_again:
314         u_len = 1;
315         for (i = 0; i < str_len; i++) {
316                 /* Name didn't fit? */
317                 if (u_len + u_ch > ocu_max_len)
318                         return 0;
319                 len = conv_f(&str_i[i], str_len - i, &uni_char);
320                 if (!len)
321                         continue;
322                 /* Invalid character, deal with it */
323                 if (len < 0) {
324                         len = 1;
325                         uni_char = '?';
326                 }
327
328                 if (uni_char > max_val) {
329                         max_val = 0xffff;
330                         ocu[0] = 0x10;
331                         u_ch = 2;
332                         goto try_again;
333                 }
334
335                 if (max_val == 0xffff)
336                         ocu[u_len++] = (uint8_t)(uni_char >> 8);
337                 ocu[u_len++] = (uint8_t)(uni_char & 0xff);
338                 i += len - 1;
339         }
340
341         return u_len;
342 }
343
344 /*
345  * Convert CS0 dstring to output charset. Warning: This function may truncate
346  * input string if it is too long as it is used for informational strings only
347  * and it is better to truncate the string than to refuse mounting a media.
348  */
349 int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
350                       const uint8_t *ocu_i, int i_len)
351 {
352         int s_len = 0;
353
354         if (i_len > 0) {
355                 s_len = ocu_i[i_len - 1];
356                 if (s_len >= i_len) {
357                         pr_warn("incorrect dstring lengths (%d/%d),"
358                                 " truncating\n", s_len, i_len);
359                         s_len = i_len - 1;
360                         /* 2-byte encoding? Need to round properly... */
361                         if (ocu_i[0] == 16)
362                                 s_len -= (s_len - 1) & 2;
363                 }
364         }
365
366         return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
367                                  udf_uni2char_utf8, 0);
368 }
369
370 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
371                      uint8_t *dname, int dlen)
372 {
373         int (*conv_f)(wchar_t, unsigned char *, int);
374         int ret;
375
376         if (!slen)
377                 return -EIO;
378
379         if (dlen <= 0)
380                 return 0;
381
382         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
383                 conv_f = udf_uni2char_utf8;
384         } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
385                 conv_f = UDF_SB(sb)->s_nls_map->uni2char;
386         } else
387                 BUG();
388
389         ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
390         /* Zero length filename isn't valid... */
391         if (ret == 0)
392                 ret = -EINVAL;
393         return ret;
394 }
395
396 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
397                      uint8_t *dname, int dlen)
398 {
399         int (*conv_f)(const unsigned char *, int, wchar_t *);
400
401         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
402                 conv_f = udf_char2uni_utf8;
403         } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
404                 conv_f = UDF_SB(sb)->s_nls_map->char2uni;
405         } else
406                 BUG();
407
408         return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
409 }
410