1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Some of the source code in this file came from fs/cifs/cifs_unicode.c
5 * Copyright (c) International Business Machines Corp., 2000,2009
6 * Modified by Steve French (sfrench@us.ibm.com)
7 * Modified by Namjae Jeon (linkinjeon@kernel.org)
10 #include <linux/slab.h>
11 #include <asm/unaligned.h>
15 #include "smb_common.h"
18 * cifs_mapchar() - convert a host-endian char to proper char in codepage
19 * @target: where converted character should be copied
20 * @from: host-endian source string
21 * @cp: codepage to which character should be converted
22 * @mapchar: should character be mapped according to mapchars mount option?
24 * This function handles the conversion of a single character. It is the
25 * responsibility of the caller to ensure that the target buffer is large
26 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
28 * Return: string length after conversion
31 cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
43 * BB: Cannot handle remapping UNI_SLASH until all the calls to
44 * build_path_from_dentry are modified, as they use slash as
74 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
81 /* convert SURROGATE_PAIR and IVS */
82 if (strcmp(cp->charset, "utf8"))
84 len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
96 * smb_utf16_bytes() - compute converted string length
97 * @from: pointer to input string
98 * @maxbytes: input string length
99 * @codepage: destination codepage
101 * Walk a utf16le string and return the number of bytes that the string will
102 * be after being converted to the given charset, not including any null
103 * termination required. Don't walk past maxbytes in the source buffer.
105 * Return: string length after conversion
107 static int smb_utf16_bytes(const __le16 *from, int maxbytes,
108 const struct nls_table *codepage)
111 int charlen, outlen = 0;
112 int maxwords = maxbytes / 2;
113 char tmp[NLS_MAX_CHARSET_SIZE];
116 for (i = 0; i < maxwords; i++) {
117 ftmp[0] = get_unaligned_le16(&from[i]);
120 for (j = 1; j <= 2; j++) {
121 if (i + j < maxwords)
122 ftmp[j] = get_unaligned_le16(&from[i + j]);
127 charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
138 * smb_from_utf16() - convert utf16le string to local charset
139 * @to: destination buffer
140 * @from: source buffer
141 * @tolen: destination buffer size (in bytes)
142 * @fromlen: source buffer size (in bytes)
143 * @codepage: codepage to which characters should be converted
144 * @mapchar: should characters be remapped according to the mapchars option?
146 * Convert a little-endian utf16le string (as sent by the server) to a string
147 * in the provided codepage. The tolen and fromlen parameters are to ensure
148 * that the code doesn't walk off of the end of the buffer (which is always
149 * a danger if the alignment of the source buffer is off). The destination
150 * string is always properly null terminated and fits in the destination
151 * buffer. Returns the length of the destination string in bytes (including
154 * Note that some windows versions actually send multiword UTF-16 characters
155 * instead of straight UTF16-2. The linux nls routines however aren't able to
156 * deal with those characters properly. In the event that we get some of
157 * those characters, they won't be translated properly.
159 * Return: string length after conversion
161 static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
162 const struct nls_table *codepage, bool mapchar)
164 int i, j, charlen, safelen;
166 int nullsize = nls_nullsize(codepage);
167 int fromwords = fromlen / 2;
168 char tmp[NLS_MAX_CHARSET_SIZE];
169 __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
172 * because the chars can be of varying widths, we need to take care
173 * not to overflow the destination buffer when we get close to the
174 * end of it. Until we get to this offset, we don't need to check
175 * for overflow however.
177 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
179 for (i = 0; i < fromwords; i++) {
180 ftmp[0] = get_unaligned_le16(&from[i]);
183 for (j = 1; j <= 2; j++) {
184 if (i + j < fromwords)
185 ftmp[j] = get_unaligned_le16(&from[i + j]);
191 * check to see if converting this character might make the
192 * conversion bleed into the null terminator
194 if (outlen >= safelen) {
195 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
196 if ((outlen + charlen) > (tolen - nullsize))
200 /* put converted char into 'to' buffer */
201 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
205 * charlen (=bytes of UTF-8 for 1 character)
206 * 4bytes UTF-8(surrogate pair) is charlen=4
207 * (4bytes UTF-16 code)
208 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
209 * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
213 else if (charlen >= 5)
218 /* properly null-terminate string */
219 for (i = 0; i < nullsize; i++)
226 * smb_strtoUTF16() - Convert character string to unicode string
227 * @to: destination buffer
228 * @from: source buffer
229 * @len: destination buffer size (in bytes)
230 * @codepage: codepage to which characters should be converted
232 * Return: string length after conversion
234 int smb_strtoUTF16(__le16 *to, const char *from, int len,
235 const struct nls_table *codepage)
239 wchar_t wchar_to; /* needed to quiet sparse */
241 /* special case for utf8 to handle no plane0 chars */
242 if (!strcmp(codepage->charset, "utf8")) {
244 * convert utf8 -> utf16, we assume we have enough space
245 * as caller should have assumed conversion does not overflow
246 * in destination len is length in wchar_t units (16bits)
248 i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
251 /* if success terminate and exit */
255 * if fails fall back to UCS encoding as this
256 * function should not return negative values
257 * currently can fail only if source contains
258 * invalid encoded characters
262 for (i = 0; len > 0 && *from; i++, from += charlen, len -= charlen) {
263 charlen = codepage->char2uni(from, len, &wchar_to);
265 /* A question mark */
269 put_unaligned_le16(wchar_to, &to[i]);
273 put_unaligned_le16(0, &to[i]);
278 * smb_strndup_from_utf16() - copy a string from wire format to the local
280 * @src: source string
281 * @maxlen: don't walk past this many bytes in the source string
282 * @is_unicode: is this a unicode string?
283 * @codepage: destination codepage
285 * Take a string given by the server, convert it to the local codepage and
286 * put it in a new buffer. Returns a pointer to the new string or NULL on
289 * Return: destination string buffer or error ptr
291 char *smb_strndup_from_utf16(const char *src, const int maxlen,
292 const bool is_unicode,
293 const struct nls_table *codepage)
299 len = smb_utf16_bytes((__le16 *)src, maxlen, codepage);
300 len += nls_nullsize(codepage);
301 dst = kmalloc(len, GFP_KERNEL);
303 return ERR_PTR(-ENOMEM);
304 ret = smb_from_utf16(dst, (__le16 *)src, len, maxlen, codepage,
308 return ERR_PTR(-EINVAL);
311 len = strnlen(src, maxlen);
313 dst = kmalloc(len, GFP_KERNEL);
315 return ERR_PTR(-ENOMEM);
316 strscpy(dst, src, len);
323 * Convert 16 bit Unicode pathname to wire format from string in current code
324 * page. Conversion may involve remapping up the six characters that are
325 * only legal in POSIX-like OS (if they are present in the string). Path
326 * names are little endian 16 bit Unicode on the wire
329 * smbConvertToUTF16() - convert string from local charset to utf16
330 * @target: destination buffer
331 * @source: source buffer
332 * @srclen: source buffer size (in bytes)
333 * @cp: codepage to which characters should be converted
334 * @mapchar: should characters be remapped according to the mapchars option?
336 * Convert 16 bit Unicode pathname to wire format from string in current code
337 * page. Conversion may involve remapping up the six characters that are
338 * only legal in POSIX-like OS (if they are present in the string). Path
339 * names are little endian 16 bit Unicode on the wire
341 * Return: char length after conversion
343 int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
344 const struct nls_table *cp, int mapchars)
350 wchar_t wchar_to[6]; /* UTF-16 */
355 return smb_strtoUTF16(target, source, srclen, cp);
357 for (i = 0, j = 0; i < srclen; j++) {
358 src_char = source[i];
362 put_unaligned(0, &target[j]);
365 dst_char = cpu_to_le16(UNI_COLON);
368 dst_char = cpu_to_le16(UNI_ASTERISK);
371 dst_char = cpu_to_le16(UNI_QUESTION);
374 dst_char = cpu_to_le16(UNI_LESSTHAN);
377 dst_char = cpu_to_le16(UNI_GRTRTHAN);
380 dst_char = cpu_to_le16(UNI_PIPE);
383 * FIXME: We can not handle remapping backslash (UNI_SLASH)
384 * until all the calls to build_path_from_dentry are modified,
385 * as they use backslash as separator.
388 charlen = cp->char2uni(source + i, srclen - i, &tmp);
389 dst_char = cpu_to_le16(tmp);
392 * if no match, use question mark, which at least in
393 * some cases serves as wild card
398 /* convert SURROGATE_PAIR */
399 if (strcmp(cp->charset, "utf8"))
401 if (*(source + i) & 0x80) {
402 charlen = utf8_to_utf32(source + i, 6, &u);
407 ret = utf8s_to_utf16s(source + i, charlen,
414 dst_char = cpu_to_le16(*wchar_to);
416 /* 1-3bytes UTF-8 to 2bytes UTF-16 */
417 put_unaligned(dst_char, &target[j]);
418 else if (charlen == 4) {
420 * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
421 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
422 * (charlen=3+4 or 4+4)
424 put_unaligned(dst_char, &target[j]);
425 dst_char = cpu_to_le16(*(wchar_to + 1));
427 put_unaligned(dst_char, &target[j]);
428 } else if (charlen >= 5) {
429 /* 5-6bytes UTF-8 to 6bytes UTF-16 */
430 put_unaligned(dst_char, &target[j]);
431 dst_char = cpu_to_le16(*(wchar_to + 1));
433 put_unaligned(dst_char, &target[j]);
434 dst_char = cpu_to_le16(*(wchar_to + 2));
436 put_unaligned(dst_char, &target[j]);
441 dst_char = cpu_to_le16(0x003f);
447 * character may take more than one byte in the source string,
448 * but will take exactly two bytes in the target string
451 put_unaligned(dst_char, &target[j]);