Initial import
[samba] / source / modules / charset_macosxfs.c
1 /* 
2    Unix SMB/CIFS implementation.
3    Samba charset module for Mac OS X/Darwin
4    Copyright (C) Benjamin Riefenstahl 2003
5    
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2 of the License, or
9    (at your option) any later version.
10    
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15    
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software
18    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /*
22  * modules/charset_macosxfs.c
23  *
24  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
25  * and display encoding.
26  *
27  * Actually two implementations are provided here.  The default
28  * implementation is based on the official CFString API.  The other is
29  * based on internal CFString APIs as defined in the OpenDarwin
30  * source.
31  */
32
33 #include "includes.h"
34
35 /*
36  * Include OS frameworks.  These are only needed in this module.
37  */
38 #include <CoreFoundation/CFString.h>
39
40 /*
41  * See if autoconf has found us the internal headers in some form.
42  */
43 #if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
44 #       include <Corefoundation/CFStringEncodingConverter.h>
45 #       include <Corefoundation/CFUnicodePrecomposition.h>
46 #       define USE_INTERNAL_API 1
47 #elif HAVE_CFSTRINGENCODINGCONVERTER_H
48 #       include <CFStringEncodingConverter.h>
49 #       include <CFUnicodePrecomposition.h>
50 #       define USE_INTERNAL_API 1
51 #endif
52
53 /*
54  * Compile time configuration: Do we want debug output?
55  */
56 /* #define DEBUG_STRINGS 1 */
57
58 /*
59  * A simple, but efficient memory provider for our buffers.
60  */
61 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
62 {
63         if (newsize > *size) {
64                 *size = newsize + 128;
65                 buffer = realloc(buffer, *size);
66         }
67         return buffer;
68 }
69
70 /*
71  * While there is a version of OpenDarwin for intel, the usual case is
72  * big-endian PPC.  So we need byte swapping to handle the
73  * little-endian byte order of the network protocol.  We also need an
74  * additional dynamic buffer to do this work for incoming data blocks,
75  * because we have to consider the original data as constant.
76  *
77  * We abstract the differences away by providing a simple facade with
78  * these functions/macros:
79  *
80  *      le_to_native(dst,src,len)
81  *      native_to_le(cp,len)
82  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
83  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
84  */
85 #ifdef WORDS_BIGENDIAN
86
87 static inline void swap_bytes (char * dst, const char * src, size_t len)
88 {
89         const char *srcend = src + len;
90         while (src < srcend) {
91                 dst[0] = src[1];
92                 dst[1] = src[0];
93                 dst += 2;
94                 src += 2;
95         }
96 }
97 static inline void swap_bytes_inplace (char * cp, size_t len)
98 {
99         char temp;
100         char *end = cp + len;
101         while (cp  < end) {
102                 temp = cp[1];
103                 cp[1] = cp[0];
104                 cp[0] = temp;
105                 cp += 2;
106         }
107 }
108
109 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
110 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
111 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
112         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
113
114 #else   /* ! WORDS_BIGENDIAN */
115
116 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
117 #define native_to_le(cp,len)            /* nothing */
118 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
119         (((void)(bufsize)),(UniChar*)(data))
120
121 #endif
122
123 static inline UniChar *set_ucbuffer_with_le_copy (
124         UniChar *buffer, size_t *bufsize,
125         const void *data, size_t size, size_t reserve)
126 {
127         buffer = resize_buffer(buffer, bufsize, size+reserve);
128         le_to_native((char*)buffer,data,size);
129         return buffer;
130 }
131
132
133 /*
134  * A simple hexdump function for debugging error conditions.
135  */
136 #define debug_out(s)    DEBUG(0,(s))
137
138 #ifdef DEBUG_STRINGS
139
140 static void hexdump( const char * label, const char * s, size_t len )
141 {
142         size_t restlen = len;
143         debug_out("<<<<<<<\n");
144         debug_out(label);
145         debug_out("\n");
146         while (restlen > 0) {
147                 char line[100];
148                 size_t i, j;
149                 char * d = line;
150 #undef sprintf
151                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
152                 *d++ = ' ';
153                 for( i = 0; i<restlen && i<8; ++i ) {
154                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
155                 }
156                 for( j = i; j<8; ++j ) {
157                         d += sprintf(d, "   ");
158                 }
159                 *d++ = ' ';
160                 for( i = 8; i<restlen && i<16; ++i ) {
161                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
162                 }
163                 for( j = i; j<16; ++j ) {
164                         d += sprintf(d, "   ");
165                 }
166                 *d++ = ' ';
167                 for( i = 0; i<restlen && i<16; ++i ) {
168                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
169                                 *d++ = '.';
170                         else
171                                 *d++ = s[i];
172                 }
173                 *d++ = '\n';
174                 *d = 0;
175                 restlen -= i;
176                 s += i;
177                 debug_out(line);
178         }
179         debug_out(">>>>>>>\n");
180 }
181
182 #else   /* !DEBUG_STRINGS */
183
184 #define hexdump(label,s,len) /* nothing */
185
186 #endif
187
188
189 #if !USE_INTERNAL_API
190
191 /*
192  * An implementation based on documented Mac OS X APIs.
193  *
194  * This does a certain amount of memory management, creating and
195  * manipulating CFString objects.  We try to minimize the impact by
196  * keeping those objects around and re-using them.  We also use
197  * external backing store for the CFStrings where this is possible and
198  * benficial.
199  *
200  * The Unicode normalizations forms available at this level are
201  * generic, not specifically for the file system.  So they may not be
202  * perfect fits.
203  */
204 static size_t macosxfs_encoding_pull(
205         void *cd,                               /* Encoder handle */
206         char **inbuf, size_t *inbytesleft,      /* Script string */
207         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
208 {
209         static const int script_code = kCFStringEncodingUTF8;
210         static CFMutableStringRef cfstring = NULL;
211         size_t outsize;
212         CFRange range;
213
214         (void) cd; /* UNUSED */
215
216         if (0 == *inbytesleft) {
217                 return 0;
218         }
219
220         if (NULL == cfstring) {
221                 /*
222                  * A version with an external backing store as in the
223                  * push function should have been more efficient, but
224                  * testing shows, that it is actually slower (!).
225                  * Maybe kCFAllocatorDefault gets shortcut evaluation
226                  * internally, while kCFAllocatorNull doesn't.
227                  */
228                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
229         }
230
231         /*
232          * Three methods of appending to a CFString, choose the most
233          * efficient.
234          */
235         if (0 == (*inbuf)[*inbytesleft-1]) {
236                 CFStringAppendCString(cfstring, *inbuf, script_code);
237         } else if (*inbytesleft <= 255) {
238                 Str255 buffer;
239                 buffer[0] = *inbytesleft;
240                 memcpy(buffer+1, *inbuf, buffer[0]);
241                 CFStringAppendPascalString(cfstring, buffer, script_code);
242         } else {
243                 /*
244                  * We would like to use a fixed buffer and a loop
245                  * here, but than we can't garantee that the input is
246                  * well-formed UTF-8, as we are supposed to do.
247                  */
248                 static char *buffer = NULL;
249                 static size_t buflen = 0;
250                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
251                 memcpy(buffer, *inbuf, *inbytesleft);
252                 buffer[*inbytesleft] = 0;
253                 CFStringAppendCString(cfstring, *inbuf, script_code);
254         }
255
256         /*
257          * Compose characters, using the non-canonical composition
258          * form.
259          */
260         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
261
262         outsize = CFStringGetLength(cfstring);
263         range = CFRangeMake(0,outsize);
264
265         if (outsize == 0) {
266                 /*
267                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
268                  * errors here.  That function will always pass 2
269                  * characters.  smbd/open.c:check_for_pipe() cuts a
270                  * patchname to 10 characters blindly.  Suppress the
271                  * debug output in those cases.
272                  */
273                 if(2 != *inbytesleft && 10 != *inbytesleft) {
274                         debug_out("String conversion: "
275                                   "An unknown error occurred\n");
276                         hexdump("UTF8->UTF16LE (old) input",
277                                 *inbuf, *inbytesleft);
278                 }
279                 errno = EILSEQ; /* Not sure, but this is what we have
280                                  * actually seen. */
281                 return -1;
282         }
283         if (outsize*2 > *outbytesleft) {
284                 CFStringDelete(cfstring, range);
285                 debug_out("String conversion: "
286                           "Output buffer too small\n");
287                 hexdump("UTF8->UTF16LE (old) input",
288                         *inbuf, *inbytesleft);
289                 errno = E2BIG;
290                 return -1;
291         }
292
293         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
294         CFStringDelete(cfstring, range);
295
296         native_to_le(*outbuf, outsize*2);
297
298         /*
299          * Add a converted null byte, if the CFString conversions
300          * prevented that until now.
301          */
302         if (0 == (*inbuf)[*inbytesleft-1] && 
303             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
304
305                 if ((outsize*2+2) > *outbytesleft) {
306                         debug_out("String conversion: "
307                                   "Output buffer too small\n");
308                         hexdump("UTF8->UTF16LE (old) input",
309                                 *inbuf, *inbytesleft);
310                         errno = E2BIG;
311                         return -1;
312                 }
313
314                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
315                 outsize += 2;
316         }
317
318         *inbuf += *inbytesleft;
319         *inbytesleft = 0;
320         *outbuf += outsize*2;
321         *outbytesleft -= outsize*2;
322
323         return 0;
324 }
325
326 static size_t macosxfs_encoding_push(
327         void *cd,                               /* Encoder handle */
328         char **inbuf, size_t *inbytesleft,      /* UTF-16-LE string */
329         char **outbuf, size_t *outbytesleft)    /* Script string */
330 {
331         static const int script_code = kCFStringEncodingUTF8;
332         static CFMutableStringRef cfstring = NULL;
333         static UniChar *buffer = NULL;
334         static size_t buflen = 0;
335         CFIndex outsize, cfsize, charsconverted;
336
337         (void) cd; /* UNUSED */
338
339         if (0 == *inbytesleft) {
340                 return 0;
341         }
342
343         /*
344          * We need a buffer that can hold 4 times the original data,
345          * because that is the theoretical maximum that decomposition
346          * can create currently (in Unicode 4.0).
347          */
348         buffer = set_ucbuffer_with_le_copy(
349                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
350
351         if (NULL == cfstring) {
352                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
353                         kCFAllocatorDefault,
354                         buffer, *inbytesleft/2, buflen/2,
355                         kCFAllocatorNull);
356         } else {
357                 CFStringSetExternalCharactersNoCopy(
358                         cfstring,
359                         buffer, *inbytesleft/2, buflen/2);
360         }
361
362         /*
363          * Decompose characters, using the non-canonical decomposition
364          * form.
365          *
366          * NB: This isn't exactly what HFS+ wants (see note on
367          * kCFStringEncodingUseHFSPlusCanonical in
368          * CFStringEncodingConverter.h), but AFAIK it's the best that
369          * the official API can do.
370          */
371         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
372
373         cfsize = CFStringGetLength(cfstring);
374         charsconverted = CFStringGetBytes(
375                 cfstring, CFRangeMake(0,cfsize),
376                 script_code, 0, False,
377                 *outbuf, *outbytesleft, &outsize);
378
379         if (0 == charsconverted) {
380                 debug_out("String conversion: "
381                           "Buffer too small or not convertable\n");
382                 hexdump("UTF16LE->UTF8 (old) input",
383                         *inbuf, *inbytesleft);
384                 errno = EILSEQ; /* Probably more likely. */
385                 return -1;
386         }
387
388         /*
389          * Add a converted null byte, if the CFString conversions
390          * prevented that until now.
391          */
392         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
393             (0 != (*outbuf)[outsize-1])) {
394
395                 if (((size_t)outsize+1) > *outbytesleft) {
396                         debug_out("String conversion: "
397                                   "Output buffer too small\n");
398                         hexdump("UTF16LE->UTF8 (old) input",
399                                 *inbuf, *inbytesleft);
400                         errno = E2BIG;
401                         return -1;
402                 }
403
404                 (*outbuf)[outsize] = 0;
405                 ++outsize;
406         }
407
408         *inbuf += *inbytesleft;
409         *inbytesleft = 0;
410         *outbuf += outsize;
411         *outbytesleft -= outsize;
412
413         return 0;
414 }
415
416 #else /* USE_INTERNAL_API */
417
418 /*
419  * An implementation based on internal code as known from the
420  * OpenDarwin CVS.
421  *
422  * This code doesn't need much memory management because it uses
423  * functions that operate on the raw memory directly.
424  *
425  * The push routine here is faster and more compatible with HFS+ than
426  * the other implementation above.  The pull routine is only faster
427  * for some strings, slightly slower for others.  The pull routine
428  * looses because it has to iterate over the data twice, once to
429  * decode UTF-8 and than to do the character composition required by
430  * Windows.
431  */
432 static size_t macosxfs_encoding_pull(
433         void *cd,                               /* Encoder handle */
434         char **inbuf, size_t *inbytesleft,      /* Script string */
435         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
436 {
437         static const int script_code = kCFStringEncodingUTF8;
438         UInt32 srcCharsUsed = 0;
439         UInt32 dstCharsUsed = 0;
440         UInt32 result;
441         uint32_t dstDecomposedUsed = 0;
442         uint32_t dstPrecomposedUsed = 0;
443
444         (void) cd; /* UNUSED */
445
446         if (0 == *inbytesleft) {
447                 return 0;
448         }
449
450         result = CFStringEncodingBytesToUnicode(
451                 script_code, kCFStringEncodingComposeCombinings,
452                 *inbuf, *inbytesleft, &srcCharsUsed,
453                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
454
455         switch(result) {
456         case kCFStringEncodingConversionSuccess:
457                 if (*inbytesleft == srcCharsUsed)
458                         break;
459                 else
460                         ; /*fall through*/
461         case kCFStringEncodingInsufficientOutputBufferLength:
462                 debug_out("String conversion: "
463                           "Output buffer too small\n");
464                 hexdump("UTF8->UTF16LE (new) input",
465                         *inbuf, *inbytesleft);
466                 errno = E2BIG;
467                 return -1;
468         case kCFStringEncodingInvalidInputStream:
469                 /*
470                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
471                  * errors here.  That function will always pass 2
472                  * characters.  smbd/open.c:check_for_pipe() cuts a
473                  * patchname to 10 characters blindly.  Suppress the
474                  * debug output in those cases.
475                  */
476                 if(2 != *inbytesleft && 10 != *inbytesleft) {
477                         debug_out("String conversion: "
478                                   "Invalid input sequence\n");
479                         hexdump("UTF8->UTF16LE (new) input",
480                                 *inbuf, *inbytesleft);
481                 }
482                 errno = EILSEQ;
483                 return -1;
484         case kCFStringEncodingConverterUnavailable:
485                 debug_out("String conversion: "
486                           "Unknown encoding\n");
487                 hexdump("UTF8->UTF16LE (new) input",
488                         *inbuf, *inbytesleft);
489                 errno = EINVAL;
490                 return -1;
491         }
492
493         /*
494          * It doesn't look like CFStringEncodingBytesToUnicode() can
495          * produce precomposed characters (flags=ComposeCombinings
496          * doesn't do it), so we need another pass over the data here.
497          * We can do this in-place, as the string can only get
498          * shorter.
499          *
500          * (Actually in theory there should be an internal
501          * decomposition and reordering before the actual composition
502          * step.  But we should be able to rely on that we always get
503          * fully decomposed strings for input, so this can't create
504          * problems in reality.)
505          */
506         CFUniCharPrecompose(
507                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
508                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
509
510         native_to_le(*outbuf, dstPrecomposedUsed*2);
511
512         *inbuf += srcCharsUsed;
513         *inbytesleft -= srcCharsUsed;
514         *outbuf += dstPrecomposedUsed*2;
515         *outbytesleft -= dstPrecomposedUsed*2;
516
517         return 0;
518 }
519
520 static size_t macosxfs_encoding_push(
521         void *cd,                               /* Encoder handle */
522         char **inbuf, size_t *inbytesleft,      /* UTF-16-LE string */
523         char **outbuf, size_t *outbytesleft)    /* Script string */
524 {
525         static const int script_code = kCFStringEncodingUTF8;
526         static UniChar *buffer = NULL;
527         static size_t buflen = 0;
528         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
529
530         (void) cd; /* UNUSED */
531
532         if (0 == *inbytesleft) {
533                 return 0;
534         }
535
536         buffer = set_ucbuffer_with_le(
537                 buffer, &buflen, *inbuf, *inbytesleft);
538
539         result = CFStringEncodingUnicodeToBytes(
540                 script_code, kCFStringEncodingUseHFSPlusCanonical,
541                 buffer, *inbytesleft/2, &srcCharsUsed,
542                 *outbuf, *outbytesleft, &dstCharsUsed);
543
544         switch(result) {
545         case kCFStringEncodingConversionSuccess:
546                 if (*inbytesleft/2 == srcCharsUsed)
547                         break;
548                 else
549                         ; /*fall through*/
550         case kCFStringEncodingInsufficientOutputBufferLength:
551                 debug_out("String conversion: "
552                           "Output buffer too small\n");
553                 hexdump("UTF16LE->UTF8 (new) input",
554                         *inbuf, *inbytesleft);
555                 errno = E2BIG;
556                 return -1;
557         case kCFStringEncodingInvalidInputStream:
558                 /*
559                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
560                  * cuts a pathname to 10 characters blindly.  Suppress
561                  * the debug output in those cases.
562                  */
563                 if(10 != *inbytesleft) {
564                         debug_out("String conversion: "
565                                   "Invalid input sequence\n");
566                         hexdump("UTF16LE->UTF8 (new) input",
567                                 *inbuf, *inbytesleft);
568                 }
569                 errno = EILSEQ;
570                 return -1;
571         case kCFStringEncodingConverterUnavailable:
572                 debug_out("String conversion: "
573                           "Unknown encoding\n");
574                 hexdump("UTF16LE->UTF8 (new) input",
575                         *inbuf, *inbytesleft);
576                 errno = EINVAL;
577                 return -1;
578         }
579
580         *inbuf += srcCharsUsed*2;
581         *inbytesleft -= srcCharsUsed*2;
582         *outbuf += dstCharsUsed;
583         *outbytesleft -= dstCharsUsed;
584
585         return 0;
586 }
587
588 #endif /* USE_INTERNAL_API */
589
590 /*
591  * For initialization, actually install the encoding as "macosxfs".
592  */
593 static struct charset_functions macosxfs_encoding_functions = {
594         "MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push
595 };
596
597 NTSTATUS init_module(void)
598 {
599         return smb_register_charset(&macosxfs_encoding_functions);
600 }
601
602 /* eof */