00001 00021 #include "lightmediascanner_charset_conv.h" 00022 #include <iconv.h> 00023 #include <stdio.h> 00024 #include <stdlib.h> 00025 #include <string.h> 00026 #include <errno.h> 00027 #include <ctype.h> 00028 00029 struct lms_charset_conv { 00030 iconv_t check; 00031 iconv_t fallback; 00032 unsigned int size; 00033 iconv_t *convs; 00034 char **names; 00035 }; 00036 00046 lms_charset_conv_t * 00047 lms_charset_conv_new_full(int use_check, int use_fallback) 00048 { 00049 lms_charset_conv_t *lcc; 00050 00051 lcc = malloc(sizeof(*lcc)); 00052 if (!lcc) { 00053 perror("malloc"); 00054 return NULL; 00055 } 00056 00057 if (!use_check) 00058 lcc->check = (iconv_t)-1; 00059 else { 00060 lcc->check = iconv_open("UTF-8", "UTF-8"); 00061 if (lcc->check == (iconv_t)-1) { 00062 perror("ERROR: could not create conversion checker"); 00063 goto error_check; 00064 } 00065 } 00066 00067 if (!use_fallback) 00068 lcc->fallback = (iconv_t)-1; 00069 else { 00070 lcc->fallback = iconv_open("UTF-8//IGNORE", "UTF-8"); 00071 if (lcc->fallback == (iconv_t)-1) { 00072 perror("ERROR: could not create conversion fallback"); 00073 goto error_fallback; 00074 } 00075 } 00076 00077 lcc->size = 0; 00078 lcc->convs = NULL; 00079 lcc->names = NULL; 00080 return lcc; 00081 00082 error_fallback: 00083 if (lcc->check != (iconv_t)-1) 00084 iconv_close(lcc->check); 00085 error_check: 00086 free(lcc); 00087 00088 return NULL; 00089 } 00090 00100 lms_charset_conv_t * 00101 lms_charset_conv_new(void) 00102 { 00103 return lms_charset_conv_new_full(1, 1); 00104 } 00105 00111 void 00112 lms_charset_conv_free(lms_charset_conv_t *lcc) 00113 { 00114 int i; 00115 00116 if (!lcc) 00117 return; 00118 00119 if (lcc->check != (iconv_t)-1) 00120 iconv_close(lcc->check); 00121 if (lcc->fallback != (iconv_t)-1) 00122 iconv_close(lcc->fallback); 00123 00124 for (i = 0; i < lcc->size; i++) { 00125 iconv_close(lcc->convs[i]); 00126 free(lcc->names[i]); 00127 } 00128 00129 if (lcc->convs) 00130 free(lcc->convs); 00131 if (lcc->names) 00132 free(lcc->names); 00133 free(lcc); 00134 } 00135 00144 int 00145 lms_charset_conv_add(lms_charset_conv_t *lcc, const char *charset) 00146 { 00147 iconv_t cd, *convs; 00148 char **names; 00149 int idx, ns; 00150 00151 if (!lcc) 00152 return -1; 00153 00154 if (!charset) 00155 return -2; 00156 00157 cd = iconv_open("UTF-8", charset); 00158 if (cd == (iconv_t)-1) { 00159 fprintf(stderr, "ERROR: could not add conversion charset '%s': %s\n", 00160 charset, strerror(errno)); 00161 return -3; 00162 } 00163 00164 idx = lcc->size; 00165 ns = lcc->size + 1; 00166 00167 convs = realloc(lcc->convs, ns * sizeof(*convs)); 00168 if (!convs) 00169 goto realloc_error; 00170 lcc->convs = convs; 00171 lcc->convs[idx] = cd; 00172 00173 names = realloc(lcc->names, ns * sizeof(*names)); 00174 if (!names) 00175 goto realloc_error; 00176 lcc->names = names; 00177 lcc->names[idx] = strdup(charset); 00178 if (!lcc->names[idx]) 00179 goto realloc_error; 00180 00181 lcc->size = ns; 00182 return 0; 00183 00184 realloc_error: 00185 perror("realloc"); 00186 iconv_close(cd); 00187 return -4; 00188 } 00189 00190 static int 00191 _find(const lms_charset_conv_t *lcc, const char *charset) 00192 { 00193 int i; 00194 00195 for (i = 0; i < lcc->size; i++) 00196 if (strcmp(lcc->names[i], charset) == 0) 00197 return i; 00198 00199 return -1; 00200 } 00201 00210 int 00211 lms_charset_conv_del(lms_charset_conv_t *lcc, const char *charset) 00212 { 00213 iconv_t *convs; 00214 char **names; 00215 int idx; 00216 00217 if (!lcc) 00218 return -1; 00219 00220 if (!charset) 00221 return -2; 00222 00223 idx = _find(lcc, charset); 00224 if (idx < 0) { 00225 fprintf(stderr, "ERROR: could not find charset '%s'\n", charset); 00226 return -3; 00227 } 00228 00229 iconv_close(lcc->convs[idx]); 00230 free(lcc->names[idx]); 00231 00232 lcc->size--; 00233 for (; idx < lcc->size; idx++) { 00234 lcc->convs[idx] = lcc->convs[idx + 1]; 00235 lcc->names[idx] = lcc->names[idx + 1]; 00236 } 00237 00238 convs = realloc(lcc->convs, lcc->size * sizeof(*convs)); 00239 if (convs) 00240 lcc->convs = convs; 00241 else 00242 perror("could not realloc 'convs'"); 00243 00244 names = realloc(lcc->names, lcc->size * sizeof(*names)); 00245 if (names) 00246 lcc->names = names; 00247 else 00248 perror("could not realloc 'names'"); 00249 00250 return 0; 00251 } 00252 00253 static int 00254 _check(lms_charset_conv_t *lcc, const char *istr, unsigned int ilen, char *ostr, unsigned int olen) 00255 { 00256 char *inbuf, *outbuf; 00257 size_t r, inlen, outlen; 00258 00259 if (lcc->check == (iconv_t)-1) 00260 return -1; 00261 00262 inbuf = (char *)istr; 00263 inlen = ilen; 00264 outbuf = ostr; 00265 outlen = olen; 00266 00267 iconv(lcc->check, NULL, NULL, NULL, NULL); 00268 r = iconv(lcc->check, &inbuf, &inlen, &outbuf, &outlen); 00269 if (r == (size_t)-1) 00270 return -1; 00271 else 00272 return 0; 00273 } 00274 00275 static int 00276 _conv(iconv_t cd, char **p_str, unsigned int *p_len, char *ostr, unsigned int olen) 00277 { 00278 char *inbuf, *outbuf; 00279 size_t r, inlen, outlen; 00280 00281 inbuf = *p_str; 00282 inlen = *p_len; 00283 outbuf = ostr; 00284 outlen = olen; 00285 00286 iconv(cd, NULL, NULL, NULL, NULL); 00287 r = iconv(cd, &inbuf, &inlen, &outbuf, &outlen); 00288 if (r == (size_t)-1) 00289 return -1; 00290 00291 *p_len = olen - outlen; 00292 free(*p_str); 00293 *p_str = ostr; 00294 00295 outbuf = realloc(*p_str, *p_len + 1); 00296 if (!outbuf) 00297 perror("realloc"); 00298 else 00299 *p_str = outbuf; 00300 00301 (*p_str)[*p_len] = '\0'; 00302 00303 return 0; 00304 } 00305 00306 static void 00307 _fix_non_ascii(char *s, int len) 00308 { 00309 for (; len > 0; len--, s++) 00310 if (!isprint(*s)) 00311 *s = '?'; 00312 } 00313 00327 int 00328 lms_charset_conv(lms_charset_conv_t *lcc, char **p_str, unsigned int *p_len) 00329 { 00330 char *outstr; 00331 int i, outlen; 00332 00333 if (!lcc) 00334 return -1; 00335 if (!p_str) 00336 return -2; 00337 if (!p_len) 00338 return -3; 00339 if (!*p_str || !*p_len) 00340 return 0; 00341 00342 outlen = 2 * *p_len; 00343 outstr = malloc(outlen + 1); 00344 if (!outstr) { 00345 perror("malloc"); 00346 return -4; 00347 } 00348 00349 if (_check(lcc, *p_str, *p_len, outstr, outlen) == 0) { 00350 free(outstr); 00351 return 0; 00352 } 00353 00354 for (i = 0; i < lcc->size; i++) 00355 if (_conv(lcc->convs[i], p_str, p_len, outstr, outlen) == 0) 00356 return 0; 00357 00358 if (lcc->fallback == (iconv_t)-1) 00359 return -5; 00360 00361 fprintf(stderr, 00362 "WARNING: could not convert '%*s' to any charset, use fallback\n", 00363 *p_len, *p_str); 00364 i = _conv(lcc->fallback, p_str, p_len, outstr, outlen); 00365 if (i < 0) { 00366 _fix_non_ascii(*p_str, *p_len); 00367 free(outstr); 00368 } 00369 return i; 00370 } 00371 00385 int 00386 lms_charset_conv_force(lms_charset_conv_t *lcc, char **p_str, unsigned int *p_len) 00387 { 00388 char *outstr; 00389 int i, outlen; 00390 00391 if (!lcc) 00392 return -1; 00393 if (!p_str) 00394 return -2; 00395 if (!p_len) 00396 return -3; 00397 if (!*p_str || !*p_len) 00398 return 0; 00399 00400 outlen = 2 * *p_len; 00401 outstr = malloc(outlen + 1); 00402 if (!outstr) { 00403 perror("malloc"); 00404 return -4; 00405 } 00406 00407 for (i = 0; i < lcc->size; i++) 00408 if (_conv(lcc->convs[i], p_str, p_len, outstr, outlen) == 0) 00409 return 0; 00410 00411 if (lcc->fallback == (iconv_t)-1) 00412 return -5; 00413 00414 fprintf(stderr, 00415 "WARNING: could not convert '%*s' to any charset, use fallback\n", 00416 *p_len, *p_str); 00417 i = _conv(lcc->fallback, p_str, p_len, outstr, outlen); 00418 if (i < 0) { 00419 _fix_non_ascii(*p_str, *p_len); 00420 free(outstr); 00421 } 00422 return i; 00423 } 00424 00439 int 00440 lms_charset_conv_check(lms_charset_conv_t *lcc, const char *str, unsigned int len) 00441 { 00442 char *outstr; 00443 int r, outlen; 00444 00445 if (!lcc) 00446 return -1; 00447 if (!str || !len) 00448 return 0; 00449 00450 outlen = 2 * len; 00451 outstr = malloc(outlen); 00452 if (!outstr) { 00453 perror("malloc"); 00454 return -2; 00455 } 00456 00457 r = _check(lcc, str, len, outstr, outlen); 00458 free(outstr); 00459 return r; 00460 }