git.maemo.org Git - qstardict/blob - plugins/stardict/lib.cpp

   1 #ifdef HAVE_CONFIG_H
   2 #  include "config.h"
   3 #endif
   4
   5 #include <algorithm>
   6 #include <cstring>
   7 #include <cctype>
   8
   9 #include <sys/stat.h>
  10 #include <zlib.h>
  11 #include <glib/gstdio.h>
  12
  13 #include "distance.h"
  14 #include "file.hpp"
  15 #include "mapfile.hpp"
  16
  17 #include "lib.h"
  18
  19 // Notice: read src/tools/DICTFILE_FORMAT for the dictionary
  20 // file's format information!
  21
  22
  23 static inline bool bIsVowel(gchar inputchar)
  24 {
  25     gchar ch = g_ascii_toupper(inputchar);
  26     return ( ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' );
  27 }
  28
  29 static bool bIsPureEnglish(const gchar *str)
  30 {
  31     // i think this should work even when it is UTF8 string :).
  32     for (int i = 0; str[i] != 0; i++)
  33         //if(str[i]<0)
  34         //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK.
  35         // Better use isascii() but not str[i]<0 while char is default unsigned in arm
  36         if (!isascii(str[i]))
  37             return false;
  38     return true;
  39 }
  40
  41 static inline gint stardict_strcmp(const gchar *s1, const gchar *s2)
  42 {
  43     gint a = g_ascii_strcasecmp(s1, s2);
  44     if (a == 0)
  45         return strcmp(s1, s2);
  46     else
  47         return a;
  48 }
  49
  50 bool DictInfo::load_from_ifo_file(const std::string& ifofilename,
  51                                   bool istreedict)
  52 {
  53     ifo_file_name = ifofilename;
  54     gchar *buffer;
  55     if (!g_file_get_contents(ifofilename.c_str(), &buffer, NULL, NULL))
  56         return false;
  57
  58 #define TREEDICT_MAGIC_DATA "StarDict's treedict ifo file\nversion=2.4.2\n"
  59 #define DICT_MAGIC_DATA "StarDict's dict ifo file\nversion=2.4.2\n"
  60
  61     const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA;
  62     if (!g_str_has_prefix(buffer, magic_data))
  63     {
  64         g_free(buffer);
  65         return false;
  66     }
  67
  68     gchar *p1, *p2, *p3;
  69
  70     p1 = buffer + strlen(magic_data) - 1;
  71
  72     p2 = strstr(p1, "\nwordcount=");
  73     if (!p2)
  74     {
  75         g_free(buffer);
  76         return false;
  77     }
  78
  79     p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n');
  80     gchar *tmpstr = (gchar *)g_memdup(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1) + 1);
  81     tmpstr[p3 - (p2 + sizeof("\nwordcount=") - 1)] = '\0';
  82     wordcount = atol(tmpstr);
  83     g_free(tmpstr);
  84
  85     if (istreedict)
  86     {
  87         p2 = strstr(p1, "\ntdxfilesize=");
  88         if (!p2)
  89         {
  90             g_free(buffer);
  91             return false;
  92         }
  93         p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n');
  94         tmpstr = (gchar *)g_memdup(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1) + 1);
  95         tmpstr[p3 - (p2 + sizeof("\ntdxfilesize=") - 1)] = '\0';
  96         index_file_size = atol(tmpstr);
  97         g_free(tmpstr);
  98     }
  99     else
 100     {
 101
 102         p2 = strstr(p1, "\nidxfilesize=");
 103         if (!p2)
 104         {
 105             g_free(buffer);
 106             return false;
 107         }
 108
 109         p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n');
 110         tmpstr = (gchar *)g_memdup(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1) + 1);
 111         tmpstr[p3 - (p2 + sizeof("\nidxfilesize=") - 1)] = '\0';
 112         index_file_size = atol(tmpstr);
 113         g_free(tmpstr);
 114     }
 115
 116     p2 = strstr(p1, "\nbookname=");
 117
 118     if (!p2)
 119     {
 120         g_free(buffer);
 121         return false;
 122     }
 123
 124     p2 = p2 + sizeof("\nbookname=") - 1;
 125     p3 = strchr(p2, '\n');
 126     bookname.assign(p2, p3 - p2);
 127
 128     p2 = strstr(p1, "\nauthor=");
 129     if (p2)
 130     {
 131         p2 = p2 + sizeof("\nauthor=") - 1;
 132         p3 = strchr(p2, '\n');
 133         author.assign(p2, p3 - p2);
 134     }
 135
 136     p2 = strstr(p1, "\nemail=");
 137     if (p2)
 138     {
 139         p2 = p2 + sizeof("\nemail=") - 1;
 140         p3 = strchr(p2, '\n');
 141         email.assign(p2, p3 - p2);
 142     }
 143
 144     p2 = strstr(p1, "\nwebsite=");
 145     if (p2)
 146     {
 147         p2 = p2 + sizeof("\nwebsite=") - 1;
 148         p3 = strchr(p2, '\n');
 149         website.assign(p2, p3 - p2);
 150     }
 151
 152     p2 = strstr(p1, "\ndate=");
 153     if (p2)
 154     {
 155         p2 = p2 + sizeof("\ndate=") - 1;
 156         p3 = strchr(p2, '\n');
 157         date.assign(p2, p3 - p2);
 158     }
 159
 160     p2 = strstr(p1, "\ndescription=");
 161     if (p2)
 162     {
 163         p2 = p2 + sizeof("\ndescription=") - 1;
 164         p3 = strchr(p2, '\n');
 165         description.assign(p2, p3 - p2);
 166     }
 167
 168     p2 = strstr(p1, "\nsametypesequence=");
 169     if (p2)
 170     {
 171         p2 += sizeof("\nsametypesequence=") - 1;
 172         p3 = strchr(p2, '\n');
 173         sametypesequence.assign(p2, p3 - p2);
 174     }
 175
 176     g_free(buffer);
 177
 178     return true;
 179 }
 180 //===================================================================
 181 DictBase::DictBase()
 182 {
 183     dictfile = NULL;
 184     cache_cur = 0;
 185 }
 186
 187 DictBase::~DictBase()
 188 {
 189     if (dictfile)
 190         fclose(dictfile);
 191 }
 192
 193 gchar* DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size)
 194 {
 195     for (int i = 0; i < WORDDATA_CACHE_NUM; i++)
 196         if (cache[i].data && cache[i].offset == idxitem_offset)
 197             return cache[i].data;
 198
 199     if (dictfile)
 200         fseek(dictfile, idxitem_offset, SEEK_SET);
 201
 202     gchar *data;
 203     if (!sametypesequence.empty())
 204     {
 205         gchar *origin_data = (gchar *)g_malloc(idxitem_size);
 206
 207         if (dictfile)
 208             fread(origin_data, idxitem_size, 1, dictfile);
 209         else
 210             dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
 211
 212         guint32 data_size;
 213         gint sametypesequence_len = sametypesequence.length();
 214         //there have sametypesequence_len char being omitted.
 215         data_size = idxitem_size + sizeof(guint32) + sametypesequence_len;
 216         //if the last item's size is determined by the end up '\0',then +=sizeof(gchar);
 217         //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32);
 218         switch (sametypesequence[sametypesequence_len - 1])
 219         {
 220         case 'm':
 221         case 't':
 222         case 'y':
 223         case 'l':
 224         case 'g':
 225         case 'x':
 226             data_size += sizeof(gchar);
 227             break;
 228         case 'W':
 229         case 'P':
 230             data_size += sizeof(guint32);
 231             break;
 232         default:
 233             if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
 234                 data_size += sizeof(guint32);
 235             else
 236                 data_size += sizeof(gchar);
 237             break;
 238         }
 239         data = (gchar *)g_malloc(data_size);
 240         gchar *p1, *p2;
 241         p1 = data + sizeof(guint32);
 242         p2 = origin_data;
 243         guint32 sec_size;
 244         //copy the head items.
 245         for (int i = 0; i < sametypesequence_len - 1; i++)
 246         {
 247             *p1 = sametypesequence[i];
 248             p1 += sizeof(gchar);
 249             switch (sametypesequence[i])
 250             {
 251             case 'm':
 252             case 't':
 253             case 'y':
 254             case 'l':
 255             case 'g':
 256             case 'x':
 257                 sec_size = strlen(p2) + 1;
 258                 memcpy(p1, p2, sec_size);
 259                 p1 += sec_size;
 260                 p2 += sec_size;
 261                 break;
 262             case 'W':
 263             case 'P':
 264                 sec_size = *reinterpret_cast<guint32 *>(p2);
 265                 sec_size += sizeof(guint32);
 266                 memcpy(p1, p2, sec_size);
 267                 p1 += sec_size;
 268                 p2 += sec_size;
 269                 break;
 270             default:
 271                 if (g_ascii_isupper(sametypesequence[i]))
 272                 {
 273                     sec_size = *reinterpret_cast<guint32 *>(p2);
 274                     sec_size += sizeof(guint32);
 275                 }
 276                 else
 277                 {
 278                     sec_size = strlen(p2) + 1;
 279                 }
 280                 memcpy(p1, p2, sec_size);
 281                 p1 += sec_size;
 282                 p2 += sec_size;
 283                 break;
 284             }
 285         }
 286         //calculate the last item 's size.
 287         sec_size = idxitem_size - (p2 - origin_data);
 288         *p1 = sametypesequence[sametypesequence_len - 1];
 289         p1 += sizeof(gchar);
 290         switch (sametypesequence[sametypesequence_len - 1])
 291         {
 292         case 'm':
 293         case 't':
 294         case 'y':
 295         case 'l':
 296         case 'g':
 297         case 'x':
 298             memcpy(p1, p2, sec_size);
 299             p1 += sec_size;
 300             *p1 = '\0'; //add the end up '\0';
 301             break;
 302         case 'W':
 303         case 'P':
 304             *reinterpret_cast<guint32 *>(p1) = sec_size;
 305             p1 += sizeof(guint32);
 306             memcpy(p1, p2, sec_size);
 307             break;
 308         default:
 309             if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
 310             {
 311                 *reinterpret_cast<guint32 *>(p1) = sec_size;
 312                 p1 += sizeof(guint32);
 313                 memcpy(p1, p2, sec_size);
 314             }
 315             else
 316             {
 317                 memcpy(p1, p2, sec_size);
 318                 p1 += sec_size;
 319                 *p1 = '\0';
 320             }
 321             break;
 322         }
 323         g_free(origin_data);
 324         *reinterpret_cast<guint32 *>(data) = data_size;
 325     }
 326     else
 327     {
 328         data = (gchar *)g_malloc(idxitem_size + sizeof(guint32));
 329         if (dictfile)
 330             fread(data + sizeof(guint32), idxitem_size, 1, dictfile);
 331         else
 332             dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size);
 333         *reinterpret_cast<guint32 *>(data) = idxitem_size + sizeof(guint32);
 334     }
 335     g_free(cache[cache_cur].data);
 336
 337     cache[cache_cur].data = data;
 338     cache[cache_cur].offset = idxitem_offset;
 339     cache_cur++;
 340     if (cache_cur == WORDDATA_CACHE_NUM)
 341         cache_cur = 0;
 342     return data;
 343 }
 344
 345 inline bool DictBase::containSearchData()
 346 {
 347     if (sametypesequence.empty())
 348         return true;
 349
 350     return sametypesequence.find_first_of("mlgxty") != std::string::npos;
 351 }
 352
 353 bool DictBase::SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data)
 354 {
 355     int nWord = SearchWords.size();
 356     std::vector<bool> WordFind(nWord, false);
 357     int nfound = 0;
 358
 359     if (dictfile)
 360         fseek(dictfile, idxitem_offset, SEEK_SET);
 361     if (dictfile)
 362         fread(origin_data, idxitem_size, 1, dictfile);
 363     else
 364         dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
 365     gchar *p = origin_data;
 366     guint32 sec_size;
 367     int j;
 368     if (!sametypesequence.empty())
 369     {
 370         gint sametypesequence_len = sametypesequence.length();
 371         for (int i = 0; i < sametypesequence_len - 1; i++)
 372         {
 373             switch (sametypesequence[i])
 374             {
 375             case 'm':
 376             case 't':
 377             case 'y':
 378             case 'l':
 379             case 'g':
 380             case 'x':
 381                 for (j = 0; j < nWord; j++)
 382                     if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
 383                     {
 384                         WordFind[j] = true;
 385                         ++nfound;
 386                     }
 387
 388
 389                 if (nfound == nWord)
 390                     return true;
 391                 sec_size = strlen(p) + 1;
 392                 p += sec_size;
 393                 break;
 394             default:
 395                 if (g_ascii_isupper(sametypesequence[i]))
 396                 {
 397                     sec_size = *reinterpret_cast<guint32 *>(p);
 398                     sec_size += sizeof(guint32);
 399                 }
 400                 else
 401                 {
 402                     sec_size = strlen(p) + 1;
 403                 }
 404                 p += sec_size;
 405             }
 406         }
 407         switch (sametypesequence[sametypesequence_len - 1])
 408         {
 409         case 'm':
 410         case 't':
 411         case 'y':
 412         case 'l':
 413         case 'g':
 414         case 'x':
 415             sec_size = idxitem_size - (p - origin_data);
 416             for (j = 0; j < nWord; j++)
 417                 if (!WordFind[j] &&
 418                         g_strstr_len(p, sec_size, SearchWords[j].c_str()))
 419                 {
 420                     WordFind[j] = true;
 421                     ++nfound;
 422                 }
 423
 424
 425             if (nfound == nWord)
 426                 return true;
 427             break;
 428         }
 429     }
 430     else
 431     {
 432         while (guint32(p - origin_data) < idxitem_size)
 433         {
 434             switch (*p)
 435             {
 436             case 'm':
 437             case 't':
 438             case 'y':
 439             case 'l':
 440             case 'g':
 441             case 'x':
 442                 for (j = 0; j < nWord; j++)
 443                     if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
 444                     {
 445                         WordFind[j] = true;
 446                         ++nfound;
 447                     }
 448
 449                 if (nfound == nWord)
 450                     return true;
 451                 sec_size = strlen(p) + 1;
 452                 p += sec_size;
 453                 break;
 454             default:
 455                 if (g_ascii_isupper(*p))
 456                 {
 457                     sec_size = *reinterpret_cast<guint32 *>(p);
 458                     sec_size += sizeof(guint32);
 459                 }
 460                 else
 461                 {
 462                     sec_size = strlen(p) + 1;
 463                 }
 464                 p += sec_size;
 465             }
 466         }
 467     }
 468     return false;
 469 }
 470
 471 class offset_index : public index_file
 472 {
 473     public:
 474         offset_index() : idxfile(NULL)
 475         {}
 476         ~offset_index();
 477         bool load(const std::string& url, gulong wc, gulong fsize);
 478         const gchar *get_key(glong idx);
 479         void get_data(glong idx);
 480         const gchar *get_key_and_data(glong idx);
 481         bool lookup(const char *str, glong &idx);
 482     private:
 483         static const gint ENTR_PER_PAGE = 32;
 484         static const char *CACHE_MAGIC;
 485
 486         std::vector<guint32> wordoffset;
 487         FILE *idxfile;
 488         gulong wordcount;
 489
 490         gchar wordentry_buf[256 + sizeof(guint32)*2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT.
 491         struct index_entry
 492         {
 493             glong idx;
 494             std::string keystr;
 495             void assign(glong i, const std::string& str)
 496             {
 497                 idx = i;
 498                 keystr.assign(str);
 499             }
 500         };
 501         index_entry first, last, middle, real_last;
 502
 503         struct page_entry
 504         {
 505             gchar *keystr;
 506             guint32 off, size;
 507         };
 508         std::vector<gchar> page_data;
 509         struct page_t
 510         {
 511             glong idx;
 512             page_entry entries[ENTR_PER_PAGE];
 513
 514             page_t(): idx( -1)
 515             {}
 516             void fill(gchar *data, gint nent, glong idx_);
 517         }
 518         page;
 519         gulong load_page(glong page_idx);
 520         const gchar *read_first_on_page_key(glong page_idx);
 521         const gchar *get_first_on_page_key(glong page_idx);
 522         bool load_cache(const std::string& url);
 523         bool save_cache(const std::string& url);
 524         static strlist_t get_cache_variant(const std::string& url);
 525 };
 526
 527 const char *offset_index::CACHE_MAGIC = "StarDict's Cache, Version: 0.1";
 528
 529 class wordlist_index : public index_file
 530 {
 531     public:
 532         wordlist_index() : idxdatabuf(NULL)
 533         {}
 534         ~wordlist_index();
 535         bool load(const std::string& url, gulong wc, gulong fsize);
 536         const gchar *get_key(glong idx);
 537         void get_data(glong idx);
 538         const gchar *get_key_and_data(glong idx);
 539         bool lookup(const char *str, glong &idx);
 540     private:
 541         gchar *idxdatabuf;
 542         std::vector<gchar *> wordlist;
 543 };
 544
 545 void offset_index::page_t::fill(gchar *data, gint nent, glong idx_)
 546 {
 547     idx = idx_;
 548     gchar *p = data;
 549     glong len;
 550     for (gint i = 0; i < nent; ++i)
 551     {
 552         entries[i].keystr = p;
 553         len = strlen(p);
 554         p += len + 1;
 555         entries[i].off = g_ntohl(*reinterpret_cast<guint32 *>(p));
 556         p += sizeof(guint32);
 557         entries[i].size = g_ntohl(*reinterpret_cast<guint32 *>(p));
 558         p += sizeof(guint32);
 559     }
 560 }
 561
 562 offset_index::~offset_index()
 563 {
 564     if (idxfile)
 565         fclose(idxfile);
 566 }
 567
 568 inline const gchar *offset_index::read_first_on_page_key(glong page_idx)
 569 {
 570     fseek(idxfile, wordoffset[page_idx], SEEK_SET);
 571     guint page_size = wordoffset[page_idx + 1] - wordoffset[page_idx];
 572     fread(wordentry_buf, std::min<guint>(sizeof(wordentry_buf), page_size), 1, idxfile); //TODO: check returned values, deal with word entry that strlen>255.
 573     return wordentry_buf;
 574 }
 575
 576 inline const gchar *offset_index::get_first_on_page_key(glong page_idx)
 577 {
 578     if (page_idx < middle.idx)
 579     {
 580         if (page_idx == first.idx)
 581             return first.keystr.c_str();
 582         return read_first_on_page_key(page_idx);
 583     }
 584     else if (page_idx > middle.idx)
 585     {
 586         if (page_idx == last.idx)
 587             return last.keystr.c_str();
 588         return read_first_on_page_key(page_idx);
 589     }
 590     else
 591         return middle.keystr.c_str();
 592 }
 593
 594 bool offset_index::load_cache(const std::string& url)
 595 {
 596     strlist_t vars = get_cache_variant(url);
 597
 598     for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
 599     {
 600         struct stat idxstat, cachestat;
 601         if (g_stat(url.c_str(), &idxstat) != 0 ||
 602                 g_stat(it->c_str(), &cachestat) != 0)
 603             continue;
 604         if (cachestat.st_mtime < idxstat.st_mtime)
 605             continue;
 606         MapFile mf;
 607         if (!mf.open(it->c_str(), cachestat.st_size))
 608             continue;
 609         if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0)
 610             continue;
 611         memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size()*sizeof(wordoffset[0]));
 612         return true;
 613
 614     }
 615
 616     return false;
 617 }
 618
 619 strlist_t offset_index::get_cache_variant(const std::string& url)
 620 {
 621     strlist_t res;
 622     res.push_back(url + ".oft");
 623     if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) &&
 624             g_mkdir(g_get_user_cache_dir(), 0700) == -1)
 625         return res;
 626
 627     std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv";
 628
 629     if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS))
 630     {
 631         if (g_mkdir(cache_dir.c_str(), 0700) == -1)
 632             return res;
 633     }
 634     else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR))
 635         return res;
 636
 637     gchar *base = g_path_get_basename(url.c_str());
 638     res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft");
 639     g_free(base);
 640     return res;
 641 }
 642
 643 bool offset_index::save_cache(const std::string& url)
 644 {
 645     strlist_t vars = get_cache_variant(url);
 646     for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
 647     {
 648         FILE *out = fopen(it->c_str(), "wb");
 649         if (!out)
 650             continue;
 651         if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC))
 652             continue;
 653         if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size())
 654             continue;
 655         fclose(out);
 656         printf("save to cache %s\n", url.c_str());
 657         return true;
 658     }
 659     return false;
 660 }
 661
 662 bool offset_index::load(const std::string& url, gulong wc, gulong fsize)
 663 {
 664     wordcount = wc;
 665     gulong npages = (wc - 1) / ENTR_PER_PAGE + 2;
 666     wordoffset.resize(npages);
 667     if (!load_cache(url))
 668     { //map file will close after finish of block
 669         MapFile map_file;
 670         if (!map_file.open(url.c_str(), fsize))
 671             return false;
 672         const gchar *idxdatabuffer = map_file.begin();
 673
 674         const gchar *p1 = idxdatabuffer;
 675         gulong index_size;
 676         guint32 j = 0;
 677         for (guint32 i = 0; i < wc; i++)
 678         {
 679             index_size = strlen(p1) + 1 + 2 * sizeof(guint32);
 680             if (i % ENTR_PER_PAGE == 0)
 681             {
 682                 wordoffset[j] = p1 - idxdatabuffer;
 683                 ++j;
 684             }
 685             p1 += index_size;
 686         }
 687         wordoffset[j] = p1 - idxdatabuffer;
 688         if (!save_cache(url))
 689             fprintf(stderr, "cache update failed\n");
 690     }
 691
 692     if (!(idxfile = fopen(url.c_str(), "rb")))
 693     {
 694         wordoffset.resize(0);
 695         return false;
 696     }
 697
 698     first.assign(0, read_first_on_page_key(0));
 699     last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2));
 700     middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2));
 701     real_last.assign(wc - 1, get_key(wc - 1));
 702
 703     return true;
 704 }
 705
 706 inline gulong offset_index::load_page(glong page_idx)
 707 {
 708     gulong nentr = ENTR_PER_PAGE;
 709     if (page_idx == glong(wordoffset.size() - 2))
 710         if ((nentr = wordcount % ENTR_PER_PAGE) == 0)
 711             nentr = ENTR_PER_PAGE;
 712
 713
 714     if (page_idx != page.idx)
 715     {
 716         page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]);
 717         fseek(idxfile, wordoffset[page_idx], SEEK_SET);
 718         fread(&page_data[0], 1, page_data.size(), idxfile);
 719         page.fill(&page_data[0], nentr, page_idx);
 720     }
 721
 722     return nentr;
 723 }
 724
 725 const gchar *offset_index::get_key(glong idx)
 726 {
 727     load_page(idx / ENTR_PER_PAGE);
 728     glong idx_in_page = idx % ENTR_PER_PAGE;
 729     wordentry_offset = page.entries[idx_in_page].off;
 730     wordentry_size = page.entries[idx_in_page].size;
 731
 732     return page.entries[idx_in_page].keystr;
 733 }
 734
 735 void offset_index::get_data(glong idx)
 736 {
 737     get_key(idx);
 738 }
 739
 740 const gchar *offset_index::get_key_and_data(glong idx)
 741 {
 742     return get_key(idx);
 743 }
 744
 745 bool offset_index::lookup(const char *str, glong &idx)
 746 {
 747     bool bFound = false;
 748     glong iFrom;
 749     glong iTo = wordoffset.size() - 2;
 750     gint cmpint;
 751     glong iThisIndex;
 752     if (stardict_strcmp(str, first.keystr.c_str()) < 0)
 753     {
 754         idx = 0;
 755         return false;
 756     }
 757     else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0)
 758     {
 759         idx = INVALID_INDEX;
 760         return false;
 761     }
 762     else
 763     {
 764         iFrom = 0;
 765         iThisIndex = 0;
 766         while (iFrom <= iTo)
 767         {
 768             iThisIndex = (iFrom + iTo) / 2;
 769             cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex));
 770             if (cmpint > 0)
 771                 iFrom = iThisIndex + 1;
 772             else if (cmpint < 0)
 773                 iTo = iThisIndex - 1;
 774             else
 775             {
 776                 bFound = true;
 777                 break;
 778             }
 779         }
 780         if (!bFound)
 781             idx = iTo;    //prev
 782         else
 783             idx = iThisIndex;
 784     }
 785     if (!bFound)
 786     {
 787         gulong netr = load_page(idx);
 788         iFrom = 1; // Needn't search the first word anymore.
 789         iTo = netr - 1;
 790         iThisIndex = 0;
 791         while (iFrom <= iTo)
 792         {
 793             iThisIndex = (iFrom + iTo) / 2;
 794             cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr);
 795             if (cmpint > 0)
 796                 iFrom = iThisIndex + 1;
 797             else if (cmpint < 0)
 798                 iTo = iThisIndex - 1;
 799             else
 800             {
 801                 bFound = true;
 802                 break;
 803             }
 804         }
 805         idx *= ENTR_PER_PAGE;
 806         if (!bFound)
 807             idx += iFrom;    //next
 808         else
 809             idx += iThisIndex;
 810     }
 811     else
 812     {
 813         idx *= ENTR_PER_PAGE;
 814     }
 815     return bFound;
 816 }
 817
 818 wordlist_index::~wordlist_index()
 819 {
 820     g_free(idxdatabuf);
 821 }
 822
 823 bool wordlist_index::load(const std::string& url, gulong wc, gulong fsize)
 824 {
 825     gzFile in = gzopen(url.c_str(), "rb");
 826     if (in == NULL)
 827         return false;
 828
 829     idxdatabuf = (gchar *)g_malloc(fsize);
 830
 831     gulong len = gzread(in, idxdatabuf, fsize);
 832     gzclose(in);
 833
 834     if (len != fsize)
 835         return false;
 836
 837     wordlist.resize(wc + 1);
 838     gchar *p1 = idxdatabuf;
 839     guint32 i;
 840     for (i = 0; i < wc; i++)
 841     {
 842         wordlist[i] = p1;
 843         p1 += strlen(p1) + 1 + 2 * sizeof(guint32);
 844     }
 845     wordlist[wc] = p1;
 846
 847     return true;
 848 }
 849
 850 const gchar *wordlist_index::get_key(glong idx)
 851 {
 852     return wordlist[idx];
 853 }
 854
 855 void wordlist_index::get_data(glong idx)
 856 {
 857     gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar);
 858     wordentry_offset = g_ntohl(*reinterpret_cast<guint32 *>(p1));
 859     p1 += sizeof(guint32);
 860     wordentry_size = g_ntohl(*reinterpret_cast<guint32 *>(p1));
 861 }
 862
 863 const gchar *wordlist_index::get_key_and_data(glong idx)
 864 {
 865     get_data(idx);
 866     return get_key(idx);
 867 }
 868
 869 bool wordlist_index::lookup(const char *str, glong &idx)
 870 {
 871     bool bFound = false;
 872     glong iTo = wordlist.size() - 2;
 873
 874     if (stardict_strcmp(str, get_key(0)) < 0)
 875     {
 876         idx = 0;
 877     }
 878     else if (stardict_strcmp(str, get_key(iTo)) > 0)
 879     {
 880         idx = INVALID_INDEX;
 881     }
 882     else
 883     {
 884         glong iThisIndex = 0;
 885         glong iFrom = 0;
 886         gint cmpint;
 887         while (iFrom <= iTo)
 888         {
 889             iThisIndex = (iFrom + iTo) / 2;
 890             cmpint = stardict_strcmp(str, get_key(iThisIndex));
 891             if (cmpint > 0)
 892                 iFrom = iThisIndex + 1;
 893             else if (cmpint < 0)
 894                 iTo = iThisIndex - 1;
 895             else
 896             {
 897                 bFound = true;
 898                 break;
 899             }
 900         }
 901         if (!bFound)
 902             idx = iFrom;    //next
 903         else
 904             idx = iThisIndex;
 905     }
 906     return bFound;
 907 }
 908
 909 //===================================================================
 910 bool Dict::load(const std::string& ifofilename)
 911 {
 912     gulong idxfilesize;
 913     if (!load_ifofile(ifofilename, idxfilesize))
 914         return false;
 915
 916     std::string fullfilename(ifofilename);
 917     fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz");
 918
 919     if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
 920     {
 921         dictdzfile.reset(new dictData);
 922         if (!dictdzfile->open(fullfilename, 0))
 923         {
 924             //g_print("open file %s failed!\n",fullfilename);
 925             return false;
 926         }
 927     }
 928     else
 929     {
 930         fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1);
 931         dictfile = fopen(fullfilename.c_str(), "rb");
 932         if (!dictfile)
 933         {
 934             //g_print("open file %s failed!\n",fullfilename);
 935             return false;
 936         }
 937     }
 938
 939     fullfilename = ifofilename;
 940     fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz");
 941
 942     if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
 943     {
 944         idx_file.reset(new wordlist_index);
 945     }
 946     else
 947     {
 948         fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1);
 949         idx_file.reset(new offset_index);
 950     }
 951
 952     if (!idx_file->load(fullfilename, wordcount, idxfilesize))
 953         return false;
 954
 955     //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles());
 956     return true;
 957 }
 958
 959 bool Dict::load_ifofile(const std::string& ifofilename, gulong &idxfilesize)
 960 {
 961     DictInfo dict_info;
 962     if (!dict_info.load_from_ifo_file(ifofilename, false))
 963         return false;
 964     if (dict_info.wordcount == 0)
 965         return false;
 966
 967
 968
 969     ifo_file_name = dict_info.ifo_file_name;
 970     wordcount = dict_info.wordcount;
 971     bookname = dict_info.bookname;
 972
 973     idxfilesize = dict_info.index_file_size;
 974
 975     sametypesequence = dict_info.sametypesequence;
 976
 977     return true;
 978 }
 979
 980 bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen)
 981 {
 982     int iIndexCount = 0;
 983
 984     for (guint32 i = 0; i < narticles() && iIndexCount < iBuffLen - 1; i++)
 985         if (g_pattern_match_string(pspec, get_key(i)))
 986             aIndex[iIndexCount++] = i;
 987
 988     aIndex[iIndexCount] = -1; // -1 is the end.
 989
 990     return (iIndexCount > 0);
 991 }
 992
 993 //===================================================================
 994 Libs::Libs(progress_func_t f)
 995 {
 996     progress_func = f;
 997     iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg.
 998 }
 999
1000 Libs::~Libs()
1001 {
1002     for (std::vector<Dict *>::iterator p = oLib.begin(); p != oLib.end(); ++p)
1003         delete *p;
1004 }
1005
1006 void Libs::load_dict(const std::string& url)
1007 {
1008     Dict *lib = new Dict;
1009     if (lib->load(url))
1010         oLib.push_back(lib);
1011     else
1012         delete lib;
1013 }
1014
1015 class DictLoader
1016 {
1017     public:
1018         DictLoader(Libs& lib_): lib(lib_)
1019         {}
1020         void operator()(const std::string& url, bool disable)
1021         {
1022             if (!disable)
1023                 lib.load_dict(url);
1024         }
1025     private:
1026         Libs& lib;
1027 };
1028
1029 void Libs::load(const strlist_t& dicts_dirs,
1030                 const strlist_t& order_list,
1031                 const strlist_t& disable_list)
1032 {
1033     for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
1034                   DictLoader(*this));
1035 }
1036
1037 class DictReLoader
1038 {
1039     public:
1040         DictReLoader(std::vector<Dict *> &p, std::vector<Dict *> &f,
1041                      Libs& lib_) : prev(p), future(f), lib(lib_)
1042         {}
1043         void operator()(const std::string& url, bool disable)
1044         {
1045             if (!disable)
1046             {
1047                 Dict *dict = find(url);
1048                 if (dict)
1049                     future.push_back(dict);
1050                 else
1051                     lib.load_dict(url);
1052             }
1053         }
1054     private:
1055         std::vector<Dict *> &prev;
1056         std::vector<Dict *> &future;
1057         Libs& lib;
1058
1059         Dict *find(const std::string& url)
1060         {
1061             std::vector<Dict *>::iterator it;
1062             for (it = prev.begin(); it != prev.end(); ++it)
1063                 if ((*it)->ifofilename() == url)
1064                     break;
1065             if (it != prev.end())
1066             {
1067                 Dict *res = *it;
1068                 prev.erase(it);
1069                 return res;
1070             }
1071             return NULL;
1072         }
1073 };
1074
1075 void Libs::reload(const strlist_t& dicts_dirs,
1076                   const strlist_t& order_list,
1077                   const strlist_t& disable_list)
1078 {
1079     std::vector<Dict *> prev(oLib);
1080     oLib.clear();
1081     for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
1082                   DictReLoader(prev, oLib, *this));
1083     for (std::vector<Dict *>::iterator it = prev.begin(); it != prev.end(); ++it)
1084         delete *it;
1085 }
1086
1087 const gchar *Libs::poGetCurrentWord(glong * iCurrent)
1088 {
1089     const gchar *poCurrentWord = NULL;
1090     const gchar *word;
1091     for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
1092     {
1093         if (iCurrent[iLib] == INVALID_INDEX)
1094             continue;
1095         if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1096             continue;
1097         if ( poCurrentWord == NULL )
1098         {
1099             poCurrentWord = poGetWord(iCurrent[iLib], iLib);
1100         }
1101         else
1102         {
1103             word = poGetWord(iCurrent[iLib], iLib);
1104
1105             if (stardict_strcmp(poCurrentWord, word) > 0 )
1106                 poCurrentWord = word;
1107         }
1108     }
1109     return poCurrentWord;
1110 }
1111
1112 const gchar *
1113 Libs::poGetNextWord(const gchar *sWord, glong *iCurrent)
1114 {
1115     // the input can be:
1116     // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback();
1117     // (NULL,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords();
1118     const gchar *poCurrentWord = NULL;
1119     std::vector<Dict *>::size_type iCurrentLib = 0;
1120     const gchar *word;
1121
1122     for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1123     {
1124         if (sWord)
1125             oLib[iLib]->Lookup(sWord, iCurrent[iLib]);
1126         if (iCurrent[iLib] == INVALID_INDEX)
1127             continue;
1128         if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1129             continue;
1130         if (poCurrentWord == NULL )
1131         {
1132             poCurrentWord = poGetWord(iCurrent[iLib], iLib);
1133             iCurrentLib = iLib;
1134         }
1135         else
1136         {
1137             word = poGetWord(iCurrent[iLib], iLib);
1138
1139             if (stardict_strcmp(poCurrentWord, word) > 0 )
1140             {
1141                 poCurrentWord = word;
1142                 iCurrentLib = iLib;
1143             }
1144         }
1145     }
1146     if (poCurrentWord)
1147     {
1148         iCurrent[iCurrentLib]
1149         ++;
1150         for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1151         {
1152             if (iLib == iCurrentLib)
1153                 continue;
1154             if (iCurrent[iLib] == INVALID_INDEX)
1155                 continue;
1156             if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1157                 continue;
1158             if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0 )
1159                 iCurrent[iLib]++;
1160         }
1161         poCurrentWord = poGetCurrentWord(iCurrent);
1162     }
1163     return poCurrentWord;
1164 }
1165
1166
1167 const gchar *
1168 Libs::poGetPreWord(glong * iCurrent)
1169 {
1170     // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange();
1171     const gchar *poCurrentWord = NULL;
1172     std::vector<Dict *>::size_type iCurrentLib = 0;
1173     const gchar *word;
1174
1175     for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1176     {
1177         if (iCurrent[iLib] == INVALID_INDEX)
1178             iCurrent[iLib] = narticles(iLib);
1179         else
1180         {
1181             if ( iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1182                 continue;
1183         }
1184         if ( poCurrentWord == NULL )
1185         {
1186             poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib);
1187             iCurrentLib = iLib;
1188         }
1189         else
1190         {
1191             word = poGetWord(iCurrent[iLib] - 1, iLib);
1192             if (stardict_strcmp(poCurrentWord, word) < 0 )
1193             {
1194                 poCurrentWord = word;
1195                 iCurrentLib = iLib;
1196             }
1197         }
1198     }
1199
1200     if (poCurrentWord)
1201     {
1202         iCurrent[iCurrentLib]
1203         --;
1204         for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1205         {
1206             if (iLib == iCurrentLib)
1207                 continue;
1208             if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1209                 continue;
1210             if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0)
1211             {
1212                 iCurrent[iLib]--;
1213             }
1214             else
1215             {
1216                 if (iCurrent[iLib] == narticles(iLib))
1217                     iCurrent[iLib] = INVALID_INDEX;
1218             }
1219         }
1220     }
1221     return poCurrentWord;
1222 }
1223
1224 bool Libs::LookupSimilarWord(const gchar* sWord, glong & iWordIndex, int iLib)
1225 {
1226     glong iIndex;
1227     bool bFound = false;
1228     gchar *casestr;
1229
1230     if (!bFound)
1231     {
1232         // to lower case.
1233         casestr = g_utf8_strdown(sWord, -1);
1234         if (strcmp(casestr, sWord))
1235         {
1236             if (oLib[iLib]->Lookup(casestr, iIndex))
1237                 bFound = true;
1238         }
1239         g_free(casestr);
1240         // to upper case.
1241         if (!bFound)
1242         {
1243             casestr = g_utf8_strup(sWord, -1);
1244             if (strcmp(casestr, sWord))
1245             {
1246                 if (oLib[iLib]->Lookup(casestr, iIndex))
1247                     bFound = true;
1248             }
1249             g_free(casestr);
1250         }
1251         // Upper the first character and lower others.
1252         if (!bFound)
1253         {
1254             gchar *nextchar = g_utf8_next_char(sWord);
1255             gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord);
1256             nextchar = g_utf8_strdown(nextchar, -1);
1257             casestr = g_strdup_printf("%s%s", firstchar, nextchar);
1258             g_free(firstchar);
1259             g_free(nextchar);
1260             if (strcmp(casestr, sWord))
1261             {
1262                 if (oLib[iLib]->Lookup(casestr, iIndex))
1263                     bFound = true;
1264             }
1265             g_free(casestr);
1266         }
1267     }
1268
1269     if (bIsPureEnglish(sWord))
1270     {
1271         // If not Found , try other status of sWord.
1272         int iWordLen = strlen(sWord);
1273         bool isupcase;
1274
1275         gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1);
1276
1277         //cut one char "s" or "d"
1278         if (!bFound && iWordLen > 1)
1279         {
1280             isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2);
1281             if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2))
1282             {
1283                 strcpy(sNewWord, sWord);
1284                 sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d"
1285                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1286                     bFound = true;
1287                 else if (isupcase || g_ascii_isupper(sWord[0]))
1288                 {
1289                     casestr = g_ascii_strdown(sNewWord, -1);
1290                     if (strcmp(casestr, sNewWord))
1291                     {
1292                         if (oLib[iLib]->Lookup(casestr, iIndex))
1293                             bFound = true;
1294                     }
1295                     g_free(casestr);
1296                 }
1297             }
1298         }
1299
1300         //cut "ly"
1301         if (!bFound && iWordLen > 2)
1302         {
1303             isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2);
1304             if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2)))
1305             {
1306                 strcpy(sNewWord, sWord);
1307                 sNewWord[iWordLen - 2] = '\0';  // cut "ly"
1308                 if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]
1309                         && !bIsVowel(sNewWord[iWordLen - 4]) &&
1310                         bIsVowel(sNewWord[iWordLen - 5]))
1311                 { //doubled
1312
1313                     sNewWord[iWordLen - 3] = '\0';
1314                     if ( oLib[iLib]->Lookup(sNewWord, iIndex) )
1315                         bFound = true;
1316                     else
1317                     {
1318                         if (isupcase || g_ascii_isupper(sWord[0]))
1319                         {
1320                             casestr = g_ascii_strdown(sNewWord, -1);
1321                             if (strcmp(casestr, sNewWord))
1322                             {
1323                                 if (oLib[iLib]->Lookup(casestr, iIndex))
1324                                     bFound = true;
1325                             }
1326                             g_free(casestr);
1327                         }
1328                         if (!bFound)
1329                             sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4];  //restore
1330                     }
1331                 }
1332                 if (!bFound)
1333                 {
1334                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1335                         bFound = true;
1336                     else if (isupcase || g_ascii_isupper(sWord[0]))
1337                     {
1338                         casestr = g_ascii_strdown(sNewWord, -1);
1339                         if (strcmp(casestr, sNewWord))
1340                         {
1341                             if (oLib[iLib]->Lookup(casestr, iIndex))
1342                                 bFound = true;
1343                         }
1344                         g_free(casestr);
1345                     }
1346                 }
1347             }
1348         }
1349
1350         //cut "ing"
1351         if (!bFound && iWordLen > 3)
1352         {
1353             isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3);
1354             if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3) )
1355             {
1356                 strcpy(sNewWord, sWord);
1357                 sNewWord[iWordLen - 3] = '\0';
1358                 if ( iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5])
1359                         && !bIsVowel(sNewWord[iWordLen - 5]) &&
1360                         bIsVowel(sNewWord[iWordLen - 6]))
1361                 {  //doubled
1362                     sNewWord[iWordLen - 4] = '\0';
1363                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1364                         bFound = true;
1365                     else
1366                     {
1367                         if (isupcase || g_ascii_isupper(sWord[0]))
1368                         {
1369                             casestr = g_ascii_strdown(sNewWord, -1);
1370                             if (strcmp(casestr, sNewWord))
1371                             {
1372                                 if (oLib[iLib]->Lookup(casestr, iIndex))
1373                                     bFound = true;
1374                             }
1375                             g_free(casestr);
1376                         }
1377                         if (!bFound)
1378                             sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5];  //restore
1379                     }
1380                 }
1381                 if ( !bFound )
1382                 {
1383                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1384                         bFound = true;
1385                     else if (isupcase || g_ascii_isupper(sWord[0]))
1386                     {
1387                         casestr = g_ascii_strdown(sNewWord, -1);
1388                         if (strcmp(casestr, sNewWord))
1389                         {
1390                             if (oLib[iLib]->Lookup(casestr, iIndex))
1391                                 bFound = true;
1392                         }
1393                         g_free(casestr);
1394                     }
1395                 }
1396                 if (!bFound)
1397                 {
1398                     if (isupcase)
1399                         strcat(sNewWord, "E"); // add a char "E"
1400                     else
1401                         strcat(sNewWord, "e"); // add a char "e"
1402                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1403                         bFound = true;
1404                     else if (isupcase || g_ascii_isupper(sWord[0]))
1405                     {
1406                         casestr = g_ascii_strdown(sNewWord, -1);
1407                         if (strcmp(casestr, sNewWord))
1408                         {
1409                             if (oLib[iLib]->Lookup(casestr, iIndex))
1410                                 bFound = true;
1411                         }
1412                         g_free(casestr);
1413                     }
1414                 }
1415             }
1416         }
1417
1418         //cut two char "es"
1419         if (!bFound && iWordLen > 3)
1420         {
1421             isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) &&
1422                         (sWord[iWordLen - 3] == 'S' ||
1423                          sWord[iWordLen - 3] == 'X' ||
1424                          sWord[iWordLen - 3] == 'O' ||
1425                          (iWordLen > 4 && sWord[iWordLen - 3] == 'H' &&
1426                           (sWord[iWordLen - 4] == 'C' ||
1427                            sWord[iWordLen - 4] == 'S'))));
1428             if (isupcase ||
1429                     (!strncmp(&sWord[iWordLen - 2], "es", 2) &&
1430                      (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' ||
1431                       sWord[iWordLen - 3] == 'o' ||
1432                       (iWordLen > 4 && sWord[iWordLen - 3] == 'h' &&
1433                        (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's')))))
1434             {
1435                 strcpy(sNewWord, sWord);
1436                 sNewWord[iWordLen - 2] = '\0';
1437                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1438                     bFound = true;
1439                 else if (isupcase || g_ascii_isupper(sWord[0]))
1440                 {
1441                     casestr = g_ascii_strdown(sNewWord, -1);
1442                     if (strcmp(casestr, sNewWord))
1443                     {
1444                         if (oLib[iLib]->Lookup(casestr, iIndex))
1445                             bFound = true;
1446                     }
1447                     g_free(casestr);
1448                 }
1449             }
1450         }
1451
1452         //cut "ed"
1453         if (!bFound && iWordLen > 3)
1454         {
1455             isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2);
1456             if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2))
1457             {
1458                 strcpy(sNewWord, sWord);
1459                 sNewWord[iWordLen - 2] = '\0';
1460                 if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4])
1461                         && !bIsVowel(sNewWord[iWordLen - 4]) &&
1462                         bIsVowel(sNewWord[iWordLen - 5]))
1463                 { //doubled
1464                     sNewWord[iWordLen - 3] = '\0';
1465                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1466                         bFound = true;
1467                     else
1468                     {
1469                         if (isupcase || g_ascii_isupper(sWord[0]))
1470                         {
1471                             casestr = g_ascii_strdown(sNewWord, -1);
1472                             if (strcmp(casestr, sNewWord))
1473                             {
1474                                 if (oLib[iLib]->Lookup(casestr, iIndex))
1475                                     bFound = true;
1476                             }
1477                             g_free(casestr);
1478                         }
1479                         if (!bFound)
1480                             sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4];  //restore
1481                     }
1482                 }
1483                 if (!bFound)
1484                 {
1485                     if (oLib[iLib]->Lookup(sNewWord, iIndex))
1486                         bFound = true;
1487                     else if (isupcase || g_ascii_isupper(sWord[0]))
1488                     {
1489                         casestr = g_ascii_strdown(sNewWord, -1);
1490                         if (strcmp(casestr, sNewWord))
1491                         {
1492                             if (oLib[iLib]->Lookup(casestr, iIndex))
1493                                 bFound = true;
1494                         }
1495                         g_free(casestr);
1496                     }
1497                 }
1498             }
1499         }
1500
1501         // cut "ied" , add "y".
1502         if (!bFound && iWordLen > 3)
1503         {
1504             isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3);
1505             if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3)))
1506             {
1507                 strcpy(sNewWord, sWord);
1508                 sNewWord[iWordLen - 3] = '\0';
1509                 if (isupcase)
1510                     strcat(sNewWord, "Y"); // add a char "Y"
1511                 else
1512                     strcat(sNewWord, "y"); // add a char "y"
1513                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1514                     bFound = true;
1515                 else if (isupcase || g_ascii_isupper(sWord[0]))
1516                 {
1517                     casestr = g_ascii_strdown(sNewWord, -1);
1518                     if (strcmp(casestr, sNewWord))
1519                     {
1520                         if (oLib[iLib]->Lookup(casestr, iIndex))
1521                             bFound = true;
1522                     }
1523                     g_free(casestr);
1524                 }
1525             }
1526         }
1527
1528         // cut "ies" , add "y".
1529         if (!bFound && iWordLen > 3)
1530         {
1531             isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3);
1532             if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3)))
1533             {
1534                 strcpy(sNewWord, sWord);
1535                 sNewWord[iWordLen - 3] = '\0';
1536                 if (isupcase)
1537                     strcat(sNewWord, "Y"); // add a char "Y"
1538                 else
1539                     strcat(sNewWord, "y"); // add a char "y"
1540                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1541                     bFound = true;
1542                 else if (isupcase || g_ascii_isupper(sWord[0]))
1543                 {
1544                     casestr = g_ascii_strdown(sNewWord, -1);
1545                     if (strcmp(casestr, sNewWord))
1546                     {
1547                         if (oLib[iLib]->Lookup(casestr, iIndex))
1548                             bFound = true;
1549                     }
1550                     g_free(casestr);
1551                 }
1552             }
1553         }
1554
1555         // cut "er".
1556         if (!bFound && iWordLen > 2)
1557         {
1558             isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2);
1559             if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2)))
1560             {
1561                 strcpy(sNewWord, sWord);
1562                 sNewWord[iWordLen - 2] = '\0';
1563                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1564                     bFound = true;
1565                 else if (isupcase || g_ascii_isupper(sWord[0]))
1566                 {
1567                     casestr = g_ascii_strdown(sNewWord, -1);
1568                     if (strcmp(casestr, sNewWord))
1569                     {
1570                         if (oLib[iLib]->Lookup(casestr, iIndex))
1571                             bFound = true;
1572                     }
1573                     g_free(casestr);
1574                 }
1575             }
1576         }
1577
1578         // cut "est".
1579         if (!bFound && iWordLen > 3)
1580         {
1581             isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3);
1582             if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3)))
1583             {
1584                 strcpy(sNewWord, sWord);
1585                 sNewWord[iWordLen - 3] = '\0';
1586                 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1587                     bFound = true;
1588                 else if (isupcase || g_ascii_isupper(sWord[0]))
1589                 {
1590                     casestr = g_ascii_strdown(sNewWord, -1);
1591                     if (strcmp(casestr, sNewWord))
1592                     {
1593                         if (oLib[iLib]->Lookup(casestr, iIndex))
1594                             bFound = true;
1595                     }
1596                     g_free(casestr);
1597                 }
1598             }
1599         }
1600
1601         g_free(sNewWord);
1602     }
1603
1604     if (bFound)
1605         iWordIndex = iIndex;
1606 #if 0
1607
1608     else
1609     {
1610         //don't change iWordIndex here.
1611         //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words.
1612         //iWordIndex = INVALID_INDEX;
1613     }
1614 #endif
1615     return bFound;
1616 }
1617
1618 bool Libs::SimpleLookupWord(const gchar* sWord, glong & iWordIndex, int iLib)
1619 {
1620     bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex);
1621     if (!bFound)
1622         bFound = LookupSimilarWord(sWord, iWordIndex, iLib);
1623     return bFound;
1624 }
1625
1626 struct Fuzzystruct
1627 {
1628     char * pMatchWord;
1629     int iMatchWordDistance;
1630 };
1631
1632 inline bool operator<(const Fuzzystruct & lh, const Fuzzystruct & rh)
1633 {
1634     if (lh.iMatchWordDistance != rh.iMatchWordDistance)
1635         return lh.iMatchWordDistance < rh.iMatchWordDistance;
1636
1637     if (lh.pMatchWord && rh.pMatchWord)
1638         return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0;
1639
1640     return false;
1641 }
1642
1643 static inline void unicode_strdown(gunichar *str)
1644 {
1645     while (*str)
1646     {
1647         *str = g_unichar_tolower(*str);
1648         ++str;
1649     }
1650 }
1651
1652 bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size, gint iLib)
1653 {
1654     if (sWord[0] == '\0')
1655         return false;
1656
1657     Fuzzystruct *oFuzzystruct = new Fuzzystruct[reslist_size];
1658
1659     for (int i = 0; i < reslist_size; i++)
1660     {
1661         oFuzzystruct[i].pMatchWord = NULL;
1662         oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance;
1663     }
1664     int iMaxDistance = iMaxFuzzyDistance;
1665     int iDistance;
1666     bool Found = false;
1667     EditDistance oEditDistance;
1668
1669     glong iCheckWordLen;
1670     const char *sCheck;
1671     gunichar *ucs4_str1, *ucs4_str2;
1672     glong ucs4_str2_len;
1673
1674     ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len);
1675     unicode_strdown(ucs4_str2);
1676
1677 //    for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
1678 //    {
1679     if (progress_func)
1680         progress_func();
1681
1682     //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) {
1683     //there are Chinese dicts and English dicts...
1684     if (TRUE)
1685     {
1686         const int iwords = narticles(iLib);
1687         for (int index = 0; index < iwords; index++)
1688         {
1689             sCheck = poGetWord(index, iLib);
1690             // tolower and skip too long or too short words
1691             iCheckWordLen = g_utf8_strlen(sCheck, -1);
1692             if (iCheckWordLen - ucs4_str2_len >= iMaxDistance ||
1693                     ucs4_str2_len - iCheckWordLen >= iMaxDistance)
1694                 continue;
1695             ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, NULL);
1696             if (iCheckWordLen > ucs4_str2_len)
1697                 ucs4_str1[ucs4_str2_len] = 0;
1698             unicode_strdown(ucs4_str1);
1699
1700             iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance);
1701             g_free(ucs4_str1);
1702             if (iDistance < iMaxDistance && iDistance < ucs4_str2_len)
1703             {
1704                 // when ucs4_str2_len=1,2 we need less fuzzy.
1705                 Found = true;
1706                 bool bAlreadyInList = false;
1707                 int iMaxDistanceAt = 0;
1708                 for (int j = 0; j < reslist_size; j++)
1709                 {
1710                     if (oFuzzystruct[j].pMatchWord &&
1711                             strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0 )
1712                     { //already in list
1713                         bAlreadyInList = true;
1714                         break;
1715                     }
1716                     //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time.
1717                     if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance )
1718                     {
1719                         iMaxDistanceAt = j;
1720                     }
1721                 }
1722                 if (!bAlreadyInList)
1723                 {
1724                     if (oFuzzystruct[iMaxDistanceAt].pMatchWord)
1725                         g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord);
1726                     oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck);
1727                     oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance;
1728                     // calc new iMaxDistance
1729                     iMaxDistance = iDistance;
1730                     for (int j = 0; j < reslist_size; j++)
1731                     {
1732                         if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance)
1733                             iMaxDistance = oFuzzystruct[j].iMatchWordDistance;
1734                     } // calc new iMaxDistance
1735                 }   // add to list
1736             }   // find one
1737         }   // each word
1738     }   // ok for search
1739 //    }   // each lib
1740     g_free(ucs4_str2);
1741
1742     if (Found) // sort with distance
1743         std::sort(oFuzzystruct, oFuzzystruct + reslist_size);
1744
1745     for (gint i = 0; i < reslist_size; ++i)
1746         reslist[i] = oFuzzystruct[i].pMatchWord;
1747
1748     delete[] oFuzzystruct;
1749
1750     return Found;
1751 }
1752
1753 inline bool less_for_compare(const char *lh, const char *rh)
1754 {
1755     return stardict_strcmp(lh, rh) < 0;
1756 }
1757
1758 gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord)
1759 {
1760     glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1];
1761     gint iMatchCount = 0;
1762     GPatternSpec *pspec = g_pattern_spec_new(word);
1763
1764     for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
1765     {
1766         //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib))
1767         // -iMatchCount,so save time,but may got less result and the word may repeat.
1768
1769         if (oLib[iLib]->
1770                 LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1))
1771         {
1772             if (progress_func)
1773                 progress_func();
1774             for (int i = 0; aiIndex[i] != -1; i++)
1775             {
1776                 const gchar * sMatchWord = poGetWord(aiIndex[i], iLib);
1777                 bool bAlreadyInList = false;
1778                 for (int j = 0; j < iMatchCount; j++)
1779                 {
1780                     if (strcmp(ppMatchWord[j], sMatchWord) == 0)
1781                     { //already in list
1782                         bAlreadyInList = true;
1783                         break;
1784                     }
1785                 }
1786                 if (!bAlreadyInList)
1787                     ppMatchWord[iMatchCount++] = g_strdup(sMatchWord);
1788             }
1789         }
1790     }
1791     g_pattern_spec_free(pspec);
1792
1793     if (iMatchCount) // sort it.
1794         std::sort(ppMatchWord, ppMatchWord + iMatchCount, less_for_compare);
1795
1796     return iMatchCount;
1797 }
1798
1799 bool Libs::LookupData(const gchar *sWord, std::vector<gchar *> *reslist)
1800 {
1801     std::vector<std::string> SearchWords;
1802     std::string SearchWord;
1803     const char *p = sWord;
1804     while (*p)
1805     {
1806         if (*p == '\\')
1807         {
1808             p++;
1809             switch (*p)
1810             {
1811             case ' ':
1812                 SearchWord += ' ';
1813                 break;
1814             case '\\':
1815                 SearchWord += '\\';
1816                 break;
1817             case 't':
1818                 SearchWord += '\t';
1819                 break;
1820             case 'n':
1821                 SearchWord += '\n';
1822                 break;
1823             default:
1824                 SearchWord += *p;
1825             }
1826         }
1827         else if (*p == ' ')
1828         {
1829             if (!SearchWord.empty())
1830             {
1831                 SearchWords.push_back(SearchWord);
1832                 SearchWord.clear();
1833             }
1834         }
1835         else
1836         {
1837             SearchWord += *p;
1838         }
1839         p++;
1840     }
1841     if (!SearchWord.empty())
1842     {
1843         SearchWords.push_back(SearchWord);
1844         SearchWord.clear();
1845     }
1846     if (SearchWords.empty())
1847         return false;
1848
1849     guint32 max_size = 0;
1850     gchar *origin_data = NULL;
1851     for (std::vector<Dict *>::size_type i = 0; i<oLib.size(); ++i)
1852     {
1853         if (!oLib[i]->
1854                 containSearchData())
1855             continue;
1856         if (progress_func)
1857             progress_func();
1858         const gulong iwords = narticles(i);
1859         const gchar *key;
1860         guint32 offset, size;
1861         for (gulong j = 0;
1862                 j < iwords;
1863                 ++j)
1864         {
1865             oLib[i]
1866             ->get_key_and_data(j, &key, &offset, &size);
1867             if (size > max_size)
1868             {
1869                 origin_data = (gchar *)g_realloc(origin_data, size);
1870                 max_size = size;
1871             }
1872             if (oLib[i]->SearchData(SearchWords, offset, size, origin_data))
1873                 reslist[i].push_back(g_strdup(key));
1874         }
1875     }
1876     g_free(origin_data);
1877
1878     std::vector<Dict *>::size_type i;
1879     for (i = 0; i<oLib.size(); ++i)
1880         if (!reslist[i].empty())
1881             break;
1882
1883     return i != oLib.size();
1884 }
1885
1886 /**************************************************/
1887 query_t analyze_query(const char *s, std::string& res)
1888 {
1889     if (!s || !*s)
1890     {
1891         res = "";
1892         return qtSIMPLE;
1893     }
1894     if (*s == '/')
1895     {
1896         res = s + 1;
1897         return qtFUZZY;
1898     }
1899
1900     if (*s == '|')
1901     {
1902         res = s + 1;
1903         return qtDATA;
1904     }
1905
1906     bool regexp = false;
1907     const char *p = s;
1908     res = "";
1909     for (; *p; res += *p, ++p)
1910     {
1911         if (*p == '\\')
1912         {
1913             ++p;
1914             if (!*p)
1915                 break;
1916             continue;
1917         }
1918         if (*p == '*' || *p == '?')
1919             regexp = true;
1920     }
1921     if (regexp)
1922         return qtREGEXP;
1923
1924     return qtSIMPLE;
1925 }