11 #include <glib/gstdio.h>
15 #include "mapfile.hpp"
19 // Notice: read src/tools/DICTFILE_FORMAT for the dictionary
20 // file's format information!
23 static inline bool bIsVowel(gchar inputchar)
25 gchar ch = g_ascii_toupper(inputchar);
26 return ( ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' );
29 static bool bIsPureEnglish(const gchar *str)
31 // i think this should work even when it is UTF8 string :).
32 for (int i = 0; str[i] != 0; i++)
34 //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK.
35 // Better use isascii() but not str[i]<0 while char is default unsigned in arm
41 static inline gint stardict_strcmp(const gchar *s1, const gchar *s2)
43 gint a = g_ascii_strcasecmp(s1, s2);
45 return strcmp(s1, s2);
50 bool DictInfo::load_from_ifo_file(const std::string& ifofilename,
53 ifo_file_name = ifofilename;
55 if (!g_file_get_contents(ifofilename.c_str(), &buffer, NULL, NULL))
58 #define TREEDICT_MAGIC_DATA "StarDict's treedict ifo file\nversion=2.4.2\n"
59 #define DICT_MAGIC_DATA "StarDict's dict ifo file\nversion=2.4.2\n"
61 const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA;
62 if (!g_str_has_prefix(buffer, magic_data))
70 p1 = buffer + strlen(magic_data) - 1;
72 p2 = strstr(p1, "\nwordcount=");
79 p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n');
80 gchar *tmpstr = (gchar *)g_memdup(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1) + 1);
81 tmpstr[p3 - (p2 + sizeof("\nwordcount=") - 1)] = '\0';
82 wordcount = atol(tmpstr);
87 p2 = strstr(p1, "\ntdxfilesize=");
93 p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n');
94 tmpstr = (gchar *)g_memdup(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1) + 1);
95 tmpstr[p3 - (p2 + sizeof("\ntdxfilesize=") - 1)] = '\0';
96 index_file_size = atol(tmpstr);
102 p2 = strstr(p1, "\nidxfilesize=");
109 p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n');
110 tmpstr = (gchar *)g_memdup(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1) + 1);
111 tmpstr[p3 - (p2 + sizeof("\nidxfilesize=") - 1)] = '\0';
112 index_file_size = atol(tmpstr);
116 p2 = strstr(p1, "\nbookname=");
124 p2 = p2 + sizeof("\nbookname=") - 1;
125 p3 = strchr(p2, '\n');
126 bookname.assign(p2, p3 - p2);
128 p2 = strstr(p1, "\nauthor=");
131 p2 = p2 + sizeof("\nauthor=") - 1;
132 p3 = strchr(p2, '\n');
133 author.assign(p2, p3 - p2);
136 p2 = strstr(p1, "\nemail=");
139 p2 = p2 + sizeof("\nemail=") - 1;
140 p3 = strchr(p2, '\n');
141 email.assign(p2, p3 - p2);
144 p2 = strstr(p1, "\nwebsite=");
147 p2 = p2 + sizeof("\nwebsite=") - 1;
148 p3 = strchr(p2, '\n');
149 website.assign(p2, p3 - p2);
152 p2 = strstr(p1, "\ndate=");
155 p2 = p2 + sizeof("\ndate=") - 1;
156 p3 = strchr(p2, '\n');
157 date.assign(p2, p3 - p2);
160 p2 = strstr(p1, "\ndescription=");
163 p2 = p2 + sizeof("\ndescription=") - 1;
164 p3 = strchr(p2, '\n');
165 description.assign(p2, p3 - p2);
168 p2 = strstr(p1, "\nsametypesequence=");
171 p2 += sizeof("\nsametypesequence=") - 1;
172 p3 = strchr(p2, '\n');
173 sametypesequence.assign(p2, p3 - p2);
180 //===================================================================
187 DictBase::~DictBase()
193 gchar* DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size)
195 for (int i = 0; i < WORDDATA_CACHE_NUM; i++)
196 if (cache[i].data && cache[i].offset == idxitem_offset)
197 return cache[i].data;
200 fseek(dictfile, idxitem_offset, SEEK_SET);
203 if (!sametypesequence.empty())
205 gchar *origin_data = (gchar *)g_malloc(idxitem_size);
208 fread(origin_data, idxitem_size, 1, dictfile);
210 dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
213 gint sametypesequence_len = sametypesequence.length();
214 //there have sametypesequence_len char being omitted.
215 data_size = idxitem_size + sizeof(guint32) + sametypesequence_len;
216 //if the last item's size is determined by the end up '\0',then +=sizeof(gchar);
217 //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32);
218 switch (sametypesequence[sametypesequence_len - 1])
226 data_size += sizeof(gchar);
230 data_size += sizeof(guint32);
233 if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
234 data_size += sizeof(guint32);
236 data_size += sizeof(gchar);
239 data = (gchar *)g_malloc(data_size);
241 p1 = data + sizeof(guint32);
244 //copy the head items.
245 for (int i = 0; i < sametypesequence_len - 1; i++)
247 *p1 = sametypesequence[i];
249 switch (sametypesequence[i])
257 sec_size = strlen(p2) + 1;
258 memcpy(p1, p2, sec_size);
264 sec_size = *reinterpret_cast<guint32 *>(p2);
265 sec_size += sizeof(guint32);
266 memcpy(p1, p2, sec_size);
271 if (g_ascii_isupper(sametypesequence[i]))
273 sec_size = *reinterpret_cast<guint32 *>(p2);
274 sec_size += sizeof(guint32);
278 sec_size = strlen(p2) + 1;
280 memcpy(p1, p2, sec_size);
286 //calculate the last item 's size.
287 sec_size = idxitem_size - (p2 - origin_data);
288 *p1 = sametypesequence[sametypesequence_len - 1];
290 switch (sametypesequence[sametypesequence_len - 1])
298 memcpy(p1, p2, sec_size);
300 *p1 = '\0'; //add the end up '\0';
304 *reinterpret_cast<guint32 *>(p1) = sec_size;
305 p1 += sizeof(guint32);
306 memcpy(p1, p2, sec_size);
309 if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1]))
311 *reinterpret_cast<guint32 *>(p1) = sec_size;
312 p1 += sizeof(guint32);
313 memcpy(p1, p2, sec_size);
317 memcpy(p1, p2, sec_size);
324 *reinterpret_cast<guint32 *>(data) = data_size;
328 data = (gchar *)g_malloc(idxitem_size + sizeof(guint32));
330 fread(data + sizeof(guint32), idxitem_size, 1, dictfile);
332 dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size);
333 *reinterpret_cast<guint32 *>(data) = idxitem_size + sizeof(guint32);
335 g_free(cache[cache_cur].data);
337 cache[cache_cur].data = data;
338 cache[cache_cur].offset = idxitem_offset;
340 if (cache_cur == WORDDATA_CACHE_NUM)
345 inline bool DictBase::containSearchData()
347 if (sametypesequence.empty())
350 return sametypesequence.find_first_of("mlgxty") != std::string::npos;
353 bool DictBase::SearchData(std::vector<std::string> &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data)
355 int nWord = SearchWords.size();
356 std::vector<bool> WordFind(nWord, false);
360 fseek(dictfile, idxitem_offset, SEEK_SET);
362 fread(origin_data, idxitem_size, 1, dictfile);
364 dictdzfile->read(origin_data, idxitem_offset, idxitem_size);
365 gchar *p = origin_data;
368 if (!sametypesequence.empty())
370 gint sametypesequence_len = sametypesequence.length();
371 for (int i = 0; i < sametypesequence_len - 1; i++)
373 switch (sametypesequence[i])
381 for (j = 0; j < nWord; j++)
382 if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
391 sec_size = strlen(p) + 1;
395 if (g_ascii_isupper(sametypesequence[i]))
397 sec_size = *reinterpret_cast<guint32 *>(p);
398 sec_size += sizeof(guint32);
402 sec_size = strlen(p) + 1;
407 switch (sametypesequence[sametypesequence_len - 1])
415 sec_size = idxitem_size - (p - origin_data);
416 for (j = 0; j < nWord; j++)
418 g_strstr_len(p, sec_size, SearchWords[j].c_str()))
432 while (guint32(p - origin_data) < idxitem_size)
442 for (j = 0; j < nWord; j++)
443 if (!WordFind[j] && strstr(p, SearchWords[j].c_str()))
451 sec_size = strlen(p) + 1;
455 if (g_ascii_isupper(*p))
457 sec_size = *reinterpret_cast<guint32 *>(p);
458 sec_size += sizeof(guint32);
462 sec_size = strlen(p) + 1;
471 class offset_index : public index_file
474 offset_index() : idxfile(NULL)
477 bool load(const std::string& url, gulong wc, gulong fsize);
478 const gchar *get_key(glong idx);
479 void get_data(glong idx);
480 const gchar *get_key_and_data(glong idx);
481 bool lookup(const char *str, glong &idx);
483 static const gint ENTR_PER_PAGE = 32;
484 static const char *CACHE_MAGIC;
486 std::vector<guint32> wordoffset;
490 gchar wordentry_buf[256 + sizeof(guint32)*2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT.
495 void assign(glong i, const std::string& str)
501 index_entry first, last, middle, real_last;
508 std::vector<gchar> page_data;
512 page_entry entries[ENTR_PER_PAGE];
516 void fill(gchar *data, gint nent, glong idx_);
519 gulong load_page(glong page_idx);
520 const gchar *read_first_on_page_key(glong page_idx);
521 const gchar *get_first_on_page_key(glong page_idx);
522 bool load_cache(const std::string& url);
523 bool save_cache(const std::string& url);
524 static strlist_t get_cache_variant(const std::string& url);
527 const char *offset_index::CACHE_MAGIC = "StarDict's Cache, Version: 0.1";
529 class wordlist_index : public index_file
532 wordlist_index() : idxdatabuf(NULL)
535 bool load(const std::string& url, gulong wc, gulong fsize);
536 const gchar *get_key(glong idx);
537 void get_data(glong idx);
538 const gchar *get_key_and_data(glong idx);
539 bool lookup(const char *str, glong &idx);
542 std::vector<gchar *> wordlist;
545 void offset_index::page_t::fill(gchar *data, gint nent, glong idx_)
550 for (gint i = 0; i < nent; ++i)
552 entries[i].keystr = p;
555 entries[i].off = g_ntohl(*reinterpret_cast<guint32 *>(p));
556 p += sizeof(guint32);
557 entries[i].size = g_ntohl(*reinterpret_cast<guint32 *>(p));
558 p += sizeof(guint32);
562 offset_index::~offset_index()
568 inline const gchar *offset_index::read_first_on_page_key(glong page_idx)
570 fseek(idxfile, wordoffset[page_idx], SEEK_SET);
571 guint page_size = wordoffset[page_idx + 1] - wordoffset[page_idx];
572 fread(wordentry_buf, std::min<guint>(sizeof(wordentry_buf), page_size), 1, idxfile); //TODO: check returned values, deal with word entry that strlen>255.
573 return wordentry_buf;
576 inline const gchar *offset_index::get_first_on_page_key(glong page_idx)
578 if (page_idx < middle.idx)
580 if (page_idx == first.idx)
581 return first.keystr.c_str();
582 return read_first_on_page_key(page_idx);
584 else if (page_idx > middle.idx)
586 if (page_idx == last.idx)
587 return last.keystr.c_str();
588 return read_first_on_page_key(page_idx);
591 return middle.keystr.c_str();
594 bool offset_index::load_cache(const std::string& url)
596 strlist_t vars = get_cache_variant(url);
598 for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
600 struct stat idxstat, cachestat;
601 if (g_stat(url.c_str(), &idxstat) != 0 ||
602 g_stat(it->c_str(), &cachestat) != 0)
604 if (cachestat.st_mtime < idxstat.st_mtime)
607 if (!mf.open(it->c_str(), cachestat.st_size))
609 if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0)
611 memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size()*sizeof(wordoffset[0]));
619 strlist_t offset_index::get_cache_variant(const std::string& url)
622 res.push_back(url + ".oft");
623 if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) &&
624 g_mkdir(g_get_user_cache_dir(), 0700) == -1)
627 std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv";
629 if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS))
631 if (g_mkdir(cache_dir.c_str(), 0700) == -1)
634 else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR))
637 gchar *base = g_path_get_basename(url.c_str());
638 res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft");
643 bool offset_index::save_cache(const std::string& url)
645 strlist_t vars = get_cache_variant(url);
646 for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it)
648 FILE *out = fopen(it->c_str(), "wb");
651 if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC))
653 if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size())
656 printf("save to cache %s\n", url.c_str());
662 bool offset_index::load(const std::string& url, gulong wc, gulong fsize)
665 gulong npages = (wc - 1) / ENTR_PER_PAGE + 2;
666 wordoffset.resize(npages);
667 if (!load_cache(url))
668 { //map file will close after finish of block
670 if (!map_file.open(url.c_str(), fsize))
672 const gchar *idxdatabuffer = map_file.begin();
674 const gchar *p1 = idxdatabuffer;
677 for (guint32 i = 0; i < wc; i++)
679 index_size = strlen(p1) + 1 + 2 * sizeof(guint32);
680 if (i % ENTR_PER_PAGE == 0)
682 wordoffset[j] = p1 - idxdatabuffer;
687 wordoffset[j] = p1 - idxdatabuffer;
688 if (!save_cache(url))
689 fprintf(stderr, "cache update failed\n");
692 if (!(idxfile = fopen(url.c_str(), "rb")))
694 wordoffset.resize(0);
698 first.assign(0, read_first_on_page_key(0));
699 last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2));
700 middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2));
701 real_last.assign(wc - 1, get_key(wc - 1));
706 inline gulong offset_index::load_page(glong page_idx)
708 gulong nentr = ENTR_PER_PAGE;
709 if (page_idx == glong(wordoffset.size() - 2))
710 if ((nentr = wordcount % ENTR_PER_PAGE) == 0)
711 nentr = ENTR_PER_PAGE;
714 if (page_idx != page.idx)
716 page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]);
717 fseek(idxfile, wordoffset[page_idx], SEEK_SET);
718 fread(&page_data[0], 1, page_data.size(), idxfile);
719 page.fill(&page_data[0], nentr, page_idx);
725 const gchar *offset_index::get_key(glong idx)
727 load_page(idx / ENTR_PER_PAGE);
728 glong idx_in_page = idx % ENTR_PER_PAGE;
729 wordentry_offset = page.entries[idx_in_page].off;
730 wordentry_size = page.entries[idx_in_page].size;
732 return page.entries[idx_in_page].keystr;
735 void offset_index::get_data(glong idx)
740 const gchar *offset_index::get_key_and_data(glong idx)
745 bool offset_index::lookup(const char *str, glong &idx)
749 glong iTo = wordoffset.size() - 2;
752 if (stardict_strcmp(str, first.keystr.c_str()) < 0)
757 else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0)
768 iThisIndex = (iFrom + iTo) / 2;
769 cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex));
771 iFrom = iThisIndex + 1;
773 iTo = iThisIndex - 1;
787 gulong netr = load_page(idx);
788 iFrom = 1; // Needn't search the first word anymore.
793 iThisIndex = (iFrom + iTo) / 2;
794 cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr);
796 iFrom = iThisIndex + 1;
798 iTo = iThisIndex - 1;
805 idx *= ENTR_PER_PAGE;
813 idx *= ENTR_PER_PAGE;
818 wordlist_index::~wordlist_index()
823 bool wordlist_index::load(const std::string& url, gulong wc, gulong fsize)
825 gzFile in = gzopen(url.c_str(), "rb");
829 idxdatabuf = (gchar *)g_malloc(fsize);
831 gulong len = gzread(in, idxdatabuf, fsize);
837 wordlist.resize(wc + 1);
838 gchar *p1 = idxdatabuf;
840 for (i = 0; i < wc; i++)
843 p1 += strlen(p1) + 1 + 2 * sizeof(guint32);
850 const gchar *wordlist_index::get_key(glong idx)
852 return wordlist[idx];
855 void wordlist_index::get_data(glong idx)
857 gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar);
858 wordentry_offset = g_ntohl(*reinterpret_cast<guint32 *>(p1));
859 p1 += sizeof(guint32);
860 wordentry_size = g_ntohl(*reinterpret_cast<guint32 *>(p1));
863 const gchar *wordlist_index::get_key_and_data(glong idx)
869 bool wordlist_index::lookup(const char *str, glong &idx)
872 glong iTo = wordlist.size() - 2;
874 if (stardict_strcmp(str, get_key(0)) < 0)
878 else if (stardict_strcmp(str, get_key(iTo)) > 0)
884 glong iThisIndex = 0;
889 iThisIndex = (iFrom + iTo) / 2;
890 cmpint = stardict_strcmp(str, get_key(iThisIndex));
892 iFrom = iThisIndex + 1;
894 iTo = iThisIndex - 1;
909 //===================================================================
910 bool Dict::load(const std::string& ifofilename)
913 if (!load_ifofile(ifofilename, idxfilesize))
916 std::string fullfilename(ifofilename);
917 fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz");
919 if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
921 dictdzfile.reset(new dictData);
922 if (!dictdzfile->open(fullfilename, 0))
924 //g_print("open file %s failed!\n",fullfilename);
930 fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1);
931 dictfile = fopen(fullfilename.c_str(), "rb");
934 //g_print("open file %s failed!\n",fullfilename);
939 fullfilename = ifofilename;
940 fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz");
942 if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS))
944 idx_file.reset(new wordlist_index);
948 fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1);
949 idx_file.reset(new offset_index);
952 if (!idx_file->load(fullfilename, wordcount, idxfilesize))
955 //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles());
959 bool Dict::load_ifofile(const std::string& ifofilename, gulong &idxfilesize)
962 if (!dict_info.load_from_ifo_file(ifofilename, false))
964 if (dict_info.wordcount == 0)
969 ifo_file_name = dict_info.ifo_file_name;
970 wordcount = dict_info.wordcount;
971 bookname = dict_info.bookname;
973 idxfilesize = dict_info.index_file_size;
975 sametypesequence = dict_info.sametypesequence;
980 bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen)
984 for (guint32 i = 0; i < narticles() && iIndexCount < iBuffLen - 1; i++)
985 if (g_pattern_match_string(pspec, get_key(i)))
986 aIndex[iIndexCount++] = i;
988 aIndex[iIndexCount] = -1; // -1 is the end.
990 return (iIndexCount > 0);
993 //===================================================================
994 Libs::Libs(progress_func_t f)
997 iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg.
1002 for (std::vector<Dict *>::iterator p = oLib.begin(); p != oLib.end(); ++p)
1006 void Libs::load_dict(const std::string& url)
1008 Dict *lib = new Dict;
1010 oLib.push_back(lib);
1018 DictLoader(Libs& lib_): lib(lib_)
1020 void operator()(const std::string& url, bool disable)
1029 void Libs::load(const strlist_t& dicts_dirs,
1030 const strlist_t& order_list,
1031 const strlist_t& disable_list)
1033 for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
1040 DictReLoader(std::vector<Dict *> &p, std::vector<Dict *> &f,
1041 Libs& lib_) : prev(p), future(f), lib(lib_)
1043 void operator()(const std::string& url, bool disable)
1047 Dict *dict = find(url);
1049 future.push_back(dict);
1055 std::vector<Dict *> &prev;
1056 std::vector<Dict *> &future;
1059 Dict *find(const std::string& url)
1061 std::vector<Dict *>::iterator it;
1062 for (it = prev.begin(); it != prev.end(); ++it)
1063 if ((*it)->ifofilename() == url)
1065 if (it != prev.end())
1075 void Libs::reload(const strlist_t& dicts_dirs,
1076 const strlist_t& order_list,
1077 const strlist_t& disable_list)
1079 std::vector<Dict *> prev(oLib);
1081 for_each_file(dicts_dirs, ".ifo", order_list, disable_list,
1082 DictReLoader(prev, oLib, *this));
1083 for (std::vector<Dict *>::iterator it = prev.begin(); it != prev.end(); ++it)
1087 const gchar *Libs::poGetCurrentWord(glong * iCurrent)
1089 const gchar *poCurrentWord = NULL;
1091 for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
1093 if (iCurrent[iLib] == INVALID_INDEX)
1095 if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1097 if ( poCurrentWord == NULL )
1099 poCurrentWord = poGetWord(iCurrent[iLib], iLib);
1103 word = poGetWord(iCurrent[iLib], iLib);
1105 if (stardict_strcmp(poCurrentWord, word) > 0 )
1106 poCurrentWord = word;
1109 return poCurrentWord;
1113 Libs::poGetNextWord(const gchar *sWord, glong *iCurrent)
1115 // the input can be:
1116 // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback();
1117 // (NULL,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords();
1118 const gchar *poCurrentWord = NULL;
1119 std::vector<Dict *>::size_type iCurrentLib = 0;
1122 for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1125 oLib[iLib]->Lookup(sWord, iCurrent[iLib]);
1126 if (iCurrent[iLib] == INVALID_INDEX)
1128 if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1130 if (poCurrentWord == NULL )
1132 poCurrentWord = poGetWord(iCurrent[iLib], iLib);
1137 word = poGetWord(iCurrent[iLib], iLib);
1139 if (stardict_strcmp(poCurrentWord, word) > 0 )
1141 poCurrentWord = word;
1148 iCurrent[iCurrentLib]
1150 for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1152 if (iLib == iCurrentLib)
1154 if (iCurrent[iLib] == INVALID_INDEX)
1156 if ( iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0)
1158 if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0 )
1161 poCurrentWord = poGetCurrentWord(iCurrent);
1163 return poCurrentWord;
1168 Libs::poGetPreWord(glong * iCurrent)
1170 // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange();
1171 const gchar *poCurrentWord = NULL;
1172 std::vector<Dict *>::size_type iCurrentLib = 0;
1175 for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1177 if (iCurrent[iLib] == INVALID_INDEX)
1178 iCurrent[iLib] = narticles(iLib);
1181 if ( iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1184 if ( poCurrentWord == NULL )
1186 poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib);
1191 word = poGetWord(iCurrent[iLib] - 1, iLib);
1192 if (stardict_strcmp(poCurrentWord, word) < 0 )
1194 poCurrentWord = word;
1202 iCurrent[iCurrentLib]
1204 for (std::vector<Dict *>::size_type iLib = 0;iLib<oLib.size();iLib++)
1206 if (iLib == iCurrentLib)
1208 if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0)
1210 if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0)
1216 if (iCurrent[iLib] == narticles(iLib))
1217 iCurrent[iLib] = INVALID_INDEX;
1221 return poCurrentWord;
1224 bool Libs::LookupSimilarWord(const gchar* sWord, glong & iWordIndex, int iLib)
1227 bool bFound = false;
1233 casestr = g_utf8_strdown(sWord, -1);
1234 if (strcmp(casestr, sWord))
1236 if (oLib[iLib]->Lookup(casestr, iIndex))
1243 casestr = g_utf8_strup(sWord, -1);
1244 if (strcmp(casestr, sWord))
1246 if (oLib[iLib]->Lookup(casestr, iIndex))
1251 // Upper the first character and lower others.
1254 gchar *nextchar = g_utf8_next_char(sWord);
1255 gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord);
1256 nextchar = g_utf8_strdown(nextchar, -1);
1257 casestr = g_strdup_printf("%s%s", firstchar, nextchar);
1260 if (strcmp(casestr, sWord))
1262 if (oLib[iLib]->Lookup(casestr, iIndex))
1269 if (bIsPureEnglish(sWord))
1271 // If not Found , try other status of sWord.
1272 int iWordLen = strlen(sWord);
1275 gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1);
1277 //cut one char "s" or "d"
1278 if (!bFound && iWordLen > 1)
1280 isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2);
1281 if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2))
1283 strcpy(sNewWord, sWord);
1284 sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d"
1285 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1287 else if (isupcase || g_ascii_isupper(sWord[0]))
1289 casestr = g_ascii_strdown(sNewWord, -1);
1290 if (strcmp(casestr, sNewWord))
1292 if (oLib[iLib]->Lookup(casestr, iIndex))
1301 if (!bFound && iWordLen > 2)
1303 isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2);
1304 if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2)))
1306 strcpy(sNewWord, sWord);
1307 sNewWord[iWordLen - 2] = '\0'; // cut "ly"
1308 if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]
1309 && !bIsVowel(sNewWord[iWordLen - 4]) &&
1310 bIsVowel(sNewWord[iWordLen - 5]))
1313 sNewWord[iWordLen - 3] = '\0';
1314 if ( oLib[iLib]->Lookup(sNewWord, iIndex) )
1318 if (isupcase || g_ascii_isupper(sWord[0]))
1320 casestr = g_ascii_strdown(sNewWord, -1);
1321 if (strcmp(casestr, sNewWord))
1323 if (oLib[iLib]->Lookup(casestr, iIndex))
1329 sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
1334 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1336 else if (isupcase || g_ascii_isupper(sWord[0]))
1338 casestr = g_ascii_strdown(sNewWord, -1);
1339 if (strcmp(casestr, sNewWord))
1341 if (oLib[iLib]->Lookup(casestr, iIndex))
1351 if (!bFound && iWordLen > 3)
1353 isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3);
1354 if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3) )
1356 strcpy(sNewWord, sWord);
1357 sNewWord[iWordLen - 3] = '\0';
1358 if ( iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5])
1359 && !bIsVowel(sNewWord[iWordLen - 5]) &&
1360 bIsVowel(sNewWord[iWordLen - 6]))
1362 sNewWord[iWordLen - 4] = '\0';
1363 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1367 if (isupcase || g_ascii_isupper(sWord[0]))
1369 casestr = g_ascii_strdown(sNewWord, -1);
1370 if (strcmp(casestr, sNewWord))
1372 if (oLib[iLib]->Lookup(casestr, iIndex))
1378 sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore
1383 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1385 else if (isupcase || g_ascii_isupper(sWord[0]))
1387 casestr = g_ascii_strdown(sNewWord, -1);
1388 if (strcmp(casestr, sNewWord))
1390 if (oLib[iLib]->Lookup(casestr, iIndex))
1399 strcat(sNewWord, "E"); // add a char "E"
1401 strcat(sNewWord, "e"); // add a char "e"
1402 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1404 else if (isupcase || g_ascii_isupper(sWord[0]))
1406 casestr = g_ascii_strdown(sNewWord, -1);
1407 if (strcmp(casestr, sNewWord))
1409 if (oLib[iLib]->Lookup(casestr, iIndex))
1419 if (!bFound && iWordLen > 3)
1421 isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) &&
1422 (sWord[iWordLen - 3] == 'S' ||
1423 sWord[iWordLen - 3] == 'X' ||
1424 sWord[iWordLen - 3] == 'O' ||
1425 (iWordLen > 4 && sWord[iWordLen - 3] == 'H' &&
1426 (sWord[iWordLen - 4] == 'C' ||
1427 sWord[iWordLen - 4] == 'S'))));
1429 (!strncmp(&sWord[iWordLen - 2], "es", 2) &&
1430 (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' ||
1431 sWord[iWordLen - 3] == 'o' ||
1432 (iWordLen > 4 && sWord[iWordLen - 3] == 'h' &&
1433 (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's')))))
1435 strcpy(sNewWord, sWord);
1436 sNewWord[iWordLen - 2] = '\0';
1437 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1439 else if (isupcase || g_ascii_isupper(sWord[0]))
1441 casestr = g_ascii_strdown(sNewWord, -1);
1442 if (strcmp(casestr, sNewWord))
1444 if (oLib[iLib]->Lookup(casestr, iIndex))
1453 if (!bFound && iWordLen > 3)
1455 isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2);
1456 if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2))
1458 strcpy(sNewWord, sWord);
1459 sNewWord[iWordLen - 2] = '\0';
1460 if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4])
1461 && !bIsVowel(sNewWord[iWordLen - 4]) &&
1462 bIsVowel(sNewWord[iWordLen - 5]))
1464 sNewWord[iWordLen - 3] = '\0';
1465 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1469 if (isupcase || g_ascii_isupper(sWord[0]))
1471 casestr = g_ascii_strdown(sNewWord, -1);
1472 if (strcmp(casestr, sNewWord))
1474 if (oLib[iLib]->Lookup(casestr, iIndex))
1480 sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore
1485 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1487 else if (isupcase || g_ascii_isupper(sWord[0]))
1489 casestr = g_ascii_strdown(sNewWord, -1);
1490 if (strcmp(casestr, sNewWord))
1492 if (oLib[iLib]->Lookup(casestr, iIndex))
1501 // cut "ied" , add "y".
1502 if (!bFound && iWordLen > 3)
1504 isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3);
1505 if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3)))
1507 strcpy(sNewWord, sWord);
1508 sNewWord[iWordLen - 3] = '\0';
1510 strcat(sNewWord, "Y"); // add a char "Y"
1512 strcat(sNewWord, "y"); // add a char "y"
1513 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1515 else if (isupcase || g_ascii_isupper(sWord[0]))
1517 casestr = g_ascii_strdown(sNewWord, -1);
1518 if (strcmp(casestr, sNewWord))
1520 if (oLib[iLib]->Lookup(casestr, iIndex))
1528 // cut "ies" , add "y".
1529 if (!bFound && iWordLen > 3)
1531 isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3);
1532 if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3)))
1534 strcpy(sNewWord, sWord);
1535 sNewWord[iWordLen - 3] = '\0';
1537 strcat(sNewWord, "Y"); // add a char "Y"
1539 strcat(sNewWord, "y"); // add a char "y"
1540 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1542 else if (isupcase || g_ascii_isupper(sWord[0]))
1544 casestr = g_ascii_strdown(sNewWord, -1);
1545 if (strcmp(casestr, sNewWord))
1547 if (oLib[iLib]->Lookup(casestr, iIndex))
1556 if (!bFound && iWordLen > 2)
1558 isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2);
1559 if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2)))
1561 strcpy(sNewWord, sWord);
1562 sNewWord[iWordLen - 2] = '\0';
1563 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1565 else if (isupcase || g_ascii_isupper(sWord[0]))
1567 casestr = g_ascii_strdown(sNewWord, -1);
1568 if (strcmp(casestr, sNewWord))
1570 if (oLib[iLib]->Lookup(casestr, iIndex))
1579 if (!bFound && iWordLen > 3)
1581 isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3);
1582 if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3)))
1584 strcpy(sNewWord, sWord);
1585 sNewWord[iWordLen - 3] = '\0';
1586 if (oLib[iLib]->Lookup(sNewWord, iIndex))
1588 else if (isupcase || g_ascii_isupper(sWord[0]))
1590 casestr = g_ascii_strdown(sNewWord, -1);
1591 if (strcmp(casestr, sNewWord))
1593 if (oLib[iLib]->Lookup(casestr, iIndex))
1605 iWordIndex = iIndex;
1610 //don't change iWordIndex here.
1611 //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words.
1612 //iWordIndex = INVALID_INDEX;
1618 bool Libs::SimpleLookupWord(const gchar* sWord, glong & iWordIndex, int iLib)
1620 bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex);
1622 bFound = LookupSimilarWord(sWord, iWordIndex, iLib);
1629 int iMatchWordDistance;
1632 inline bool operator<(const Fuzzystruct & lh, const Fuzzystruct & rh)
1634 if (lh.iMatchWordDistance != rh.iMatchWordDistance)
1635 return lh.iMatchWordDistance < rh.iMatchWordDistance;
1637 if (lh.pMatchWord && rh.pMatchWord)
1638 return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0;
1643 static inline void unicode_strdown(gunichar *str)
1647 *str = g_unichar_tolower(*str);
1652 bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size, gint iLib)
1654 if (sWord[0] == '\0')
1657 Fuzzystruct *oFuzzystruct = new Fuzzystruct[reslist_size];
1659 for (int i = 0; i < reslist_size; i++)
1661 oFuzzystruct[i].pMatchWord = NULL;
1662 oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance;
1664 int iMaxDistance = iMaxFuzzyDistance;
1667 EditDistance oEditDistance;
1669 glong iCheckWordLen;
1671 gunichar *ucs4_str1, *ucs4_str2;
1672 glong ucs4_str2_len;
1674 ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len);
1675 unicode_strdown(ucs4_str2);
1677 // for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
1682 //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) {
1683 //there are Chinese dicts and English dicts...
1686 const int iwords = narticles(iLib);
1687 for (int index = 0; index < iwords; index++)
1689 sCheck = poGetWord(index, iLib);
1690 // tolower and skip too long or too short words
1691 iCheckWordLen = g_utf8_strlen(sCheck, -1);
1692 if (iCheckWordLen - ucs4_str2_len >= iMaxDistance ||
1693 ucs4_str2_len - iCheckWordLen >= iMaxDistance)
1695 ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, NULL);
1696 if (iCheckWordLen > ucs4_str2_len)
1697 ucs4_str1[ucs4_str2_len] = 0;
1698 unicode_strdown(ucs4_str1);
1700 iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance);
1702 if (iDistance < iMaxDistance && iDistance < ucs4_str2_len)
1704 // when ucs4_str2_len=1,2 we need less fuzzy.
1706 bool bAlreadyInList = false;
1707 int iMaxDistanceAt = 0;
1708 for (int j = 0; j < reslist_size; j++)
1710 if (oFuzzystruct[j].pMatchWord &&
1711 strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0 )
1713 bAlreadyInList = true;
1716 //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time.
1717 if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance )
1722 if (!bAlreadyInList)
1724 if (oFuzzystruct[iMaxDistanceAt].pMatchWord)
1725 g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord);
1726 oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck);
1727 oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance;
1728 // calc new iMaxDistance
1729 iMaxDistance = iDistance;
1730 for (int j = 0; j < reslist_size; j++)
1732 if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance)
1733 iMaxDistance = oFuzzystruct[j].iMatchWordDistance;
1734 } // calc new iMaxDistance
1742 if (Found) // sort with distance
1743 std::sort(oFuzzystruct, oFuzzystruct + reslist_size);
1745 for (gint i = 0; i < reslist_size; ++i)
1746 reslist[i] = oFuzzystruct[i].pMatchWord;
1748 delete[] oFuzzystruct;
1753 inline bool less_for_compare(const char *lh, const char *rh)
1755 return stardict_strcmp(lh, rh) < 0;
1758 gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord)
1760 glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1];
1761 gint iMatchCount = 0;
1762 GPatternSpec *pspec = g_pattern_spec_new(word);
1764 for (std::vector<Dict *>::size_type iLib = 0; iLib<oLib.size(); iLib++)
1766 //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib))
1767 // -iMatchCount,so save time,but may got less result and the word may repeat.
1770 LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1))
1774 for (int i = 0; aiIndex[i] != -1; i++)
1776 const gchar * sMatchWord = poGetWord(aiIndex[i], iLib);
1777 bool bAlreadyInList = false;
1778 for (int j = 0; j < iMatchCount; j++)
1780 if (strcmp(ppMatchWord[j], sMatchWord) == 0)
1782 bAlreadyInList = true;
1786 if (!bAlreadyInList)
1787 ppMatchWord[iMatchCount++] = g_strdup(sMatchWord);
1791 g_pattern_spec_free(pspec);
1793 if (iMatchCount) // sort it.
1794 std::sort(ppMatchWord, ppMatchWord + iMatchCount, less_for_compare);
1799 bool Libs::LookupData(const gchar *sWord, std::vector<gchar *> *reslist)
1801 std::vector<std::string> SearchWords;
1802 std::string SearchWord;
1803 const char *p = sWord;
1829 if (!SearchWord.empty())
1831 SearchWords.push_back(SearchWord);
1841 if (!SearchWord.empty())
1843 SearchWords.push_back(SearchWord);
1846 if (SearchWords.empty())
1849 guint32 max_size = 0;
1850 gchar *origin_data = NULL;
1851 for (std::vector<Dict *>::size_type i = 0; i<oLib.size(); ++i)
1854 containSearchData())
1858 const gulong iwords = narticles(i);
1860 guint32 offset, size;
1866 ->get_key_and_data(j, &key, &offset, &size);
1867 if (size > max_size)
1869 origin_data = (gchar *)g_realloc(origin_data, size);
1872 if (oLib[i]->SearchData(SearchWords, offset, size, origin_data))
1873 reslist[i].push_back(g_strdup(key));
1876 g_free(origin_data);
1878 std::vector<Dict *>::size_type i;
1879 for (i = 0; i<oLib.size(); ++i)
1880 if (!reslist[i].empty())
1883 return i != oLib.size();
1886 /**************************************************/
1887 query_t analyze_query(const char *s, std::string& res)
1906 bool regexp = false;
1909 for (; *p; res += *p, ++p)
1918 if (*p == '*' || *p == '?')