Set version to v1.1
[mverbiste] / verbiste / FrenchVerbDictionary.cpp
1 /*  $Id: FrenchVerbDictionary.cpp,v 1.51 2012/04/24 02:46:05 sarrazip Exp $
2     FrenchVerbDictionary.cpp - Dictionary of verbs and conjugation templates
3
4     verbiste - French conjugation system
5     Copyright (C) 2003-2010 Pierre Sarrazin <http://sarrazip.com/>
6
7     This program is free software; you can redistribute it and/or
8     modify it under the terms of the GNU General Public License
9     as published by the Free Software Foundation; either version 2
10     of the License, or (at your option) any later version.
11
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16
17     You should have received a copy of the GNU General Public License
18     along with this program; if not, write to the Free Software
19     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20     02111-1307, USA.
21 */
22
23 #include "FrenchVerbDictionary.h"
24
25 #include <assert.h>
26 #include <iostream>
27 #include <errno.h>
28 #include <string.h>
29 #include <stdlib.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <unistd.h>
33
34 using namespace std;
35 using namespace verbiste;
36
37
38 static bool trace = getenv("TRACE") != NULL;
39
40
41 class AutoDoc
42 {
43 public:
44     AutoDoc(xmlDocPtr d) : doc(d) {}
45     ~AutoDoc() { if (doc != NULL) xmlFreeDoc(doc); }
46     xmlDocPtr get() const { return doc; }
47     bool operator ! () const { return doc == NULL; }
48 private:
49     xmlDocPtr doc;
50
51     // Forbidden operations:
52     AutoDoc(const AutoDoc &);
53     AutoDoc &operator = (const AutoDoc &);
54 };
55
56
57 class AutoString
58 {
59 public:
60     AutoString(xmlChar *s) : str(s) {}
61     ~AutoString() { if (str != NULL) xmlFree(str); }
62     xmlChar *get() const { return str; }
63     bool operator ! () const { return str == NULL; }
64     size_t length() const { return str == NULL ? 0 : strlen((char *) str); }
65 private:
66     xmlChar *str;
67
68     // Forbidden operations:
69     AutoString(const AutoString &);
70     AutoString &operator = (const AutoString &);
71 };
72
73
74 inline
75 const xmlChar *
76 XMLCHAR(const char *s)
77 {
78     return (const xmlChar *) s;
79 }
80
81
82 inline
83 int
84 equal(const xmlChar *a, const char *b)
85 {
86     return xmlStrcmp(a, XMLCHAR(b)) == 0;
87 }
88
89
90 inline
91 int
92 different(const xmlChar *a, const char *b)
93 {
94     return !equal(a, b);
95 }
96
97
98 inline
99 xmlChar *
100 getProp(xmlNodePtr node, const char *propName)
101 {
102     return xmlGetProp(node, XMLCHAR(propName));
103 }
104
105
106 inline
107 xmlChar *
108 getString(xmlDocPtr doc, xmlNodePtr node)
109 {
110     return xmlNodeListGetString(doc, node, 1);
111 }
112
113
114 inline
115 string
116 operator + (const AutoString &a, const string &b)
117 {
118     return (char *) a.get() + b;
119 }
120
121
122 inline
123 string
124 operator + (const string &a, const AutoString &b)
125 {
126     return a + (char *) b.get();
127 }
128
129
130 inline
131 Mode
132 convertModeName(const xmlChar *modeName)
133 {
134     return FrenchVerbDictionary::convertModeName((char *) modeName);
135 }
136
137
138 inline
139 Tense
140 convertTenseName(const xmlChar *tenseName)
141 {
142     return FrenchVerbDictionary::convertTenseName((char *) tenseName);
143 }
144
145
146 // Latin-1 to ASCII conversion table (codes 0xC0 to 0xFF).
147 // Some characters have bogus translations, but they are not used in French.
148 //
149 static const char *accentRemovalTable =
150             "AAAAAA_CEEEEIIII"
151             "DNOOOOOxOUUUUYbB"
152             "aaaaaa-ceeeeiiii"
153             "dnooooo/ouuuuyby";
154
155
156 // Only works on Latin-1 characters.
157 //
158 inline wchar_t
159 removeWideCharAccent(wchar_t c)
160 {
161     if (c >= 0xC0 && c <= 0xFF)
162         c = (unsigned char) accentRemovalTable[c - 0xC0];
163     return c;
164 }
165
166
167 string
168 FrenchVerbDictionary::removeUTF8Accents(const string &utf8String)
169 {
170     wstring result = utf8ToWide(utf8String);
171     for (size_t i = 0; i < result.length(); ++i)
172         result[i] = removeWideCharAccent(result[i]);
173     return wideToUTF8(result);
174 }
175
176
177 void
178 FrenchVerbDictionary::formUTF8UnaccentedVariants(const wstring &wideString,
179                                                 size_t index,
180                                                 vector<string> &utf8Variants)
181 {
182     for ( ; index < wideString.length(); ++index)
183     {
184         wchar_t ch = wideString[index];
185         wchar_t unacc = removeWideCharAccent(ch);
186         if (ch != unacc)
187         {
188             wstring copy = wideString;
189             copy[index] = unacc;
190             assert(copy.length() == wideString.length());
191
192             utf8Variants.push_back(wideToUTF8(copy));
193             formUTF8UnaccentedVariants(copy, index + 1, utf8Variants);
194         }
195     }
196 }
197
198
199 void
200 FrenchVerbDictionary::formUTF8UnaccentedVariants(const string &utf8String,
201                                                 size_t index,
202                                                 vector<string> &utf8Variants)
203 {
204     wstring wideString = utf8ToWide(utf8String);
205     formUTF8UnaccentedVariants(wideString, index, utf8Variants);
206 }
207
208
209 void
210 FrenchVerbDictionary::getXMLFilenames(string &conjFN, string &verbsFN, Language l)
211 {
212     const char *libdatadir = getenv("LIBDATADIR");
213     if (libdatadir == NULL)
214         libdatadir = LIBDATADIR;
215     string languageCode = getLanguageCode(l);
216     conjFN  = libdatadir + string("/") + "conjugation-" + languageCode + ".xml";
217     verbsFN = libdatadir + string("/") + "verbs-" + languageCode + ".xml";
218 }
219
220
221 //static
222 FrenchVerbDictionary::Language
223 FrenchVerbDictionary::parseLanguageCode(const std::string &twoLetterCode)
224 {
225     if (twoLetterCode == "fr")
226         return FRENCH;
227     if (twoLetterCode == "it")
228         return ITALIAN;
229     if (twoLetterCode == "el")
230         return GREEK;
231     return NO_LANGUAGE;
232 }
233
234
235 //static
236 std::string
237 FrenchVerbDictionary::getLanguageCode(Language l)
238 {
239     switch (l)
240     {
241     case NO_LANGUAGE: return "";
242     case FRENCH: return "fr";
243     case ITALIAN: return "it";
244     case GREEK: return "el";
245     }
246     return "";
247 }
248
249
250 FrenchVerbDictionary::FrenchVerbDictionary(
251                                 const string &conjugationFilename,
252                                 const string &verbsFilename,
253                                 bool includeWithoutAccents,
254                                 Language _lang)
255                                         throw (logic_error)
256   : conjugSys(),
257     knownVerbs(),
258     aspirateHVerbs(),
259     inflectionTable(),
260     wideToUTF8Conv((iconv_t) -1),
261     utf8ToWideConv((iconv_t) -1),
262     verbTrie(*this),
263     lang(_lang)
264 {
265     if (lang == NO_LANGUAGE)
266         throw logic_error("Invalid language code");
267     init(conjugationFilename, verbsFilename, includeWithoutAccents);
268 }
269
270
271 FrenchVerbDictionary::FrenchVerbDictionary(bool includeWithoutAccents)
272                                                 throw (std::logic_error)
273   : conjugSys(),
274     knownVerbs(),
275     aspirateHVerbs(),
276     inflectionTable(),
277     wideToUTF8Conv((iconv_t) -1),
278     utf8ToWideConv((iconv_t) -1),
279     verbTrie(*this),
280     lang(FRENCH)
281 {
282     string conjFN, verbsFN;
283     getXMLFilenames(conjFN, verbsFN, lang);
284
285     init(conjFN, verbsFN, includeWithoutAccents);
286 }
287
288
289 void
290 FrenchVerbDictionary::init(const string &conjugationFilename,
291                             const string &verbsFilename,
292                             bool includeWithoutAccents)
293                                         throw (logic_error)
294 {
295     wideToUTF8Conv = iconv_open("UTF-8", "WCHAR_T");
296     if (wideToUTF8Conv == (iconv_t) -1)
297         throw logic_error("conversion from wide characters to UTF-8 not supported");
298     utf8ToWideConv = iconv_open("WCHAR_T", "UTF-8");
299     if (utf8ToWideConv == (iconv_t) -1)
300         throw logic_error("conversion from UTF-8 to wide characters not supported");
301
302     #ifndef NDEBUG  // self-test for the wide character string conversions:
303     try
304     {
305         wstring w = utf8ToWide("ab");
306         assert(w.length() == 2);
307         assert(w[0] == 'a');
308         assert(w[1] == 'b');
309
310         const char u0[] = { '\xc3', '\xa2', 't', '\0' };  // 'a' with circumflex accent
311         w = utf8ToWide(u0);
312         assert(w.length() == 2);
313         assert(w[0] == 0xe2);
314         assert(w[1] == 't');
315
316         const char u1[] = { 't', '\xc3', '\xa2', '\0' };  // 'a' with circumflex accent
317         w = utf8ToWide(u1);
318         assert(w.length() == 2);
319         assert(w[0] == 't');
320         assert(w[1] == 0xe2);
321     }
322     catch (int e)
323     {
324         throw logic_error("self-test of utf8ToWide() failed");
325     }
326
327     try
328     {
329         string u = wideToUTF8(L"ab");
330         assert(u.length() == 2);
331         assert(u[0] == 'a');
332         assert(u[1] == 'b');
333     }
334     catch (int e)
335     {
336         throw logic_error("self-test of wideToUTF8() failed");
337     }
338     #endif  // ndef NDEBUG
339
340
341     {
342         for (int i = 0; i < 0xC0; i++)
343             latin1TolowerTable[i] = char(tolower(char(i)));
344         for (int i = 0xC0; i < 0xE0; i++)
345             latin1TolowerTable[i] = char(i + 0x20);
346         for (int i = 0xE0; i < 0x100; i++)
347             latin1TolowerTable[i] = char(i);
348     }
349
350     loadConjugationDatabase(conjugationFilename.c_str(), includeWithoutAccents);
351     loadVerbDatabase(verbsFilename.c_str(), includeWithoutAccents);
352
353     // Load additional verbs from $HOME/.verbiste/verbs-<lang>.xml, if present.
354     //
355     const char *home = getenv("HOME");
356     if (home != NULL)  // do nothing if $HOME not defined
357     {
358         string otherVerbsFilename = string(home) + "/.verbiste/verbs-" + getLanguageCode(lang) + ".xml";
359         struct stat statbuf;
360         if (stat(otherVerbsFilename.c_str(), &statbuf) == 0)  // if file exists
361         {
362             //cout << "otherVerbsFilename=" << otherVerbsFilename << endl;
363             loadVerbDatabase(otherVerbsFilename.c_str(), includeWithoutAccents);
364         }
365     }
366
367     if (trace)
368         cout << "FrenchVerbDictionary::init: trie takes "
369              << verbTrie.computeMemoryConsumption() << " bytes\n";
370 }
371
372
373 void
374 FrenchVerbDictionary::loadConjugationDatabase(
375                                 const char *conjugationFilename,
376                                 bool includeWithoutAccents)
377                                         throw (logic_error)
378 {
379     if (conjugationFilename == NULL)
380         throw invalid_argument("conjugationFilename");
381
382     AutoDoc conjDoc(xmlParseFile(conjugationFilename));
383     if (!conjDoc)
384         throw logic_error("could not parse " + string(conjugationFilename));
385
386     readConjugation(conjDoc.get(), includeWithoutAccents);
387 }
388
389
390 void
391 FrenchVerbDictionary::loadVerbDatabase(
392                                 const char *verbsFilename,
393                                 bool includeWithoutAccents)
394                                         throw (logic_error)
395 {
396     if (verbsFilename == NULL)
397         throw invalid_argument("verbsFilename");
398
399     AutoDoc verbsDoc(xmlParseFile(verbsFilename));
400     if (!verbsDoc)
401         throw logic_error("could not parse " + string(verbsFilename));
402
403     readVerbs(verbsDoc.get(), includeWithoutAccents);
404 }
405
406
407 void
408 FrenchVerbDictionary::readConjugation(xmlDocPtr doc, bool includeWithoutAccents) throw(logic_error)
409 {
410     const bool isItalian = (lang == ITALIAN);
411
412     xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);
413
414     if (rootNodePtr == NULL)
415         throw logic_error("empty conjugation document");
416
417     string langCode = getLanguageCode(lang);
418     if (different(rootNodePtr->name, ("conjugation-" + langCode).c_str()))
419     {
420         string msg = "wrong top node in conjugation document: got "
421                      + string((const char *) rootNodePtr->name)
422                      + ", expected conjugation-" + langCode;
423         throw logic_error(msg);
424     }
425
426     for (xmlNodePtr templ = rootNodePtr->xmlChildrenNode;
427                         templ != NULL;
428                         templ = templ->next)
429     {
430         if (different(templ->name, "template"))  // ignore junk between tags
431             continue;
432
433         string tname = getUTF8XmlProp(templ, "name");
434         if (tname.empty())
435             throw logic_error("missing template name attribute");
436
437         // The template name is the root and the termination,
438         // with a colon in between.  For example, "pla:cer".
439
440         if (tname.find(':') == string::npos)
441             throw logic_error("missing colon in template name");
442
443         // The use of the [] operator creates an empty conjugation
444         // template spec, to which we keep a reference:
445
446         TemplateSpec &theTemplateSpec = conjugSys[tname];
447
448         // Same idea:
449
450         TemplateInflectionTable &ti = inflectionTable[tname];
451
452         // For each mode (e.g., infinitive, indicative, conditional, etc):
453         for (xmlNodePtr mode = templ->xmlChildrenNode;
454                             mode != NULL;
455                             mode = mode->next)
456         {
457             if (equal(mode->name, "text") || equal(mode->name, "comment"))  // any text in this node is ignored
458                 continue;
459
460             if (trace) cout << "readConjugation: mode node: '" << mode->name << "'" << endl;
461             Mode theMode = ::convertModeName(mode->name);
462             ModeSpec &theModeSpec = theTemplateSpec[theMode];
463
464             // For each tense in the mode:
465             for (xmlNodePtr tense = mode->xmlChildrenNode;
466                             tense != NULL;
467                                 tense = tense->next)
468             {
469                 if (equal(tense->name, "text") || equal(tense->name, "comment"))
470                     continue;
471
472                 Tense theTense = ::convertTenseName(tense->name);
473                 TenseSpec &theTenseSpec = theModeSpec[theTense];
474
475                 // For each person in the tense:
476                 int personCounter = 0;
477                 for (xmlNodePtr person = tense->xmlChildrenNode;
478                                 person != NULL;
479                                 person = person->next)
480                 {
481                     if (different(person->name, "p"))
482                         continue;
483
484                     personCounter++;
485
486                     theTenseSpec.push_back(PersonSpec());
487                     PersonSpec &thePersonSpec = theTenseSpec.back();
488
489                     // For each variant for this person:
490                     // (Note that most persons of most verbs have only
491                     // one variant.)
492                     for (xmlNodePtr inf = person->xmlChildrenNode;
493                                     inf != NULL;
494                                     inf = inf->next)
495                     {
496                         string variant = getUTF8XmlNodeText(
497                                                     doc, inf->xmlChildrenNode);
498                         thePersonSpec.push_back(InflectionSpec(variant, true));
499
500                         ModeTensePersonNumber mtpn(
501                                 reinterpret_cast<const char *>(mode->name),
502                                 reinterpret_cast<const char *>(tense->name),
503                                 personCounter,
504                                 true,
505                                 isItalian);
506                         ti[variant].push_back(mtpn);
507
508                         if (includeWithoutAccents)
509                         {
510                             // Also include versions where some or all accents are missing.
511                             vector<string> unaccentedVariants;
512                             formUTF8UnaccentedVariants(variant, 0, unaccentedVariants);
513                             for (vector<string>::const_iterator it = unaccentedVariants.begin();
514                                                                 it != unaccentedVariants.end(); ++it)
515                             {
516                                 thePersonSpec.push_back(InflectionSpec(*it, false));
517                                 mtpn.correct = false;  // 'false' marks this spelling as incorrect.
518                                 ti[*it].push_back(mtpn);
519                             }
520                         }
521                     }
522                 }
523             }
524         }
525     }
526 }
527
528
529 string
530 FrenchVerbDictionary::getUTF8XmlNodeText(xmlDocPtr doc, xmlNodePtr node)
531                                                                 throw(int)
532 {
533     xmlChar *s = getString(doc, node);
534     if (s == NULL)
535         return string();
536     return reinterpret_cast<char *>(s);
537 }
538
539
540 string
541 FrenchVerbDictionary::getUTF8XmlProp(xmlNodePtr node, const char *propName)
542                                                                 throw(int)
543 {
544     xmlChar *s = getProp(node, propName);
545     if (s == NULL)
546         return string();
547     return reinterpret_cast<char *>(s);
548 }
549
550
551 // Reads the given XML document and adds data to members knownVerbs,
552 // aspirateHVerbs and verbTrie.
553 //
554 void
555 FrenchVerbDictionary::readVerbs(xmlDocPtr doc,
556                                 bool includeWithoutAccents)
557                                                 throw(logic_error)
558 {
559     if (trace)
560         cout << "readVerbs: start: includeWithoutAccents=" << includeWithoutAccents << endl;
561
562     xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);
563
564     if (rootNodePtr == NULL)
565         throw logic_error("empty verbs document");
566
567     string langCode = getLanguageCode(lang);
568     if (different(rootNodePtr->name, ("verbs-" + langCode).c_str()))
569         throw logic_error("wrong top node in verbs document");
570
571     for (xmlNodePtr v = rootNodePtr->xmlChildrenNode; v != NULL; v = v->next)
572     {
573         if (equal(v->name, "text") || equal(v->name, "comment"))
574             continue;
575
576         xmlNodePtr i = v->xmlChildrenNode;
577         if (i == NULL || i->xmlChildrenNode == NULL)
578             throw logic_error("missing <i> node");
579
580         string utf8Infinitive = getUTF8XmlNodeText(doc, i->xmlChildrenNode);
581         wstring wideInfinitive = utf8ToWide(utf8Infinitive);
582         if (wideInfinitive.empty())
583             throw logic_error("empty <i> node");
584         size_t lenInfinitive = wideInfinitive.length();
585         if (trace) cout << "utf8Infinitive='" << utf8Infinitive << "'\n";
586
587         if (i->next == NULL)
588             throw logic_error("unexpected end after <i> node");
589
590         xmlNodePtr t = i->next->next;
591         if (t == NULL)
592             throw logic_error("missing <t> node");
593
594         #if 0
595         cout << "t=" << t << ", t->xmlChildrenNode=" << t->xmlChildrenNode << "\n";
596         if (t->xmlChildrenNode == NULL)
597             cout << "  t->next=" << t->next << ", " << (t->next ? getUTF8XmlNodeText(doc, t->next->xmlChildrenNode) : 0) << endl;
598         #endif
599
600         // Get template name (e.g., "aim:er") in UTF-8.
601         string utf8TName = getUTF8XmlNodeText(doc, t->xmlChildrenNode);
602         if (utf8TName.empty())
603             throw logic_error("empty <t> node");
604         if (trace) cout << "  utf8TName='" << utf8TName << "'\n";
605
606         // Check that this template name (seen in verbs-*.xml) has been
607         // seen in conjugation-*.xml.
608         //
609         if (conjugSys.find(utf8TName) == conjugSys.end())
610             throw logic_error("unknown template name: " + utf8TName);
611
612         // Find the offset of the colon in the template name.
613         // For example: the offset is 3 in the case of "aim:er".
614         // Find this offset in a wide character string, because
615         // the offset in a UTF-8 string is in bytes, not characters.
616         //
617         wstring wideTName = utf8ToWide(utf8TName);
618         wstring::size_type posColon = wideTName.find(':');
619         if (posColon == wstring::npos)
620             throw logic_error("missing colon in <t> node");
621         assert(wideTName[posColon] == ':');
622
623
624         knownVerbs[utf8Infinitive].insert(utf8TName);
625
626         if (includeWithoutAccents)
627         {
628             // Also include versions where some of all accents are missing.
629             vector<string> unaccentedVariants;
630             formUTF8UnaccentedVariants(wideInfinitive, 0, unaccentedVariants);
631             for (vector<string>::const_iterator it = unaccentedVariants.begin();
632                                                 it != unaccentedVariants.end(); ++it)
633             {
634                 if (trace) cout << "  unaccvar: '" << *it << "'\n";
635                 knownVerbs[*it].insert(utf8TName);
636             }
637         }
638
639         // <aspirate-h>: If this verb starts with an aspirate h, remember it:
640         if (t->next != NULL && t->next->next != NULL)
641             aspirateHVerbs.insert(utf8Infinitive);
642
643         // Insert the verb in the trie.
644         // A list of template names is associated to each verb in this trie.
645
646         size_t lenTermination = wideTName.length() - posColon - 1;
647         assert(lenTermination > 0);
648         assert(lenInfinitive >= lenTermination);
649
650         wstring wideVerbRadical(wideInfinitive, 0, lenInfinitive - lenTermination);
651         string utf8VerbRadical = wideToUTF8(wideVerbRadical);
652
653         insertVerbRadicalInTrie(utf8VerbRadical, utf8TName, utf8VerbRadical);
654
655         if (includeWithoutAccents)
656         {
657             // Also include versions where some of all accents are missing.
658             vector<string> unaccentedVariants;
659             formUTF8UnaccentedVariants(wideVerbRadical, 0, unaccentedVariants);
660             for (vector<string>::const_iterator it = unaccentedVariants.begin();
661                                                 it != unaccentedVariants.end(); ++it)
662             {
663                 insertVerbRadicalInTrie(*it, utf8TName, utf8VerbRadical);  // pass correct verb radical as 3rd argument
664             }
665         }
666     }
667
668     if (trace)
669         cout << "Number of known verbs (lang " << langCode << "): " << knownVerbs.size() << endl;
670 }
671
672
673 // String parameters expected to be in UTF-8.
674 // Adds to 'verbTrie', which contains wide character strings.
675 //
676 void
677 FrenchVerbDictionary::insertVerbRadicalInTrie(
678                                     const std::string &verbRadical,
679                                     const std::string &tname,
680                                     const std::string &correctVerbRadical)
681 {
682     wstring wideVerbRadical = utf8ToWide(verbRadical);
683     if (trace)
684         cout << "insertVerbRadicalInTrie('"
685               << verbRadical << "' (len=" << wideVerbRadical.length()
686               << "), '" << tname
687               << "', '" << correctVerbRadical
688               << "')\n";
689
690     vector<TrieValue> **templateListPtr =
691                             verbTrie.getUserDataPointer(wideVerbRadical);
692     assert(templateListPtr != NULL);
693
694     // If a new entry was created for 'wideVerbRadical', then the associated
695     // user data pointer is null.  Make this pointer point to a new,
696     // empty vector of template names.
697     //
698     if (*templateListPtr == NULL)
699         *templateListPtr = new vector<TrieValue>();
700
701     // Associate the given template name to the given verb radical.
702     //
703     (*templateListPtr)->push_back(TrieValue(tname, correctVerbRadical));
704 }
705
706
707 FrenchVerbDictionary::~FrenchVerbDictionary()
708 {
709     iconv_close(utf8ToWideConv);
710     iconv_close(wideToUTF8Conv);
711 }
712
713
714 const TemplateSpec *
715 FrenchVerbDictionary::getTemplate(const string &templateName) const
716 {
717     ConjugationSystem::const_iterator it = conjugSys.find(templateName);
718     if (it == conjugSys.end())
719         return NULL;
720     return &it->second;
721 }
722
723
724 ConjugationSystem::const_iterator
725 FrenchVerbDictionary::beginConjugSys() const
726 {
727     return conjugSys.begin();
728 }
729
730
731 ConjugationSystem::const_iterator
732 FrenchVerbDictionary::endConjugSys() const
733 {
734     return conjugSys.end();
735 }
736
737
738 const std::set<std::string> &
739 FrenchVerbDictionary::getVerbTemplateSet(const char *infinitive) const
740 {
741     static const std::set<std::string> emptySet;
742     if (infinitive == NULL)
743         return emptySet;
744     VerbTable::const_iterator it = knownVerbs.find(infinitive);
745     if (it == knownVerbs.end())
746         return emptySet;
747     return it->second;
748 }
749
750
751 const std::set<std::string> &
752 FrenchVerbDictionary::getVerbTemplateSet(const string &infinitive) const
753 {
754     return getVerbTemplateSet(infinitive.c_str());
755 }
756
757
758 VerbTable::const_iterator
759 FrenchVerbDictionary::beginKnownVerbs() const
760 {
761     return knownVerbs.begin();
762 }
763
764
765 VerbTable::const_iterator
766 FrenchVerbDictionary::endKnownVerbs() const
767 {
768     return knownVerbs.end();
769 }
770
771
772 const std::vector<ModeTensePersonNumber> *
773 FrenchVerbDictionary::getMTPNForInflection(
774                                 const std::string &templateName,
775                                 const std::string &inflection) const
776 {
777     InflectionTable::const_iterator i = inflectionTable.find(templateName);
778     if (i == inflectionTable.end())
779         return NULL;
780     const TemplateInflectionTable &ti = i->second;
781     TemplateInflectionTable::const_iterator j = ti.find(inflection);
782     if (j == ti.end())
783         return NULL;
784     return &j->second;
785 }
786
787
788 /*static*/
789 Mode
790 FrenchVerbDictionary::convertModeName(const char *modeName)
791 {
792     Mode mode = INVALID_MODE;
793     if (modeName == NULL)
794         ;
795     else if (strcmp(modeName, "infinitive") == 0)
796         mode = INFINITIVE_MODE;
797     else if (strcmp(modeName, "indicative") == 0)
798         mode = INDICATIVE_MODE;
799     else if (strcmp(modeName, "conditional") == 0)
800         mode = CONDITIONAL_MODE;
801     else if (strcmp(modeName, "subjunctive") == 0)
802         mode = SUBJUNCTIVE_MODE;
803     else if (strcmp(modeName, "imperative") == 0)
804         mode = IMPERATIVE_MODE;
805     else if (strcmp(modeName, "participle") == 0)
806         mode = PARTICIPLE_MODE;
807     else if (strcmp(modeName, "gerund") == 0)
808         mode = GERUND_MODE;
809     else if (strcmp(modeName, "present-indicative") == 0)
810         mode = PRESENT_INDICATIVE;
811     else if (strcmp(modeName, "present-subjunctive") == 0)
812         mode = PRESENT_SUBJUNCTIVE;
813     else if (strcmp(modeName, "present-imperative") == 0)
814         mode = PRESENT_IMPERATIVE;
815     else if (strcmp(modeName, "present-gerund") == 0)
816         mode = PRESENT_GERUND;
817     else if (strcmp(modeName, "past-imperfect-indicative") == 0)
818         mode = PAST_IMPERFECT_INDICATIVE;
819     else if (strcmp(modeName, "past-perfect-indicative") == 0)
820         mode = PAST_PERFECT_INDICATIVE;
821     else if (strcmp(modeName, "past-perfect-subjunctive") == 0)
822         mode = PAST_PERFECT_SUBJUNCTIVE;
823     else if (strcmp(modeName, "past-perfect-imperative") == 0)
824         mode = PAST_PERFECT_IMPERATIVE;
825     else if (strcmp(modeName, "past-perfect-infinitive") == 0)
826         mode = PAST_PERFECT_INFINITIVE;
827
828     if (mode == INVALID_MODE)
829     {
830         if (trace) cout << "modeName='" << modeName << "'" << endl;
831         assert(!"Invalid mode");
832     }
833
834     return mode;
835 }
836
837
838 /*static*/
839 Tense
840 FrenchVerbDictionary::convertTenseName(const char *tenseName)
841 {
842     Tense tense = INVALID_TENSE;
843     if (tenseName == NULL)
844         ;
845     else if (strcmp(tenseName, "infinitive-present") == 0)
846         tense = PRESENT_TENSE;
847     else if (strcmp(tenseName, "present") == 0)
848         tense = PRESENT_TENSE;
849     else if (strcmp(tenseName, "imperfect") == 0)
850         tense = IMPERFECT_TENSE;
851     else if (strcmp(tenseName, "future") == 0)
852         tense = FUTURE_TENSE;
853     else if (strcmp(tenseName, "simple-past") == 0)
854         tense = PAST_TENSE;
855     else if (strcmp(tenseName, "imperative-present") == 0)
856         tense = PRESENT_TENSE;
857     else if (strcmp(tenseName, "present-participle") == 0)
858         tense = PRESENT_TENSE;
859     else if (strcmp(tenseName, "past-participle") == 0)
860         tense = PAST_TENSE;
861     else if (strcmp(tenseName, "past") == 0)
862         tense = PAST_TENSE;
863     else if (strcmp(tenseName, "present-gerund") == 0)
864         tense = PRESENT_TENSE;
865     else if (strcmp(tenseName, "active") == 0)
866         tense = ACTIVE_TENSE;
867     else if (strcmp(tenseName, "passive") == 0)
868         tense = PASSIVE_TENSE;
869     else if (strcmp(tenseName, "imp-active") == 0)
870         tense = IMPERATIVE_ACTIVE_TENSE;
871     else if (strcmp(tenseName, "imp-passive") == 0)
872         tense = IMPERATIVE_PASSIVE_TENSE;
873     else if (strcmp(tenseName, "past-perfect") == 0)
874         tense = PAST_PERFECT;
875
876     if (tense == INVALID_TENSE)
877     {
878         if (trace) cout << "tenseName='" << tenseName << "'" << endl;
879         assert(!"Invalid tense");
880     }
881
882     return tense;
883 }
884
885
886 void
887 FrenchVerbDictionary::deconjugate(const string &utf8ConjugatedVerb,
888                                 std::vector<InflectionDesc> &results)
889 {
890     verbTrie.setDestination(&results);
891
892     try
893     {
894         wstring w = utf8ToWide(utf8ConjugatedVerb);
895         (void) verbTrie.get(w);
896     }
897     catch (int e)  // exception throw by utf8towide()
898     {
899         // Wrong encoding (possibly Latin-1). Act as with unknown verb.
900     }
901
902     verbTrie.setDestination(NULL);
903 }
904
905
906 /*virtual*/
907 void
908 FrenchVerbDictionary::VerbTrie::onFoundPrefixWithUserData(
909                         const wstring &conjugatedVerb,
910                         wstring::size_type index,
911                         const vector<TrieValue> *templateList) const throw()
912 {
913     assert(templateList != NULL);
914     if (trace)
915         wcout << "VerbTrie::onFoundPrefixWithUserData: start: conjugatedVerb='"
916               << conjugatedVerb << "', index=" << index
917               << ", templateList: " << templateList->size()
918               << ", results=" << results << endl;
919
920     if (results == NULL)
921         return;
922
923     const wstring term(conjugatedVerb, index);
924     const string utf8Term = fvd.wideToUTF8(term);
925
926     if (trace)
927         cout << "  utf8Term='" << utf8Term << "'\n";
928
929     /*
930         'templateList' contains the names of conjugated templates that might
931         apply to the conjugated verb.  We check each of them to see if there
932         is one that accepts the given termination 'term'.
933     */
934     for (vector<TrieValue>::const_iterator i = templateList->begin();
935                                            i != templateList->end(); i++)
936     {
937         const TrieValue &trieValue = *i;
938         const string &tname = trieValue.templateName;
939         const TemplateInflectionTable &ti =
940                                 fvd.inflectionTable.find(tname)->second;
941         TemplateInflectionTable::const_iterator j = ti.find(utf8Term);
942         if (trace)
943             cout << "    tname='" << tname << "'\n";
944         if (j == ti.end())
945             continue;  // template 'tname' does not accept termination 'term'
946
947         // template 'tname' accepts 'term', so we produce some results.
948
949         string templateTerm(tname, tname.find(':') + 1);
950             // termination of the infinitive form
951         if (trace)
952             cout << "    templateTerm='" << templateTerm << "'\n";
953
954         const vector<ModeTensePersonNumber> &v = j->second;
955             // list of mode-tense-person combinations that can correspond
956             // to the conjugated verb's termination
957
958         for (vector<ModeTensePersonNumber>::const_iterator k = v.begin();
959                                                     k != v.end(); k++)
960         {
961             const ModeTensePersonNumber &mtpn = *k;
962
963             string infinitive = trieValue.correctVerbRadical + templateTerm;
964                 // The infinitive of the conjugated verb is formed from its
965                 // (correct) radical part and from the termination of the template name.
966                 // Correct means with the proper accents. This allows the user
967                 // to type "etaler" without the acute accent on the first "e"
968                 // and obtain the conjugation for the correct verb, which has
969                 // that accent.
970
971             if (trace)
972             {
973                 const wstring radical(conjugatedVerb, 0, index);
974                 cout << "VerbTrie::onFoundPrefixWithUserData: radical='"
975                     << fvd.wideToUTF8(radical) << "', templateTerm='" << templateTerm
976                     << "', tname='" << tname
977                     << "', correctVerbRadical='" << trieValue.correctVerbRadical
978                     << "', mtpn=("
979                     << mtpn.mode << ", "
980                     << mtpn.tense << ", "
981                     << (unsigned) mtpn.person << ", "
982                     << mtpn.plural << ", "
983                     << mtpn.correct << ")\n";
984             }
985
986             results->push_back(InflectionDesc(infinitive, tname, mtpn));
987                 // the InflectionDesc object is an analysis of the
988                 // conjugated verb
989         }
990     }
991 }
992
993
994 /*static*/
995 const char *
996 FrenchVerbDictionary::getModeName(Mode m)
997 {
998     if (int(m) < int(INFINITIVE_MODE) || int(m) > int(PAST_PERFECT_INFINITIVE))
999     {
1000         assert(!"FrenchVerbDictionary::getModeName() received invalid Mode value");
1001         return NULL;
1002     }
1003
1004     static const char *names[] =
1005     {
1006         "infinitive", "indicative", "conditional",
1007         "subjunctive", "imperative", "participle",
1008         "gerund",
1009         "present indicative",
1010         "present subjunctive",
1011         "present imperative",
1012         "present gerund",
1013         "past imperfect indicative",
1014         "past perfect indicative",
1015         "past perfect subjunctive",
1016         "past perfect imperative",
1017         "past perfect infinitive",
1018     };
1019
1020     size_t index = size_t(m) - 1;
1021     assert(index < sizeof(names) / sizeof(names[0]));
1022     return names[index];
1023 }
1024
1025
1026 /*static*/
1027 const char *
1028 FrenchVerbDictionary::getTenseName(Tense t)
1029 {
1030     if (int(t) < int(PRESENT_TENSE) || int(t) > int(PAST_PERFECT))
1031     {
1032         assert(!"FrenchVerbDictionary::getTenseName() received invalid Tense value");
1033         return NULL;
1034     }
1035
1036     static const char *names[] =
1037     {
1038         "present", "past", "imperfect", "future",
1039         "active", "passive", "active", "passive", "past perfect",
1040     };
1041
1042     size_t index = size_t(t) - 1;
1043     assert(index < sizeof(names) / sizeof(names[0]));
1044     return names[index];
1045 }
1046
1047
1048 wstring
1049 FrenchVerbDictionary::tolowerWide(const wstring &wideString) const
1050 {
1051     wstring result;
1052     for (wstring::size_type len = wideString.length(), i = 0; i < len; i++)
1053     {
1054         wchar_t c = wideString[i];
1055         if (c <= 0xFF)
1056             result += (unsigned char) latin1TolowerTable[(unsigned char) c];
1057         else
1058             result += c;
1059     }
1060     return result;
1061 }
1062
1063
1064 //static
1065 bool
1066 FrenchVerbDictionary::isWideVowel(wchar_t c)
1067 {
1068     if (strchr("aeiouyAEIOUY", (unsigned char) c) != NULL)
1069         return true;
1070     if (c < 0xc0 || c > 0xff)
1071         return false;
1072     return c != 0xc7 && c != 0xd0
1073         && c != 0xd1 && c != 0xd7 && c != 0xde
1074         && c != 0xe7
1075         && c != 0xf0 && c != 0xf1 && c != 0xf7 && c != 0xfe;
1076 }
1077
1078
1079 wstring
1080 FrenchVerbDictionary::utf8ToWide(const string &utf8String) const throw(int)
1081 {
1082     size_t inbytesleft = utf8String.length() + 1;  // number of *bytes* in UTF-8 string
1083     size_t outbytesleft = inbytesleft * sizeof(wchar_t);  // oversized for safety
1084     char *inbuf = strcpy(new char[inbytesleft], utf8String.c_str());
1085     char *outbuf = new char[outbytesleft];
1086
1087     ICONV_CONST char *in = inbuf;
1088     char *out = outbuf;
1089     size_t initNumOutBytes = outbytesleft;
1090     if (iconv(utf8ToWideConv, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
1091     {
1092         int e = errno;
1093         delete [] inbuf;
1094         delete [] outbuf;
1095         throw e;
1096     }
1097
1098     // iconv() has substracted the number of bytes produced
1099     // from outbytesleft. This allows the computation of the
1100     // number of wide characters in the result (excluding the
1101     // terminating null character).
1102     // See the iconv(3) man page for details.
1103     //
1104     const wchar_t *resultPtr = reinterpret_cast<wchar_t *>(outbuf);
1105     size_t resultLen = (initNumOutBytes - outbytesleft) / sizeof(wchar_t) - 1;
1106     assert(resultPtr[resultLen] == 0);
1107
1108     wstring result(resultPtr, resultLen);
1109     assert(result.length() == resultLen);
1110
1111     delete [] inbuf;
1112     delete [] outbuf;
1113     return result;
1114 }
1115
1116
1117 string
1118 FrenchVerbDictionary::wideToUTF8(const wstring &wideString) const throw(int)
1119 {
1120     size_t inbytesleft = (wideString.length() + 1) * sizeof(wchar_t);
1121     size_t outbytesleft = inbytesleft;  // UTF-8 string takes no more room than wstring
1122     char *inbuf = reinterpret_cast<char *>(memcpy(new char[inbytesleft], wideString.data(), inbytesleft));
1123     char *outbuf = new char[outbytesleft];
1124
1125     ICONV_CONST char *in = inbuf;
1126     char *out = outbuf;
1127     if (iconv(wideToUTF8Conv, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
1128     {
1129         int e = errno;
1130         delete [] inbuf;
1131         delete [] outbuf;
1132         throw e;
1133     }
1134
1135     string result = outbuf;
1136     delete [] inbuf;
1137     delete [] outbuf;
1138     return result;
1139 }
1140
1141
1142 /*static*/
1143 string
1144 FrenchVerbDictionary::getRadical(
1145                         const string &infinitive,
1146                         const string &templateName) throw(logic_error)
1147 {
1148     string::size_type posColon = templateName.find(':');
1149     if (posColon == string::npos)
1150         throw logic_error("no colon found in template name");
1151
1152     string::size_type lenSuffix = templateName.length() - posColon - 1;
1153     string::size_type lenInfPrefix = infinitive.length() - lenSuffix;
1154     return string(infinitive, 0, lenInfPrefix);
1155 }
1156
1157
1158 bool
1159 FrenchVerbDictionary::generateTense(const string &radical,
1160                                 const TemplateSpec &templ,
1161                                 Mode mode,
1162                                 Tense tense,
1163                                 vector< vector<string> > &dest,
1164                                 bool includePronouns,
1165                                 bool aspirateH,
1166                                 bool isItalian) const throw()
1167 {
1168     if (templ.find(mode) == templ.end())
1169         return false;
1170
1171     const ModeSpec &modeSpec = templ.find(mode)->second;
1172
1173     if (modeSpec.find(tense) == modeSpec.end())
1174         return false;
1175
1176     const TenseSpec &tenseSpec = modeSpec.find(tense)->second;
1177
1178     if (mode != INDICATIVE_MODE
1179             && mode != CONDITIONAL_MODE
1180             && mode != SUBJUNCTIVE_MODE)
1181         includePronouns = false;
1182
1183     for (TenseSpec::const_iterator p = tenseSpec.begin();
1184                                     p != tenseSpec.end(); p++)
1185     {
1186         dest.push_back(vector<string>());
1187         for (PersonSpec::const_iterator i = p->begin(); i != p->end(); i++)
1188         {
1189             // Do not return spellings that are marked incorrect.
1190             // They are in the knowledge base only to allow
1191             // error-tolerant searches.
1192             //
1193             if (!(*i).isCorrect)
1194                 continue;
1195
1196             string pronoun;  // no pronoun by default
1197
1198             string v = radical + (*i).inflection;
1199
1200             if (includePronouns)
1201             {
1202                 size_t noPers = p - tenseSpec.begin();
1203                 switch (noPers)
1204                 {
1205                 case 0:
1206                     if (isItalian)
1207                         pronoun = "io ";
1208                     else
1209                     {
1210                         bool elideJe = false;
1211                         if (!aspirateH)
1212                         {
1213                             wstring wideV = utf8ToWide(v);  // inefficient: converts all chars, only 1st needed
1214                             wchar_t init = (wideV.empty() ? '\0' : wideV[0]);
1215                             if (init == 'h' || init == 'H' || isWideVowel(init))
1216                                 elideJe = true;
1217                         }
1218                         pronoun = (elideJe ? "j'" : "je ");
1219                     }
1220                     break;
1221                 case 1: pronoun = "tu "; break;
1222                 case 2: pronoun = (isItalian ? "egli " : "il "); break;
1223                 case 3: pronoun = (isItalian ? "noi "  : "nous "); break;
1224                 case 4: pronoun = (isItalian ? "voi "  : "vous "); break;
1225                 case 5: pronoun = (isItalian ? "essi " : "ils "); break;
1226                 }
1227
1228                 if (mode == SUBJUNCTIVE_MODE)
1229                 {
1230                     const char *conj;
1231                     if (isItalian)
1232                         conj = "che ";
1233                     else if (noPers == 2 || noPers == 5)
1234                         conj = "qu'";
1235                     else
1236                         conj = "que ";
1237                     pronoun = conj + pronoun;
1238                 }
1239             }
1240
1241             dest.back().push_back(pronoun + v);
1242         }
1243     }
1244
1245     return true;
1246 }
1247
1248
1249 bool FrenchVerbDictionary::isVerbStartingWithAspirateH(
1250                                 const std::string &infinitive) const throw()
1251 {
1252     return aspirateHVerbs.find(infinitive) != aspirateHVerbs.end();
1253 }