1 /* $Id: FrenchVerbDictionary.cpp,v 1.51 2012/04/24 02:46:05 sarrazip Exp $
2 FrenchVerbDictionary.cpp - Dictionary of verbs and conjugation templates
4 verbiste - French conjugation system
5 Copyright (C) 2003-2010 Pierre Sarrazin <http://sarrazip.com/>
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License
9 as published by the Free Software Foundation; either version 2
10 of the License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 #include "FrenchVerbDictionary.h"
30 #include <sys/types.h>
35 using namespace verbiste;
38 static bool trace = getenv("TRACE") != NULL;
44 AutoDoc(xmlDocPtr d) : doc(d) {}
45 ~AutoDoc() { if (doc != NULL) xmlFreeDoc(doc); }
46 xmlDocPtr get() const { return doc; }
47 bool operator ! () const { return doc == NULL; }
51 // Forbidden operations:
52 AutoDoc(const AutoDoc &);
53 AutoDoc &operator = (const AutoDoc &);
60 AutoString(xmlChar *s) : str(s) {}
61 ~AutoString() { if (str != NULL) xmlFree(str); }
62 xmlChar *get() const { return str; }
63 bool operator ! () const { return str == NULL; }
64 size_t length() const { return str == NULL ? 0 : strlen((char *) str); }
68 // Forbidden operations:
69 AutoString(const AutoString &);
70 AutoString &operator = (const AutoString &);
76 XMLCHAR(const char *s)
78 return (const xmlChar *) s;
84 equal(const xmlChar *a, const char *b)
86 return xmlStrcmp(a, XMLCHAR(b)) == 0;
92 different(const xmlChar *a, const char *b)
100 getProp(xmlNodePtr node, const char *propName)
102 return xmlGetProp(node, XMLCHAR(propName));
108 getString(xmlDocPtr doc, xmlNodePtr node)
110 return xmlNodeListGetString(doc, node, 1);
116 operator + (const AutoString &a, const string &b)
118 return (char *) a.get() + b;
124 operator + (const string &a, const AutoString &b)
126 return a + (char *) b.get();
132 convertModeName(const xmlChar *modeName)
134 return FrenchVerbDictionary::convertModeName((char *) modeName);
140 convertTenseName(const xmlChar *tenseName)
142 return FrenchVerbDictionary::convertTenseName((char *) tenseName);
146 // Latin-1 to ASCII conversion table (codes 0xC0 to 0xFF).
147 // Some characters have bogus translations, but they are not used in French.
149 static const char *accentRemovalTable =
156 // Only works on Latin-1 characters.
159 removeWideCharAccent(wchar_t c)
161 if (c >= 0xC0 && c <= 0xFF)
162 c = (unsigned char) accentRemovalTable[c - 0xC0];
168 FrenchVerbDictionary::removeUTF8Accents(const string &utf8String)
170 wstring result = utf8ToWide(utf8String);
171 for (size_t i = 0; i < result.length(); ++i)
172 result[i] = removeWideCharAccent(result[i]);
173 return wideToUTF8(result);
178 FrenchVerbDictionary::formUTF8UnaccentedVariants(const wstring &wideString,
180 vector<string> &utf8Variants)
182 for ( ; index < wideString.length(); ++index)
184 wchar_t ch = wideString[index];
185 wchar_t unacc = removeWideCharAccent(ch);
188 wstring copy = wideString;
190 assert(copy.length() == wideString.length());
192 utf8Variants.push_back(wideToUTF8(copy));
193 formUTF8UnaccentedVariants(copy, index + 1, utf8Variants);
200 FrenchVerbDictionary::formUTF8UnaccentedVariants(const string &utf8String,
202 vector<string> &utf8Variants)
204 wstring wideString = utf8ToWide(utf8String);
205 formUTF8UnaccentedVariants(wideString, index, utf8Variants);
210 FrenchVerbDictionary::getXMLFilenames(string &conjFN, string &verbsFN, Language l)
212 const char *libdatadir = getenv("LIBDATADIR");
213 if (libdatadir == NULL)
214 libdatadir = LIBDATADIR;
215 string languageCode = getLanguageCode(l);
216 conjFN = libdatadir + string("/") + "conjugation-" + languageCode + ".xml";
217 verbsFN = libdatadir + string("/") + "verbs-" + languageCode + ".xml";
222 FrenchVerbDictionary::Language
223 FrenchVerbDictionary::parseLanguageCode(const std::string &twoLetterCode)
225 if (twoLetterCode == "fr")
227 if (twoLetterCode == "it")
229 if (twoLetterCode == "el")
237 FrenchVerbDictionary::getLanguageCode(Language l)
241 case NO_LANGUAGE: return "";
242 case FRENCH: return "fr";
243 case ITALIAN: return "it";
244 case GREEK: return "el";
250 FrenchVerbDictionary::FrenchVerbDictionary(
251 const string &conjugationFilename,
252 const string &verbsFilename,
253 bool includeWithoutAccents,
260 wideToUTF8Conv((iconv_t) -1),
261 utf8ToWideConv((iconv_t) -1),
265 if (lang == NO_LANGUAGE)
266 throw logic_error("Invalid language code");
267 init(conjugationFilename, verbsFilename, includeWithoutAccents);
271 FrenchVerbDictionary::FrenchVerbDictionary(bool includeWithoutAccents)
272 throw (std::logic_error)
277 wideToUTF8Conv((iconv_t) -1),
278 utf8ToWideConv((iconv_t) -1),
282 string conjFN, verbsFN;
283 getXMLFilenames(conjFN, verbsFN, lang);
285 init(conjFN, verbsFN, includeWithoutAccents);
290 FrenchVerbDictionary::init(const string &conjugationFilename,
291 const string &verbsFilename,
292 bool includeWithoutAccents)
295 wideToUTF8Conv = iconv_open("UTF-8", "WCHAR_T");
296 if (wideToUTF8Conv == (iconv_t) -1)
297 throw logic_error("conversion from wide characters to UTF-8 not supported");
298 utf8ToWideConv = iconv_open("WCHAR_T", "UTF-8");
299 if (utf8ToWideConv == (iconv_t) -1)
300 throw logic_error("conversion from UTF-8 to wide characters not supported");
302 #ifndef NDEBUG // self-test for the wide character string conversions:
305 wstring w = utf8ToWide("ab");
306 assert(w.length() == 2);
310 const char u0[] = { '\xc3', '\xa2', 't', '\0' }; // 'a' with circumflex accent
312 assert(w.length() == 2);
313 assert(w[0] == 0xe2);
316 const char u1[] = { 't', '\xc3', '\xa2', '\0' }; // 'a' with circumflex accent
318 assert(w.length() == 2);
320 assert(w[1] == 0xe2);
324 throw logic_error("self-test of utf8ToWide() failed");
329 string u = wideToUTF8(L"ab");
330 assert(u.length() == 2);
336 throw logic_error("self-test of wideToUTF8() failed");
338 #endif // ndef NDEBUG
342 for (int i = 0; i < 0xC0; i++)
343 latin1TolowerTable[i] = char(tolower(char(i)));
344 for (int i = 0xC0; i < 0xE0; i++)
345 latin1TolowerTable[i] = char(i + 0x20);
346 for (int i = 0xE0; i < 0x100; i++)
347 latin1TolowerTable[i] = char(i);
350 loadConjugationDatabase(conjugationFilename.c_str(), includeWithoutAccents);
351 loadVerbDatabase(verbsFilename.c_str(), includeWithoutAccents);
353 // Load additional verbs from $HOME/.verbiste/verbs-<lang>.xml, if present.
355 const char *home = getenv("HOME");
356 if (home != NULL) // do nothing if $HOME not defined
358 string otherVerbsFilename = string(home) + "/.verbiste/verbs-" + getLanguageCode(lang) + ".xml";
360 if (stat(otherVerbsFilename.c_str(), &statbuf) == 0) // if file exists
362 //cout << "otherVerbsFilename=" << otherVerbsFilename << endl;
363 loadVerbDatabase(otherVerbsFilename.c_str(), includeWithoutAccents);
368 cout << "FrenchVerbDictionary::init: trie takes "
369 << verbTrie.computeMemoryConsumption() << " bytes\n";
374 FrenchVerbDictionary::loadConjugationDatabase(
375 const char *conjugationFilename,
376 bool includeWithoutAccents)
379 if (conjugationFilename == NULL)
380 throw invalid_argument("conjugationFilename");
382 AutoDoc conjDoc(xmlParseFile(conjugationFilename));
384 throw logic_error("could not parse " + string(conjugationFilename));
386 readConjugation(conjDoc.get(), includeWithoutAccents);
391 FrenchVerbDictionary::loadVerbDatabase(
392 const char *verbsFilename,
393 bool includeWithoutAccents)
396 if (verbsFilename == NULL)
397 throw invalid_argument("verbsFilename");
399 AutoDoc verbsDoc(xmlParseFile(verbsFilename));
401 throw logic_error("could not parse " + string(verbsFilename));
403 readVerbs(verbsDoc.get(), includeWithoutAccents);
408 FrenchVerbDictionary::readConjugation(xmlDocPtr doc, bool includeWithoutAccents) throw(logic_error)
410 const bool isItalian = (lang == ITALIAN);
412 xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);
414 if (rootNodePtr == NULL)
415 throw logic_error("empty conjugation document");
417 string langCode = getLanguageCode(lang);
418 if (different(rootNodePtr->name, ("conjugation-" + langCode).c_str()))
420 string msg = "wrong top node in conjugation document: got "
421 + string((const char *) rootNodePtr->name)
422 + ", expected conjugation-" + langCode;
423 throw logic_error(msg);
426 for (xmlNodePtr templ = rootNodePtr->xmlChildrenNode;
430 if (different(templ->name, "template")) // ignore junk between tags
433 string tname = getUTF8XmlProp(templ, "name");
435 throw logic_error("missing template name attribute");
437 // The template name is the root and the termination,
438 // with a colon in between. For example, "pla:cer".
440 if (tname.find(':') == string::npos)
441 throw logic_error("missing colon in template name");
443 // The use of the [] operator creates an empty conjugation
444 // template spec, to which we keep a reference:
446 TemplateSpec &theTemplateSpec = conjugSys[tname];
450 TemplateInflectionTable &ti = inflectionTable[tname];
452 // For each mode (e.g., infinitive, indicative, conditional, etc):
453 for (xmlNodePtr mode = templ->xmlChildrenNode;
457 if (equal(mode->name, "text") || equal(mode->name, "comment")) // any text in this node is ignored
460 if (trace) cout << "readConjugation: mode node: '" << mode->name << "'" << endl;
461 Mode theMode = ::convertModeName(mode->name);
462 ModeSpec &theModeSpec = theTemplateSpec[theMode];
464 // For each tense in the mode:
465 for (xmlNodePtr tense = mode->xmlChildrenNode;
469 if (equal(tense->name, "text") || equal(tense->name, "comment"))
472 Tense theTense = ::convertTenseName(tense->name);
473 TenseSpec &theTenseSpec = theModeSpec[theTense];
475 // For each person in the tense:
476 int personCounter = 0;
477 for (xmlNodePtr person = tense->xmlChildrenNode;
479 person = person->next)
481 if (different(person->name, "p"))
486 theTenseSpec.push_back(PersonSpec());
487 PersonSpec &thePersonSpec = theTenseSpec.back();
489 // For each variant for this person:
490 // (Note that most persons of most verbs have only
492 for (xmlNodePtr inf = person->xmlChildrenNode;
496 string variant = getUTF8XmlNodeText(
497 doc, inf->xmlChildrenNode);
498 thePersonSpec.push_back(InflectionSpec(variant, true));
500 ModeTensePersonNumber mtpn(
501 reinterpret_cast<const char *>(mode->name),
502 reinterpret_cast<const char *>(tense->name),
506 ti[variant].push_back(mtpn);
508 if (includeWithoutAccents)
510 // Also include versions where some or all accents are missing.
511 vector<string> unaccentedVariants;
512 formUTF8UnaccentedVariants(variant, 0, unaccentedVariants);
513 for (vector<string>::const_iterator it = unaccentedVariants.begin();
514 it != unaccentedVariants.end(); ++it)
516 thePersonSpec.push_back(InflectionSpec(*it, false));
517 mtpn.correct = false; // 'false' marks this spelling as incorrect.
518 ti[*it].push_back(mtpn);
530 FrenchVerbDictionary::getUTF8XmlNodeText(xmlDocPtr doc, xmlNodePtr node)
533 xmlChar *s = getString(doc, node);
536 return reinterpret_cast<char *>(s);
541 FrenchVerbDictionary::getUTF8XmlProp(xmlNodePtr node, const char *propName)
544 xmlChar *s = getProp(node, propName);
547 return reinterpret_cast<char *>(s);
551 // Reads the given XML document and adds data to members knownVerbs,
552 // aspirateHVerbs and verbTrie.
555 FrenchVerbDictionary::readVerbs(xmlDocPtr doc,
556 bool includeWithoutAccents)
560 cout << "readVerbs: start: includeWithoutAccents=" << includeWithoutAccents << endl;
562 xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);
564 if (rootNodePtr == NULL)
565 throw logic_error("empty verbs document");
567 string langCode = getLanguageCode(lang);
568 if (different(rootNodePtr->name, ("verbs-" + langCode).c_str()))
569 throw logic_error("wrong top node in verbs document");
571 for (xmlNodePtr v = rootNodePtr->xmlChildrenNode; v != NULL; v = v->next)
573 if (equal(v->name, "text") || equal(v->name, "comment"))
576 xmlNodePtr i = v->xmlChildrenNode;
577 if (i == NULL || i->xmlChildrenNode == NULL)
578 throw logic_error("missing <i> node");
580 string utf8Infinitive = getUTF8XmlNodeText(doc, i->xmlChildrenNode);
581 wstring wideInfinitive = utf8ToWide(utf8Infinitive);
582 if (wideInfinitive.empty())
583 throw logic_error("empty <i> node");
584 size_t lenInfinitive = wideInfinitive.length();
585 if (trace) cout << "utf8Infinitive='" << utf8Infinitive << "'\n";
588 throw logic_error("unexpected end after <i> node");
590 xmlNodePtr t = i->next->next;
592 throw logic_error("missing <t> node");
595 cout << "t=" << t << ", t->xmlChildrenNode=" << t->xmlChildrenNode << "\n";
596 if (t->xmlChildrenNode == NULL)
597 cout << " t->next=" << t->next << ", " << (t->next ? getUTF8XmlNodeText(doc, t->next->xmlChildrenNode) : 0) << endl;
600 // Get template name (e.g., "aim:er") in UTF-8.
601 string utf8TName = getUTF8XmlNodeText(doc, t->xmlChildrenNode);
602 if (utf8TName.empty())
603 throw logic_error("empty <t> node");
604 if (trace) cout << " utf8TName='" << utf8TName << "'\n";
606 // Check that this template name (seen in verbs-*.xml) has been
607 // seen in conjugation-*.xml.
609 if (conjugSys.find(utf8TName) == conjugSys.end())
610 throw logic_error("unknown template name: " + utf8TName);
612 // Find the offset of the colon in the template name.
613 // For example: the offset is 3 in the case of "aim:er".
614 // Find this offset in a wide character string, because
615 // the offset in a UTF-8 string is in bytes, not characters.
617 wstring wideTName = utf8ToWide(utf8TName);
618 wstring::size_type posColon = wideTName.find(':');
619 if (posColon == wstring::npos)
620 throw logic_error("missing colon in <t> node");
621 assert(wideTName[posColon] == ':');
624 knownVerbs[utf8Infinitive].insert(utf8TName);
626 if (includeWithoutAccents)
628 // Also include versions where some of all accents are missing.
629 vector<string> unaccentedVariants;
630 formUTF8UnaccentedVariants(wideInfinitive, 0, unaccentedVariants);
631 for (vector<string>::const_iterator it = unaccentedVariants.begin();
632 it != unaccentedVariants.end(); ++it)
634 if (trace) cout << " unaccvar: '" << *it << "'\n";
635 knownVerbs[*it].insert(utf8TName);
639 // <aspirate-h>: If this verb starts with an aspirate h, remember it:
640 if (t->next != NULL && t->next->next != NULL)
641 aspirateHVerbs.insert(utf8Infinitive);
643 // Insert the verb in the trie.
644 // A list of template names is associated to each verb in this trie.
646 size_t lenTermination = wideTName.length() - posColon - 1;
647 assert(lenTermination > 0);
648 assert(lenInfinitive >= lenTermination);
650 wstring wideVerbRadical(wideInfinitive, 0, lenInfinitive - lenTermination);
651 string utf8VerbRadical = wideToUTF8(wideVerbRadical);
653 insertVerbRadicalInTrie(utf8VerbRadical, utf8TName, utf8VerbRadical);
655 if (includeWithoutAccents)
657 // Also include versions where some of all accents are missing.
658 vector<string> unaccentedVariants;
659 formUTF8UnaccentedVariants(wideVerbRadical, 0, unaccentedVariants);
660 for (vector<string>::const_iterator it = unaccentedVariants.begin();
661 it != unaccentedVariants.end(); ++it)
663 insertVerbRadicalInTrie(*it, utf8TName, utf8VerbRadical); // pass correct verb radical as 3rd argument
669 cout << "Number of known verbs (lang " << langCode << "): " << knownVerbs.size() << endl;
673 // String parameters expected to be in UTF-8.
674 // Adds to 'verbTrie', which contains wide character strings.
677 FrenchVerbDictionary::insertVerbRadicalInTrie(
678 const std::string &verbRadical,
679 const std::string &tname,
680 const std::string &correctVerbRadical)
682 wstring wideVerbRadical = utf8ToWide(verbRadical);
684 cout << "insertVerbRadicalInTrie('"
685 << verbRadical << "' (len=" << wideVerbRadical.length()
687 << "', '" << correctVerbRadical
690 vector<TrieValue> **templateListPtr =
691 verbTrie.getUserDataPointer(wideVerbRadical);
692 assert(templateListPtr != NULL);
694 // If a new entry was created for 'wideVerbRadical', then the associated
695 // user data pointer is null. Make this pointer point to a new,
696 // empty vector of template names.
698 if (*templateListPtr == NULL)
699 *templateListPtr = new vector<TrieValue>();
701 // Associate the given template name to the given verb radical.
703 (*templateListPtr)->push_back(TrieValue(tname, correctVerbRadical));
707 FrenchVerbDictionary::~FrenchVerbDictionary()
709 iconv_close(utf8ToWideConv);
710 iconv_close(wideToUTF8Conv);
715 FrenchVerbDictionary::getTemplate(const string &templateName) const
717 ConjugationSystem::const_iterator it = conjugSys.find(templateName);
718 if (it == conjugSys.end())
724 ConjugationSystem::const_iterator
725 FrenchVerbDictionary::beginConjugSys() const
727 return conjugSys.begin();
731 ConjugationSystem::const_iterator
732 FrenchVerbDictionary::endConjugSys() const
734 return conjugSys.end();
738 const std::set<std::string> &
739 FrenchVerbDictionary::getVerbTemplateSet(const char *infinitive) const
741 static const std::set<std::string> emptySet;
742 if (infinitive == NULL)
744 VerbTable::const_iterator it = knownVerbs.find(infinitive);
745 if (it == knownVerbs.end())
751 const std::set<std::string> &
752 FrenchVerbDictionary::getVerbTemplateSet(const string &infinitive) const
754 return getVerbTemplateSet(infinitive.c_str());
758 VerbTable::const_iterator
759 FrenchVerbDictionary::beginKnownVerbs() const
761 return knownVerbs.begin();
765 VerbTable::const_iterator
766 FrenchVerbDictionary::endKnownVerbs() const
768 return knownVerbs.end();
772 const std::vector<ModeTensePersonNumber> *
773 FrenchVerbDictionary::getMTPNForInflection(
774 const std::string &templateName,
775 const std::string &inflection) const
777 InflectionTable::const_iterator i = inflectionTable.find(templateName);
778 if (i == inflectionTable.end())
780 const TemplateInflectionTable &ti = i->second;
781 TemplateInflectionTable::const_iterator j = ti.find(inflection);
790 FrenchVerbDictionary::convertModeName(const char *modeName)
792 Mode mode = INVALID_MODE;
793 if (modeName == NULL)
795 else if (strcmp(modeName, "infinitive") == 0)
796 mode = INFINITIVE_MODE;
797 else if (strcmp(modeName, "indicative") == 0)
798 mode = INDICATIVE_MODE;
799 else if (strcmp(modeName, "conditional") == 0)
800 mode = CONDITIONAL_MODE;
801 else if (strcmp(modeName, "subjunctive") == 0)
802 mode = SUBJUNCTIVE_MODE;
803 else if (strcmp(modeName, "imperative") == 0)
804 mode = IMPERATIVE_MODE;
805 else if (strcmp(modeName, "participle") == 0)
806 mode = PARTICIPLE_MODE;
807 else if (strcmp(modeName, "gerund") == 0)
809 else if (strcmp(modeName, "present-indicative") == 0)
810 mode = PRESENT_INDICATIVE;
811 else if (strcmp(modeName, "present-subjunctive") == 0)
812 mode = PRESENT_SUBJUNCTIVE;
813 else if (strcmp(modeName, "present-imperative") == 0)
814 mode = PRESENT_IMPERATIVE;
815 else if (strcmp(modeName, "present-gerund") == 0)
816 mode = PRESENT_GERUND;
817 else if (strcmp(modeName, "past-imperfect-indicative") == 0)
818 mode = PAST_IMPERFECT_INDICATIVE;
819 else if (strcmp(modeName, "past-perfect-indicative") == 0)
820 mode = PAST_PERFECT_INDICATIVE;
821 else if (strcmp(modeName, "past-perfect-subjunctive") == 0)
822 mode = PAST_PERFECT_SUBJUNCTIVE;
823 else if (strcmp(modeName, "past-perfect-imperative") == 0)
824 mode = PAST_PERFECT_IMPERATIVE;
825 else if (strcmp(modeName, "past-perfect-infinitive") == 0)
826 mode = PAST_PERFECT_INFINITIVE;
828 if (mode == INVALID_MODE)
830 if (trace) cout << "modeName='" << modeName << "'" << endl;
831 assert(!"Invalid mode");
840 FrenchVerbDictionary::convertTenseName(const char *tenseName)
842 Tense tense = INVALID_TENSE;
843 if (tenseName == NULL)
845 else if (strcmp(tenseName, "infinitive-present") == 0)
846 tense = PRESENT_TENSE;
847 else if (strcmp(tenseName, "present") == 0)
848 tense = PRESENT_TENSE;
849 else if (strcmp(tenseName, "imperfect") == 0)
850 tense = IMPERFECT_TENSE;
851 else if (strcmp(tenseName, "future") == 0)
852 tense = FUTURE_TENSE;
853 else if (strcmp(tenseName, "simple-past") == 0)
855 else if (strcmp(tenseName, "imperative-present") == 0)
856 tense = PRESENT_TENSE;
857 else if (strcmp(tenseName, "present-participle") == 0)
858 tense = PRESENT_TENSE;
859 else if (strcmp(tenseName, "past-participle") == 0)
861 else if (strcmp(tenseName, "past") == 0)
863 else if (strcmp(tenseName, "present-gerund") == 0)
864 tense = PRESENT_TENSE;
865 else if (strcmp(tenseName, "active") == 0)
866 tense = ACTIVE_TENSE;
867 else if (strcmp(tenseName, "passive") == 0)
868 tense = PASSIVE_TENSE;
869 else if (strcmp(tenseName, "imp-active") == 0)
870 tense = IMPERATIVE_ACTIVE_TENSE;
871 else if (strcmp(tenseName, "imp-passive") == 0)
872 tense = IMPERATIVE_PASSIVE_TENSE;
873 else if (strcmp(tenseName, "past-perfect") == 0)
874 tense = PAST_PERFECT;
876 if (tense == INVALID_TENSE)
878 if (trace) cout << "tenseName='" << tenseName << "'" << endl;
879 assert(!"Invalid tense");
887 FrenchVerbDictionary::deconjugate(const string &utf8ConjugatedVerb,
888 std::vector<InflectionDesc> &results)
890 verbTrie.setDestination(&results);
894 wstring w = utf8ToWide(utf8ConjugatedVerb);
895 (void) verbTrie.get(w);
897 catch (int e) // exception throw by utf8towide()
899 // Wrong encoding (possibly Latin-1). Act as with unknown verb.
902 verbTrie.setDestination(NULL);
908 FrenchVerbDictionary::VerbTrie::onFoundPrefixWithUserData(
909 const wstring &conjugatedVerb,
910 wstring::size_type index,
911 const vector<TrieValue> *templateList) const throw()
913 assert(templateList != NULL);
915 wcout << "VerbTrie::onFoundPrefixWithUserData: start: conjugatedVerb='"
916 << conjugatedVerb << "', index=" << index
917 << ", templateList: " << templateList->size()
918 << ", results=" << results << endl;
923 const wstring term(conjugatedVerb, index);
924 const string utf8Term = fvd.wideToUTF8(term);
927 cout << " utf8Term='" << utf8Term << "'\n";
930 'templateList' contains the names of conjugated templates that might
931 apply to the conjugated verb. We check each of them to see if there
932 is one that accepts the given termination 'term'.
934 for (vector<TrieValue>::const_iterator i = templateList->begin();
935 i != templateList->end(); i++)
937 const TrieValue &trieValue = *i;
938 const string &tname = trieValue.templateName;
939 const TemplateInflectionTable &ti =
940 fvd.inflectionTable.find(tname)->second;
941 TemplateInflectionTable::const_iterator j = ti.find(utf8Term);
943 cout << " tname='" << tname << "'\n";
945 continue; // template 'tname' does not accept termination 'term'
947 // template 'tname' accepts 'term', so we produce some results.
949 string templateTerm(tname, tname.find(':') + 1);
950 // termination of the infinitive form
952 cout << " templateTerm='" << templateTerm << "'\n";
954 const vector<ModeTensePersonNumber> &v = j->second;
955 // list of mode-tense-person combinations that can correspond
956 // to the conjugated verb's termination
958 for (vector<ModeTensePersonNumber>::const_iterator k = v.begin();
961 const ModeTensePersonNumber &mtpn = *k;
963 string infinitive = trieValue.correctVerbRadical + templateTerm;
964 // The infinitive of the conjugated verb is formed from its
965 // (correct) radical part and from the termination of the template name.
966 // Correct means with the proper accents. This allows the user
967 // to type "etaler" without the acute accent on the first "e"
968 // and obtain the conjugation for the correct verb, which has
973 const wstring radical(conjugatedVerb, 0, index);
974 cout << "VerbTrie::onFoundPrefixWithUserData: radical='"
975 << fvd.wideToUTF8(radical) << "', templateTerm='" << templateTerm
976 << "', tname='" << tname
977 << "', correctVerbRadical='" << trieValue.correctVerbRadical
980 << mtpn.tense << ", "
981 << (unsigned) mtpn.person << ", "
982 << mtpn.plural << ", "
983 << mtpn.correct << ")\n";
986 results->push_back(InflectionDesc(infinitive, tname, mtpn));
987 // the InflectionDesc object is an analysis of the
996 FrenchVerbDictionary::getModeName(Mode m)
998 if (int(m) < int(INFINITIVE_MODE) || int(m) > int(PAST_PERFECT_INFINITIVE))
1000 assert(!"FrenchVerbDictionary::getModeName() received invalid Mode value");
1004 static const char *names[] =
1006 "infinitive", "indicative", "conditional",
1007 "subjunctive", "imperative", "participle",
1009 "present indicative",
1010 "present subjunctive",
1011 "present imperative",
1013 "past imperfect indicative",
1014 "past perfect indicative",
1015 "past perfect subjunctive",
1016 "past perfect imperative",
1017 "past perfect infinitive",
1020 size_t index = size_t(m) - 1;
1021 assert(index < sizeof(names) / sizeof(names[0]));
1022 return names[index];
1028 FrenchVerbDictionary::getTenseName(Tense t)
1030 if (int(t) < int(PRESENT_TENSE) || int(t) > int(PAST_PERFECT))
1032 assert(!"FrenchVerbDictionary::getTenseName() received invalid Tense value");
1036 static const char *names[] =
1038 "present", "past", "imperfect", "future",
1039 "active", "passive", "active", "passive", "past perfect",
1042 size_t index = size_t(t) - 1;
1043 assert(index < sizeof(names) / sizeof(names[0]));
1044 return names[index];
1049 FrenchVerbDictionary::tolowerWide(const wstring &wideString) const
1052 for (wstring::size_type len = wideString.length(), i = 0; i < len; i++)
1054 wchar_t c = wideString[i];
1056 result += (unsigned char) latin1TolowerTable[(unsigned char) c];
1066 FrenchVerbDictionary::isWideVowel(wchar_t c)
1068 if (strchr("aeiouyAEIOUY", (unsigned char) c) != NULL)
1070 if (c < 0xc0 || c > 0xff)
1072 return c != 0xc7 && c != 0xd0
1073 && c != 0xd1 && c != 0xd7 && c != 0xde
1075 && c != 0xf0 && c != 0xf1 && c != 0xf7 && c != 0xfe;
1080 FrenchVerbDictionary::utf8ToWide(const string &utf8String) const throw(int)
1082 size_t inbytesleft = utf8String.length() + 1; // number of *bytes* in UTF-8 string
1083 size_t outbytesleft = inbytesleft * sizeof(wchar_t); // oversized for safety
1084 char *inbuf = strcpy(new char[inbytesleft], utf8String.c_str());
1085 char *outbuf = new char[outbytesleft];
1087 ICONV_CONST char *in = inbuf;
1089 size_t initNumOutBytes = outbytesleft;
1090 if (iconv(utf8ToWideConv, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
1098 // iconv() has substracted the number of bytes produced
1099 // from outbytesleft. This allows the computation of the
1100 // number of wide characters in the result (excluding the
1101 // terminating null character).
1102 // See the iconv(3) man page for details.
1104 const wchar_t *resultPtr = reinterpret_cast<wchar_t *>(outbuf);
1105 size_t resultLen = (initNumOutBytes - outbytesleft) / sizeof(wchar_t) - 1;
1106 assert(resultPtr[resultLen] == 0);
1108 wstring result(resultPtr, resultLen);
1109 assert(result.length() == resultLen);
1118 FrenchVerbDictionary::wideToUTF8(const wstring &wideString) const throw(int)
1120 size_t inbytesleft = (wideString.length() + 1) * sizeof(wchar_t);
1121 size_t outbytesleft = inbytesleft; // UTF-8 string takes no more room than wstring
1122 char *inbuf = reinterpret_cast<char *>(memcpy(new char[inbytesleft], wideString.data(), inbytesleft));
1123 char *outbuf = new char[outbytesleft];
1125 ICONV_CONST char *in = inbuf;
1127 if (iconv(wideToUTF8Conv, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
1135 string result = outbuf;
1144 FrenchVerbDictionary::getRadical(
1145 const string &infinitive,
1146 const string &templateName) throw(logic_error)
1148 string::size_type posColon = templateName.find(':');
1149 if (posColon == string::npos)
1150 throw logic_error("no colon found in template name");
1152 string::size_type lenSuffix = templateName.length() - posColon - 1;
1153 string::size_type lenInfPrefix = infinitive.length() - lenSuffix;
1154 return string(infinitive, 0, lenInfPrefix);
1159 FrenchVerbDictionary::generateTense(const string &radical,
1160 const TemplateSpec &templ,
1163 vector< vector<string> > &dest,
1164 bool includePronouns,
1166 bool isItalian) const throw()
1168 if (templ.find(mode) == templ.end())
1171 const ModeSpec &modeSpec = templ.find(mode)->second;
1173 if (modeSpec.find(tense) == modeSpec.end())
1176 const TenseSpec &tenseSpec = modeSpec.find(tense)->second;
1178 if (mode != INDICATIVE_MODE
1179 && mode != CONDITIONAL_MODE
1180 && mode != SUBJUNCTIVE_MODE)
1181 includePronouns = false;
1183 for (TenseSpec::const_iterator p = tenseSpec.begin();
1184 p != tenseSpec.end(); p++)
1186 dest.push_back(vector<string>());
1187 for (PersonSpec::const_iterator i = p->begin(); i != p->end(); i++)
1189 // Do not return spellings that are marked incorrect.
1190 // They are in the knowledge base only to allow
1191 // error-tolerant searches.
1193 if (!(*i).isCorrect)
1196 string pronoun; // no pronoun by default
1198 string v = radical + (*i).inflection;
1200 if (includePronouns)
1202 size_t noPers = p - tenseSpec.begin();
1210 bool elideJe = false;
1213 wstring wideV = utf8ToWide(v); // inefficient: converts all chars, only 1st needed
1214 wchar_t init = (wideV.empty() ? '\0' : wideV[0]);
1215 if (init == 'h' || init == 'H' || isWideVowel(init))
1218 pronoun = (elideJe ? "j'" : "je ");
1221 case 1: pronoun = "tu "; break;
1222 case 2: pronoun = (isItalian ? "egli " : "il "); break;
1223 case 3: pronoun = (isItalian ? "noi " : "nous "); break;
1224 case 4: pronoun = (isItalian ? "voi " : "vous "); break;
1225 case 5: pronoun = (isItalian ? "essi " : "ils "); break;
1228 if (mode == SUBJUNCTIVE_MODE)
1233 else if (noPers == 2 || noPers == 5)
1237 pronoun = conj + pronoun;
1241 dest.back().push_back(pronoun + v);
1249 bool FrenchVerbDictionary::isVerbStartingWithAspirateH(
1250 const std::string &infinitive) const throw()
1252 return aspirateHVerbs.find(infinitive) != aspirateHVerbs.end();