git.maemo.org Git - mverbiste/blob - verbiste/FrenchVerbDictionary.cpp

   1 /*  $Id: FrenchVerbDictionary.cpp,v 1.51 2012/04/24 02:46:05 sarrazip Exp $
   2     FrenchVerbDictionary.cpp - Dictionary of verbs and conjugation templates
   3
   4     verbiste - French conjugation system
   5     Copyright (C) 2003-2010 Pierre Sarrazin <http://sarrazip.com/>
   6
   7     This program is free software; you can redistribute it and/or
   8     modify it under the terms of the GNU General Public License
   9     as published by the Free Software Foundation; either version 2
  10     of the License, or (at your option) any later version.
  11
  12     This program is distributed in the hope that it will be useful,
  13     but WITHOUT ANY WARRANTY; without even the implied warranty of
  14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15     GNU General Public License for more details.
  16
  17     You should have received a copy of the GNU General Public License
  18     along with this program; if not, write to the Free Software
  19     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  20     02111-1307, USA.
  21 */
  22
  23 #include "FrenchVerbDictionary.h"
  24
  25 #include <assert.h>
  26 #include <iostream>
  27 #include <errno.h>
  28 #include <string.h>
  29 #include <stdlib.h>
  30 #include <sys/types.h>
  31 #include <sys/stat.h>
  32 #include <unistd.h>
  33
  34 using namespace std;
  35 using namespace verbiste;
  36
  37
  38 static bool trace = getenv("TRACE") != NULL;
  39
  40
  41 class AutoDoc
  42 {
  43 public:
  44     AutoDoc(xmlDocPtr d) : doc(d) {}
  45     ~AutoDoc() { if (doc != NULL) xmlFreeDoc(doc); }
  46     xmlDocPtr get() const { return doc; }
  47     bool operator ! () const { return doc == NULL; }
  48 private:
  49     xmlDocPtr doc;
  50
  51     // Forbidden operations:
  52     AutoDoc(const AutoDoc &);
  53     AutoDoc &operator = (const AutoDoc &);
  54 };
  55
  56
  57 class AutoString
  58 {
  59 public:
  60     AutoString(xmlChar *s) : str(s) {}
  61     ~AutoString() { if (str != NULL) xmlFree(str); }
  62     xmlChar *get() const { return str; }
  63     bool operator ! () const { return str == NULL; }
  64     size_t length() const { return str == NULL ? 0 : strlen((char *) str); }
  65 private:
  66     xmlChar *str;
  67
  68     // Forbidden operations:
  69     AutoString(const AutoString &);
  70     AutoString &operator = (const AutoString &);
  71 };
  72
  73
  74 inline
  75 const xmlChar *
  76 XMLCHAR(const char *s)
  77 {
  78     return (const xmlChar *) s;
  79 }
  80
  81
  82 inline
  83 int
  84 equal(const xmlChar *a, const char *b)
  85 {
  86     return xmlStrcmp(a, XMLCHAR(b)) == 0;
  87 }
  88
  89
  90 inline
  91 int
  92 different(const xmlChar *a, const char *b)
  93 {
  94     return !equal(a, b);
  95 }
  96
  97
  98 inline
  99 xmlChar *
 100 getProp(xmlNodePtr node, const char *propName)
 101 {
 102     return xmlGetProp(node, XMLCHAR(propName));
 103 }
 104
 105
 106 inline
 107 xmlChar *
 108 getString(xmlDocPtr doc, xmlNodePtr node)
 109 {
 110     return xmlNodeListGetString(doc, node, 1);
 111 }
 112
 113
 114 inline
 115 string
 116 operator + (const AutoString &a, const string &b)
 117 {
 118     return (char *) a.get() + b;
 119 }
 120
 121
 122 inline
 123 string
 124 operator + (const string &a, const AutoString &b)
 125 {
 126     return a + (char *) b.get();
 127 }
 128
 129
 130 inline
 131 Mode
 132 convertModeName(const xmlChar *modeName)
 133 {
 134     return FrenchVerbDictionary::convertModeName((char *) modeName);
 135 }
 136
 137
 138 inline
 139 Tense
 140 convertTenseName(const xmlChar *tenseName)
 141 {
 142     return FrenchVerbDictionary::convertTenseName((char *) tenseName);
 143 }
 144
 145
 146 // Latin-1 to ASCII conversion table (codes 0xC0 to 0xFF).
 147 // Some characters have bogus translations, but they are not used in French.
 148 //
 149 static const char *accentRemovalTable =
 150             "AAAAAA_CEEEEIIII"
 151             "DNOOOOOxOUUUUYbB"
 152             "aaaaaa-ceeeeiiii"
 153             "dnooooo/ouuuuyby";
 154
 155
 156 // Only works on Latin-1 characters.
 157 //
 158 inline wchar_t
 159 removeWideCharAccent(wchar_t c)
 160 {
 161     if (c >= 0xC0 && c <= 0xFF)
 162         c = (unsigned char) accentRemovalTable[c - 0xC0];
 163     return c;
 164 }
 165
 166
 167 string
 168 FrenchVerbDictionary::removeUTF8Accents(const string &utf8String)
 169 {
 170     wstring result = utf8ToWide(utf8String);
 171     for (size_t i = 0; i < result.length(); ++i)
 172         result[i] = removeWideCharAccent(result[i]);
 173     return wideToUTF8(result);
 174 }
 175
 176
 177 void
 178 FrenchVerbDictionary::formUTF8UnaccentedVariants(const wstring &wideString,
 179                                                 size_t index,
 180                                                 vector<string> &utf8Variants)
 181 {
 182     for ( ; index < wideString.length(); ++index)
 183     {
 184         wchar_t ch = wideString[index];
 185         wchar_t unacc = removeWideCharAccent(ch);
 186         if (ch != unacc)
 187         {
 188             wstring copy = wideString;
 189             copy[index] = unacc;
 190             assert(copy.length() == wideString.length());
 191
 192             utf8Variants.push_back(wideToUTF8(copy));
 193             formUTF8UnaccentedVariants(copy, index + 1, utf8Variants);
 194         }
 195     }
 196 }
 197
 198
 199 void
 200 FrenchVerbDictionary::formUTF8UnaccentedVariants(const string &utf8String,
 201                                                 size_t index,
 202                                                 vector<string> &utf8Variants)
 203 {
 204     wstring wideString = utf8ToWide(utf8String);
 205     formUTF8UnaccentedVariants(wideString, index, utf8Variants);
 206 }
 207
 208
 209 void
 210 FrenchVerbDictionary::getXMLFilenames(string &conjFN, string &verbsFN, Language l)
 211 {
 212     const char *libdatadir = getenv("LIBDATADIR");
 213     if (libdatadir == NULL)
 214         libdatadir = LIBDATADIR;
 215     string languageCode = getLanguageCode(l);
 216     conjFN  = libdatadir + string("/") + "conjugation-" + languageCode + ".xml";
 217     verbsFN = libdatadir + string("/") + "verbs-" + languageCode + ".xml";
 218 }
 219
 220
 221 //static
 222 FrenchVerbDictionary::Language
 223 FrenchVerbDictionary::parseLanguageCode(const std::string &twoLetterCode)
 224 {
 225     if (twoLetterCode == "fr")
 226         return FRENCH;
 227     if (twoLetterCode == "it")
 228         return ITALIAN;
 229     if (twoLetterCode == "el")
 230         return GREEK;
 231     return NO_LANGUAGE;
 232 }
 233
 234
 235 //static
 236 std::string
 237 FrenchVerbDictionary::getLanguageCode(Language l)
 238 {
 239     switch (l)
 240     {
 241     case NO_LANGUAGE: return "";
 242     case FRENCH: return "fr";
 243     case ITALIAN: return "it";
 244     case GREEK: return "el";
 245     }
 246     return "";
 247 }
 248
 249
 250 FrenchVerbDictionary::FrenchVerbDictionary(
 251                                 const string &conjugationFilename,
 252                                 const string &verbsFilename,
 253                                 bool includeWithoutAccents,
 254                                 Language _lang)
 255                                         throw (logic_error)
 256   : conjugSys(),
 257     knownVerbs(),
 258     aspirateHVerbs(),
 259     inflectionTable(),
 260     wideToUTF8Conv((iconv_t) -1),
 261     utf8ToWideConv((iconv_t) -1),
 262     verbTrie(*this),
 263     lang(_lang)
 264 {
 265     if (lang == NO_LANGUAGE)
 266         throw logic_error("Invalid language code");
 267     init(conjugationFilename, verbsFilename, includeWithoutAccents);
 268 }
 269
 270
 271 FrenchVerbDictionary::FrenchVerbDictionary(bool includeWithoutAccents)
 272                                                 throw (std::logic_error)
 273   : conjugSys(),
 274     knownVerbs(),
 275     aspirateHVerbs(),
 276     inflectionTable(),
 277     wideToUTF8Conv((iconv_t) -1),
 278     utf8ToWideConv((iconv_t) -1),
 279     verbTrie(*this),
 280     lang(FRENCH)
 281 {
 282     string conjFN, verbsFN;
 283     getXMLFilenames(conjFN, verbsFN, lang);
 284
 285     init(conjFN, verbsFN, includeWithoutAccents);
 286 }
 287
 288
 289 void
 290 FrenchVerbDictionary::init(const string &conjugationFilename,
 291                             const string &verbsFilename,
 292                             bool includeWithoutAccents)
 293                                         throw (logic_error)
 294 {
 295     wideToUTF8Conv = iconv_open("UTF-8", "WCHAR_T");
 296     if (wideToUTF8Conv == (iconv_t) -1)
 297         throw logic_error("conversion from wide characters to UTF-8 not supported");
 298     utf8ToWideConv = iconv_open("WCHAR_T", "UTF-8");
 299     if (utf8ToWideConv == (iconv_t) -1)
 300         throw logic_error("conversion from UTF-8 to wide characters not supported");
 301
 302     #ifndef NDEBUG  // self-test for the wide character string conversions:
 303     try
 304     {
 305         wstring w = utf8ToWide("ab");
 306         assert(w.length() == 2);
 307         assert(w[0] == 'a');
 308         assert(w[1] == 'b');
 309
 310         const char u0[] = { '\xc3', '\xa2', 't', '\0' };  // 'a' with circumflex accent
 311         w = utf8ToWide(u0);
 312         assert(w.length() == 2);
 313         assert(w[0] == 0xe2);
 314         assert(w[1] == 't');
 315
 316         const char u1[] = { 't', '\xc3', '\xa2', '\0' };  // 'a' with circumflex accent
 317         w = utf8ToWide(u1);
 318         assert(w.length() == 2);
 319         assert(w[0] == 't');
 320         assert(w[1] == 0xe2);
 321     }
 322     catch (int e)
 323     {
 324         throw logic_error("self-test of utf8ToWide() failed");
 325     }
 326
 327     try
 328     {
 329         string u = wideToUTF8(L"ab");
 330         assert(u.length() == 2);
 331         assert(u[0] == 'a');
 332         assert(u[1] == 'b');
 333     }
 334     catch (int e)
 335     {
 336         throw logic_error("self-test of wideToUTF8() failed");
 337     }
 338     #endif  // ndef NDEBUG
 339
 340
 341     {
 342         for (int i = 0; i < 0xC0; i++)
 343             latin1TolowerTable[i] = char(tolower(char(i)));
 344         for (int i = 0xC0; i < 0xE0; i++)
 345             latin1TolowerTable[i] = char(i + 0x20);
 346         for (int i = 0xE0; i < 0x100; i++)
 347             latin1TolowerTable[i] = char(i);
 348     }
 349
 350     loadConjugationDatabase(conjugationFilename.c_str(), includeWithoutAccents);
 351     loadVerbDatabase(verbsFilename.c_str(), includeWithoutAccents);
 352
 353     // Load additional verbs from $HOME/.verbiste/verbs-<lang>.xml, if present.
 354     //
 355     const char *home = getenv("HOME");
 356     if (home != NULL)  // do nothing if $HOME not defined
 357     {
 358         string otherVerbsFilename = string(home) + "/.verbiste/verbs-" + getLanguageCode(lang) + ".xml";
 359         struct stat statbuf;
 360         if (stat(otherVerbsFilename.c_str(), &statbuf) == 0)  // if file exists
 361         {
 362             //cout << "otherVerbsFilename=" << otherVerbsFilename << endl;
 363             loadVerbDatabase(otherVerbsFilename.c_str(), includeWithoutAccents);
 364         }
 365     }
 366
 367     if (trace)
 368         cout << "FrenchVerbDictionary::init: trie takes "
 369              << verbTrie.computeMemoryConsumption() << " bytes\n";
 370 }
 371
 372
 373 void
 374 FrenchVerbDictionary::loadConjugationDatabase(
 375                                 const char *conjugationFilename,
 376                                 bool includeWithoutAccents)
 377                                         throw (logic_error)
 378 {
 379     if (conjugationFilename == NULL)
 380         throw invalid_argument("conjugationFilename");
 381
 382     AutoDoc conjDoc(xmlParseFile(conjugationFilename));
 383     if (!conjDoc)
 384         throw logic_error("could not parse " + string(conjugationFilename));
 385
 386     readConjugation(conjDoc.get(), includeWithoutAccents);
 387 }
 388
 389
 390 void
 391 FrenchVerbDictionary::loadVerbDatabase(
 392                                 const char *verbsFilename,
 393                                 bool includeWithoutAccents)
 394                                         throw (logic_error)
 395 {
 396     if (verbsFilename == NULL)
 397         throw invalid_argument("verbsFilename");
 398
 399     AutoDoc verbsDoc(xmlParseFile(verbsFilename));
 400     if (!verbsDoc)
 401         throw logic_error("could not parse " + string(verbsFilename));
 402
 403     readVerbs(verbsDoc.get(), includeWithoutAccents);
 404 }
 405
 406
 407 void
 408 FrenchVerbDictionary::readConjugation(xmlDocPtr doc, bool includeWithoutAccents) throw(logic_error)
 409 {
 410     const bool isItalian = (lang == ITALIAN);
 411
 412     xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);
 413
 414     if (rootNodePtr == NULL)
 415         throw logic_error("empty conjugation document");
 416
 417     string langCode = getLanguageCode(lang);
 418     if (different(rootNodePtr->name, ("conjugation-" + langCode).c_str()))
 419     {
 420         string msg = "wrong top node in conjugation document: got "
 421                      + string((const char *) rootNodePtr->name)
 422                      + ", expected conjugation-" + langCode;
 423         throw logic_error(msg);
 424     }
 425
 426     for (xmlNodePtr templ = rootNodePtr->xmlChildrenNode;
 427                         templ != NULL;
 428                         templ = templ->next)
 429     {
 430         if (different(templ->name, "template"))  // ignore junk between tags
 431             continue;
 432
 433         string tname = getUTF8XmlProp(templ, "name");
 434         if (tname.empty())
 435             throw logic_error("missing template name attribute");
 436
 437         // The template name is the root and the termination,
 438         // with a colon in between.  For example, "pla:cer".
 439
 440         if (tname.find(':') == string::npos)
 441             throw logic_error("missing colon in template name");
 442
 443         // The use of the [] operator creates an empty conjugation
 444         // template spec, to which we keep a reference:
 445
 446         TemplateSpec &theTemplateSpec = conjugSys[tname];
 447
 448         // Same idea:
 449
 450         TemplateInflectionTable &ti = inflectionTable[tname];
 451
 452         // For each mode (e.g., infinitive, indicative, conditional, etc):
 453         for (xmlNodePtr mode = templ->xmlChildrenNode;
 454                             mode != NULL;
 455                             mode = mode->next)
 456         {
 457             if (equal(mode->name, "text") || equal(mode->name, "comment"))  // any text in this node is ignored
 458                 continue;
 459
 460             if (trace) cout << "readConjugation: mode node: '" << mode->name << "'" << endl;
 461             Mode theMode = ::convertModeName(mode->name);
 462             ModeSpec &theModeSpec = theTemplateSpec[theMode];
 463
 464             // For each tense in the mode:
 465             for (xmlNodePtr tense = mode->xmlChildrenNode;
 466                             tense != NULL;
 467                                 tense = tense->next)
 468             {
 469                 if (equal(tense->name, "text") || equal(tense->name, "comment"))
 470                     continue;
 471
 472                 Tense theTense = ::convertTenseName(tense->name);
 473                 TenseSpec &theTenseSpec = theModeSpec[theTense];
 474
 475                 // For each person in the tense:
 476                 int personCounter = 0;
 477                 for (xmlNodePtr person = tense->xmlChildrenNode;
 478                                 person != NULL;
 479                                 person = person->next)
 480                 {
 481                     if (different(person->name, "p"))
 482                         continue;
 483
 484                     personCounter++;
 485
 486                     theTenseSpec.push_back(PersonSpec());
 487                     PersonSpec &thePersonSpec = theTenseSpec.back();
 488
 489                     // For each variant for this person:
 490                     // (Note that most persons of most verbs have only
 491                     // one variant.)
 492                     for (xmlNodePtr inf = person->xmlChildrenNode;
 493                                     inf != NULL;
 494                                     inf = inf->next)
 495                     {
 496                         string variant = getUTF8XmlNodeText(
 497                                                     doc, inf->xmlChildrenNode);
 498                         thePersonSpec.push_back(InflectionSpec(variant, true));
 499
 500                         ModeTensePersonNumber mtpn(
 501                                 reinterpret_cast<const char *>(mode->name),
 502                                 reinterpret_cast<const char *>(tense->name),
 503                                 personCounter,
 504                                 true,
 505                                 isItalian);
 506                         ti[variant].push_back(mtpn);
 507
 508                         if (includeWithoutAccents)
 509                         {
 510                             // Also include versions where some or all accents are missing.
 511                             vector<string> unaccentedVariants;
 512                             formUTF8UnaccentedVariants(variant, 0, unaccentedVariants);
 513                             for (vector<string>::const_iterator it = unaccentedVariants.begin();
 514                                                                 it != unaccentedVariants.end(); ++it)
 515                             {
 516                                 thePersonSpec.push_back(InflectionSpec(*it, false));
 517                                 mtpn.correct = false;  // 'false' marks this spelling as incorrect.
 518                                 ti[*it].push_back(mtpn);
 519                             }
 520                         }
 521                     }
 522                 }
 523             }
 524         }
 525     }
 526 }
 527
 528
 529 string
 530 FrenchVerbDictionary::getUTF8XmlNodeText(xmlDocPtr doc, xmlNodePtr node)
 531                                                                 throw(int)
 532 {
 533     xmlChar *s = getString(doc, node);
 534     if (s == NULL)
 535         return string();
 536     return reinterpret_cast<char *>(s);
 537 }
 538
 539
 540 string
 541 FrenchVerbDictionary::getUTF8XmlProp(xmlNodePtr node, const char *propName)
 542                                                                 throw(int)
 543 {
 544     xmlChar *s = getProp(node, propName);
 545     if (s == NULL)
 546         return string();
 547     return reinterpret_cast<char *>(s);
 548 }
 549
 550
 551 // Reads the given XML document and adds data to members knownVerbs,
 552 // aspirateHVerbs and verbTrie.
 553 //
 554 void
 555 FrenchVerbDictionary::readVerbs(xmlDocPtr doc,
 556                                 bool includeWithoutAccents)
 557                                                 throw(logic_error)
 558 {
 559     if (trace)
 560         cout << "readVerbs: start: includeWithoutAccents=" << includeWithoutAccents << endl;
 561
 562     xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);
 563
 564     if (rootNodePtr == NULL)
 565         throw logic_error("empty verbs document");
 566
 567     string langCode = getLanguageCode(lang);
 568     if (different(rootNodePtr->name, ("verbs-" + langCode).c_str()))
 569         throw logic_error("wrong top node in verbs document");
 570
 571     for (xmlNodePtr v = rootNodePtr->xmlChildrenNode; v != NULL; v = v->next)
 572     {
 573         if (equal(v->name, "text") || equal(v->name, "comment"))
 574             continue;
 575
 576         xmlNodePtr i = v->xmlChildrenNode;
 577         if (i == NULL || i->xmlChildrenNode == NULL)
 578             throw logic_error("missing <i> node");
 579
 580         string utf8Infinitive = getUTF8XmlNodeText(doc, i->xmlChildrenNode);
 581         wstring wideInfinitive = utf8ToWide(utf8Infinitive);
 582         if (wideInfinitive.empty())
 583             throw logic_error("empty <i> node");
 584         size_t lenInfinitive = wideInfinitive.length();
 585         if (trace) cout << "utf8Infinitive='" << utf8Infinitive << "'\n";
 586
 587         if (i->next == NULL)
 588             throw logic_error("unexpected end after <i> node");
 589
 590         xmlNodePtr t = i->next->next;
 591         if (t == NULL)
 592             throw logic_error("missing <t> node");
 593
 594         #if 0
 595         cout << "t=" << t << ", t->xmlChildrenNode=" << t->xmlChildrenNode << "\n";
 596         if (t->xmlChildrenNode == NULL)
 597             cout << "  t->next=" << t->next << ", " << (t->next ? getUTF8XmlNodeText(doc, t->next->xmlChildrenNode) : 0) << endl;
 598         #endif
 599
 600         // Get template name (e.g., "aim:er") in UTF-8.
 601         string utf8TName = getUTF8XmlNodeText(doc, t->xmlChildrenNode);
 602         if (utf8TName.empty())
 603             throw logic_error("empty <t> node");
 604         if (trace) cout << "  utf8TName='" << utf8TName << "'\n";
 605
 606         // Check that this template name (seen in verbs-*.xml) has been
 607         // seen in conjugation-*.xml.
 608         //
 609         if (conjugSys.find(utf8TName) == conjugSys.end())
 610             throw logic_error("unknown template name: " + utf8TName);
 611
 612         // Find the offset of the colon in the template name.
 613         // For example: the offset is 3 in the case of "aim:er".
 614         // Find this offset in a wide character string, because
 615         // the offset in a UTF-8 string is in bytes, not characters.
 616         //
 617         wstring wideTName = utf8ToWide(utf8TName);
 618         wstring::size_type posColon = wideTName.find(':');
 619         if (posColon == wstring::npos)
 620             throw logic_error("missing colon in <t> node");
 621         assert(wideTName[posColon] == ':');
 622
 623
 624         knownVerbs[utf8Infinitive].insert(utf8TName);
 625
 626         if (includeWithoutAccents)
 627         {
 628             // Also include versions where some of all accents are missing.
 629             vector<string> unaccentedVariants;
 630             formUTF8UnaccentedVariants(wideInfinitive, 0, unaccentedVariants);
 631             for (vector<string>::const_iterator it = unaccentedVariants.begin();
 632                                                 it != unaccentedVariants.end(); ++it)
 633             {
 634                 if (trace) cout << "  unaccvar: '" << *it << "'\n";
 635                 knownVerbs[*it].insert(utf8TName);
 636             }
 637         }
 638
 639         // <aspirate-h>: If this verb starts with an aspirate h, remember it:
 640         if (t->next != NULL && t->next->next != NULL)
 641             aspirateHVerbs.insert(utf8Infinitive);
 642
 643         // Insert the verb in the trie.
 644         // A list of template names is associated to each verb in this trie.
 645
 646         size_t lenTermination = wideTName.length() - posColon - 1;
 647         assert(lenTermination > 0);
 648         assert(lenInfinitive >= lenTermination);
 649
 650         wstring wideVerbRadical(wideInfinitive, 0, lenInfinitive - lenTermination);
 651         string utf8VerbRadical = wideToUTF8(wideVerbRadical);
 652
 653         insertVerbRadicalInTrie(utf8VerbRadical, utf8TName, utf8VerbRadical);
 654
 655         if (includeWithoutAccents)
 656         {
 657             // Also include versions where some of all accents are missing.
 658             vector<string> unaccentedVariants;
 659             formUTF8UnaccentedVariants(wideVerbRadical, 0, unaccentedVariants);
 660             for (vector<string>::const_iterator it = unaccentedVariants.begin();
 661                                                 it != unaccentedVariants.end(); ++it)
 662             {
 663                 insertVerbRadicalInTrie(*it, utf8TName, utf8VerbRadical);  // pass correct verb radical as 3rd argument
 664             }
 665         }
 666     }
 667
 668     if (trace)
 669         cout << "Number of known verbs (lang " << langCode << "): " << knownVerbs.size() << endl;
 670 }
 671
 672
 673 // String parameters expected to be in UTF-8.
 674 // Adds to 'verbTrie', which contains wide character strings.
 675 //
 676 void
 677 FrenchVerbDictionary::insertVerbRadicalInTrie(
 678                                     const std::string &verbRadical,
 679                                     const std::string &tname,
 680                                     const std::string &correctVerbRadical)
 681 {
 682     wstring wideVerbRadical = utf8ToWide(verbRadical);
 683     if (trace)
 684         cout << "insertVerbRadicalInTrie('"
 685               << verbRadical << "' (len=" << wideVerbRadical.length()
 686               << "), '" << tname
 687               << "', '" << correctVerbRadical
 688               << "')\n";
 689
 690     vector<TrieValue> **templateListPtr =
 691                             verbTrie.getUserDataPointer(wideVerbRadical);
 692     assert(templateListPtr != NULL);
 693
 694     // If a new entry was created for 'wideVerbRadical', then the associated
 695     // user data pointer is null.  Make this pointer point to a new,
 696     // empty vector of template names.
 697     //
 698     if (*templateListPtr == NULL)
 699         *templateListPtr = new vector<TrieValue>();
 700
 701     // Associate the given template name to the given verb radical.
 702     //
 703     (*templateListPtr)->push_back(TrieValue(tname, correctVerbRadical));
 704 }
 705
 706
 707 FrenchVerbDictionary::~FrenchVerbDictionary()
 708 {
 709     iconv_close(utf8ToWideConv);
 710     iconv_close(wideToUTF8Conv);
 711 }
 712
 713
 714 const TemplateSpec *
 715 FrenchVerbDictionary::getTemplate(const string &templateName) const
 716 {
 717     ConjugationSystem::const_iterator it = conjugSys.find(templateName);
 718     if (it == conjugSys.end())
 719         return NULL;
 720     return &it->second;
 721 }
 722
 723
 724 ConjugationSystem::const_iterator
 725 FrenchVerbDictionary::beginConjugSys() const
 726 {
 727     return conjugSys.begin();
 728 }
 729
 730
 731 ConjugationSystem::const_iterator
 732 FrenchVerbDictionary::endConjugSys() const
 733 {
 734     return conjugSys.end();
 735 }
 736
 737
 738 const std::set<std::string> &
 739 FrenchVerbDictionary::getVerbTemplateSet(const char *infinitive) const
 740 {
 741     static const std::set<std::string> emptySet;
 742     if (infinitive == NULL)
 743         return emptySet;
 744     VerbTable::const_iterator it = knownVerbs.find(infinitive);
 745     if (it == knownVerbs.end())
 746         return emptySet;
 747     return it->second;
 748 }
 749
 750
 751 const std::set<std::string> &
 752 FrenchVerbDictionary::getVerbTemplateSet(const string &infinitive) const
 753 {
 754     return getVerbTemplateSet(infinitive.c_str());
 755 }
 756
 757
 758 VerbTable::const_iterator
 759 FrenchVerbDictionary::beginKnownVerbs() const
 760 {
 761     return knownVerbs.begin();
 762 }
 763
 764
 765 VerbTable::const_iterator
 766 FrenchVerbDictionary::endKnownVerbs() const
 767 {
 768     return knownVerbs.end();
 769 }
 770
 771
 772 const std::vector<ModeTensePersonNumber> *
 773 FrenchVerbDictionary::getMTPNForInflection(
 774                                 const std::string &templateName,
 775                                 const std::string &inflection) const
 776 {
 777     InflectionTable::const_iterator i = inflectionTable.find(templateName);
 778     if (i == inflectionTable.end())
 779         return NULL;
 780     const TemplateInflectionTable &ti = i->second;
 781     TemplateInflectionTable::const_iterator j = ti.find(inflection);
 782     if (j == ti.end())
 783         return NULL;
 784     return &j->second;
 785 }
 786
 787
 788 /*static*/
 789 Mode
 790 FrenchVerbDictionary::convertModeName(const char *modeName)
 791 {
 792     Mode mode = INVALID_MODE;
 793     if (modeName == NULL)
 794         ;
 795     else if (strcmp(modeName, "infinitive") == 0)
 796         mode = INFINITIVE_MODE;
 797     else if (strcmp(modeName, "indicative") == 0)
 798         mode = INDICATIVE_MODE;
 799     else if (strcmp(modeName, "conditional") == 0)
 800         mode = CONDITIONAL_MODE;
 801     else if (strcmp(modeName, "subjunctive") == 0)
 802         mode = SUBJUNCTIVE_MODE;
 803     else if (strcmp(modeName, "imperative") == 0)
 804         mode = IMPERATIVE_MODE;
 805     else if (strcmp(modeName, "participle") == 0)
 806         mode = PARTICIPLE_MODE;
 807     else if (strcmp(modeName, "gerund") == 0)
 808         mode = GERUND_MODE;
 809     else if (strcmp(modeName, "present-indicative") == 0)
 810         mode = PRESENT_INDICATIVE;
 811     else if (strcmp(modeName, "present-subjunctive") == 0)
 812         mode = PRESENT_SUBJUNCTIVE;
 813     else if (strcmp(modeName, "present-imperative") == 0)
 814         mode = PRESENT_IMPERATIVE;
 815     else if (strcmp(modeName, "present-gerund") == 0)
 816         mode = PRESENT_GERUND;
 817     else if (strcmp(modeName, "past-imperfect-indicative") == 0)
 818         mode = PAST_IMPERFECT_INDICATIVE;
 819     else if (strcmp(modeName, "past-perfect-indicative") == 0)
 820         mode = PAST_PERFECT_INDICATIVE;
 821     else if (strcmp(modeName, "past-perfect-subjunctive") == 0)
 822         mode = PAST_PERFECT_SUBJUNCTIVE;
 823     else if (strcmp(modeName, "past-perfect-imperative") == 0)
 824         mode = PAST_PERFECT_IMPERATIVE;
 825     else if (strcmp(modeName, "past-perfect-infinitive") == 0)
 826         mode = PAST_PERFECT_INFINITIVE;
 827
 828     if (mode == INVALID_MODE)
 829     {
 830         if (trace) cout << "modeName='" << modeName << "'" << endl;
 831         assert(!"Invalid mode");
 832     }
 833
 834     return mode;
 835 }
 836
 837
 838 /*static*/
 839 Tense
 840 FrenchVerbDictionary::convertTenseName(const char *tenseName)
 841 {
 842     Tense tense = INVALID_TENSE;
 843     if (tenseName == NULL)
 844         ;
 845     else if (strcmp(tenseName, "infinitive-present") == 0)
 846         tense = PRESENT_TENSE;
 847     else if (strcmp(tenseName, "present") == 0)
 848         tense = PRESENT_TENSE;
 849     else if (strcmp(tenseName, "imperfect") == 0)
 850         tense = IMPERFECT_TENSE;
 851     else if (strcmp(tenseName, "future") == 0)
 852         tense = FUTURE_TENSE;
 853     else if (strcmp(tenseName, "simple-past") == 0)
 854         tense = PAST_TENSE;
 855     else if (strcmp(tenseName, "imperative-present") == 0)
 856         tense = PRESENT_TENSE;
 857     else if (strcmp(tenseName, "present-participle") == 0)
 858         tense = PRESENT_TENSE;
 859     else if (strcmp(tenseName, "past-participle") == 0)
 860         tense = PAST_TENSE;
 861     else if (strcmp(tenseName, "past") == 0)
 862         tense = PAST_TENSE;
 863     else if (strcmp(tenseName, "present-gerund") == 0)
 864         tense = PRESENT_TENSE;
 865     else if (strcmp(tenseName, "active") == 0)
 866         tense = ACTIVE_TENSE;
 867     else if (strcmp(tenseName, "passive") == 0)
 868         tense = PASSIVE_TENSE;
 869     else if (strcmp(tenseName, "imp-active") == 0)
 870         tense = IMPERATIVE_ACTIVE_TENSE;
 871     else if (strcmp(tenseName, "imp-passive") == 0)
 872         tense = IMPERATIVE_PASSIVE_TENSE;
 873     else if (strcmp(tenseName, "past-perfect") == 0)
 874         tense = PAST_PERFECT;
 875
 876     if (tense == INVALID_TENSE)
 877     {
 878         if (trace) cout << "tenseName='" << tenseName << "'" << endl;
 879         assert(!"Invalid tense");
 880     }
 881
 882     return tense;
 883 }
 884
 885
 886 void
 887 FrenchVerbDictionary::deconjugate(const string &utf8ConjugatedVerb,
 888                                 std::vector<InflectionDesc> &results)
 889 {
 890     verbTrie.setDestination(&results);
 891
 892     try
 893     {
 894         wstring w = utf8ToWide(utf8ConjugatedVerb);
 895         (void) verbTrie.get(w);
 896     }
 897     catch (int e)  // exception throw by utf8towide()
 898     {
 899         // Wrong encoding (possibly Latin-1). Act as with unknown verb.
 900     }
 901
 902     verbTrie.setDestination(NULL);
 903 }
 904
 905
 906 /*virtual*/
 907 void
 908 FrenchVerbDictionary::VerbTrie::onFoundPrefixWithUserData(
 909                         const wstring &conjugatedVerb,
 910                         wstring::size_type index,
 911                         const vector<TrieValue> *templateList) const throw()
 912 {
 913     assert(templateList != NULL);
 914     if (trace)
 915         wcout << "VerbTrie::onFoundPrefixWithUserData: start: conjugatedVerb='"
 916               << conjugatedVerb << "', index=" << index
 917               << ", templateList: " << templateList->size()
 918               << ", results=" << results << endl;
 919
 920     if (results == NULL)
 921         return;
 922
 923     const wstring term(conjugatedVerb, index);
 924     const string utf8Term = fvd.wideToUTF8(term);
 925
 926     if (trace)
 927         cout << "  utf8Term='" << utf8Term << "'\n";
 928
 929     /*
 930         'templateList' contains the names of conjugated templates that might
 931         apply to the conjugated verb.  We check each of them to see if there
 932         is one that accepts the given termination 'term'.
 933     */
 934     for (vector<TrieValue>::const_iterator i = templateList->begin();
 935                                            i != templateList->end(); i++)
 936     {
 937         const TrieValue &trieValue = *i;
 938         const string &tname = trieValue.templateName;
 939         const TemplateInflectionTable &ti =
 940                                 fvd.inflectionTable.find(tname)->second;
 941         TemplateInflectionTable::const_iterator j = ti.find(utf8Term);
 942         if (trace)
 943             cout << "    tname='" << tname << "'\n";
 944         if (j == ti.end())
 945             continue;  // template 'tname' does not accept termination 'term'
 946
 947         // template 'tname' accepts 'term', so we produce some results.
 948
 949         string templateTerm(tname, tname.find(':') + 1);
 950             // termination of the infinitive form
 951         if (trace)
 952             cout << "    templateTerm='" << templateTerm << "'\n";
 953
 954         const vector<ModeTensePersonNumber> &v = j->second;
 955             // list of mode-tense-person combinations that can correspond
 956             // to the conjugated verb's termination
 957
 958         for (vector<ModeTensePersonNumber>::const_iterator k = v.begin();
 959                                                     k != v.end(); k++)
 960         {
 961             const ModeTensePersonNumber &mtpn = *k;
 962
 963             string infinitive = trieValue.correctVerbRadical + templateTerm;
 964                 // The infinitive of the conjugated verb is formed from its
 965                 // (correct) radical part and from the termination of the template name.
 966                 // Correct means with the proper accents. This allows the user
 967                 // to type "etaler" without the acute accent on the first "e"
 968                 // and obtain the conjugation for the correct verb, which has
 969                 // that accent.
 970
 971             if (trace)
 972             {
 973                 const wstring radical(conjugatedVerb, 0, index);
 974                 cout << "VerbTrie::onFoundPrefixWithUserData: radical='"
 975                     << fvd.wideToUTF8(radical) << "', templateTerm='" << templateTerm
 976                     << "', tname='" << tname
 977                     << "', correctVerbRadical='" << trieValue.correctVerbRadical
 978                     << "', mtpn=("
 979                     << mtpn.mode << ", "
 980                     << mtpn.tense << ", "
 981                     << (unsigned) mtpn.person << ", "
 982                     << mtpn.plural << ", "
 983                     << mtpn.correct << ")\n";
 984             }
 985
 986             results->push_back(InflectionDesc(infinitive, tname, mtpn));
 987                 // the InflectionDesc object is an analysis of the
 988                 // conjugated verb
 989         }
 990     }
 991 }
 992
 993
 994 /*static*/
 995 const char *
 996 FrenchVerbDictionary::getModeName(Mode m)
 997 {
 998     if (int(m) < int(INFINITIVE_MODE) || int(m) > int(PAST_PERFECT_INFINITIVE))
 999     {
1000         assert(!"FrenchVerbDictionary::getModeName() received invalid Mode value");
1001         return NULL;
1002     }
1003
1004     static const char *names[] =
1005     {
1006         "infinitive", "indicative", "conditional",
1007         "subjunctive", "imperative", "participle",
1008         "gerund",
1009         "present indicative",
1010         "present subjunctive",
1011         "present imperative",
1012         "present gerund",
1013         "past imperfect indicative",
1014         "past perfect indicative",
1015         "past perfect subjunctive",
1016         "past perfect imperative",
1017         "past perfect infinitive",
1018     };
1019
1020     size_t index = size_t(m) - 1;
1021     assert(index < sizeof(names) / sizeof(names[0]));
1022     return names[index];
1023 }
1024
1025
1026 /*static*/
1027 const char *
1028 FrenchVerbDictionary::getTenseName(Tense t)
1029 {
1030     if (int(t) < int(PRESENT_TENSE) || int(t) > int(PAST_PERFECT))
1031     {
1032         assert(!"FrenchVerbDictionary::getTenseName() received invalid Tense value");
1033         return NULL;
1034     }
1035
1036     static const char *names[] =
1037     {
1038         "present", "past", "imperfect", "future",
1039         "active", "passive", "active", "passive", "past perfect",
1040     };
1041
1042     size_t index = size_t(t) - 1;
1043     assert(index < sizeof(names) / sizeof(names[0]));
1044     return names[index];
1045 }
1046
1047
1048 wstring
1049 FrenchVerbDictionary::tolowerWide(const wstring &wideString) const
1050 {
1051     wstring result;
1052     for (wstring::size_type len = wideString.length(), i = 0; i < len; i++)
1053     {
1054         wchar_t c = wideString[i];
1055         if (c <= 0xFF)
1056             result += (unsigned char) latin1TolowerTable[(unsigned char) c];
1057         else
1058             result += c;
1059     }
1060     return result;
1061 }
1062
1063
1064 //static
1065 bool
1066 FrenchVerbDictionary::isWideVowel(wchar_t c)
1067 {
1068     if (strchr("aeiouyAEIOUY", (unsigned char) c) != NULL)
1069         return true;
1070     if (c < 0xc0 || c > 0xff)
1071         return false;
1072     return c != 0xc7 && c != 0xd0
1073         && c != 0xd1 && c != 0xd7 && c != 0xde
1074         && c != 0xe7
1075         && c != 0xf0 && c != 0xf1 && c != 0xf7 && c != 0xfe;
1076 }
1077
1078
1079 wstring
1080 FrenchVerbDictionary::utf8ToWide(const string &utf8String) const throw(int)
1081 {
1082     size_t inbytesleft = utf8String.length() + 1;  // number of *bytes* in UTF-8 string
1083     size_t outbytesleft = inbytesleft * sizeof(wchar_t);  // oversized for safety
1084     char *inbuf = strcpy(new char[inbytesleft], utf8String.c_str());
1085     char *outbuf = new char[outbytesleft];
1086
1087     ICONV_CONST char *in = inbuf;
1088     char *out = outbuf;
1089     size_t initNumOutBytes = outbytesleft;
1090     if (iconv(utf8ToWideConv, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
1091     {
1092         int e = errno;
1093         delete [] inbuf;
1094         delete [] outbuf;
1095         throw e;
1096     }
1097
1098     // iconv() has substracted the number of bytes produced
1099     // from outbytesleft. This allows the computation of the
1100     // number of wide characters in the result (excluding the
1101     // terminating null character).
1102     // See the iconv(3) man page for details.
1103     //
1104     const wchar_t *resultPtr = reinterpret_cast<wchar_t *>(outbuf);
1105     size_t resultLen = (initNumOutBytes - outbytesleft) / sizeof(wchar_t) - 1;
1106     assert(resultPtr[resultLen] == 0);
1107
1108     wstring result(resultPtr, resultLen);
1109     assert(result.length() == resultLen);
1110
1111     delete [] inbuf;
1112     delete [] outbuf;
1113     return result;
1114 }
1115
1116
1117 string
1118 FrenchVerbDictionary::wideToUTF8(const wstring &wideString) const throw(int)
1119 {
1120     size_t inbytesleft = (wideString.length() + 1) * sizeof(wchar_t);
1121     size_t outbytesleft = inbytesleft;  // UTF-8 string takes no more room than wstring
1122     char *inbuf = reinterpret_cast<char *>(memcpy(new char[inbytesleft], wideString.data(), inbytesleft));
1123     char *outbuf = new char[outbytesleft];
1124
1125     ICONV_CONST char *in = inbuf;
1126     char *out = outbuf;
1127     if (iconv(wideToUTF8Conv, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
1128     {
1129         int e = errno;
1130         delete [] inbuf;
1131         delete [] outbuf;
1132         throw e;
1133     }
1134
1135     string result = outbuf;
1136     delete [] inbuf;
1137     delete [] outbuf;
1138     return result;
1139 }
1140
1141
1142 /*static*/
1143 string
1144 FrenchVerbDictionary::getRadical(
1145                         const string &infinitive,
1146                         const string &templateName) throw(logic_error)
1147 {
1148     string::size_type posColon = templateName.find(':');
1149     if (posColon == string::npos)
1150         throw logic_error("no colon found in template name");
1151
1152     string::size_type lenSuffix = templateName.length() - posColon - 1;
1153     string::size_type lenInfPrefix = infinitive.length() - lenSuffix;
1154     return string(infinitive, 0, lenInfPrefix);
1155 }
1156
1157
1158 bool
1159 FrenchVerbDictionary::generateTense(const string &radical,
1160                                 const TemplateSpec &templ,
1161                                 Mode mode,
1162                                 Tense tense,
1163                                 vector< vector<string> > &dest,
1164                                 bool includePronouns,
1165                                 bool aspirateH,
1166                                 bool isItalian) const throw()
1167 {
1168     if (templ.find(mode) == templ.end())
1169         return false;
1170
1171     const ModeSpec &modeSpec = templ.find(mode)->second;
1172
1173     if (modeSpec.find(tense) == modeSpec.end())
1174         return false;
1175
1176     const TenseSpec &tenseSpec = modeSpec.find(tense)->second;
1177
1178     if (mode != INDICATIVE_MODE
1179             && mode != CONDITIONAL_MODE
1180             && mode != SUBJUNCTIVE_MODE)
1181         includePronouns = false;
1182
1183     for (TenseSpec::const_iterator p = tenseSpec.begin();
1184                                     p != tenseSpec.end(); p++)
1185     {
1186         dest.push_back(vector<string>());
1187         for (PersonSpec::const_iterator i = p->begin(); i != p->end(); i++)
1188         {
1189             // Do not return spellings that are marked incorrect.
1190             // They are in the knowledge base only to allow
1191             // error-tolerant searches.
1192             //
1193             if (!(*i).isCorrect)
1194                 continue;
1195
1196             string pronoun;  // no pronoun by default
1197
1198             string v = radical + (*i).inflection;
1199
1200             if (includePronouns)
1201             {
1202                 size_t noPers = p - tenseSpec.begin();
1203                 switch (noPers)
1204                 {
1205                 case 0:
1206                     if (isItalian)
1207                         pronoun = "io ";
1208                     else
1209                     {
1210                         bool elideJe = false;
1211                         if (!aspirateH)
1212                         {
1213                             wstring wideV = utf8ToWide(v);  // inefficient: converts all chars, only 1st needed
1214                             wchar_t init = (wideV.empty() ? '\0' : wideV[0]);
1215                             if (init == 'h' || init == 'H' || isWideVowel(init))
1216                                 elideJe = true;
1217                         }
1218                         pronoun = (elideJe ? "j'" : "je ");
1219                     }
1220                     break;
1221                 case 1: pronoun = "tu "; break;
1222                 case 2: pronoun = (isItalian ? "egli " : "il "); break;
1223                 case 3: pronoun = (isItalian ? "noi "  : "nous "); break;
1224                 case 4: pronoun = (isItalian ? "voi "  : "vous "); break;
1225                 case 5: pronoun = (isItalian ? "essi " : "ils "); break;
1226                 }
1227
1228                 if (mode == SUBJUNCTIVE_MODE)
1229                 {
1230                     const char *conj;
1231                     if (isItalian)
1232                         conj = "che ";
1233                     else if (noPers == 2 || noPers == 5)
1234                         conj = "qu'";
1235                     else
1236                         conj = "que ";
1237                     pronoun = conj + pronoun;
1238                 }
1239             }
1240
1241             dest.back().push_back(pronoun + v);
1242         }
1243     }
1244
1245     return true;
1246 }
1247
1248
1249 bool FrenchVerbDictionary::isVerbStartingWithAspirateH(
1250                                 const std::string &infinitive) const throw()
1251 {
1252     return aspirateHVerbs.find(infinitive) != aspirateHVerbs.end();
1253 }