Initial check-in
[him-cellwriter] / src / wordfreq.c
1
2 /*
3
4 cellwriter -- a character recognition input method
5 Copyright (C) 2007 Michael Levin <risujin@risujin.org>
6
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License
9 as published by the Free Software Foundation; either version 2
10 of the License, or (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20
21 */
22
23 #include "config.h"
24 #include "common.h"
25 #include "recognize.h"
26 #include <stdlib.h>
27 #include <string.h>
28
29 /* cellwidget.c */
30 const char *cell_widget_word(void);
31
32 /*
33         Word frequency engine
34 */
35
36 #ifndef DISABLE_WORDFREQ
37
38 /* TODO needs to be internationalized (wide char)
39    TODO user-made words list
40    TODO choose a list via GUI
41    FIXME the frequency list contains "n't" etc as separate endings, this
42          needs to be taken into consideration */
43
44 /* The number of word frequency entries to load */
45 #define WORDFREQS 15000
46
47 typedef struct {
48         char string[24];
49         int count;
50 } WordFreq;
51
52 int wordfreq_enable = TRUE;
53
54 static WordFreq wordfreqs[WORDFREQS + 1];
55 static int wordfreqs_len, wordfreqs_count;
56
57 void load_wordfreq(void)
58 /* Read in the word frequency file. The file format is: word\tcount\n */
59 {
60         GIOChannel *channel;
61         GError *error = NULL;
62         char buf[64], *path;
63         gsize bytes_read = 1;
64         int i;
65
66         wordfreqs[0].string[0] = 0;
67
68         /* Try to open the user's word frequency file */
69         path = g_build_filename(g_get_home_dir(), "." PACKAGE, "wordfreq",
70                                 NULL);
71         channel = g_io_channel_new_file(path, "r", &error);
72         if (error) {
73                 g_debug("User does not have a word frequency file, "
74                         "loading system file");
75                 channel = NULL;
76         }
77         error = NULL;
78         g_free(path);
79
80         /* Open the word frequency file */
81         if (!channel) {
82                 path = g_build_filename(PKGDATADIR, "wordfreq", NULL);
83                 channel = g_io_channel_new_file(path, "r", &error);
84                 if (error) {
85                         g_warning("Failed to open system word frequency file "
86                                   "'%s' for reading: %s", path, error->message);
87                         g_free(path);
88                         return;
89                 }
90                 g_free(path);
91         }
92
93         /* Read in every entry */
94         g_debug("Parsing word frequency list");
95         wordfreqs_count = 0;
96         for (i = 0; bytes_read > 0 && i < WORDFREQS; i++) {
97                 char *pbuf;
98                 int swap, len;
99
100                 /* Read a line */
101                 pbuf = buf - 1;
102                 do {
103                         g_io_channel_read_chars(channel, ++pbuf, 1,
104                                                 &bytes_read, &error);
105                 } while (bytes_read > 0 && *pbuf != '\n' &&
106                          pbuf < buf + sizeof (buf));
107                 *pbuf = 0;
108
109                 /* Parse the word */
110                 pbuf = buf;
111                 while (*pbuf && *pbuf != '\t' && *pbuf != ' ')
112                         pbuf++;
113                 if (buf == pbuf) {
114                         i--;
115                         continue;
116                 }
117                 swap = *pbuf;
118                 *pbuf = 0;
119                 len = pbuf - buf;
120                 if (len >= (int)sizeof (wordfreqs[i].string))
121                         len = sizeof (wordfreqs[i].string) - 1;
122                 memcpy(wordfreqs[i].string, buf, len);
123                 wordfreqs[i].string[len] = 0;
124
125                 /* Parse the count */
126                 *pbuf = swap;
127                 while (*pbuf == ' ' || *pbuf == '\t')
128                         pbuf++;
129                 wordfreqs_count += wordfreqs[i].count = log(atoi(pbuf));
130         }
131         wordfreqs[i].string[0] = 0;
132         wordfreqs_len = i;
133         g_io_channel_unref(channel);
134         g_debug("%d words parsed", i);
135
136         return;
137 }
138
139 void engine_wordfreq(void)
140 {
141         Sample *sample;
142         const char *pre, *post;
143         int i, pre_len, post_len, chars[128];
144
145         if (!wordfreq_enable)
146                 return;
147         pre = cell_widget_word();
148         pre_len = strlen(pre);
149         post = pre + pre_len + 1;
150         post_len = strlen(post);
151         if (!pre_len && !post_len)
152                 return;
153         memset(chars, 0, sizeof (chars));
154
155         /* Numbers follow numbers */
156         if (g_ascii_isdigit(pre[pre_len - 1])) {
157                 for (i = 0; i <= 9; i++)
158                         chars['0' + i] = 1;
159                 goto apply_table;
160         }
161
162         /* Search the databases for matches (FIXME sort/index) */
163         for (i = 0; i < wordfreqs_len; i++)
164                 if ((!pre_len ||
165                      !g_ascii_strncasecmp(pre, wordfreqs[i].string, pre_len)) &&
166                     (!post_len ||
167                      !g_ascii_strncasecmp(post, wordfreqs[i].string + pre_len +
168                                           1, post_len))) {
169                         int ch = wordfreqs[i].string[pre_len],
170                             ch_lower = ch, ch_upper = 0;
171
172                         if (ch < 32 || ch >= 127)
173                                 continue;
174
175                         /* Suggest proper case */
176                         if (g_ascii_isalpha(ch)) {
177                                 ch_lower = g_ascii_tolower(ch);
178                                 ch_upper = g_ascii_toupper(ch);
179                                 if (pre_len > 1) {
180                                         if (g_ascii_islower(pre[pre_len - 1]))
181                                                 ch_upper = 0;
182                                         else
183                                         if (g_ascii_isupper(pre[pre_len - 1]) &&
184                                             g_ascii_isupper(pre[pre_len - 2]))
185                                                 ch_lower = 0;
186                                 }
187                         }
188
189                         chars[ch_lower] += wordfreqs[i].count;
190                         chars[ch_upper] += wordfreqs[i].count;
191                 }
192
193 apply_table:
194         /* Apply characters table */
195         sampleiter_reset();
196         while ((sample = sampleiter_next()))
197                 if (sample->ch >= 32 && sample->ch < 127)
198                         sample->ratings[ENGINE_WORDFREQ] = chars[sample->ch];
199 }
200
201 #endif /* DISABLE_WORDFREQ */
202