4 cellwriter -- a character recognition input method
5 Copyright (C) 2007 Michael Levin <risujin@risujin.org>
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License
9 as published by the Free Software Foundation; either version 2
10 of the License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
29 #include "recognize.h"
34 void engine_prep(void);
42 /* Preprocessor engine must run first */
43 { "Key-point distance", engine_prep, MAX_RANGE, TRUE, -1, 0, 0 },
45 /* Averaging engines */
46 { "Average distance", engine_average, MAX_RANGE, TRUE, -1, 0, 0 },
47 { "Average angle", NULL, MAX_RANGE, TRUE, 0, 0, 0 },
49 #ifndef DISABLE_WORDFREQ
50 /* Word frequency engine */
51 { "Word context", engine_wordfreq, MAX_RANGE / 3, FALSE, -1, 0, 0 },
55 static int engine_rating(const Sample *sample, int j)
56 /* Get the processed rating for engine j on a sample */
60 if (!engines[j].range || engines[j].max < 1)
62 value = ((int)sample->ratings[j] - engines[j].average) *
63 engines[j].range / engines[j].max;
64 if (engines[j].scale >= 0)
65 value = value * engines[j].scale / ENGINE_SCALE;
73 typedef struct SampleLink {
75 struct SampleLink *prev, *next;
78 static SampleLink *samplelink_root = NULL, *samplelink_iter = NULL;
79 static int current = 1;
81 static Sample *sample_new(void)
82 /* Allocate a link in the sample linked list */
86 link = g_malloc0(sizeof (*link));
87 link->next = samplelink_root;
89 samplelink_root->prev = link;
90 samplelink_root = link;
94 void sampleiter_reset(void)
95 /* Reset the sample linked list iterator */
97 samplelink_iter = samplelink_root;
100 Sample *sampleiter_next(void)
101 /* Get the next sample link from the sample linked list iterator */
105 if (!samplelink_iter)
107 link = samplelink_iter;
108 samplelink_iter = samplelink_iter->next;
109 return &link->sample;
112 int samples_loaded(void)
114 return samplelink_root != NULL;
121 int samples_max = 5, no_latin_alpha = FALSE;
123 void clear_sample(Sample *sample)
124 /* Free stroke data associated with a sample and reset its parameters */
128 for (i = 0; i < sample->len; i++) {
129 stroke_free(sample->strokes[i]);
130 stroke_free(sample->roughs[i]);
132 memset(sample, 0, sizeof (*sample));
135 void copy_sample(Sample *dest, const Sample *src)
136 /* Copy a sample, cloing its strokes, overwriting dest */
141 for (i = 0; i < src->len; i++) {
142 dest->strokes[i] = stroke_clone(src->strokes[i], FALSE);
143 dest->roughs[i] = stroke_clone(src->roughs[i], FALSE);
147 static void process_gluable(const Sample *sample, int stroke_num)
148 /* Calculates the lowest distance between the start or end of one stroke and any
149 other point on each other stroke in the sample */
155 /* Dots cannot be glued */
156 s1 = sample->strokes[stroke_num];
157 memset(s1->gluable_start, -1, sizeof (s1->gluable_start));
158 memset(s1->gluable_end, -1, sizeof (s1->gluable_end));
159 if (s1->spread < DOT_SPREAD)
164 point = start ? s1->points[0] : s1->points[s1->len - 1];
165 for (i = 0; i < sample->len; i++) {
168 float dist, min = GLUE_DIST;
172 s2 = sample->strokes[i];
173 if (i == stroke_num || s2->spread < DOT_SPREAD)
176 /* Check the distance to the first point */
177 vec2_set(&v, s2->points[0].x - point.x,
178 s2->points[0].y - point.y);
183 /* Find the lowest distance from the glue point to any other
184 point on the other stroke */
185 for (j = 0; j < s2->len - 1; j++) {
187 double dist, mag, dot;
189 /* Vector l is a unit vector from point j to j + 1 */
190 vec2_set(&l, s2->points[j].x - s2->points[j + 1].x,
191 s2->points[j].y - s2->points[j + 1].y);
192 mag = vec2_norm(&l, &l);
194 /* Vector w is a vector from point j to our point */
195 vec2_set(&w, s2->points[j].x - point.x,
196 s2->points[j].y - point.y);
198 /* For points that are not in between a segment,
199 get the distance from the points themselves,
200 otherwise get the distance from the segment line */
201 dot = vec2_dot(&l, &w);
202 if (dot < 0. || dot > mag) {
203 vec2_set(&v, s2->points[j + 1].x - point.x,
204 s2->points[j + 1].y - point.y);
207 dist = vec2_cross(&w, &l);
214 gluable = min * GLUABLE_MAX / GLUE_DIST;
216 s1->gluable_start[i] = gluable;
218 s1->gluable_end[i] = gluable;
226 void process_sample(Sample *sample)
227 /* Generate cached properties of a sample */
232 if (sample->processed)
234 sample->processed = TRUE;
236 /* Make sure all strokes have been processed first */
237 for (i = 0; i < sample->len; i++)
238 process_stroke(sample->strokes[i]);
240 /* Compute properties for each stroke */
241 vec2_set(&sample->center, 0., 0.);
242 for (i = 0, distance = 0.; i < sample->len; i++) {
248 stroke = sample->strokes[i];
250 /* Add the stroke center to the center vector, weighted by
252 vec2_copy(&v, &stroke->center);
253 weight = stroke->spread < DOT_SPREAD ?
254 DOT_SPREAD : stroke->distance;
255 vec2_scale(&v, &v, weight);
256 vec2_sum(&sample->center, &sample->center, &v);
259 /* Get gluing distances */
260 process_gluable(sample, i);
262 /* Create a rough-sampled version */
263 points = stroke->distance / ROUGH_RESOLUTION + 0.5;
266 sample->roughs[i] = sample_stroke(NULL, stroke, points, points);
268 vec2_scale(&sample->center, &sample->center, 1.f / distance);
269 sample->distance = distance;
272 void center_samples(Vec2 *ac_to_bc, Sample *a, Sample *b)
273 /* Adjust for the difference between two sample centers */
275 vec2_sub(ac_to_bc, &b->center, &a->center);
278 int char_disabled(int ch)
279 /* Returns TRUE if a character is not renderable or is explicity disabled by
280 a setting (not counting disabled Unicode blocks) */
282 return (no_latin_alpha && ch >= unicode_blocks[0].start &&
283 ch <= unicode_blocks[0].end && g_ascii_isalpha(ch)) ||
284 !g_unichar_isgraph(ch);
287 int sample_disqualified(const Sample *sample)
288 /* Check disqualification conditions for a sample during recognition.
289 The preprocessor engine must run before any calls to this or
290 disqualification will not work. */
292 if ((!ignore_stroke_num && sample->len != input->len) ||
295 if (sample->disqualified)
297 if (char_disabled(sample->ch))
302 int sample_valid(const Sample *sample, int used)
303 /* Check if this sample has changed since it was last referenced */
305 if (!sample || !used)
307 return sample->used == used;
310 static void sample_rating(Sample *sample)
311 /* Get the composite processed rating on a sample */
315 if (!sample->ch || sample_disqualified(sample) ||
316 sample->penalty >= 1.f) {
317 sample->rating = RATING_MIN;
320 for (i = 0, rating = 0; i < ENGINES; i++)
321 rating += engine_rating(sample, i);
322 rating *= 1.f - sample->penalty;
323 if (rating > RATING_MAX)
325 if (rating < RATING_MIN)
327 sample->rating = rating;
330 void update_enabled_samples(void)
331 /* Run through the samples list and enable samples in enabled blocks */
336 while ((sample = sampleiter_next())) {
339 sample->enabled = FALSE;
342 block = unicode_blocks;
343 while (block->name) {
344 if (sample->ch >= block->start &&
345 sample->ch <= block->end) {
346 sample->enabled = block->enabled;
354 void promote_sample(Sample *sample)
355 /* Update usage counter for a sample */
357 sample->used = current++;
360 void demote_sample(Sample *sample)
361 /* Remove the sample from our set if we can */
363 if (char_trained(sample->ch) > 1)
364 clear_sample(sample);
369 Stroke *transform_stroke(Sample *src, Transform *tfm, int i)
370 /* Create a new stroke by applying the transformation to the source */
375 stroke = stroke_new(0);
376 for (k = 0, j = 0; k < STROKES_MAX && j < src->len; k++)
377 for (j = 0; j < src->len; j++)
378 if (tfm->order[j] - 1 == i && tfm->glue[j] == k) {
379 glue_stroke(&stroke, src->strokes[j],
383 process_stroke(stroke);
388 Recognition and training
391 Sample *input = NULL;
392 int strength_sum = 0;
394 static GTimer *timer;
396 void recognize_init(void)
398 #ifndef DISABLE_WORDFREQ
401 timer = g_timer_new();
404 void recognize_sample(Sample *sample, Sample **alts, int num_alts)
407 int i, range, strength, msec;
409 g_timer_start(timer);
411 process_sample(input);
415 while ((sample = sampleiter_next())) {
416 memset(sample->ratings, 0, sizeof (sample->ratings));
421 for (i = 0, range = 0; i < ENGINES; i++) {
427 /* Compute average and maximum value */
429 engines[i].average = 0;
431 while ((sample = sampleiter_next())) {
436 if (sample->ratings[i] > value)
437 value = sample->ratings[i];
438 if (!value && engines[i].ignore_zeros)
440 if (value > engines[i].max)
441 engines[i].max = value;
442 engines[i].average += value;
447 engines[i].average /= rated;
448 if (engines[i].max > 0)
449 range += engines[i].range;
450 if (engines[i].max == engines[i].average) {
451 engines[i].average = 0;
454 engines[i].max -= engines[i].average;
457 g_timer_elapsed(timer, µsec);
458 msec = microsec / 100;
459 g_message("Recognized -- No ratings, %dms", msec);
464 /* Rank the top samples */
467 while ((sample = sampleiter_next())) {
470 sample_rating(sample);
471 if (sample->rating < 1)
474 /* Bubble-sort the new rating in */
475 for (j = 0; j < num_alts; j++)
477 if (j < num_alts - 1)
480 } else if (alts[j]->ch == sample->ch) {
481 if (alts[j]->rating >= sample->rating)
484 } else if (alts[j]->rating < sample->rating) {
487 if (j == num_alts - 1)
490 /* See if the character is in the list */
491 for (k = j + 1; k < num_alts - 1 && alts[k] &&
492 alts[k]->ch != sample->ch; k++);
494 /* Do not swallow zeroes */
495 if (!alts[k] && k < num_alts - 1)
498 memmove(alts + j + 1, alts + j,
499 sizeof (*alts) * (k - j));
507 /* Normalize the alternates' accuracies to 100 */
509 for (i = 0; i < num_alts && alts[i]; i++)
510 alts[i]->rating = alts[i]->rating * 100 / range;
512 /* Keep track of strength stat */
515 strength = alts[1] ? alts[0]->rating - alts[1]->rating :
517 strength_sum += strength;
520 g_timer_elapsed(timer, µsec);
521 msec = microsec / 100;
522 g_message("Recognized -- %d/%d (%d%%) disqualified, "
523 "%dms (%dms/symbol), %d%% strong",
524 num_disqualified, prep_examined,
525 num_disqualified * 100 / prep_examined, msec,
526 prep_examined - num_disqualified ?
527 msec / (prep_examined - num_disqualified) : -1,
530 /* Print out the top candidate scores in detail */
531 if (log_level >= G_LOG_LEVEL_DEBUG)
532 for (i = 0; i < num_alts && alts[i]; i++) {
535 len = input->len >= alts[i]->len ? input->len :
537 log_print("| '%C' (", alts[i]->ch);
538 for (j = 0; j < ENGINES; j++)
539 log_print("%4d [%5d]%s",
540 engine_rating(alts[i], j),
542 j < ENGINES - 1 ? "," : "");
543 log_print(") %3d%% [", alts[i]->rating);
544 for (j = 0; j < len; j++)
546 alts[i]->transform.order[j] - 1);
547 for (j = 0; j < len; j++)
548 log_print("%c", alts[i]->transform.reverse[j] ?
550 for (j = 0; j < len; j++)
551 log_print("%d", alts[i]->transform.glue[j]);
555 /* Select the top result */
556 input->ch = alts[0] ? alts[0]->ch : 0;
559 static void insert_sample(const Sample *new_sample, int force_overwrite)
560 /* Insert a sample into the sample chain, possibly overwriting an older
563 int last_used, count = 0;
564 Sample *sample, *overwrite = NULL, *create = NULL;
566 last_used = force_overwrite ? current + 1 : new_sample->used;
568 while ((sample = sampleiter_next())) {
573 if (sample->ch != new_sample->ch)
575 if (sample->used < last_used) {
577 last_used = sample->used;
581 if (overwrite && count >= samples_max) {
583 clear_sample(sample);
587 sample = sample_new();
588 *sample = *new_sample;
589 process_sample(sample);
592 void train_sample(const Sample *sample, int trusted)
593 /* Overwrite a blank or least-recently-used slot in the samples set */
597 /* Do not allow zero-length samples */
598 if (sample->len < 1) {
599 g_warning("Attempted to train zero length sample for '%C'",
604 copy_sample(&new_sample, sample);
605 new_sample.used = trusted ? current++ : 1;
606 new_sample.enabled = TRUE;
607 insert_sample(&new_sample, TRUE);
610 int char_trained(int ch)
611 /* Count the number of samples for this character */
617 while ((sample = sampleiter_next())) {
618 if (sample->ch != ch)
625 void untrain_char(int ch)
626 /* Delete all samples for a character */
631 while ((sample = sampleiter_next()))
632 if (sample->ch == ch)
633 clear_sample(sample);
640 void recognize_sync(void)
641 /* Sync params with the profile */
645 profile_write("recognize");
646 profile_sync_int(¤t);
647 profile_sync_int(&samples_max);
650 profile_sync_int(&no_latin_alpha);
651 for (i = 0; i < ENGINES; i++)
652 profile_sync_int(&engines[i].range);
656 void sample_read(void)
657 /* Read a sample from the profile */
662 memset(&sample, 0, sizeof (sample));
663 sample.ch = atoi(profile_read());
665 g_warning("Sample on line %d has NULL symbol", profile_line);
668 sample.used = atoi(profile_read());
669 stroke = sample.strokes[0];
674 str = profile_read();
676 if (!sample.strokes[0]) {
677 g_warning("Sample on line %d ('%C') with no "
678 "point data", profile_line,
682 insert_sample(&sample, FALSE);
686 stroke = sample.strokes[sample.len];
689 if (sample.len >= STROKES_MAX) {
690 g_warning("Sample on line %d ('%C') is oversize",
691 profile_line, sample.ch);
692 clear_sample(&sample);
696 stroke = stroke_new(0);
697 sample.strokes[sample.len++] = stroke;
699 if (stroke->len >= POINTS_MAX) {
700 g_warning("Symbol '%C' stroke %d is oversize",
701 sample.ch, sample.len);
702 clear_sample(&sample);
706 y = atoi(profile_read());
707 draw_stroke(&stroke, x, y);
711 static void sample_write(Sample *sample)
712 /* Write a sample link to the profile */
716 profile_write(va("sample %5d %5d", sample->ch, sample->used));
717 for (k = 0; k < sample->len; k++) {
718 for (l = 0; l < sample->strokes[k]->len; l++)
719 profile_write(va(" %4d %4d",
720 sample->strokes[k]->points[l].x,
721 sample->strokes[k]->points[l].y));
727 void samples_write(void)
728 /* Write all of the samples to the profile */
733 while ((sample = sampleiter_next()))
734 if (sample->ch && sample->used)
735 sample_write(sample);