33 #pragma warning(disable:4244) // Conversion warnings
34 #pragma warning(disable:4800) // int/bool warnings
76 if (!choice1)
return choice2;
77 if (!choice2)
return choice1;
92 BLOB_CHOICE_IT c_it(blob_list);
93 while (n-- > 0 && !c_it.at_last())
100 if (!blob_list)
return INVALID_UNICHAR_ID;
101 BLOB_CHOICE_IT blob_choice_it(blob_list);
102 return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
103 : INVALID_UNICHAR_ID;
111 BLOB_CHOICE_IT c_it(blob_list);
114 if (c_it.data()->unichar_id() == target_uid)
return pos;
115 if (c_it.at_last())
break;
133 float *certainties) {
134 int pos_str_len = strlen(pos_str);
136 if (start_pos + pos_str_len > char_choices.
length()) {
140 for (
int x = 0; x < pos_str_len; x++) {
141 int pos = pos_str[x]-
'0';
142 if (pos < 0) pos = 0;
144 tprintf(
"PosStr[%d](%d)=%c %d\n", x, pos_str_len, pos_str[x], pos);
151 if (certainties !=
NULL) certainties[x] = blob_it->
certainty();
165 for (
int i = 0; i < word_choice->
length(); i++) {
167 BLOB_CHOICE_LIST* blob_choice_list = char_choices.
get(start_pos + i);
169 if (pos < 0) pos = 0;
170 pos_str[i] = pos +
'0';
172 pos_str[word_choice->
length()] =
'\0';
182 BLOB_CHOICE_LIST *blob_choices,
185 BLOB_CHOICE_IT c_it(blob_choices);
186 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
188 unicharset.
get_chartype(c_it.data()->unichar_id()) == target_type)
207 BLOB_CHOICE_LIST *blob_choices,
211 BLOB_CHOICE_IT c_it(blob_choices);
212 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
214 if (c_it.data()->script_id() == 0)
continue;
215 if (c_it.data()->script_id() == target_sid) found =
true;
216 if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found =
true;
217 if (found)
return c_it.data();
219 if (secondary_sid > 0) {
220 c_it.set_to_list(blob_choices);
221 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
222 if (c_it.data()->script_id() == 0)
continue;
223 if (c_it.data()->script_id() == secondary_sid)
233 char_choices_ =
NULL;
234 adjust_factor_ = 1.0f;
235 allow_collision_ =
false;
245 unicharset_ = &unicharset;
246 char_choices_ = &char_choices;
247 word_length_ = char_choices.
length();
248 for (
int i = 0; i < word_length_; ++i)
249 perm_state_[i] = kPosFree;
250 perm_state_[word_length_] =
'\0';
252 for (
int i = 0; i < word_length_; ++i) {
255 perm_state_[i] =
'1';
257 adjust_factor_ = default_bias;
258 allow_collision_ =
false;
268 ASSERT_HOST(start_pos + strlen(pos_str) - 1 < word_length_);
270 tprintf(
"Copy over %s -> %s @ %d ", pos_str, perm_state_, start_pos);
273 if (!allow_collision_) {
274 int len = strlen(pos_str);
275 for (
int i = 0; i < len; ++i)
278 strncpy(&perm_state_[start_pos], pos_str, strlen(pos_str));
279 adjust_factor_ *= weight;
280 if (debug_)
tprintf(
"==> %s %f\n", perm_state_, adjust_factor_);
292 tprintf(
"Set UID %d -> %s @ %d ",
293 blob_choice->
unichar_id(), perm_state_, char_pos);
297 perm_state_[char_pos] = pos +
'0';
298 adjust_factor_ *= weight;
299 if (debug_)
tprintf(
"==> %s %f\n", perm_state_, adjust_factor_);
304 float *adjust_factor) {
307 unicharset_, *char_choices_, 0, perm_state_, certainties);
308 float rating = word_choice->
rating() * adjust_factor_;
310 *adjust_factor = adjust_factor_;
337 float top_choice_rating_limit = best_choice->
rating();
366 LogNewChoice(adjust_factor, certainties,
false, result2, char_choices);
372 raw_choice, &any_alpha);
388 top_choice_rating_limit);
410 if (!word || wordseg_rating_adjust_factor_ <= 0)
return;
412 float old_rating = word->
rating();
413 float new_rating = old_rating * wordseg_rating_adjust_factor_;
416 tprintf(
"Permute segadjust %f * %f --> %f\n",
417 old_rating, wordseg_rating_adjust_factor_, new_rating);
438 const int max_dict_len = max_fixed_length_dawgs_wdlen_;
439 const int min_dict_len = 2;
443 while (anchor_pos < char_choices.
length()) {
446 int step = max_dict_len;
447 while (step >= min_dict_len) {
448 int end_pos = anchor_pos + step - 1;
449 if (end_pos < char_choices.
length()) {
454 if (part_choice->
length() == step) {
456 tprintf(
"match found at pos=%d len=%d\n%s\n", anchor_pos, step,
466 if (part_choice && step > 1) {
468 float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.
length());
470 permuter_state->
AddPreference(anchor_pos, posstr, adjust_factor);
471 match_score += step - 1;
473 tprintf(
"Promote word rating %d-len%d\n%s\n", anchor_pos, step,
478 anchor_pos,
"0",
NULL);
480 tprintf(
"Single char %d %s\n", anchor_pos,
483 if (part_choice && part_choice->
length() > 0)
484 (*best_choice) += (*part_choice);
485 if (part_choice)
delete part_choice;
489 if (match_score > 0) {
490 float adjust_factor = pow(0.8,
491 match_score * 2.0 / char_choices.
length());
492 float adjusted_score = best_choice->
rating() * adjust_factor;
494 tprintf(
"Adjusting score %f @ %d -> %f\n",
495 best_choice->
rating(), match_score, adjusted_score);
499 tprintf(
"Found Best CJK word %f: %s\n",
513 char* pos_chartypes) {
515 const int hist_size = 128;
516 int chprop[hist_size];
518 for (x = 0; x < hist_size; x++) chprop[x] = 0;
519 for (x = 0; x < char_choices.
length(); ++x) {
522 if (pos_chartypes) pos_chartypes[x] = ctype;
523 if (ctype == 0 || ctype ==
'p')
continue;
526 if (x == 0 && ctype ==
'A')
530 for (x = 1; x < hist_size; x++)
531 if (chprop[x] >= chprop[max_prop]) max_prop = x;
532 return (chprop[max_prop] > 0) ? max_prop : 0;
551 if (word_type == 0 || word_type ==
'p')
554 tprintf(
"\n\nPermuteCharType[%c]\n", word_type);
559 BLOB_CHOICE_IT blob_choice_it;
561 bool replaced =
false;
562 int prev_unambig_type = 0;
564 for (
int x = 0; x < char_choices.
length(); ++x) {
565 BLOB_CHOICE_LIST* pos_choice = char_choices.
get(x);
567 if (unichar_id == 0) {
571 blob_choice_it.set_to_list(pos_choice);
577 bool is_ambiguous = (ambig_uids !=
NULL);
579 bool is_consistent = is_punct ||
580 unicharset.
get_chartype(unichar_id) == prev_unambig_type ||
584 tprintf(
"char[%d]:%s is_ambig %c is_punct %c is_consistent %c\n",
586 is_ambiguous?
'T':
'F', is_punct?
'T':
'F', is_consistent?
'T':
'F');
593 }
else if (is_ambiguous && !is_consistent) {
597 tprintf(
"Checking %s r%g PrevCharType %c\n",
599 first_choice->
rating(), prev_unambig_type);
609 if (c_it ==
NULL && prev_unambig_type > 0) {
627 tprintf(
"Replacing %s r%g ==> %s r%g\n",
630 tprintf(
"\n\nPermuteCharType[%c]\n", word_type);
638 }
else if (!is_ambiguous && !is_punct) {
640 prev_unambig_type = pos_chartypes[x];
645 certainties[x] = first_choice->
certainty();
652 float rating = current_word->
rating();
655 current_word->
print(
"<== permute_chartype_word **");
680 tprintf(
"\n\nPermuteScript %s\n",
687 BLOB_CHOICE_IT blob_choice_it;
688 bool replaced =
false;
689 bool prev_is_consistent =
false;
691 for (
int x = 0; x < char_choices.
length(); ++x) {
692 blob_choice_it.set_to_list(char_choices.
get(x));
699 if (unichar_id == 0) {
712 }
else if (!sid_consistent && !this_is_punct && prev_is_consistent) {
728 tprintf(
"Replacing %s r%g ==> %s r%g\n",
742 certainties[x] = first_choice->
certainty();
743 prev_is_consistent = sid_consistent;
750 float rating = current_word->
rating();
753 current_word->
print(
"<== permute_script_word **");
769 tprintf(
"\n\n\n##### Permute_Characters #######\n");
775 if (char_choices.
length() == 1 &&
779 if (this_choice && this_choice->
rating() < best_choice->
rating()) {
780 *best_choice = *this_choice;
783 best_choice->
print(
"\n**** Populate BestChoice");
784 cprintf(
"populate best_choice\n\t%s\n",
801 float rating_limit) {
807 BLOB_CHOICE_IT blob_choice_it;
812 return bad_word_choice;
817 for (x = 0; x < char_choices.
length(); ++x) {
818 blob_choice_it.set_to_list(char_choices.
get(x));
819 first_choice = blob_choice_it.data();
822 if (x > first_index) {
824 cprintf (
"Hyphenated word found\n");
826 x - 1, ¤t_word);
827 if (current_word.
rating() > rating_limit)
839 if (first_index > 0 && first_index < x &&
840 current_word.
rating() <= rating_limit) {
842 x - 1, ¤t_word);
846 return (best_choice);
872 for (x = start; x <= end; x++) {
873 if (char_choices.
get(x) !=
NULL) {
874 subchoices += char_choices.
get(x);
878 if (!subchoices.
empty()) {
883 best_choice =
permute_all(subchoices, &initial_choice, &raw_choice);
885 if (best_choice && best_choice->
length() > 0) {
886 *current_word += *best_choice;
898 cprintf (
"Subword permuted = %s, %5.2f, %5.2f\n\n",
913 for (
int x = 0; x < char_choices.
length(); x++) {
914 BLOB_CHOICE_IT blob_choice_it;
915 blob_choice_it.set_to_list(char_choices.
get(x));
920 certainties[x] = top_choice->
certainty();
922 LogNewChoice(1.0, certainties,
true, top_word, char_choices);
940 const char *first_char;
941 const char *second_char;
942 const char *third_char;
944 const char *next_char =
"";
945 const char *next_next_char =
"";
956 float first_rating = 0;
962 BLOB_CHOICE_IT blob_choice_it;
966 register const char* ch;
967 register inT8 lower_done;
968 register inT8 upper_done;
972 if (any_alpha !=
NULL)
979 for (x = 0; x < char_choices.
length(); ++x) {
980 if (x + 1 < char_choices.
length()) {
982 next_char = unichar_id != INVALID_UNICHAR_ID ?
988 if (x + 2 < char_choices.
length()) {
990 next_next_char = unichar_id != INVALID_UNICHAR_ID ?
996 blob_choice_it.set_to_list(char_choices.
get(x));
999 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1000 blob_choice_it.forward()) {
1001 temp_id = blob_choice_it.data()->unichar_id();
1003 first_choice = blob_choice_it.data();
1005 }
else if (char_choices.
length() > 1) {
1011 if (first_choice ==
NULL) {
1012 cprintf(
"Permuter found only fragments for"
1013 " character at position %d; word=%s\n",
1018 unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
1019 first_choice->unichar_id() : space;
1021 first_rating = first_choice->rating();
1023 unichar_id, 1, first_choice->rating(), first_choice->certainty());
1025 unichar_id, 1, first_choice->rating(), first_choice->certainty());
1027 unichar_id, 1, first_choice->rating(), first_choice->certainty());
1029 certainties[x] = first_choice->certainty();
1030 lower_certainties[x] = first_choice->certainty();
1031 upper_certainties[x] = first_choice->certainty();
1038 for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
1039 unichar_id = blob_choice_it.data()->unichar_id();
1040 if (
getUnicharset().eq(unichar_id,
"l") && !blob_choice_it.at_last() &&
1041 blob_choice_it.data_relative(1)->rating() == first_rating) {
1042 temp_id = blob_choice_it.data_relative(1)->unichar_id();
1046 blob_choice_it.forward();
1047 if (!blob_choice_it.at_last() &&
1048 blob_choice_it.data_relative(1)->rating() == first_rating) {
1049 temp_id = blob_choice_it.data_relative(1)->unichar_id();
1053 blob_choice_it.forward();
1056 ch =
choose_il1 (first_char, second_char, third_char,
1057 prev_char, next_char, next_next_char);
1058 unichar_id = (ch !=
NULL && *ch !=
'\0') ?
1060 if (strcmp(ch,
"l") != 0 &&
1068 if (unichar_id != INVALID_UNICHAR_ID) {
1075 first_choice->rating() + blob_choice_it.data()->rating());
1076 if (blob_choice_it.data()->certainty() < lower_word.
certainty()) {
1077 lower_word.
set_certainty(blob_choice_it.data()->certainty());
1079 lower_certainties[x] = blob_choice_it.data()->certainty();
1083 if (!upper_done &&
getUnicharset().get_isupper(unichar_id)) {
1086 first_choice->rating() + blob_choice_it.data()->rating());
1087 if (blob_choice_it.data()->certainty() < capital_word.
certainty()) {
1088 capital_word.
set_certainty(blob_choice_it.data()->certainty());
1090 upper_certainties[x] = blob_choice_it.data()->certainty();
1096 temp_id = !fragment ? unichar_id :
1102 if (lower_done && upper_done)
1106 if (char_alpha && any_alpha !=
NULL)
1111 tprintf(
"\n***** Aborting high-cost word: %g > limit %g\n",
1118 if (temp_id != INVALID_UNICHAR_ID) {
1125 LogNewChoice(1.0, certainties,
true, raw_choice, char_choices);
1127 float rating = word.
rating();
1130 float lower_rating = lower_word.
rating();
1134 float upper_rating = capital_word.
rating();
1139 *rating_limit = rating;
1141 best_choice = &lower_word;
1142 *rating_limit = lower_rating;
1145 best_choice = &capital_word;
1146 *rating_limit = upper_rating;
1164 const char *second_char,
1165 const char *third_char,
1166 const char *prev_char,
1167 const char *next_char,
1168 const char *next_next_char) {
1173 int first_char_length = strlen(first_char);
1174 int prev_char_length = strlen(prev_char);
1175 int next_char_length = strlen(next_char);
1176 int next_next_char_length = strlen(next_next_char);
1178 if (*first_char ==
'l' && *second_char !=
'\0') {
1179 if (*second_char ==
'I'
1180 && (((prev_char_length != 0 &&
1181 getUnicharset().get_isupper (prev_char, prev_char_length)) &&
1182 (next_char_length == 0 ||
1183 !
getUnicharset().get_islower (next_char, next_char_length)) &&
1184 (next_char_length == 0 ||
1185 !
getUnicharset().get_isdigit (next_char, next_char_length))) ||
1186 ((next_char_length != 0 &&
1187 getUnicharset().get_isupper (next_char, next_char_length)) &&
1188 (prev_char_length == 0 ||
1189 !
getUnicharset().get_islower (prev_char, prev_char_length)) &&
1190 (prev_char_length == 0 ||
1191 !
getUnicharset().get_isdigit (prev_char, prev_char_length)))))
1192 first_char = second_char;
1193 else if (*second_char ==
'1' || *third_char ==
'1') {
1194 if ((next_char_length != 0 &&
1195 getUnicharset().get_isdigit (next_char, next_char_length)) ||
1196 (prev_char_length != 0 &&
1198 || (*next_char ==
'l' &&
1199 (next_next_char_length != 0 &&
1201 next_next_char_length)))) {
1203 first_char_length = 1;
1205 else if ((prev_char_length == 0 ||
1206 !
getUnicharset().get_islower (prev_char, prev_char_length)) &&
1207 ((next_char_length == 0 ||
1208 !
getUnicharset().get_islower (next_char, next_char_length)) ||
1209 (*next_char ==
's' &&
1210 *next_next_char ==
't'))) {
1211 if (((*prev_char !=
'\'' && *prev_char !=
'`') || *next_char !=
'\0')
1212 && ((*next_char !=
'\'' && *next_char !=
'`')
1213 || *prev_char !=
'\0')) {
1215 first_char_length = 1;
1219 if (*first_char ==
'l' && *next_char !=
'\0' &&
1220 (prev_char_length == 0 ||
1221 !
getUnicharset().get_isalpha (prev_char, prev_char_length))) {
1224 if (*second_char ==
'1')
1226 else if (*second_char ==
'I')
1228 else if (*second_char ==
'l')
1233 if (*third_char ==
'1')
1235 else if (*third_char ==
'I')
1237 else if (*third_char ==
'l')
1243 if (bigram_counts[*next_char][type2] >
1244 bigram_counts[*next_char][type1]) {
1245 first_char = second_char;
1248 if (bigram_counts[*next_char][type3] >
1249 bigram_counts[*next_char][type1]) {
1250 first_char = third_char;
1284 float curr_rating,
float curr_certainty,
1286 const char *debug,
int word_ending,
1294 if (debug && (prev_fragment || this_fragment)) {
1295 cprintf(
"%s check fragments: choice=%s word_ending=%d\n", debug,
1298 if (prev_fragment) {
1301 if (this_fragment) {
1306 char_frag_info->
unichar_id = curr_unichar_id;
1307 char_frag_info->
fragment = this_fragment;
1308 char_frag_info->
rating = curr_rating;
1309 char_frag_info->
certainty = curr_certainty;
1311 if (prev_fragment && !this_fragment) {
1312 if (debug)
tprintf(
"Skip choice with incomplete fragment\n");
1315 if (this_fragment) {
1317 char_frag_info->
unichar_id = INVALID_UNICHAR_ID;
1318 if (prev_fragment) {
1320 if (debug)
tprintf(
"Non-matching fragment piece\n");
1328 tprintf(
"Built character %s from fragments\n",
1333 if (debug)
tprintf(
"Record fragment continuation\n");
1334 char_frag_info->
fragment = this_fragment;
1338 prev_char_frag_info->
rating + curr_rating;
1344 if (debug)
cprintf(
"Record fragment beginning\n");
1347 tprintf(
"Non-starting fragment piece with no prev_fragment\n");
1353 if (word_ending && char_frag_info->
fragment) {
1354 if (debug)
tprintf(
"Word can not end with a fragment\n");
1369 float rating_limit) {
1370 if (char_choices.
length() <= 1 ||
1376 float min_rating = 0.0;
1377 BLOB_CHOICE_IT blob_choice_it;
1378 for (x = 0; x < char_choices.
length(); ++x) {
1379 blob_choice_it.set_to_list(char_choices.
get(x));
1380 if (blob_choice_it.data()) {
1381 min_rating += blob_choice_it.data()->rating();
1383 if (min_rating >= rating_limit) {
1388 tprintf(
"A choice with fragment beats top choice\n");
1389 tprintf(
"Running fragment permuter...\n");
1396 for (x = 0; x < char_choices.
length(); ++x) {
1397 bool need_nonfrag_char =
true;
1398 BLOB_CHOICE_LIST *frag_choices =
new BLOB_CHOICE_LIST();
1399 BLOB_CHOICE_IT frag_choices_it;
1400 frag_choices_it.set_to_list(frag_choices);
1401 blob_choice_it.set_to_list(char_choices.
get(x));
1402 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1403 blob_choice_it.forward()) {
1405 frag_choices_it.add_after_then_move(
1407 }
else if (need_nonfrag_char) {
1408 frag_choices_it.add_after_then_move(
1410 need_nonfrag_char =
false;
1413 frag_char_choices += frag_choices;
1424 frag_char_choices, 0,
NULL, &word, certainties,
1425 &rating_limit, best_choice, &attempts_left,
NULL);
1427 frag_char_choices.delete_data_pointers();
1440 int char_choice_index,
1443 float certainties[],
1449 tprintf(
"%s permute_choices: char_choice_index=%d"
1450 " limit=%g rating=%g, certainty=%g word=%s\n",
1451 debug, char_choice_index, *limit, word->
rating(),
1454 if (char_choice_index < char_choices.
length()) {
1455 BLOB_CHOICE_IT blob_choice_it;
1456 blob_choice_it.set_to_list(char_choices.
get(char_choice_index));
1457 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1458 blob_choice_it.forward()) {
1461 char_choice_index, prev_char_frag_info, word,
1462 certainties, limit, best_choice, attempts_left, more_args);
1463 if (*attempts_left <= 0) {
1464 if (debug)
tprintf(
"permute_choices(): attempts_left is 0\n");
1483 int char_choice_index,
1486 float certainties[],
1492 (char_choice_index == char_choices.
length() - 1) ?
true :
false;
1497 blob_choice.
certainty(), prev_char_frag_info, debug,
1498 word_ending, &char_frag_info)) {
1502 if (char_frag_info.
unichar_id == INVALID_UNICHAR_ID) {
1504 &char_frag_info, word, certainties, limit,
1505 best_choice, attempts_left, more_args);
1510 float old_rating = word->
rating();
1511 float old_certainty = word->
certainty();
1520 &char_frag_info, word_ending, word, certainties,
1521 limit, best_choice, attempts_left, more_args);
1541 bool word_ending,
WERD_CHOICE *word,
float certainties[],
float *limit,
1542 WERD_CHOICE *best_choice,
int *attempts_left,
void *more_args) {
1543 if (word->
rating() < *limit) {
1546 tprintf(
"fragments_debug new choice = %s\n",
1554 prev_char_frag_info, word, certainties, limit,
1555 best_choice, attempts_left, more_args);
1559 tprintf(
"fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",