PocketSphinx 5prealpha
ngram_search_fwdflat.c
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
42/* System headers. */
43#include <string.h>
44#include <assert.h>
45
46/* SphinxBase headers. */
47#include <sphinxbase/ckd_alloc.h>
48#include <sphinxbase/listelem_alloc.h>
49#include <sphinxbase/err.h>
50
51/* Local headers. */
52#include "ngram_search.h"
53#include "ps_lattice_internal.h"
54
55/* Turn this on to dump channels for debugging */
56#define __CHAN_DUMP__ 0
57#if __CHAN_DUMP__
58#define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr)
59#else
60#define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm)
61#endif
62
63static void
64ngram_fwdflat_expand_all(ngram_search_t *ngs)
65{
66 int n_words, i;
67
68 /* For all "real words" (not fillers or <s>/</s>) in the dictionary,
69 *
70 * 1) Add the ones which are in the LM to the fwdflat wordlist
71 * 2) And to the expansion list (since we are expanding all)
72 */
73 ngs->n_expand_words = 0;
74 n_words = ps_search_n_words(ngs);
75 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
76 for (i = 0; i < n_words; ++i) {
77 if (!ngram_model_set_known_wid(ngs->lmset,
78 dict_basewid(ps_search_dict(ngs),i)))
79 continue;
80 ngs->fwdflat_wordlist[ngs->n_expand_words] = i;
81 ngs->expand_word_list[ngs->n_expand_words] = i;
82 bitvec_set(ngs->expand_word_flag, i);
83 ngs->n_expand_words++;
84 }
85 E_INFO("Utterance vocabulary contains %d words\n", ngs->n_expand_words);
86 ngs->expand_word_list[ngs->n_expand_words] = -1;
87 ngs->fwdflat_wordlist[ngs->n_expand_words] = -1;
88}
89
90static void
91ngram_fwdflat_allocate_1ph(ngram_search_t *ngs)
92{
93 dict_t *dict = ps_search_dict(ngs);
94 int n_words = ps_search_n_words(ngs);
95 int i, w;
96
97 /* Allocate single-phone words, since they won't have
98 * been allocated for us by fwdtree initialization. */
99 ngs->n_1ph_words = 0;
100 for (w = 0; w < n_words; w++) {
101 if (dict_is_single_phone(dict, w))
102 ++ngs->n_1ph_words;
103 }
104 ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words,
105 sizeof(*ngs->single_phone_wid));
106 ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph));
107 i = 0;
108 for (w = 0; w < n_words; w++) {
109 if (!dict_is_single_phone(dict, w))
110 continue;
111
112 /* DICT2PID location */
113 ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w);
114 ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef);
115 hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE,
116 /* ssid */ bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef,
117 ngs->rhmm_1ph[i].ciphone),
118 /* tmatid */ bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef,
119 ngs->rhmm_1ph[i].ciphone));
120 ngs->rhmm_1ph[i].next = NULL;
121 ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]);
122 ngs->single_phone_wid[i] = w;
123 i++;
124 }
125}
126
127static void
128ngram_fwdflat_free_1ph(ngram_search_t *ngs)
129{
130 int i, w;
131 int n_words = ps_search_n_words(ngs);
132
133 for (i = w = 0; w < n_words; ++w) {
134 if (!dict_is_single_phone(ps_search_dict(ngs), w))
135 continue;
136 hmm_deinit(&ngs->rhmm_1ph[i].hmm);
137 ++i;
138 }
139 ckd_free(ngs->rhmm_1ph);
140 ngs->rhmm_1ph = NULL;
141 ckd_free(ngs->single_phone_wid);
142}
143
144void
146{
147 int n_words;
148
149 n_words = ps_search_n_words(ngs);
150 ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist));
151 ngs->expand_word_flag = bitvec_alloc(n_words);
152 ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list));
153 ngs->frm_wordlist = ckd_calloc(ngs->n_frame_alloc, sizeof(*ngs->frm_wordlist));
154 ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatefwid");
155 ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatsfwin");
156 E_INFO("fwdflat: min_ef_width = %d, max_sf_win = %d\n",
157 ngs->min_ef_width, ngs->max_sf_win);
158
159 /* No tree-search; pre-build the expansion list, including all LM words. */
160 if (!ngs->fwdtree) {
161 /* Build full expansion list from LM words. */
162 ngram_fwdflat_expand_all(ngs);
163 /* Allocate single phone words. */
164 ngram_fwdflat_allocate_1ph(ngs);
165 }
166}
167
168void
170{
171 double n_speech = (double)ngs->n_tot_frame
172 / cmd_ln_int32_r(ps_search_config(ngs), "-frate");
173
174 E_INFO("TOTAL fwdflat %.2f CPU %.3f xRT\n",
175 ngs->fwdflat_perf.t_tot_cpu,
176 ngs->fwdflat_perf.t_tot_cpu / n_speech);
177 E_INFO("TOTAL fwdflat %.2f wall %.3f xRT\n",
178 ngs->fwdflat_perf.t_tot_elapsed,
179 ngs->fwdflat_perf.t_tot_elapsed / n_speech);
180
181 /* Free single-phone words if we allocated them. */
182 if (!ngs->fwdtree) {
183 ngram_fwdflat_free_1ph(ngs);
184 }
185 ckd_free(ngs->fwdflat_wordlist);
186 bitvec_free(ngs->expand_word_flag);
187 ckd_free(ngs->expand_word_list);
188 ckd_free(ngs->frm_wordlist);
189}
190
191int
193{
194 /* Reallocate things that depend on the number of words. */
195 int n_words;
196
197 ckd_free(ngs->fwdflat_wordlist);
198 ckd_free(ngs->expand_word_list);
199 bitvec_free(ngs->expand_word_flag);
200 n_words = ps_search_n_words(ngs);
201 ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist));
202 ngs->expand_word_flag = bitvec_alloc(n_words);
203 ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list));
204
205 /* No tree-search; take care of the expansion list and single phone words. */
206 if (!ngs->fwdtree) {
207 /* Free single-phone words. */
208 ngram_fwdflat_free_1ph(ngs);
209 /* Reallocate word_chan. */
210 ckd_free(ngs->word_chan);
211 ngs->word_chan = ckd_calloc(dict_size(ps_search_dict(ngs)),
212 sizeof(*ngs->word_chan));
213 /* Rebuild full expansion list from LM words. */
214 ngram_fwdflat_expand_all(ngs);
215 /* Allocate single phone words. */
216 ngram_fwdflat_allocate_1ph(ngs);
217 }
218 /* Otherwise there is nothing to do since the wordlist is
219 * generated anew every utterance. */
220 return 0;
221}
222
226static void
227build_fwdflat_wordlist(ngram_search_t *ngs)
228{
229 int32 i, f, sf, ef, wid, nwd;
230 bptbl_t *bp;
231 ps_latnode_t *node, *prevnode, *nextnode;
232
233 /* No tree-search, use statically allocated wordlist. */
234 if (!ngs->fwdtree)
235 return;
236
237 memset(ngs->frm_wordlist, 0, ngs->n_frame_alloc * sizeof(*ngs->frm_wordlist));
238
239 /* Scan the backpointer table for all active words and record
240 * their exit frames. */
241 for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) {
242 sf = (bp->bp < 0) ? 0 : ngs->bp_table[bp->bp].frame + 1;
243 ef = bp->frame;
244 wid = bp->wid;
245
246 /* Anything that can be transitioned to in the LM can go in
247 * the word list. */
248 if (!ngram_model_set_known_wid(ngs->lmset,
249 dict_basewid(ps_search_dict(ngs), wid)))
250 continue;
251
252 /* Look for it in the wordlist. */
253 for (node = ngs->frm_wordlist[sf]; node && (node->wid != wid);
254 node = node->next);
255
256 /* Update last end frame. */
257 if (node)
258 node->lef = ef;
259 else {
260 /* New node; link to head of list */
261 node = listelem_malloc(ngs->latnode_alloc);
262 node->wid = wid;
263 node->fef = node->lef = ef;
264
265 node->next = ngs->frm_wordlist[sf];
266 ngs->frm_wordlist[sf] = node;
267 }
268 }
269
270 /* Eliminate "unlikely" words, for which there are too few end points */
271 for (f = 0; f < ngs->n_frame; f++) {
272 prevnode = NULL;
273 for (node = ngs->frm_wordlist[f]; node; node = nextnode) {
274 nextnode = node->next;
275 /* Word has too few endpoints */
276 if ((node->lef - node->fef < ngs->min_ef_width) ||
277 /* Word is </s> and doesn't actually end in last frame */
278 ((node->wid == ps_search_finish_wid(ngs)) && (node->lef < ngs->n_frame - 1))) {
279 if (!prevnode)
280 ngs->frm_wordlist[f] = nextnode;
281 else
282 prevnode->next = nextnode;
283 listelem_free(ngs->latnode_alloc, node);
284 }
285 else
286 prevnode = node;
287 }
288 }
289
290 /* Form overall wordlist for 2nd pass */
291 nwd = 0;
292 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
293 for (f = 0; f < ngs->n_frame; f++) {
294 for (node = ngs->frm_wordlist[f]; node; node = node->next) {
295 if (!bitvec_is_set(ngs->word_active, node->wid)) {
296 bitvec_set(ngs->word_active, node->wid);
297 ngs->fwdflat_wordlist[nwd++] = node->wid;
298 }
299 }
300 }
301 ngs->fwdflat_wordlist[nwd] = -1;
302 E_INFO("Utterance vocabulary contains %d words\n", nwd);
303}
304
308static void
309build_fwdflat_chan(ngram_search_t *ngs)
310{
311 int32 i, wid, p;
312 root_chan_t *rhmm;
313 chan_t *hmm, *prevhmm;
314 dict_t *dict;
315 dict2pid_t *d2p;
316
317 dict = ps_search_dict(ngs);
318 d2p = ps_search_dict2pid(ngs);
319
320 /* Build word HMMs for each word in the lattice. */
321 for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
322 wid = ngs->fwdflat_wordlist[i];
323
324 /* Single-phone words are permanently allocated */
325 if (dict_is_single_phone(dict, wid))
326 continue;
327
328 assert(ngs->word_chan[wid] == NULL);
329
330 /* Multiplex root HMM for first phone (one root per word, flat
331 * lexicon). diphone is irrelevant here, for the time being,
332 * at least. */
333 rhmm = listelem_malloc(ngs->root_chan_alloc);
334 rhmm->ci2phone = dict_second_phone(dict, wid);
335 rhmm->ciphone = dict_first_phone(dict, wid);
336 rhmm->next = NULL;
337 hmm_init(ngs->hmmctx, &rhmm->hmm, TRUE,
338 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->ciphone),
339 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->ciphone));
340
341 /* HMMs for word-internal phones */
342 prevhmm = NULL;
343 for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) {
344 hmm = listelem_malloc(ngs->chan_alloc);
345 hmm->ciphone = dict_pron(dict, wid, p);
346 hmm->info.rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1;
347 hmm->next = NULL;
348 hmm_init(ngs->hmmctx, &hmm->hmm, FALSE,
349 dict2pid_internal(d2p,wid,p),
350 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->ciphone));
351
352 if (prevhmm)
353 prevhmm->next = hmm;
354 else
355 rhmm->next = hmm;
356
357 prevhmm = hmm;
358 }
359
360 /* Right-context phones */
362
363 /* Link in just allocated right-context phones */
364 if (prevhmm)
365 prevhmm->next = ngs->word_chan[wid];
366 else
367 rhmm->next = ngs->word_chan[wid];
368 ngs->word_chan[wid] = (chan_t *) rhmm;
369 }
370
371}
372
373void
375{
376 root_chan_t *rhmm;
377 int i;
378
379 ptmr_reset(&ngs->fwdflat_perf);
380 ptmr_start(&ngs->fwdflat_perf);
381 build_fwdflat_wordlist(ngs);
382 build_fwdflat_chan(ngs);
383
384 ngs->bpidx = 0;
385 ngs->bss_head = 0;
386
387 for (i = 0; i < ps_search_n_words(ngs); i++)
388 ngs->word_lat_idx[i] = NO_BP;
389
390 /* Reset the permanently allocated single-phone words, since they
391 * may have junk left over in them from previous searches. */
392 for (i = 0; i < ngs->n_1ph_words; i++) {
393 int32 w = ngs->single_phone_wid[i];
394 rhmm = (root_chan_t *) ngs->word_chan[w];
395 hmm_clear(&rhmm->hmm);
396 }
397
398 /* Start search with <s>; word_chan[<s>] is permanently allocated */
399 rhmm = (root_chan_t *) ngs->word_chan[ps_search_start_wid(ngs)];
400 hmm_enter(&rhmm->hmm, 0, NO_BP, 0);
401 ngs->active_word_list[0][0] = ps_search_start_wid(ngs);
402 ngs->n_active_word[0] = 1;
403
404 ngs->best_score = 0;
405 ngs->renormalized = FALSE;
406
407 for (i = 0; i < ps_search_n_words(ngs); i++)
408 ngs->last_ltrans[i].sf = -1;
409
410 if (!ngs->fwdtree)
411 ngs->n_frame = 0;
412
413 ngs->st.n_fwdflat_chan = 0;
414 ngs->st.n_fwdflat_words = 0;
415 ngs->st.n_fwdflat_word_transition = 0;
416 ngs->st.n_senone_active_utt = 0;
417}
418
419static void
420compute_fwdflat_sen_active(ngram_search_t *ngs, int frame_idx)
421{
422 int32 i, nw, w;
423 int32 *awl;
424 root_chan_t *rhmm;
425 chan_t *hmm;
426
427 acmod_clear_active(ps_search_acmod(ngs));
428
429 nw = ngs->n_active_word[frame_idx & 0x1];
430 awl = ngs->active_word_list[frame_idx & 0x1];
431
432 for (i = 0; i < nw; i++) {
433 w = *(awl++);
434 rhmm = (root_chan_t *)ngs->word_chan[w];
435 if (hmm_frame(&rhmm->hmm) == frame_idx) {
436 acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm);
437 }
438
439 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
440 if (hmm_frame(&hmm->hmm) == frame_idx) {
441 acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm);
442 }
443 }
444 }
445}
446
447static void
448fwdflat_eval_chan(ngram_search_t *ngs, int frame_idx)
449{
450 int32 i, w, nw, bestscore;
451 int32 *awl;
452 root_chan_t *rhmm;
453 chan_t *hmm;
454
455 nw = ngs->n_active_word[frame_idx & 0x1];
456 awl = ngs->active_word_list[frame_idx & 0x1];
457 bestscore = WORST_SCORE;
458
459 ngs->st.n_fwdflat_words += nw;
460
461 /* Scan all active words. */
462 for (i = 0; i < nw; i++) {
463 w = *(awl++);
464 rhmm = (root_chan_t *) ngs->word_chan[w];
465 if (hmm_frame(&rhmm->hmm) == frame_idx) {
466 int32 score = chan_v_eval(rhmm);
467 if ((score BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs)))
468 bestscore = score;
469 ngs->st.n_fwdflat_chan++;
470 }
471
472 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
473 if (hmm_frame(&hmm->hmm) == frame_idx) {
474 int32 score = chan_v_eval(hmm);
475 if (score BETTER_THAN bestscore)
476 bestscore = score;
477 ngs->st.n_fwdflat_chan++;
478 }
479 }
480 }
481
482 ngs->best_score = bestscore;
483}
484
485static void
486fwdflat_prune_chan(ngram_search_t *ngs, int frame_idx)
487{
488 int32 i, nw, cf, nf, w, pip, newscore, thresh, wordthresh;
489 int32 *awl;
490 root_chan_t *rhmm;
491 chan_t *hmm, *nexthmm;
492
493 cf = frame_idx;
494 nf = cf + 1;
495 nw = ngs->n_active_word[cf & 0x1];
496 awl = ngs->active_word_list[cf & 0x1];
497 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
498
499 thresh = ngs->best_score + ngs->fwdflatbeam;
500 wordthresh = ngs->best_score + ngs->fwdflatwbeam;
501 pip = ngs->pip;
502 E_DEBUG(3,("frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh));
503
504 /* Scan all active words. */
505 for (i = 0; i < nw; i++) {
506 w = *(awl++);
507 rhmm = (root_chan_t *) ngs->word_chan[w];
508 /* Propagate active root channels */
509 if (hmm_frame(&rhmm->hmm) == cf
510 && hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) {
511 hmm_frame(&rhmm->hmm) = nf;
512 bitvec_set(ngs->word_active, w);
513
514 /* Transitions out of root channel */
515 newscore = hmm_out_score(&rhmm->hmm);
516 if (rhmm->next) {
517 assert(!dict_is_single_phone(ps_search_dict(ngs), w));
518
519 newscore += pip;
520 if (newscore BETTER_THAN thresh) {
521 hmm = rhmm->next;
522 /* Enter all right context phones */
523 if (hmm->info.rc_id >= 0) {
524 for (; hmm; hmm = hmm->next) {
525 if ((hmm_frame(&hmm->hmm) < cf)
526 || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) {
527 hmm_enter(&hmm->hmm, newscore,
528 hmm_out_history(&rhmm->hmm), nf);
529 }
530 }
531 }
532 /* Just a normal word internal phone */
533 else {
534 if ((hmm_frame(&hmm->hmm) < cf)
535 || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) {
536 hmm_enter(&hmm->hmm, newscore,
537 hmm_out_history(&rhmm->hmm), nf);
538 }
539 }
540 }
541 }
542 else {
543 assert(dict_is_single_phone(ps_search_dict(ngs), w));
544
545 /* Word exit for single-phone words (where did their
546 * whmms come from?) (either from
547 * ngram_search_fwdtree, or from
548 * ngram_fwdflat_allocate_1ph(), that's where) */
549 if (newscore BETTER_THAN wordthresh) {
550 ngram_search_save_bp(ngs, cf, w, newscore,
551 hmm_out_history(&rhmm->hmm), 0);
552 }
553 }
554 }
555
556 /* Transitions out of non-root channels. */
557 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
558 if (hmm_frame(&hmm->hmm) >= cf) {
559 /* Propagate forward HMMs inside the beam. */
560 if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) {
561 hmm_frame(&hmm->hmm) = nf;
562 bitvec_set(ngs->word_active, w);
563
564 newscore = hmm_out_score(&hmm->hmm);
565 /* Word-internal phones */
566 if (hmm->info.rc_id < 0) {
567 newscore += pip;
568 if (newscore BETTER_THAN thresh) {
569 nexthmm = hmm->next;
570 /* Enter all right-context phones. */
571 if (nexthmm->info.rc_id >= 0) {
572 for (; nexthmm; nexthmm = nexthmm->next) {
573 if ((hmm_frame(&nexthmm->hmm) < cf)
574 || (newscore BETTER_THAN
575 hmm_in_score(&nexthmm->hmm))) {
576 hmm_enter(&nexthmm->hmm,
577 newscore,
578 hmm_out_history(&hmm->hmm),
579 nf);
580 }
581 }
582 }
583 /* Enter single word-internal phone. */
584 else {
585 if ((hmm_frame(&nexthmm->hmm) < cf)
586 || (newscore BETTER_THAN
587 hmm_in_score(&nexthmm->hmm))) {
588 hmm_enter(&nexthmm->hmm, newscore,
589 hmm_out_history(&hmm->hmm), nf);
590 }
591 }
592 }
593 }
594 /* Right-context phones - apply word beam and exit. */
595 else {
596 if (newscore BETTER_THAN wordthresh) {
597 ngram_search_save_bp(ngs, cf, w, newscore,
598 hmm_out_history(&hmm->hmm),
599 hmm->info.rc_id);
600 }
601 }
602 }
603 /* Zero out inactive HMMs. */
604 else if (hmm_frame(&hmm->hmm) != nf) {
605 hmm_clear_scores(&hmm->hmm);
606 }
607 }
608 }
609 }
610}
611
612static void
613get_expand_wordlist(ngram_search_t *ngs, int32 frm, int32 win)
614{
615 int32 f, sf, ef;
616 ps_latnode_t *node;
617
618 if (!ngs->fwdtree) {
619 ngs->st.n_fwdflat_word_transition += ngs->n_expand_words;
620 return;
621 }
622
623 sf = frm - win;
624 if (sf < 0)
625 sf = 0;
626 ef = frm + win;
627 if (ef > ngs->n_frame)
628 ef = ngs->n_frame;
629
630 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
631 ngs->n_expand_words = 0;
632
633 for (f = sf; f < ef; f++) {
634 for (node = ngs->frm_wordlist[f]; node; node = node->next) {
635 if (!bitvec_is_set(ngs->expand_word_flag, node->wid)) {
636 ngs->expand_word_list[ngs->n_expand_words++] = node->wid;
637 bitvec_set(ngs->expand_word_flag, node->wid);
638 }
639 }
640 }
641 ngs->expand_word_list[ngs->n_expand_words] = -1;
642 ngs->st.n_fwdflat_word_transition += ngs->n_expand_words;
643}
644
645static void
646fwdflat_word_transition(ngram_search_t *ngs, int frame_idx)
647{
648 int32 cf, nf, b, thresh, pip, i, nw, w, newscore;
649 int32 best_silrc_score = 0, best_silrc_bp = 0; /* FIXME: good defaults? */
650 bptbl_t *bp;
651 int32 *rcss;
652 root_chan_t *rhmm;
653 int32 *awl;
654 float32 lwf;
655 dict_t *dict = ps_search_dict(ngs);
656 dict2pid_t *d2p = ps_search_dict2pid(ngs);
657
658 cf = frame_idx;
659 nf = cf + 1;
660 thresh = ngs->best_score + ngs->fwdflatbeam;
661 pip = ngs->pip;
662 best_silrc_score = WORST_SCORE;
663 lwf = ngs->fwdflat_fwdtree_lw_ratio;
664
665 /* Search for all words starting within a window of this frame.
666 * These are the successors for words exiting now. */
667 get_expand_wordlist(ngs, cf, ngs->max_sf_win);
668
669 /* Scan words exited in current frame */
670 for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) {
671 xwdssid_t *rssid;
672 int32 silscore;
673
674 bp = ngs->bp_table + b;
675 ngs->word_lat_idx[bp->wid] = NO_BP;
676
677 if (bp->wid == ps_search_finish_wid(ngs))
678 continue;
679
680 /* DICT2PID location */
681 /* Get the mapping from right context phone ID to index in the
682 * right context table and the bscore_stack. */
683 rcss = ngs->bscore_stack + bp->s_idx;
684 if (bp->last2_phone == -1)
685 rssid = NULL;
686 else
687 rssid = dict2pid_rssid(d2p, bp->last_phone, bp->last2_phone);
688
689 /* Transition to all successor words. */
690 for (i = 0; ngs->expand_word_list[i] >= 0; i++) {
691 int32 n_used;
692
693 w = ngs->expand_word_list[i];
694
695 /* Get the exit score we recorded in save_bwd_ptr(), or
696 * something approximating it. */
697 if (rssid)
698 newscore = rcss[rssid->cimap[dict_first_phone(dict, w)]];
699 else
700 newscore = bp->score;
701 if (newscore == WORST_SCORE)
702 continue;
703 /* FIXME: Floating point... */
704 newscore += lwf
705 * (ngram_tg_score(ngs->lmset,
706 dict_basewid(dict, w),
707 bp->real_wid,
708 bp->prev_real_wid,
709 &n_used) >> SENSCR_SHIFT);
710 newscore += pip;
711
712 /* Enter the next word */
713 if (newscore BETTER_THAN thresh) {
714 rhmm = (root_chan_t *) ngs->word_chan[w];
715 if ((hmm_frame(&rhmm->hmm) < cf)
716 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
717 hmm_enter(&rhmm->hmm, newscore, b, nf);
718 /* DICT2PID: This is where mpx ssids get introduced. */
719 /* Look up the ssid to use when entering this mpx triphone. */
720 hmm_mpx_ssid(&rhmm->hmm, 0) =
721 dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone,
722 dict_last_phone(dict, bp->wid));
723 assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->hmm, 0)));
724 E_DEBUG(6,("ssid %d(%d,%d) = %d\n",
725 rhmm->ciphone, dict_last_phone(dict, bp->wid), rhmm->ci2phone,
726 hmm_mpx_ssid(&rhmm->hmm, 0)));
727 bitvec_set(ngs->word_active, w);
728 }
729 }
730 }
731
732 /* Get the best exit into silence. */
733 if (rssid)
734 silscore = rcss[rssid->cimap[ps_search_acmod(ngs)->mdef->sil]];
735 else
736 silscore = bp->score;
737 if (silscore BETTER_THAN best_silrc_score) {
738 best_silrc_score = silscore;
739 best_silrc_bp = b;
740 }
741 }
742
743 /* Transition to <sil> */
744 newscore = best_silrc_score + ngs->silpen + pip;
745 if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) {
746 w = ps_search_silence_wid(ngs);
747 rhmm = (root_chan_t *) ngs->word_chan[w];
748 if ((hmm_frame(&rhmm->hmm) < cf)
749 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
750 hmm_enter(&rhmm->hmm, newscore,
751 best_silrc_bp, nf);
752 bitvec_set(ngs->word_active, w);
753 }
754 }
755 /* Transition to noise words */
756 newscore = best_silrc_score + ngs->fillpen + pip;
757 if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) {
758 for (w = dict_filler_start(dict); w <= dict_filler_end(dict); w++) {
759 if (w == ps_search_silence_wid(ngs))
760 continue;
761
762 rhmm = (root_chan_t *) ngs->word_chan[w];
763 /* Noise words that aren't a single phone will have NULL here. */
764 if (rhmm == NULL)
765 continue;
766 if ((hmm_frame(&rhmm->hmm) < cf)
767 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
768 hmm_enter(&rhmm->hmm, newscore,
769 best_silrc_bp, nf);
770 bitvec_set(ngs->word_active, w);
771 }
772 }
773 }
774
775 /* Reset initial channels of words that have become inactive even after word trans. */
776 nw = ngs->n_active_word[cf & 0x1];
777 awl = ngs->active_word_list[cf & 0x1];
778 for (i = 0; i < nw; i++) {
779 w = *(awl++);
780 rhmm = (root_chan_t *) ngs->word_chan[w];
781 if (hmm_frame(&rhmm->hmm) == cf) {
782 hmm_clear_scores(&rhmm->hmm);
783 }
784 }
785}
786
787static void
788fwdflat_renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm)
789{
790 root_chan_t *rhmm;
791 chan_t *hmm;
792 int32 i, nw, cf, w, *awl;
793
794 cf = frame_idx;
795
796 /* Renormalize individual word channels */
797 nw = ngs->n_active_word[cf & 0x1];
798 awl = ngs->active_word_list[cf & 0x1];
799 for (i = 0; i < nw; i++) {
800 w = *(awl++);
801 rhmm = (root_chan_t *) ngs->word_chan[w];
802 if (hmm_frame(&rhmm->hmm) == cf) {
803 hmm_normalize(&rhmm->hmm, norm);
804 }
805 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
806 if (hmm_frame(&hmm->hmm) == cf) {
807 hmm_normalize(&hmm->hmm, norm);
808 }
809 }
810 }
811
812 ngs->renormalized = TRUE;
813}
814
815int
817{
818 int16 const *senscr;
819 int32 nf, i, j;
820 int32 *nawl;
821
822 /* Activate our HMMs for the current frame if need be. */
823 if (!ps_search_acmod(ngs)->compallsen)
824 compute_fwdflat_sen_active(ngs, frame_idx);
825
826 /* Compute GMM scores for the current frame. */
827 senscr = acmod_score(ps_search_acmod(ngs), &frame_idx);
828 ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active;
829
830 /* Mark backpointer table for current frame. */
831 ngram_search_mark_bptable(ngs, frame_idx);
832
833 /* If the best score is equal to or worse than WORST_SCORE,
834 * recognition has failed, don't bother to keep trying. */
836 return 0;
837 /* Renormalize if necessary */
838 if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) {
839 E_INFO("Renormalizing Scores at frame %d, best score %d\n",
840 frame_idx, ngs->best_score);
841 fwdflat_renormalize_scores(ngs, frame_idx, ngs->best_score);
842 }
843
844 ngs->best_score = WORST_SCORE;
845 hmm_context_set_senscore(ngs->hmmctx, senscr);
846
847 /* Evaluate HMMs */
848 fwdflat_eval_chan(ngs, frame_idx);
849 /* Prune HMMs and do phone transitions. */
850 fwdflat_prune_chan(ngs, frame_idx);
851 /* Do word transitions. */
852 fwdflat_word_transition(ngs, frame_idx);
853
854 /* Create next active word list, skip fillers */
855 nf = frame_idx + 1;
856 nawl = ngs->active_word_list[nf & 0x1];
857 for (i = 0, j = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
858 int32 wid = ngs->fwdflat_wordlist[i];
859 if (bitvec_is_set(ngs->word_active, wid) && wid < ps_search_start_wid(ngs)) {
860 *(nawl++) = wid;
861 j++;
862 }
863 }
864 /* Add fillers */
865 for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) {
866 if (bitvec_is_set(ngs->word_active, i)) {
867 *(nawl++) = i;
868 j++;
869 }
870 }
871 if (!ngs->fwdtree)
872 ++ngs->n_frame;
873 ngs->n_active_word[nf & 0x1] = j;
874
875 /* Return the number of frames processed. */
876 return 1;
877}
878
882static void
883destroy_fwdflat_wordlist(ngram_search_t *ngs)
884{
885 ps_latnode_t *node, *tnode;
886 int32 f;
887
888 if (!ngs->fwdtree)
889 return;
890
891 for (f = 0; f < ngs->n_frame; f++) {
892 for (node = ngs->frm_wordlist[f]; node; node = tnode) {
893 tnode = node->next;
894 listelem_free(ngs->latnode_alloc, node);
895 }
896 }
897}
898
902static void
903destroy_fwdflat_chan(ngram_search_t *ngs)
904{
905 int32 i, wid;
906
907 for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
908 root_chan_t *rhmm;
909 chan_t *thmm;
910 wid = ngs->fwdflat_wordlist[i];
911 if (dict_is_single_phone(ps_search_dict(ngs),wid))
912 continue;
913 assert(ngs->word_chan[wid] != NULL);
914
915 /* The first HMM in ngs->word_chan[wid] was allocated with
916 * ngs->root_chan_alloc, but this will attempt to free it
917 * using ngs->chan_alloc, which will not work. Therefore we
918 * free it manually and move the list forward before handing
919 * it off. */
920 rhmm = (root_chan_t *)ngs->word_chan[wid];
921 thmm = rhmm->next;
922 listelem_free(ngs->root_chan_alloc, rhmm);
923 ngs->word_chan[wid] = thmm;
924 ngram_search_free_all_rc(ngs, wid);
925 }
926}
927
928void
930{
931 int32 cf;
932
933 destroy_fwdflat_chan(ngs);
934 destroy_fwdflat_wordlist(ngs);
935 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
936
937 /* This is the number of frames processed. */
938 cf = ps_search_acmod(ngs)->output_frame;
939 /* Add a mark in the backpointer table for one past the final frame. */
941
942 ptmr_stop(&ngs->fwdflat_perf);
943 /* Print out some statistics. */
944 if (cf > 0) {
945 double n_speech = (double)(cf + 1)
946 / cmd_ln_int32_r(ps_search_config(ngs), "-frate");
947 E_INFO("%8d words recognized (%d/fr)\n",
948 ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1));
949 E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt,
950 (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1));
951 E_INFO("%8d channels searched (%d/fr)\n",
952 ngs->st.n_fwdflat_chan, ngs->st.n_fwdflat_chan / (cf + 1));
953 E_INFO("%8d words searched (%d/fr)\n",
954 ngs->st.n_fwdflat_words, ngs->st.n_fwdflat_words / (cf + 1));
955 E_INFO("%8d word transitions (%d/fr)\n",
956 ngs->st.n_fwdflat_word_transition,
957 ngs->st.n_fwdflat_word_transition / (cf + 1));
958 E_INFO("fwdflat %.2f CPU %.3f xRT\n",
959 ngs->fwdflat_perf.t_cpu,
960 ngs->fwdflat_perf.t_cpu / n_speech);
961 E_INFO("fwdflat %.2f wall %.3f xRT\n",
962 ngs->fwdflat_perf.t_elapsed,
963 ngs->fwdflat_perf.t_elapsed / n_speech);
964 }
965}
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
Definition: acmod.c:1213
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
Definition: acmod.c:1106
void acmod_clear_active(acmod_t *acmod)
Clear set of active senones.
Definition: acmod.c:1197
s3ssid_t dict2pid_internal(dict2pid_t *d2p, int32 wid, int pos)
Return the senone sequence ID for the given word position.
Definition: dict2pid.c:367
#define dict2pid_rssid(d, ci, lc)
Access macros; not designed for arbitrary use.
Definition: dict2pid.h:115
#define dict_size(d)
Packaged macro access to dictionary members.
Definition: dict.h:151
#define dict_pron(d, w, p)
The CI phones of the word w at position p.
Definition: dict.h:165
void hmm_normalize(hmm_t *h, int32 bestscr)
Renormalize the scores in this HMM based on the given best score.
Definition: hmm.c:209
#define BETTER_THAN
Is one score better than another?
Definition: hmm.h:95
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
Definition: hmm.h:227
void hmm_enter(hmm_t *h, int32 score, int32 histid, int frame)
Enter an HMM with the given path score and history ID.
Definition: hmm.c:201
void hmm_deinit(hmm_t *hmm)
Free an HMM structure, releasing internal data (but not the HMM structure itself).
Definition: hmm.c:111
#define WORST_SCORE
Large "bad" score.
Definition: hmm.h:84
void hmm_clear_scores(hmm_t *h)
Reset the scores of the HMM.
Definition: hmm.c:170
void hmm_init(hmm_context_t *ctx, hmm_t *hmm, int mpx, int ssid, int tmatid)
Populate a previously-allocated HMM structure, allocating internal data.
Definition: hmm.c:89
#define WORSE_THAN
Is one score worse than another?
Definition: hmm.h:100
void hmm_clear(hmm_t *h)
Reset the states of the HMM to the invalid condition.
Definition: hmm.c:183
#define SENSCR_SHIFT
Shift count for senone scores.
Definition: hmm.h:73
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
Definition: ngram_search.c:647
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
Definition: ngram_search.c:598
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame's index in the backpointer table.
Definition: ngram_search.c:329
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
Definition: ngram_search.c:383
N-Gram based multi-pass search ("FBS")
void ngram_fwdflat_start(ngram_search_t *ngs)
Start fwdflat decoding for an utterance.
void ngram_fwdflat_deinit(ngram_search_t *ngs)
Release memory associated with fwdflat decoding.
int ngram_fwdflat_reinit(ngram_search_t *ngs)
Rebuild search structures for updated language models.
void ngram_fwdflat_finish(ngram_search_t *ngs)
Finish fwdflat decoding for an utterance.
void ngram_fwdflat_init(ngram_search_t *ngs)
Initialize N-Gram search for fwdflat decoding.
int ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx)
Search one frame forward in an utterance.
Word graph search implementation.
Back pointer table (forward pass lattice; actually a tree)
Definition: ngram_search.h:109
int32 wid
Word index.
Definition: ngram_search.h:113
int16 last2_phone
next-to-last phone of this word
Definition: ngram_search.h:120
int32 bp
Back Pointer.
Definition: ngram_search.h:114
int32 prev_real_wid
wid of second-last real word
Definition: ngram_search.h:118
int32 real_wid
wid of this or latest predecessor real word
Definition: ngram_search.h:117
int32 score
Score (best among all right contexts)
Definition: ngram_search.h:115
int16 last_phone
last phone of this word
Definition: ngram_search.h:119
int32 s_idx
Start of BScoreStack for various right contexts.
Definition: ngram_search.h:116
frame_idx_t frame
start or end frame
Definition: ngram_search.h:110
Lexical tree node data type.
Definition: ngram_search.h:64
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
Definition: ngram_search.h:68
int32 ciphone
ciphone for this node
Definition: ngram_search.h:73
hmm_t hmm
Basic HMM structure.
Definition: ngram_search.h:65
int32 rc_id
right-context id for last phone of words
Definition: ngram_search.h:79
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition: dict2pid.h:84
a structure for a dictionary.
Definition: dict.h:76
N-Gram search module structure.
Definition: ngram_search.h:197
int32 * single_phone_wid
list of single-phone word ids
Definition: ngram_search.h:264
int32 best_score
Best Viterbi path score.
Definition: ngram_search.h:325
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
Definition: ngram_search.h:236
listelem_alloc_t * latnode_alloc
For latnode_t.
Definition: ngram_search.h:213
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
Definition: ngram_search.h:307
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
Definition: ngram_search.h:287
int32 n_frame
Number of frames actually present.
Definition: ngram_search.h:308
ngram_search_stats_t st
Various statistics for profiling.
Definition: ngram_search.h:335
listelem_alloc_t * root_chan_alloc
For root_chan_t.
Definition: ngram_search.h:212
int32 n_active_word[2]
Number entries in active_word_list.
Definition: ngram_search.h:288
ngram_model_t * lmset
Set of language models.
Definition: ngram_search.h:199
int32 * fwdflat_wordlist
List of active word IDs for utterance.
Definition: ngram_search.h:317
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
Definition: ngram_search.h:246
int32 n_1ph_words
Number single phone words in dict (total)
Definition: ngram_search.h:265
ps_latnode_t ** frm_wordlist
List of active words in each frame.
Definition: ngram_search.h:316
listelem_alloc_t * chan_alloc
For chan_t.
Definition: ngram_search.h:211
hmm_context_t * hmmctx
HMM context.
Definition: ngram_search.h:200
bitvec_t * word_active
array of active flags for all words.
Definition: ngram_search.h:247
int32 fef
First end frame.
int32 lef
Last end frame.
struct ps_latnode_s * next
Next node in DAG (no ordering implied)
int32 wid
Dictionary word id.
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
Definition: ngram_search.h:90
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
Definition: ngram_search.h:102
hmm_t hmm
Basic HMM structure.
Definition: ngram_search.h:91
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone
Definition: ngram_search.h:100
chan_t * next
first descendant of this channel
Definition: ngram_search.h:94
cross word triphone model structure
Definition: dict2pid.h:73
s3cipid_t * cimap
Index into ssid[] above for each ci phone.
Definition: dict2pid.h:75