From e59cac40480bcb41cbe955e3d417502ce55a8b77 Mon Sep 17 00:00:00 2001 From: nshmyrev Date: Sun, 10 Nov 2013 11:37:45 +0000 Subject: [PATCH] Greatly optimize the number of states in FSG created from JSGF by avoiding new unnecessary nodes. See for details https://sourceforge.net/p/cmusphinx/bugs/358/ Patch by Steven J. Boswell II git-svn-id: svn+ssh://svn.code.sf.net/p/cmusphinx/code/trunk/sphinxbase@12042 94700074-3cef-4d97-a70e-9c8c206c02f5 --- src/libsphinxbase/lm/jsgf.c | 176 +++++++++++++----- src/libsphinxbase/lm/jsgf_internal.h | 10 +- test/regression/test.command.fsg | 67 +++---- test/regression/test.kleene.fsg | 30 ++- test/regression/test.nestedRightRecursion.fsg | 12 +- test/regression/test.nulltest.fsg | 53 ++---- test/regression/test.rightRecursion.fsg | 26 +-- 7 files changed, 189 insertions(+), 185 deletions(-) diff --git a/src/libsphinxbase/lm/jsgf.c b/src/libsphinxbase/lm/jsgf.c index 47a26aa3..f5061a0d 100644 --- a/src/libsphinxbase/lm/jsgf.c +++ b/src/libsphinxbase/lm/jsgf.c @@ -56,6 +56,8 @@ extern int yyparse (void* scanner, jsgf_t* jsgf); * into Sphinx finite-state grammars. **/ +static int expand_rule(jsgf_t *grammar, jsgf_rule_t *rule, int rule_entry, int rule_exit); + jsgf_atom_t * jsgf_atom_new(char *name, float weight) { @@ -299,78 +301,124 @@ importname2rulename(char *importname) } } -static int expand_rule(jsgf_t *grammar, jsgf_rule_t *rule); +#define NO_NODE -1 +#define RECURSIVE_NODE -2 + +/** + * + * Expand a right-hand-side of a rule (i.e. a single alternate). + * + * @returns the FSG state at the end of this rule, NO_NODE if there's an + * error, and RECURSIVE_NODE if the right-hand-side ended in right-recursion (i.e. + * a link to an earlier FSG state). + */ static int -expand_rhs(jsgf_t *grammar, jsgf_rule_t *rule, jsgf_rhs_t *rhs) +expand_rhs(jsgf_t *grammar, jsgf_rule_t *rule, jsgf_rhs_t *rhs, + int rule_entry, int rule_exit) { gnode_t *gn; int lastnode; /* Last node expanded in this sequence. */ - lastnode = rule->entry; + lastnode = rule_entry; /* Iterate over atoms in rhs and generate links/nodes */ for (gn = rhs->atoms; gn; gn = gnode_next(gn)) { jsgf_atom_t *atom = gnode_ptr(gn); + if (jsgf_atom_is_rule(atom)) { jsgf_rule_t *subrule; char *fullname; gnode_t *subnode; - void *val; - - /* Special case for and pseudo-rules */ - if (0 == strcmp(atom->name, "")) { - /* Emit a NULL transition */ - jsgf_add_link(grammar, atom, - lastnode, grammar->nstate); - lastnode = grammar->nstate; - ++grammar->nstate; + jsgf_rule_stack_t *rule_stack_entry; + + /* Special case for and pseudo-rules + If this is the only atom in the rhs, and it's the + first rhs in the rule, then emit a null transition, + creating an exit state if needed. */ + if (0 == strcmp(atom->name, "")) { + if (gn == rhs->atoms && gnode_next(gn) == NULL) { + if (rule_exit == NO_NODE) { + jsgf_add_link(grammar, atom, + lastnode, grammar->nstate); + rule_exit = lastnode = grammar->nstate; + ++grammar->nstate; + } else { + jsgf_add_link(grammar, atom, + lastnode, rule_exit); + } + } continue; - } + } else if (0 == strcmp(atom->name, "")) { /* Make this entire RHS unspeakable */ - return -1; + return NO_NODE; } fullname = jsgf_fullname_from_rule(rule, atom->name); - if (hash_table_lookup(grammar->rules, fullname, &val) == -1) { + if (hash_table_lookup(grammar->rules, fullname, (void**)&subrule) == -1) { E_ERROR("Undefined rule in RHS: %s\n", fullname); ckd_free(fullname); - return -1; + return NO_NODE; } ckd_free(fullname); - subrule = val; - /* Look for this in the stack of expanded rules */ - for (subnode = grammar->rulestack; subnode; subnode = gnode_next(subnode)) - if (gnode_ptr(subnode) == (void *)subrule) + + /* Look for this subrule in the stack of expanded rules */ + for (subnode = grammar->rulestack; subnode; subnode = gnode_next(subnode)) { + rule_stack_entry = (jsgf_rule_stack_t *)gnode_ptr(subnode); + if (rule_stack_entry->rule == subrule) break; + } + if (subnode != NULL) { /* Allow right-recursion only. */ if (gnode_next(gn) != NULL) { E_ERROR("Only right-recursion is permitted (in %s.%s)\n", grammar->name, rule->name); - return -1; + return NO_NODE; } /* Add a link back to the beginning of this rule instance */ - E_INFO("Right recursion %s %d => %d\n", atom->name, lastnode, subrule->entry); - jsgf_add_link(grammar, atom, lastnode, subrule->entry); + E_INFO("Right recursion %s %d => %d\n", atom->name, lastnode, rule_stack_entry->entry); + jsgf_add_link(grammar, atom, lastnode, rule_stack_entry->entry); + + /* Let our caller know that this rhs didn't reach an + end state. */ + lastnode = RECURSIVE_NODE; } else { + /* If this is the last atom in this rhs, link its + expansion to the parent rule's exit state. + Otherwise, create a new exit state for it. */ + int subruleexit = NO_NODE; + if (gnode_next(gn) == NULL && rule_exit >= 0) + subruleexit = rule_exit; + /* Expand the subrule */ - if (expand_rule(grammar, subrule) == -1) - return -1; - /* Add a link into the subrule. */ - jsgf_add_link(grammar, atom, - lastnode, subrule->entry); - lastnode = subrule->exit; + lastnode = expand_rule(grammar, subrule, lastnode, subruleexit); + + if (lastnode == NO_NODE) + return NO_NODE; } } else { - /* Add a link for this token and create a new exit node. */ + /* An exit-state is created if this isn't the last atom + in the rhs, or if the containing rule doesn't have an + exit state yet. + Otherwise, the rhs's exit state becomes the containing + rule's exit state. */ + + int exitstate; + if (gnode_next(gn) == NULL && rule_exit >= 0) { + exitstate = rule_exit; + } else { + exitstate = grammar->nstate; + ++grammar->nstate; + } + + /* Add a link for this token */ jsgf_add_link(grammar, atom, - lastnode, grammar->nstate); - lastnode = grammar->nstate; - ++grammar->nstate; + lastnode, exitstate); + lastnode = exitstate; } } @@ -378,15 +426,21 @@ expand_rhs(jsgf_t *grammar, jsgf_rule_t *rule, jsgf_rhs_t *rhs) } static int -expand_rule(jsgf_t *grammar, jsgf_rule_t *rule) +expand_rule(jsgf_t *grammar, jsgf_rule_t *rule, int rule_entry, + int rule_exit) { + jsgf_rule_stack_t* rule_stack_entry; jsgf_rhs_t *rhs; float norm; /* Push this rule onto the stack */ - grammar->rulestack = glist_add_ptr(grammar->rulestack, rule); + rule_stack_entry = (jsgf_rule_stack_t*)ckd_calloc(1, sizeof (jsgf_rule_stack_t)); + rule_stack_entry->rule = rule; + rule_stack_entry->entry = rule_entry; + grammar->rulestack = glist_add_ptr(grammar->rulestack, + rule_stack_entry); - /* Normalize weights for all alternatives exiting rule->entry */ + /* Normalize weights for all alternatives exiting rule_entry */ norm = 0; for (rhs = rule->rhs; rhs; rhs = rhs->alt) { if (rhs->atoms) { @@ -395,28 +449,43 @@ expand_rule(jsgf_t *grammar, jsgf_rule_t *rule) } } - rule->entry = grammar->nstate++; - rule->exit = grammar->nstate++; if (norm == 0) norm = 1; for (rhs = rule->rhs; rhs; rhs = rhs->alt) { int lastnode; if (rhs->atoms) { jsgf_atom_t *atom = gnode_ptr(rhs->atoms); - atom->weight /= norm; - } - lastnode = expand_rhs(grammar, rule, rhs); - if (lastnode == -1) { - return -1; + atom->weight /= norm; } - else { - jsgf_add_link(grammar, NULL, lastnode, rule->exit); + + lastnode = expand_rhs(grammar, rule, rhs, + rule_entry, rule_exit); + + if (lastnode == NO_NODE) { + return NO_NODE; + } else if (lastnode == RECURSIVE_NODE) { + /* The rhs ended with right-recursion, i.e. a transition to + an earlier state. Nothing needs to happen at this level. */ + ; + } else if (rule_exit == NO_NODE) { + /* If this rule doesn't have an exit state yet, use the exit + state of its first right-hand-side. + All other right-hand-sides will use this exit state. */ + assert (lastnode >= 0); + rule_exit = lastnode; } } + /* If no exit-state was created, use the entry-state. */ + if (rule_exit == NO_NODE) { + rule_exit = rule_entry; + } + /* Pop this rule from the rule stack */ + ckd_free(gnode_ptr(grammar->rulestack)); grammar->rulestack = gnode_free(grammar->rulestack, NULL); - return rule->exit; + + return rule_exit; } jsgf_rule_iter_t * @@ -454,6 +523,7 @@ jsgf_build_fsg_internal(jsgf_t *grammar, jsgf_rule_t *rule, fsg_model_t *fsg; glist_t nulls; gnode_t *gn; + int rule_entry, rule_exit; /* Clear previous links */ for (gn = grammar->links; gn; gn = gnode_next(gn)) { @@ -461,13 +531,16 @@ jsgf_build_fsg_internal(jsgf_t *grammar, jsgf_rule_t *rule, } glist_free(grammar->links); grammar->links = NULL; - rule->entry = rule->exit = 0; grammar->nstate = 0; - expand_rule(grammar, rule); + + /* Create the top-level entry state, and expand the + top-level rule. */ + rule_entry = grammar->nstate++; + rule_exit = expand_rule(grammar, rule, rule_entry, NO_NODE); fsg = fsg_model_init(rule->name, lmath, lw, grammar->nstate); - fsg->start_state = rule->entry; - fsg->final_state = rule->exit; + fsg->start_state = rule_entry; + fsg->final_state = rule_exit; grammar->links = glist_reverse(grammar->links); for (gn = grammar->links; gn; gn = gnode_next(gn)) { jsgf_link_t *link = gnode_ptr(gn); @@ -527,7 +600,7 @@ jsgf_read_file(const char *file, logmath_t * lmath, float32 lw) itor = jsgf_rule_iter_next(itor)) { rule = jsgf_rule_iter_rule(itor); if (jsgf_rule_public(rule)) { - jsgf_rule_iter_free(itor); + jsgf_rule_iter_free(itor); break; } } @@ -557,6 +630,7 @@ jsgf_write_fsg(jsgf_t *grammar, jsgf_rule_t *rule, FILE *outfh) logmath_free(lmath); return -1; } + jsgf_rule_t * jsgf_define_rule(jsgf_t *jsgf, char *name, jsgf_rhs_t *rhs, int is_public) { diff --git a/src/libsphinxbase/lm/jsgf_internal.h b/src/libsphinxbase/lm/jsgf_internal.h index a59b6540..cda71b31 100644 --- a/src/libsphinxbase/lm/jsgf_internal.h +++ b/src/libsphinxbase/lm/jsgf_internal.h @@ -64,6 +64,7 @@ extern "C" { typedef struct jsgf_rhs_s jsgf_rhs_t; typedef struct jsgf_atom_s jsgf_atom_t; typedef struct jsgf_link_s jsgf_link_t; +typedef struct jsgf_rule_stack_s jsgf_rule_stack_t; struct jsgf_s { char *version; /**< JSGF version (from header) */ @@ -82,14 +83,17 @@ struct jsgf_s { glist_t rulestack; /**< Stack of currently expanded rules. */ }; +/* A type to keep track of the stack of rules currently being expanded. */ +struct jsgf_rule_stack_s { + jsgf_rule_t *rule; /**< The rule being expanded */ + int entry; /**< The entry-state for this expansion */ +}; + struct jsgf_rule_s { int refcnt; /**< Reference count. */ char *name; /**< Rule name (NULL for an alternation/grouping) */ int is_public; /**< Is this rule marked 'public'? */ jsgf_rhs_t *rhs; /**< Expansion */ - - int entry; /**< Entry state for current instance of this rule. */ - int exit; /**< Exit state for current instance of this rule. */ }; struct jsgf_rhs_s { diff --git a/test/regression/test.command.fsg b/test/regression/test.command.fsg index a56f2422..d9b61a24 100644 --- a/test/regression/test.command.fsg +++ b/test/regression/test.command.fsg @@ -1,49 +1,24 @@ FSG_BEGIN -NUM_STATES 33 +NUM_STATES 10 START_STATE 0 -FINAL_STATE 1 -TRANSITION 0 2 1.000000 -TRANSITION 2 4 1.000000 -TRANSITION 3 14 1.000000 -TRANSITION 4 7 0.200004 oh -TRANSITION 4 10 0.200004 could -TRANSITION 4 12 0.200004 kindly -TRANSITION 4 13 0.200004 please -TRANSITION 4 6 0.200004 -TRANSITION 5 3 1.000000 -TRANSITION 6 5 1.000000 -TRANSITION 7 8 1.000000 mighty -TRANSITION 8 9 1.000000 computer -TRANSITION 9 5 1.000000 -TRANSITION 10 11 1.000000 you -TRANSITION 11 5 1.000000 -TRANSITION 12 5 1.000000 -TRANSITION 13 5 1.000000 -TRANSITION 14 20 0.500041 -TRANSITION 14 16 0.500041 -TRANSITION 15 24 1.000000 -TRANSITION 16 18 0.500041 stop -TRANSITION 16 19 0.500041 stop -TRANSITION 17 15 1.000000 -TRANSITION 18 17 1.000000 -TRANSITION 19 16 1.000000 -TRANSITION 19 17 1.000000 -TRANSITION 20 23 0.500041 go -TRANSITION 20 22 0.500041 -TRANSITION 21 15 1.000000 -TRANSITION 22 21 1.000000 -TRANSITION 23 20 1.000000 -TRANSITION 23 21 1.000000 -TRANSITION 24 26 1.000000 -TRANSITION 25 1 1.000000 -TRANSITION 26 29 0.250016 thank -TRANSITION 26 31 0.250016 thanks -TRANSITION 26 32 0.250016 please -TRANSITION 26 28 0.250016 -TRANSITION 27 25 1.000000 -TRANSITION 28 27 1.000000 -TRANSITION 29 30 1.000000 you -TRANSITION 30 27 1.000000 -TRANSITION 31 27 1.000000 -TRANSITION 32 27 1.000000 +FINAL_STATE 8 +TRANSITION 0 1 0.200004 please +TRANSITION 0 1 0.200004 kindly +TRANSITION 0 2 0.200004 oh +TRANSITION 0 4 0.200004 could +TRANSITION 0 1 0.200004 +TRANSITION 1 5 0.500041 stop +TRANSITION 1 6 0.500041 stop +TRANSITION 1 7 0.500041 go +TRANSITION 1 5 0.500041 +TRANSITION 2 3 1.000000 mighty +TRANSITION 3 1 1.000000 computer +TRANSITION 4 1 1.000000 you +TRANSITION 5 8 0.250016 please +TRANSITION 5 8 0.250016 thanks +TRANSITION 5 9 0.250016 thank +TRANSITION 5 8 0.250016 +TRANSITION 6 1 1.000000 +TRANSITION 7 1 1.000000 +TRANSITION 9 8 1.000000 you FSG_END diff --git a/test/regression/test.kleene.fsg b/test/regression/test.kleene.fsg index b4d97e24..d839271b 100644 --- a/test/regression/test.kleene.fsg +++ b/test/regression/test.kleene.fsg @@ -1,22 +1,14 @@ FSG_BEGIN -NUM_STATES 14 +NUM_STATES 7 START_STATE 0 -FINAL_STATE 1 -TRANSITION 0 2 1.000000 -TRANSITION 2 4 0.500041 -TRANSITION 2 5 0.500041 -TRANSITION 3 12 1.000000 don't -TRANSITION 4 3 1.000000 -TRANSITION 5 7 0.333356 oh -TRANSITION 5 10 0.333356 kindly -TRANSITION 5 11 0.333356 please -TRANSITION 6 2 1.000000 -TRANSITION 6 3 1.000000 -TRANSITION 7 8 1.000000 mighty -TRANSITION 8 9 1.000000 computer -TRANSITION 9 6 1.000000 -TRANSITION 10 6 1.000000 -TRANSITION 11 6 1.000000 -TRANSITION 12 13 1.000000 crash -TRANSITION 13 1 1.000000 +FINAL_STATE 6 +TRANSITION 0 2 0.333356 oh +TRANSITION 0 4 0.333356 please +TRANSITION 0 4 0.333356 kindly +TRANSITION 0 1 0.500041 +TRANSITION 1 5 1.000000 don't +TRANSITION 2 3 1.000000 mighty +TRANSITION 3 4 1.000000 computer +TRANSITION 4 0 1.000000 +TRANSITION 5 6 1.000000 crash FSG_END diff --git a/test/regression/test.nestedRightRecursion.fsg b/test/regression/test.nestedRightRecursion.fsg index 1b6a5da8..10f3aaca 100644 --- a/test/regression/test.nestedRightRecursion.fsg +++ b/test/regression/test.nestedRightRecursion.fsg @@ -1,13 +1,7 @@ FSG_BEGIN -NUM_STATES 6 +NUM_STATES 2 START_STATE 0 FINAL_STATE 1 -TRANSITION 0 5 0.500041 something -TRANSITION 0 2 0.500041 -TRANSITION 2 4 0.500041 another -TRANSITION 2 0 0.500041 -TRANSITION 2 3 1.000000 -TRANSITION 3 1 1.000000 -TRANSITION 4 3 1.000000 -TRANSITION 5 1 1.000000 +TRANSITION 0 1 0.500041 something +TRANSITION 0 1 0.500041 another FSG_END diff --git a/test/regression/test.nulltest.fsg b/test/regression/test.nulltest.fsg index 5f0b8151..672834f5 100644 --- a/test/regression/test.nulltest.fsg +++ b/test/regression/test.nulltest.fsg @@ -1,42 +1,17 @@ FSG_BEGIN -NUM_STATES 35 +NUM_STATES 10 START_STATE 0 -FINAL_STATE 1 -TRANSITION 0 2 1.000000 -TRANSITION 2 4 1.000000 -TRANSITION 3 13 1.000000 -TRANSITION 4 5 1.000000 -TRANSITION 5 7 1.000000 one -TRANSITION 6 3 1.000000 -TRANSITION 7 8 1.000000 -TRANSITION 8 11 0.500041 and -TRANSITION 8 10 0.500041 -TRANSITION 9 12 1.000000 one -TRANSITION 10 9 1.000000 -TRANSITION 11 9 1.000000 -TRANSITION 12 6 1.000000 -TRANSITION 13 15 1.000000 -TRANSITION 14 24 1.000000 -TRANSITION 15 16 1.000000 -TRANSITION 16 18 1.000000 two -TRANSITION 17 14 1.000000 -TRANSITION 18 19 1.000000 -TRANSITION 19 22 0.500041 and -TRANSITION 19 21 0.500041 -TRANSITION 20 23 1.000000 two -TRANSITION 21 20 1.000000 -TRANSITION 22 20 1.000000 -TRANSITION 23 17 1.000000 -TRANSITION 24 26 1.000000 -TRANSITION 25 1 1.000000 -TRANSITION 26 27 1.000000 -TRANSITION 27 29 1.000000 three -TRANSITION 28 25 1.000000 -TRANSITION 29 30 1.000000 -TRANSITION 30 33 0.500041 and -TRANSITION 30 32 0.500041 -TRANSITION 31 34 1.000000 three -TRANSITION 32 31 1.000000 -TRANSITION 33 31 1.000000 -TRANSITION 34 28 1.000000 +FINAL_STATE 9 +TRANSITION 0 1 1.000000 one +TRANSITION 1 2 0.500041 and +TRANSITION 1 2 0.500041 +TRANSITION 2 3 1.000000 one +TRANSITION 3 4 1.000000 two +TRANSITION 4 5 0.500041 and +TRANSITION 4 5 0.500041 +TRANSITION 5 6 1.000000 two +TRANSITION 6 7 1.000000 three +TRANSITION 7 8 0.500041 and +TRANSITION 7 8 0.500041 +TRANSITION 8 9 1.000000 three FSG_END diff --git a/test/regression/test.rightRecursion.fsg b/test/regression/test.rightRecursion.fsg index f4301f46..90a6d0c7 100644 --- a/test/regression/test.rightRecursion.fsg +++ b/test/regression/test.rightRecursion.fsg @@ -1,21 +1,11 @@ FSG_BEGIN -NUM_STATES 13 +NUM_STATES 3 START_STATE 0 -FINAL_STATE 1 -TRANSITION 0 2 0.500041 -TRANSITION 0 9 0.500041 -TRANSITION 2 4 1.000000 -TRANSITION 3 1 1.000000 -TRANSITION 4 6 0.500041 start -TRANSITION 4 7 0.500041 stop -TRANSITION 5 8 1.000000 and -TRANSITION 6 5 1.000000 -TRANSITION 7 5 1.000000 -TRANSITION 8 0 1.000000 -TRANSITION 8 3 1.000000 -TRANSITION 9 11 0.500041 start -TRANSITION 9 12 0.500041 stop -TRANSITION 10 1 1.000000 -TRANSITION 11 10 1.000000 -TRANSITION 12 10 1.000000 +FINAL_STATE 0 +TRANSITION 0 0 0.500041 stop +TRANSITION 0 0 0.500041 start +TRANSITION 0 1 0.500041 stop +TRANSITION 0 1 0.500041 start +TRANSITION 1 2 1.000000 and +TRANSITION 2 0 1.000000 FSG_END