Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix-up so single parser works for en, ru, amy #486

Merged
merged 12 commits into from
Jan 27, 2017
4 changes: 2 additions & 2 deletions data/amy/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ ANY-PUNCT: {@ANY-} & {@ANY+};

JUNK: {@ANY-} & {@ANY+};

MOR-PREF: PF+;
MOR-STEM: {@PF-} & LL+;
MOR-PREF: PL+;
MOR-STEM: {@PL-} & LL+;
MOR-SUFF: LL- & {@ANY-} & {@ANY+};
172 changes: 76 additions & 96 deletions link-grammar/api.c
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,13 @@ static void post_process_lkgs(Sentence sent, Parse_Options opts)
size_t N_linkages_alloced = sent->num_linkages_alloced;
bool twopass = sent->length >= opts->twopass_length;

/* Special-case the "amy/ady" morphology handling. */
if (sent->dict->affix_table->anysplit)
{
sent->num_linkages_post_processed = sent->num_valid_linkages;
return;
}

/* (optional) First pass: just visit the linkages */
/* The purpose of the first pass is to make the post-processing
* more efficient. Because (hopefully) by the time the real work
Expand Down Expand Up @@ -959,11 +966,12 @@ static void wordgraph_path_free(Wordgraph_pathpos *wp, bool free_final_path)
* another alternative. This can happen due to the way in which word
* alternatives are implemented.
*
* It does so by checking that all the chosen disjuncts in a linkage (including
* null words) match, in the same order, a path in the Wordgraph.
* It does so by checking that all the chosen disjuncts in a linkage
* (including null words) match, in the same order, a path in the
* Wordgraph.
*
* An important side effect of this check is that if the linkage is good,
* its Wordgraph path is found.
* An important side effect of this check is that if the linkage is
* good, its Wordgraph path is found.
*
* Optionally (if SANEMORPHISM regex is defined in the affix file), it
* also validates that the morpheme-type sequence is permitted for the
Expand All @@ -985,9 +993,10 @@ bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)

Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */
char *const affix_types = alloca(sent->length*2 + 1); /* affix types */

affix_types[0] = '\0';

lkg->wg_path = NULL;

/* Populate the path word queue, initializing the path to NULL. */
for (next = sent->wordgraph->next; *next; next++)
{
Expand All @@ -999,7 +1008,7 @@ bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)
{
Disjunct *cdj; /* chosen disjunct */

lgdebug(D_SLM, "%p Word %zu: ", lkg, i);
lgdebug(D_SLM, "lkg=%p Word %zu: ", lkg, i);

if (NULL == wp_new)
{
Expand Down Expand Up @@ -1087,8 +1096,10 @@ bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)
if (match_found)
{
match_found = false;
/* Validate that there are no missing words in the linkage. It is so if
* the dummy termination word is found in the new pathpos queue. */
/* Validate that there are no missing words in the linkage.
* It is so, if the dummy termination word is found in the
* new pathpos queue.
*/
if (NULL != wp_new)
{
for (wpp = wp_new; NULL != wpp->word; wpp++)
Expand Down Expand Up @@ -1258,81 +1269,52 @@ static bool setup_linkages(Sentence sent, fast_matcher_t* mchxt,
return overflowed;
}

/**
* This fills the linkage array with morphologically-acceptable
* linakges.
*/
static void process_linkages(Sentence sent, bool overflowed, Parse_Options opts)
{
/*
* We want to pick random linkages in three special cases:
* if there's an overflow,
* if more were found than what were asked for,
* if randomization was explicitly asked for.
*/
bool pick_randomly = overflowed ||
(sent->num_linkages_found != (int) sent->num_linkages_alloced) ||
(0 != sent->rand_state);

Parse_info pi = sent->parse_info;

size_t N_invalid_morphism = 0;
sent->num_valid_linkages = sent->num_linkages_alloced;

for (size_t in=0; in < sent->num_linkages_alloced; in++)
{
Linkage lkg = &sent->lnkages[in];
Linkage_info * lifo = &lkg->lifo;

lifo->index = pick_randomly? -(in+1) : in;

partial_init_linkage(sent, lkg, pi->N_words);

/* The extract_links() call sets the chosen_disjuncts array */
extract_links(lkg, pi);
compute_link_names(lkg, sent->string_set);
remove_empty_words(lkg);

if (!sane_linkage_morphism(sent, lkg, opts))
{
lifo->N_violations++;
lifo->pp_violation_msg = "Invalid morphism construction.";
lifo->discarded = true;
lkg->wg_path = NULL;
sent->num_valid_linkages --;
N_invalid_morphism ++;
}
}
if (0 == sent->num_linkages_found) return;

if (verbosity_level(5))
{
prt_error("Info: sane_morphism(): %zu of %zu linkages had "
"invalid morphology construction\n",
N_invalid_morphism, sent->num_linkages_alloced);
}
}
/* Pick random linkages if we get more than what was asked for. */
bool pick_randomly = overflowed ||
(sent->num_linkages_found != (int) sent->num_linkages_alloced);

/* Special-case the "amy/ady" languages; the ones that perform
* random morphological splitting. This is due to a feature/bug
* in the parser design: not everything that it finds is valid,
* because morphemes from the wrong splits were matched up.
* For longer sentences, this can even be 999 out of every 1000
* that get mis-matched, and so we need to discard these earlier.
*/
static void fill_em_up(Sentence sent, Parse_Options opts)
{
Parse_info pi = sent->parse_info;
pi->rand_state = sent->rand_state;
sent->num_valid_linkages = 0;
size_t N_invalid_morphism = 0;

size_t itry = 0;
size_t in = 0;
int foo=0;
size_t maxtries = sent->num_linkages_alloced;

/* If we're picking randomly, then try as many as we are allowed. */
if (pick_randomly) maxtries = sent->num_linkages_found;

/* In the case of overflow, which will happen for some long
* sentences, but is particularly common for the amy/ady random
* splitters, we want to find as many morpho-acceptable linkages
* as possible, but keep the CPU usage down, as these might be
* very rare. This is due to a bug/feature in the interaction
* between the word-graph and the parser: valid morph linkages
* can be one-in-a-thousand.. or worse. Search for them, but
* don't over-do it.
*/
#define MAX_TRIES 250000
if (MAX_TRIES < maxtries) maxtries = MAX_TRIES;

bool need_init = true;
while (in < sent->num_linkages_alloced)
for (itry=0; itry<maxtries; itry++)
{
Linkage lkg = &sent->lnkages[in];
Linkage_info * lifo = &lkg->lifo;

foo++;
lifo->index = -(in+1);
lkg->lifo.index = -foo;
/* Negative values tell extract-links to pick randomly; for
* reproducible-rand, the actual value is the rand seed. */
lifo->index = pick_randomly ? -(itry+1) : itry;

// printf("duude try to %d %d of %d\n", foo, in, sent->num_linkages_alloced);
if (need_init)
{
partial_init_linkage(sent, lkg, pi->N_words);
Expand All @@ -1347,27 +1329,35 @@ foo++;
need_init = true;
in++;
sent->num_valid_linkages ++;
// printf("duude foo-sane %d %d of %d\n", foo, in, sent->num_linkages_alloced);
if (in >= sent->num_linkages_alloced) break;
}
else
{
lifo->discarded = true;
lkg->wg_path = NULL;
N_invalid_morphism ++;
lkg->num_links = 0;
memset(lkg->link_array, 0, lkg->lasz * sizeof(Link));
lkg->num_words = pi->N_words;
// memset(lkg->link_array, 0, lkg->lasz * sizeof(Link));
memset(lkg->chosen_disjuncts, 0, pi->N_words * sizeof(Disjunct *));
}
in++;
need_init = true;
}
// printf("duuude unscathed %d\n", sent->num_valid_linkages);
sent->num_linkages_post_processed = sent->num_valid_linkages;
// TODO sset lifo->discarded = true; for any remaining...

/* The remainder of the array is garbage; we never filled it in.
* So just pretend that it's shorter than it is */
sent->num_linkages_alloced = sent->num_valid_linkages;

if (verbosity_level(5))
{
prt_error("Info: sane_morphism(): %zu of %zu linkages had "
"invalid morphology construction\n",
N_invalid_morphism, sent->num_linkages_alloced);
}
}

/**
* chart_parse() -- parse the given sentence.
* (Misnamed, this has nothing to do with chart parsing.)
* classic_parse() -- parse the given sentence.
* Perform parsing, using the original link-grammar parsing algorithm
* given in the original link-grammar papers.
*
* Do the parse with the minimum number of null-links within the range
* specified by opts->min_null_count and opts->max_null_count.
*
Expand All @@ -1388,7 +1378,7 @@ need_init = true;
* null_count>0. To solve that, we need to restore the original
* disjuncts of the sentence and call pp_and_power_prune() once again.
*/
static void chart_parse(Sentence sent, Parse_Options opts)
static void classic_parse(Sentence sent, Parse_Options opts)
{
fast_matcher_t * mchxt = NULL;
count_context_t * ctxt;
Expand Down Expand Up @@ -1473,20 +1463,10 @@ static void chart_parse(Sentence sent, Parse_Options opts)
sent->num_linkages_found = (int) total;
print_time(opts, "Counted parses");

/* Special-case the "amy/ady" morphology handling. */
if (sent->dict->affix_table->anysplit)
{
printf("duuuuuuuuuuuuuuuuuuuuuude \n");
setup_linkages(sent, mchxt, ctxt, opts);
fill_em_up(sent, opts);
}
else
{
/* Normal processing path */
bool ovfl = setup_linkages(sent, mchxt, ctxt, opts);
process_linkages(sent, ovfl, opts);
post_process_lkgs(sent, opts);
}
bool ovfl = setup_linkages(sent, mchxt, ctxt, opts);
process_linkages(sent, ovfl, opts);
post_process_lkgs(sent, opts);

if (sent->num_valid_linkages > 0) break;
if ((0 == nl) && (0 < max_null_count) && verbosity > 0)
prt_error("No complete linkages found.\n");
Expand Down Expand Up @@ -1544,7 +1524,7 @@ int sentence_parse(Sentence sent, Parse_Options opts)
}
else
{
chart_parse(sent, opts);
classic_parse(sent, opts);
}
print_time(opts, "Finished parse");

Expand Down
1 change: 0 additions & 1 deletion link-grammar/constituents.c
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,6 @@ static char * do_print_flat_constituents(con_context_t *ctxt, Linkage linkage)
char * q;
Sentence sent = linkage->sent;

assert(NULL != sent->lnkages, "No linkages"); /* Sentence already free()'d */
ctxt->phrase_ss = string_set_create();
generate_misc_word_info(ctxt, linkage);

Expand Down
4 changes: 2 additions & 2 deletions link-grammar/extract-links.c
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ static void issue_links_for_choice(Linkage lkg, Parse_choice *pc)
}
}

static void list_links(Linkage lkg, Parse_set * set, int index)
static void list_links(Linkage lkg, const Parse_set * set, int index)
{
Parse_choice *pc;
s64 n;
Expand All @@ -619,7 +619,7 @@ static void list_links(Linkage lkg, Parse_set * set, int index)
list_links(lkg, pc->set[1], index / pc->set[0]->count);
}

static void list_random_links(Linkage lkg, Parse_info pi, Parse_set * set)
static void list_random_links(Linkage lkg, Parse_info pi, const Parse_set * set)
{
Parse_choice *pc;
int num_pc, new_index;
Expand Down
8 changes: 5 additions & 3 deletions link-grammar/linkage.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ static Gword *wordgraph_null_join(Sentence sent, Gword **start, Gword **end)

#define SUBSCRIPT_SEP SUBSCRIPT_DOT /* multiple-subscript separator */

#define PREFIX_SUPPRESS ("PL") /* prefix links start with this */
#define PREFIX_SUPPRESS_L 2 /* length of above */
#define SUFFIX_SUPPRESS ("LL") /* suffix links start with this */
#define SUFFIX_SUPPRESS_L 2 /* length of above */

Expand All @@ -141,13 +143,13 @@ static Gword *wordgraph_null_join(Sentence sent, Gword **start, Gword **end)
/* FIXME: Define an affix class MORPHOLOGY_LINKS. */
static inline bool is_morphology_link(const char *link_name)
{
return (NULL != link_name) &&
(0 == strncmp(link_name, SUFFIX_SUPPRESS, SUFFIX_SUPPRESS_L));
if (NULL == link_name) return false;
return (0 == strncmp(link_name, SUFFIX_SUPPRESS, SUFFIX_SUPPRESS_L)) ||
(0 == strncmp(link_name, PREFIX_SUPPRESS, PREFIX_SUPPRESS_L));
}

/*
* Remap the link array according to discarded links and words.
* If pp_info exists, remap it too.
*
* The remap[] elements indicate the new WordIdx of the word.
* A value which is -1 indicates a discarded word.
Expand Down
3 changes: 3 additions & 0 deletions link-grammar/print.c
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,9 @@ linkage_print_diagram_ctxt(const Linkage linkage,
char xpicture[MAX_HEIGHT][MAX_LINE];
size_t start[MAX_HEIGHT];

// Avoid pathological case and the resulting crash.
if (0 == linkage->num_words) return strdup("");

string = string_new();

/* Do we want to print the left wall? */
Expand Down
28 changes: 8 additions & 20 deletions link-parser/link-parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -425,30 +425,18 @@ static void batch_process_some_linkages(Label label,

if (there_was_an_error(label, sent, opts))
{
/* If linkages were found, print them.
* The equality check takes care for the SAT solver, which, for
* now, leaves num_linkages_found==0. In any case, if there are
* actually no linkages, linkage_create() will return NULL, and
* nothing will be processed. */
if (sentence_num_linkages_found(sent) >= 0) {
/* If we found at least one good linkage, print it. */
if (sentence_num_valid_linkages(sent) > 0) {
Linkage linkage = NULL;
/* If we found at least one good linkage, print it. */
if (sentence_num_valid_linkages(sent) > 0) {
int i;
for (i=0; i<sentence_num_linkages_post_processed(sent); i++)
int i;
for (i=0; i<sentence_num_linkages_post_processed(sent); i++)
{
if (0 == sentence_num_violations(sent, i))
{
if (0 == sentence_num_violations(sent, i))
{
linkage = linkage_create(i, sent, opts);
break;
}
linkage = linkage_create(i, sent, opts);
break;
}
}
else
{
/* This linkage will be bad; no good ones were found. */
linkage = linkage_create(0, sent, opts);
}
process_linkage(linkage, copts);
linkage_delete(linkage);
}
Expand Down