Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the connector hashing and extend condesc_t #1528

Merged
merged 11 commits into from
May 20, 2024
Merged
63 changes: 33 additions & 30 deletions link-grammar/connectors.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static unsigned int get_connector_length_limit(condesc_t *cd,

int short_len = opts->short_length;
bool all_short = opts->all_short;
int length_limit = cd->length_limit;
int length_limit = cd->more->length_limit;

if ((all_short && (length_limit > short_len)) || (0 == length_limit))
return short_len;
Expand Down Expand Up @@ -165,7 +165,7 @@ static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_l
if (econlist[en]->uc_num != sdesc[cn]->uc_num) continue;
restart_cn = cn;

const char *wc_str = econlist[en]->string;
const char *wc_str = econlist[en]->more->string;
char *uc_wildcard = strchr(wc_str, LENGTH_LINIT_WILD_TYPE);

for (; cn < ct->num_con; cn++)
Expand All @@ -181,11 +181,11 @@ static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_l
else
{
/* The uppercase part is a prefix. */
if (0 != strncmp(wc_str, sdesc[cn]->string, uc_wildcard - wc_str))
if (0 != strncmp(wc_str, sdesc[cn]->more->string, uc_wildcard - wc_str))
break;
}

sdesc[cn]->length_limit = length_limit;
sdesc[cn]->more->length_limit = length_limit;
}
}
}
Expand Down Expand Up @@ -221,8 +221,8 @@ static void set_all_condesc_length_limit(Dictionary dict)

for (size_t en = 0; en < ct->num_con; en++)
{
if (0 == sdesc[en]->length_limit)
sdesc[en]->length_limit = UNLIMITED_LEN;
if (0 == sdesc[en]->more->length_limit)
sdesc[en]->more->length_limit = UNLIMITED_LEN;
}
}

Expand All @@ -234,7 +234,7 @@ static void set_all_condesc_length_limit(Dictionary dict)
for (size_t n = 0; n < ct->num_con; n++)
{
prt_error("%5zu %6u %3d %s\n\\", n, ct->sdesc[n]->uc_num,
ct->sdesc[n]->length_limit, ct->sdesc[n]->string);
ct->sdesc[n]->more->length_limit, ct->sdesc[n]->more->string);
}
prt_error("\n");
}
Expand Down Expand Up @@ -277,8 +277,8 @@ static void connector_encode_lc(const char *lc_string, condesc_t *desc)
lc_string, (int)(s-lc_string), MAX_CONNECTOR_LC_LENGTH);
}

desc->lc_mask = (lc_mask << 1) + !!(desc->flags & CD_HEAD_DEPENDENT);
desc->lc_letters = (lc_value << 1) + !!(desc->flags & CD_HEAD);
desc->lc_mask = (lc_mask << 1) + !!(desc->more->flags & CD_HEAD_DEPENDENT);
desc->lc_letters = (lc_value << 1) + !!(desc->more->flags & CD_HEAD);
}

/**
Expand All @@ -288,27 +288,27 @@ static void connector_encode_lc(const char *lc_string, condesc_t *desc)
*
* Note: check_connector() has already validated the connector string.
*/
void calculate_connector_info(condesc_t * c)
void calculate_connector_info(hdesc_t *hdesc)
{
const char *s;

s = c->string;
s = hdesc->string;
if (islower((unsigned char)*s))
{
dassert((c->string[0] == 'h') || (c->string[0] == 'd'),
"\'%c\': Bad head/dependent character", c->string[0]);
dassert((hdesc->string[0] == 'h') || (hdesc->string[0] == 'd'),
"\'%hdesc\': Bad head/dependent character", hdesc->string[0]);

if ((*s == 'h') || (*s == 'd')) c->flags |= CD_HEAD_DEPENDENT;
if (*s == 'h') c->flags |= CD_HEAD;
if ((*s == 'h') || (*s == 'd')) hdesc->flags |= CD_HEAD_DEPENDENT;
if (*s == 'h') hdesc->flags |= CD_HEAD;
s++; /* Ignore head-dependent indicator. */
}

c->uc_start = (uint8_t)(s - c->string);
hdesc->uc_start = (uint8_t)(s - hdesc->string);
/* Skip the uppercase part. */
do { s++; } while (is_connector_name_char(*s));
c->uc_length = (uint8_t)(s - c->string - c->uc_start);
hdesc->uc_length = (uint8_t)(s - hdesc->string - hdesc->uc_start);

connector_encode_lc(s, c);
connector_encode_lc(s, hdesc->desc);
}

/* ================= Connector descriptor table. ====================== */
Expand Down Expand Up @@ -373,11 +373,11 @@ int condesc_by_uc_constring(const void * a, const void * b)
if (NULL == *cda) return (NULL != *cdb);
if (NULL == *cdb) return -1;

const char *sa = &(*cda)->string[(*cda)->uc_start];
const char *sb = &(*cdb)->string[(*cdb)->uc_start];
const char *sa = &(*cda)->more->string[(*cda)->more->uc_start];
const char *sb = &(*cdb)->more->string[(*cdb)->more->uc_start];

int la = (*cda)->uc_length;
int lb = (*cdb)->uc_length;
int la = (*cda)->more->uc_length;
int lb = (*cdb)->more->uc_length;

if (la == lb)
{
Expand Down Expand Up @@ -424,7 +424,7 @@ static bool sort_condesc_by_uc_constring(Dictionary dict)
condesc_t *condesc = dict->contable.hdesc[n].desc;

if (NULL == condesc) continue;
calculate_connector_info(condesc);
calculate_connector_info(&dict->contable.hdesc[n]);
sdesc[i++] = dict->contable.hdesc[n].desc;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems to be same as condesc

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

condesc is dict->contable.hdesc[n].desc, not dict->contable.hdesc[n].
To be clearer, these lines can be changed to (not tested):

hdesc_t *hdesc = &dict->contable.hdesc[n];

if (NULL == hdesc->desc) continue;
calculate_connector_info(hdesc);
sdesc[i++] = hdesc->desc;

}

Expand All @@ -439,17 +439,17 @@ static bool sort_condesc_by_uc_constring(Dictionary dict)
{
condesc_t **condesc = &sdesc[n];

if (condesc[0]->uc_length != condesc[-1]->uc_length)
if (condesc[0]->more->uc_length != condesc[-1]->more->uc_length)

{
/* We know that the UC part has been changed. */
uc_num++;
}
else
{
const char *uc1 = &condesc[0]->string[condesc[0]->uc_start];
const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->uc_length))
const char *uc1 = &condesc[0]->more->string[condesc[0]->more->uc_start];
const char *uc2 = &condesc[-1]->more->string[condesc[-1]->more->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->more->uc_length))
{
uc_num++;
}
Expand Down Expand Up @@ -495,7 +495,7 @@ static hdesc_t *condesc_find(ConTable *ct, const char *constring, uint32_t hash)
uint32_t i = hash & (ct->size-1);

while ((NULL != ct->hdesc[i].desc) &&
!string_set_cmp(constring, ct->hdesc[i].desc->string))
!string_set_cmp(constring, ct->hdesc[i].string))
{
i = (i + 1) & (ct->size-1);
}
Expand Down Expand Up @@ -524,7 +524,7 @@ static bool condesc_grow(ConTable *ct)
{
hdesc_t *old_h = &old_hdesc[i];
if (NULL == old_h->desc) continue;
hdesc_t *new_h = condesc_find(ct, old_h->desc->string, old_h->str_hash);
hdesc_t *new_h = condesc_find(ct, old_h->string, old_h->str_hash);

if (NULL != new_h->desc)
{
Expand All @@ -533,6 +533,7 @@ static bool condesc_grow(ConTable *ct)
return false;
}
*new_h = *old_h;
new_h->desc->more = new_h;
}

free(old_hdesc);
Expand All @@ -548,9 +549,11 @@ condesc_t *condesc_add(ConTable *ct, const char *constring)
{
lgdebug(+11, "Creating connector '%s' (%zu)\n", constring, ct->num_con);
h->desc = pool_alloc(ct->mempool);
h->desc->string = constring;
h->string = constring;
h->desc->uc_num = UINT32_MAX;
h->str_hash = hash;
h->desc->more = h;
h->desc->con_num = ct->num_con;
ct->num_con++;

if ((8 * ct->num_con) > (3 * ct->size))
Expand Down
94 changes: 52 additions & 42 deletions link-grammar/connectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
*/
#define MAX_SENTENCE 254 /* Maximum number of words in a sentence */

/* Length-limits for how far connectors can reach out. */
#define UNLIMITED_LEN 255

/* Since tracon IDs are unique per sentence, for convenience NULL
* connectors (zero-length tracons) have tracon IDs equal to the word
* number on which their disjunct resides. To that end an initial block
Expand Down Expand Up @@ -76,16 +79,16 @@ static inline bool is_connector_subscript_char(unsigned char c)
}
/* End of connector string character validation. */

/* Note: If more byte-size fields are needed, to save space
* uc_length and uc_start may be moved to struct hdesc. */
struct condesc_struct
{
lc_enc_t lc_letters;
lc_enc_t lc_mask;
typedef struct condesc_struct condesc_t;

/* The connector descriptors (see below) are pointed from a hash table
* with these elements. */
typedef struct hdesc
{
condesc_t *desc;
const char *string; /* The connector name w/o the direction mark, e.g. AB */
// float *cost; /* Array of cost by connector length (cost[0]: default) */
connector_uc_hash_t uc_num; /* uc part enumeration. */
// float *cost; // Array of cost by connector length (cost[0]: default)
connector_uc_hash_t str_hash;
uint8_t length_limit; /* If not 0, it gives the limit of the length of the
* link that can be used on this connector type. The
* value UNLIMITED_LEN specifies no limit.
Expand All @@ -97,11 +100,24 @@ struct condesc_struct
/* For connector match speedup when sorting the connector table. */
uint8_t uc_length; /* uc part length */
uint8_t uc_start; /* uc start position */
};
typedef struct condesc_struct condesc_t;
} hdesc_t;

/* Length-limits for how far connectors can reach out. */
#define UNLIMITED_LEN 255
/* Each connector type has a connector descriptor. The size of this
* struct is 32 byes, to facilitate CPU memory caching during parsing.
* The "more" field points to connector information that is needed in
* other steps. The con_num field is used a lot in steps that need
* connector hashing, and it is included here to avoid extra
* redirections.
* Multi connectors are considering the same type as their non-multi
* version, so the multi indication is kept in Connector_struct. */
struct condesc_struct
{
lc_enc_t lc_letters;
lc_enc_t lc_mask;
hdesc_t *more; /* More information, for keeping small struct size. */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So hdesc points at condesc, and condesc points at hdesc. Interesting ... the old code did not have this loop.

Copy link
Member Author

@ampli ampli May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hdesc are the slots of the connector hash table. Each one that is used, has a desc pointer to a condesc_t element. Up to here it is a standard implementation of an open hash table.
The more elements of condesc_t need to point somewhere, and it was convenient to use the hdesc slots, which are already allocated (it requires less code). However, this is memory inefficient (and indirectly CPU inefficiently due to cache trashing) because most of the hdesc elements are empty (NULL desc).

The solution is to use a memory pool for the more elements. I will eventually send a cleanup PR for that (and the sort_condesc_by_uc_constring() code).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh! Ah! OK. I think I understand. This wasn't obvious from casual perusal, so I thought I'd point it out.

connector_uc_hash_t uc_num; /* uc part enumeration. */
uint32_t con_num; /* Connector ordinal number. */
};

typedef struct length_limit_def
{
Expand All @@ -111,12 +127,6 @@ typedef struct length_limit_def
int length_limit;
} length_limit_def_t;

typedef struct hdesc
{
condesc_t *desc;
connector_uc_hash_t str_hash;
} hdesc_t;

typedef struct
{
hdesc_t *hdesc; /* Hashed connector descriptors table */
Expand Down Expand Up @@ -176,12 +186,17 @@ void condesc_reuse(Dictionary);
* accesses connectors */
static inline const char * connector_string(const Connector *c)
{
return c->desc->string;
return c->desc->more->string;
}

static inline unsigned int connector_uc_start(const Connector *c)
{
return c->desc->uc_start;
return c->desc->more->uc_start;
}

static inline unsigned int connector_uc_length(const Connector *c)
{
return c->desc->more->uc_length;
}

static inline const condesc_t *connector_desc(const Connector *c)
Expand All @@ -199,12 +214,16 @@ static inline unsigned int connector_uc_num(const Connector * c)
return c->desc->uc_num;
}

static inline unsigned int connector_num(const Connector * c)
{
return 2 * c->desc->con_num + c->multi;
}

/* Connector utilities ... */
Connector * connector_new(Pool_desc *, const condesc_t *);
void set_connector_farthest_word(Exp *, int, int, Parse_Options);
void free_connectors(Connector *);
void calculate_connector_info(condesc_t *);
void calculate_connector_info(hdesc_t *);
int condesc_by_uc_constring(const void *, const void *);

/**
Expand Down Expand Up @@ -310,41 +329,32 @@ static inline uint32_t string_hash(const char *s)
}

typedef uint32_t connector_hash_t;
static const connector_hash_t FIBONACCI_MULT = 0x9E3779B9;

static inline connector_hash_t connector_hash(const Connector *c)
{
// The use of (c->desc->lc_mask & 1) during hashing is important;
// See pull req #1487 for details. This raises other questions
// about hashing. Two forms are attempted below. They appear to
// be equivalent, in terms of measured elapsed-time performance.
// (I did not look at the quality of the distribution.)
// The second form uses some mixing bitshifts:
// 266281 == sum of 1 8 32 4096 (256*1024) It is a prime number
// 524429 == sum of 1 4 8 128 (512*1024) and it is a prime number
#ifdef SIMPLE_HASH
return c->desc->uc_num +
(c->multi << 19) +
(((connector_hash_t)c->desc->lc_mask & 1) << 20) +
(connector_hash_t)c->desc->lc_letters;
#else
return c->desc->uc_num +
c->multi * 266281 +
(((connector_hash_t)c->desc->lc_mask & 1) * 524429) +
((connector_hash_t)c->desc->lc_letters) * 101;
#endif
// connector_num() is different for each connector string + its multi
// attribute, and it naturally depends also on its head-dependent
// attribute, if any. For the importance of considering the
// head-dependent attribute during hashing see pull req #1487;
return connector_num(c);
}

/**
* \p c is assumed to be non-NULL.
*/
// To check hash functions, enable the "N" printing in
// eliminate_duplicate_disjuncts().
#define FEEDBACK_HASH 1
static inline connector_hash_t connector_list_hash(const Connector *c)
{
connector_hash_t accum = connector_hash(c);

for (c = c->next; c != NULL; c = c->next)
#ifdef FEEDBACK_HASH
accum = (19 * accum) + (accum >> 24) + connector_hash(c);
#if FEEDBACK_HASH
accum = (accum<<7) + (accum<<14) + (accum >> 16) - connector_hash(c);
#else
// Bad.
accum = (19 * accum) + connector_hash(c);
#endif

Expand Down
12 changes: 6 additions & 6 deletions link-grammar/dict-atomese/lookup-atomese.cc
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,8 @@ static void update_condesc(Dictionary dict)
if (NULL == condesc) continue;
if (UINT32_MAX != condesc->uc_num) continue;

calculate_connector_info(condesc);
condesc->length_limit = UNLIMITED_LEN;
calculate_connector_info(&ct->hdesc[n]);
condesc->more->length_limit = UNLIMITED_LEN;
sdesc[i++] = condesc;
}

Expand All @@ -438,16 +438,16 @@ static void update_condesc(Dictionary dict)
{
condesc_t **condesc = &sdesc[n];

if (condesc[0]->uc_length != condesc[-1]->uc_length)
if (condesc[0]->more->uc_length != condesc[-1]->more->uc_length)
{
/* We know that the UC part has been changed. */
uc_num++;
}
else
{
const char *uc1 = &condesc[0]->string[condesc[0]->uc_start];
const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->uc_length))
const char *uc1 = &condesc[0]->more->string[condesc[0]->more->uc_start];
const char *uc2 = &condesc[-1]->more->string[condesc[-1]->more->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->more->uc_length))
{
uc_num++;
}
Expand Down
2 changes: 1 addition & 1 deletion link-grammar/dict-atomese/word-pairs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static Exp* get_sent_pair_exprs(Dictionary dict, const Handle& germ,
{
assert(CONNECTOR_type == orch->type, "unexpected expression!");

if (links.end() != links.find(orch->condesc->string))
if (links.end() != links.find(orch->condesc->more->string))
{
Exp* cpe = (Exp*) pool_alloc(pool);
*cpe = *orch;
Expand Down
Loading