Skip to content

Commit

Permalink
Postpone position assignment until the end.
Browse files Browse the repository at this point in the history
Rather than trying to assign positions to literal nodes on creation, and
then having to renumber them on the fly during AST expansion, wait until
the end and move position assignment into `tre_compute_nfl()`, which we
rename to `tre_compute_pnfl()`.
  • Loading branch information
dag-erling committed Jul 24, 2024
1 parent d77e553 commit e049fa6
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 126 deletions.
4 changes: 2 additions & 2 deletions lib/tre-ast.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
}

tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max)
{
tre_ast_node_t *node;
tre_literal_t *lit;
Expand All @@ -44,7 +44,7 @@ tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
lit = node->obj;
lit->code_min = code_min;
lit->code_max = code_max;
lit->position = position;
lit->position = -1;
node->nullable = code_min < 0 && code_min != BACKREF;

return node;
Expand Down
2 changes: 1 addition & 1 deletion lib/tre-ast.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);

tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max);

tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
Expand Down
105 changes: 26 additions & 79 deletions lib/tre-compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
c = tre_mem_alloc(mem, sizeof(*c));
if (c == NULL)
return REG_ESPACE;
c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
c->left = tre_ast_new_literal(mem, TAG, tag_id);
if (c->left == NULL)
return REG_ESPACE;
c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
Expand Down Expand Up @@ -79,7 +79,7 @@ tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
c = tre_mem_alloc(mem, sizeof(*c));
if (c == NULL)
return REG_ESPACE;
c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
c->right = tre_ast_new_literal(mem, TAG, tag_id);
if (c->right == NULL)
return REG_ESPACE;
c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
Expand Down Expand Up @@ -713,7 +713,7 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
tag_directions[max] = TRE_TAG_MAXIMIZE;
first_tag = 0;
}
*result = tre_ast_new_literal(mem, min, max, pos);
*result = tre_ast_new_literal(mem, min, max);
if (*result == NULL)
status = REG_ESPACE;

Expand Down Expand Up @@ -800,8 +800,7 @@ typedef enum {
iteration count to a catenated sequence of copies of the node. */
static reg_errcode_t
tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
int *position, tre_tag_direction_t *tag_directions,
int *max_depth)
tre_tag_direction_t *tag_directions, int *max_depth)
{
reg_errcode_t status = REG_OK;
int bottom = tre_stack_num_objects(stack);
Expand Down Expand Up @@ -978,12 +977,12 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
int *old_params;

tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
tmp_l = tre_ast_new_literal(mem, PARAMETER, 0);
if (!tmp_l)
return REG_ESPACE;
((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
tmp_r = tre_ast_new_literal(mem, PARAMETER, 0);
if (!tmp_r)
return REG_ESPACE;
old_params = tre_mem_alloc(mem, sizeof(*old_params)
Expand Down Expand Up @@ -1024,19 +1023,9 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
}
}

*position += pos_add_total;

/* `max_pos' should never be larger than `*position' if the above
code works, but just an extra safeguard let's make sure
`*position' is set large enough so enough memory will be
allocated for the transition table. */
if (max_pos > *position)
*position = max_pos;

#ifdef TRE_DEBUG
DPRINT(("Expanded AST:\n"));
tre_ast_print(ast);
DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
#endif

return status;
Expand Down Expand Up @@ -1308,10 +1297,11 @@ typedef enum {
} tre_nfl_stack_symbol_t;


/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
the nodes of the AST `tree'. */
/* Computes and fills in the fields `position`, `nullable', `firstpos',
and `lastpos' for the nodes of the AST `tree'. */
static reg_errcode_t
tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
tre_compute_pnfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
int *nextpos)
{
int bottom = tre_stack_num_objects(stack);

Expand All @@ -1338,6 +1328,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
/* Back references: nullable = false, firstpos = {i},
lastpos = {i}. */
assert(node->nullable == 0);
lit->position = (*nextpos)++;
node->firstpos = tre_set_one(mem, lit->position, 0,
TRE_CHAR_MAX, 0, NULL, -1);
if (!node->firstpos)
Expand Down Expand Up @@ -1365,6 +1356,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
/* Literal at position i: nullable = false, firstpos = {i},
lastpos = {i}. */
assert(node->nullable == 0);
lit->position = (*nextpos)++;
node->firstpos =
tre_set_one(mem, lit->position, (int)lit->code_min,
(int)lit->code_max, 0, NULL, -1);
Expand Down Expand Up @@ -1832,44 +1824,6 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
return errcode;
}

static void
tre_reposition_ast(tre_ast_node_t *node, int *position)
{
switch (node->type)
{
case LITERAL:
{
tre_literal_t *lit = node->obj;
if (lit->position != -1)
lit->position = (*position)++;
}
break;

case UNION:
{
tre_union_t *uni = node->obj;
tre_reposition_ast(uni->left, position);
tre_reposition_ast(uni->right, position);
}
break;

case CATENATION:
{
tre_catenation_t *cat = node->obj;
tre_reposition_ast(cat->left, position);
tre_reposition_ast(cat->right, position);
}
break;

case ITERATION:
{
tre_iteration_t *iter = node->obj;
tre_reposition_ast(iter->arg, position);
}
break;
}
}

#define ERROR_EXIT(err) \
do \
{ \
Expand All @@ -1894,6 +1848,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
tre_tag_direction_t *tag_directions = NULL;
reg_errcode_t errcode;
tre_mem_t mem;
int numpos = 0;

/* Parse context. */
tre_parse_ctx_t parse_ctx;
Expand Down Expand Up @@ -2000,55 +1955,47 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
}

/* Expand iteration nodes. */
errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
tag_directions, &tnfa->params_depth);
errcode = tre_expand_ast(mem, stack, tree, tag_directions,
&tnfa->params_depth);
if (errcode != REG_OK)
ERROR_EXIT(errcode);

/* XXX recompute all positions */
parse_ctx.position = 0;
tre_reposition_ast(tree, &parse_ctx.position);
#ifdef TRE_DEBUG
DPRINT(("Repositioned AST:\n"));
tre_ast_print(tree);
#endif

/* Add a dummy node for the final state.
XXX - For certain patterns this dummy node can be optimized away,
for example "a*" or "ab*". Figure out a simple way to detect
this possibility. */
tmp_ast_l = tree;
tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
tmp_ast_r = tre_ast_new_literal(mem, 0, 0);
if (tmp_ast_r == NULL)
ERROR_EXIT(REG_ESPACE);

tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
if (tree == NULL)
ERROR_EXIT(REG_ESPACE);

errcode = tre_compute_pnfl(mem, stack, tree, &numpos);
if (errcode != REG_OK)
ERROR_EXIT(errcode);

#ifdef TRE_DEBUG
tre_ast_print(tree);
DPRINT(("Number of states: %d\n", parse_ctx.position));
DPRINT(("Number of states: %d\n", numpos));
#endif /* TRE_DEBUG */

errcode = tre_compute_nfl(mem, stack, tree);
if (errcode != REG_OK)
ERROR_EXIT(errcode);

counts = xmalloc(sizeof(int) * parse_ctx.position);
counts = xmalloc(sizeof(int) * numpos);
if (counts == NULL)
ERROR_EXIT(REG_ESPACE);

offs = xmalloc(sizeof(int) * parse_ctx.position);
offs = xmalloc(sizeof(int) * numpos);
if (offs == NULL)
ERROR_EXIT(REG_ESPACE);

for (i = 0; i < parse_ctx.position; i++)
for (i = 0; i < numpos; i++)
counts[i] = 0;
tre_ast_to_tnfa(tree, NULL, counts, NULL);

add = 0;
for (i = 0; i < parse_ctx.position; i++)
for (i = 0; i < numpos; i++)
{
offs[i] = add;
add += counts[i] + 1;
Expand Down Expand Up @@ -2186,7 +2133,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)

tnfa->num_transitions = add;
tnfa->final = transitions + offs[tree->lastpos[0].position];
tnfa->num_states = parse_ctx.position;
tnfa->num_states = numpos;
tnfa->cflags = cflags;

DPRINT(("final state %p\n", (void *)tnfa->final));
Expand Down
Loading

0 comments on commit e049fa6

Please sign in to comment.