libregexp: simplify regular expression vm implementation

Make the logic around who has priority over the final
match simpler by merging the priority generation and
match fields in a smarter way. Move the creation of
new thread matches up to the top to avoid jumping all
over the place.
This commit is contained in:
spew 2017-02-02 21:21:34 -06:00
parent f94167ebee
commit 9ae083d816
4 changed files with 145 additions and 175 deletions

View file

@ -156,17 +156,6 @@ initplex(Parselex *plex, char *regstr, int lit)
return plex; return plex;
} }
static int
maxthreads(Renode *tree)
{
tree = tree->left;
if(tree->op == TCAT)
tree = tree->left;
if(tree->op == TBOL)
return 2;
return -1;
}
static Reprog* static Reprog*
regcomp1(char *regstr, int nl, int lit) regcomp1(char *regstr, int nl, int lit)
{ {
@ -187,7 +176,7 @@ regcomp1(char *regstr, int nl, int lit)
return nil; return nil;
} }
maxthr = regstrlen; maxthr = regstrlen + 1;
parsetr = node(&plex, TSUB, e0(&plex), nil); parsetr = node(&plex, TSUB, e0(&plex), nil);
// prtree(parsetr, 0, 1); // prtree(parsetr, 0, 1);
@ -304,12 +293,13 @@ Tailcall:
static Reinst* static Reinst*
compile(Renode *parsetr, Reprog *reprog, int nl) compile(Renode *parsetr, Reprog *reprog, int nl)
{ {
Reinst *reinst; Reinst *reinst, *end;
int sub; int sub;
sub = 0; sub = 0;
reinst = (Reinst*)(reprog+1); reinst = (Reinst*)(reprog+1);
compile1(parsetr, reinst, &sub, nl); end = compile1(parsetr, reinst, &sub, nl);
assert(reinst + reprog->len == end);
return reinst; return reinst;
} }

View file

@ -4,30 +4,31 @@
#include "regimpl.h" #include "regimpl.h"
typedef struct RethreadQ RethreadQ; typedef struct RethreadQ RethreadQ;
struct RethreadQ struct RethreadQ {
{
Rethread *head; Rethread *head;
Rethread **tail; Rethread **tail;
}; };
int int
regexec(Reprog *prog, char *str, Resub *sem, int msize) regexec(Reprog *p, char *str, Resub *sem, int msize)
{ {
RethreadQ lists[2], *clist, *nlist, *tmp; RethreadQ lists[2], *clist, *nlist, *tmp;
Rethread *t, *next, *pooltop, *avail; Rethread *t, *next, *pool, *avail;
Reinst *curinst; Reinst *ci;
Rune r; Rune r;
char *sp, *ep, endc; char *sp, *ep, endc;
int i, match, first, gen, matchpri, pri; int i, matchgen, gen;
if(msize > NSUBEXPM) if(msize > NSUBEXPM)
msize = NSUBEXPM; msize = NSUBEXPM;
if(prog->startinst->gen != 0) { if(p->startinst->gen != 0) {
for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++) for(ci = p->startinst; ci < p->startinst + p->len; ci++)
curinst->gen = 0; ci->gen = 0;
} }
memset(p->threads, 0, sizeof(Rethread)*p->nthr);
clist = lists; clist = lists;
clist->head = nil; clist->head = nil;
clist->tail = &clist->head; clist->tail = &clist->head;
@ -35,10 +36,10 @@ regexec(Reprog *prog, char *str, Resub *sem, int msize)
nlist->head = nil; nlist->head = nil;
nlist->tail = &nlist->head; nlist->tail = &nlist->head;
pooltop = prog->threads + prog->nthr; pool = p->threads;
avail = nil; avail = nil;
pri = matchpri = gen = match = 0; gen = matchgen = 0;
sp = str; sp = str;
ep = nil; ep = nil;
endc = '\0'; endc = '\0';
@ -51,109 +52,120 @@ regexec(Reprog *prog, char *str, Resub *sem, int msize)
*sem->ep = '\0'; *sem->ep = '\0';
} }
} }
r = Runemax + 1; for(r = 1; r != L'\0'; sp += i) {
for(; r != L'\0'; sp += i) {
gen++;
i = chartorune(&r, sp); i = chartorune(&r, sp);
first = 1; gen++;
if(matchgen == 0) {
if(avail == nil) {
assert(pool < p->threads + p->nthr);
t = pool++;
} else {
t = avail;
avail = avail->next;
}
t->i = p->startinst;
if(msize > 0)
memset(t->sem, 0, sizeof(Resub)*msize);
t->next = nil;
t->gen = gen;
*clist->tail = t;
clist->tail = &t->next;
}
t = clist->head; t = clist->head;
if(t == nil) if(t == nil)
goto Start; break;
curinst = t->pc; ci = t->i;
Again: Again:
if(curinst->gen == gen) if(ci->gen == gen || matchgen && t->gen > matchgen)
goto Done; goto Done;
curinst->gen = gen; ci->gen = gen;
switch(curinst->op) { switch(ci->op) {
case ORUNE: case ORUNE:
if(r != curinst->r) if(r != ci->r)
goto Done; goto Done;
case OANY: /* fallthrough */ case OANY: /* fallthrough */
next = t->next; next = t->next;
t->pc = curinst + 1; t->i = ci + 1;
t->next = nil; t->next = nil;
*nlist->tail = t; *nlist->tail = t;
nlist->tail = &t->next; nlist->tail = &t->next;
if(next == nil) if(next == nil)
break; break;
t = next; t = next;
curinst = t->pc; ci = t->i;
goto Again; goto Again;
case OCLASS: case OCLASS:
Class: Class:
if(r < curinst->r) if(r < ci->r)
goto Done; goto Done;
if(r > curinst->r1) { if(r > ci->r1) {
curinst++; ci++;
goto Class; goto Class;
} }
next = t->next; next = t->next;
t->pc = curinst->a; t->i = ci->a;
t->next = nil; t->next = nil;
*nlist->tail = t; *nlist->tail = t;
nlist->tail = &t->next; nlist->tail = &t->next;
if(next == nil) if(next == nil)
break; break;
t = next; t = next;
curinst = t->pc; ci = t->i;
goto Again; goto Again;
case ONOTNL: case ONOTNL:
if(r != L'\n') { if(r != L'\n') {
curinst++; ci++;
goto Again; goto Again;
} }
goto Done; goto Done;
case OBOL: case OBOL:
if(sp == str || sp[-1] == '\n') { if(sp == str || sp[-1] == '\n') {
curinst++; ci++;
goto Again; goto Again;
} }
goto Done; goto Done;
case OEOL: case OEOL:
if(r == L'\n' || r == L'\0' && ep == nil) { if(r == L'\n' || r == L'\0' && ep == nil) {
curinst++; ci++;
goto Again; goto Again;
} }
goto Done; goto Done;
case OJMP: case OJMP:
curinst = curinst->a; ci = ci->a;
goto Again; goto Again;
case OSPLIT: case OSPLIT:
if(avail == nil) if(avail == nil) {
next = --pooltop; assert(pool < p->threads + p->nthr);
else { next = pool++;
} else {
next = avail; next = avail;
avail = avail->next; avail = avail->next;
} }
next->pc = curinst->b; next->i = ci->b;
if(msize > 0) if(msize > 0)
memcpy(next->sem, t->sem, sizeof(Resub)*msize); memcpy(next->sem, t->sem, sizeof(Resub)*msize);
next->pri = t->pri;
next->next = t->next; next->next = t->next;
next->gen = t->gen;
t->next = next; t->next = next;
curinst = curinst->a; ci = ci->a;
goto Again; goto Again;
case OSAVE: case OSAVE:
if(curinst->sub < msize) if(ci->sub < msize)
t->sem[curinst->sub].sp = sp; t->sem[ci->sub].sp = sp;
curinst++; ci++;
goto Again; goto Again;
case OUNSAVE: case OUNSAVE:
if(curinst->sub == 0) { if(ci->sub == 0) {
/* "Highest" priority is the left-most longest. */ matchgen = t->gen;
if (t->pri > matchpri)
goto Done;
match = 1;
matchpri = t->pri;
if(sem != nil && msize > 0) { if(sem != nil && msize > 0) {
memcpy(sem, t->sem, sizeof(Resub)*msize); memcpy(sem, t->sem, sizeof(Resub)*msize);
sem->ep = sp; sem->ep = sp;
} }
goto Done; goto Done;
} }
if(curinst->sub < msize) if(ci->sub < msize)
t->sem[curinst->sub].ep = sp; t->sem[ci->sub].ep = sp;
curinst++; ci++;
goto Again; goto Again;
Done: Done:
next = t->next; next = t->next;
@ -162,30 +174,9 @@ Again:
if(next == nil) if(next == nil)
break; break;
t = next; t = next;
curinst = t->pc; ci = t->i;
goto Again; goto Again;
} }
Start:
/* Start again once if we haven't found anything. */
if(first == 1 && match == 0) {
first = 0;
if(avail == nil)
t = --pooltop;
else {
t = avail;
avail = avail->next;
}
if(msize > 0)
memset(t->sem, 0, sizeof(Resub)*msize);
/* "Lower" priority thread */
t->pri = matchpri = pri++;
t->next = nil;
curinst = prog->startinst;
goto Again;
}
/* If we have a match and no extant threads, we are done. */
if(match == 1 && nlist->head == nil)
break;
tmp = clist; tmp = clist;
clist = nlist; clist = nlist;
nlist = tmp; nlist = tmp;
@ -194,5 +185,5 @@ Start:
} }
if(ep != nil) if(ep != nil)
*ep = endc; *ep = endc;
return match; return matchgen > 0 ? 1 : 0;
} }

View file

@ -1,5 +1,4 @@
enum enum {
{
LANY = 0, LANY = 0,
LBOL, LBOL,
LCLASS, LCLASS,
@ -30,8 +29,7 @@ enum
typedef struct Parselex Parselex; typedef struct Parselex Parselex;
typedef struct Renode Renode; typedef struct Renode Renode;
struct Parselex struct Parselex {
{
/* Parse */ /* Parse */
Renode *next; Renode *next;
Renode *nodes; Renode *nodes;
@ -50,8 +48,8 @@ struct Parselex
Rune cpairs[400+2]; Rune cpairs[400+2];
int nc; int nc;
}; };
struct Renode
{ struct Renode {
int op; int op;
Renode *left; Renode *left;
Rune r; Rune r;
@ -63,15 +61,15 @@ struct Renode
}; };
int nclass; int nclass;
}; };
struct Rethread
{ struct Rethread {
Reinst *pc; Reinst *i;
Resub sem[NSUBEXPM]; Resub sem[NSUBEXPM];
int pri;
Rethread *next; Rethread *next;
int gen;
}; };
struct Reinst
{ struct Reinst {
char op; char op;
int gen; int gen;
Reinst *a; Reinst *a;

View file

@ -4,29 +4,30 @@
#include "regimpl.h" #include "regimpl.h"
typedef struct RethreadQ RethreadQ; typedef struct RethreadQ RethreadQ;
struct RethreadQ struct RethreadQ {
{
Rethread *head; Rethread *head;
Rethread **tail; Rethread **tail;
}; };
int int
rregexec(Reprog *prog, Rune *str, Resub *sem, int msize) rregexec(Reprog *p, Rune *str, Resub *sem, int msize)
{ {
RethreadQ lists[2], *clist, *nlist, *tmp; RethreadQ lists[2], *clist, *nlist, *tmp;
Rethread *t, *next, *pooltop, *avail; Rethread *t, *next, *pool, *avail;
Reinst *curinst; Reinst *ci;
Rune *rsp, *rep, endr, last; Rune *rsp, *rep, endr, r;
int match, first, gen, pri, matchpri; int matchgen, gen;
if(msize > NSUBEXPM) if(msize > NSUBEXPM)
msize = NSUBEXPM; msize = NSUBEXPM;
if(prog->startinst->gen != 0) { if(p->startinst->gen != 0) {
for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++) for(ci = p->startinst; ci < p->startinst + p->len; ci++)
curinst->gen = 0; ci->gen = 0;
} }
memset(p->threads, 0, sizeof(Rethread)*p->nthr);
clist = lists; clist = lists;
clist->head = nil; clist->head = nil;
clist->tail = &clist->head; clist->tail = &clist->head;
@ -34,10 +35,10 @@ rregexec(Reprog *prog, Rune *str, Resub *sem, int msize)
nlist->head = nil; nlist->head = nil;
nlist->tail = &nlist->head; nlist->tail = &nlist->head;
pooltop = prog->threads + prog->nthr; pool = p->threads;
avail = nil; avail = nil;
pri = matchpri = gen = match = 0; gen = matchgen = 0;
rsp = str; rsp = str;
rep = nil; rep = nil;
endr = L'\0'; endr = L'\0';
@ -50,109 +51,120 @@ rregexec(Reprog *prog, Rune *str, Resub *sem, int msize)
*sem->rep = '\0'; *sem->rep = '\0';
} }
} }
last = 1; for(r = 1; r != L'\0'; rsp++) {
for(; last != L'\0'; rsp++) { r = *rsp;
gen++; gen++;
last = *rsp; if(matchgen == 0) {
first = 1; if(avail == nil) {
assert(pool < p->threads + p->nthr);
t = pool++;
} else {
t = avail;
avail = avail->next;
}
t->i = p->startinst;
if(msize > 0)
memset(t->sem, 0, sizeof(Resub)*msize);
t->next = nil;
t->gen = gen;
*clist->tail = t;
clist->tail = &t->next;
}
t = clist->head; t = clist->head;
if(t == nil) if(t == nil)
goto Start; break;
curinst = t->pc; ci = t->i;
Again: Again:
if(curinst->gen == gen) if(ci->gen == gen || matchgen && t->gen > matchgen)
goto Done; goto Done;
curinst->gen = gen; ci->gen = gen;
switch(curinst->op) { switch(ci->op) {
case ORUNE: case ORUNE:
if(*rsp != curinst->r) if(r != ci->r)
goto Done; goto Done;
case OANY: /* fallthrough */ case OANY: /* fallthrough */
next = t->next; next = t->next;
t->pc = curinst + 1; t->i = ci + 1;
t->next = nil; t->next = nil;
*nlist->tail = t; *nlist->tail = t;
nlist->tail = &t->next; nlist->tail = &t->next;
if(next == nil) if(next == nil)
break; break;
t = next; t = next;
curinst = t->pc; ci = t->i;
goto Again; goto Again;
case OCLASS: case OCLASS:
Class: Class:
if(*rsp < curinst->r) if(r < ci->r)
goto Done; goto Done;
if(*rsp > curinst->r1) { if(r > ci->r1) {
curinst++; ci++;
goto Class; goto Class;
} }
next = t->next; next = t->next;
t->pc = curinst->a; t->i = ci->a;
t->next = nil; t->next = nil;
*nlist->tail = t; *nlist->tail = t;
nlist->tail = &t->next; nlist->tail = &t->next;
if(next == nil) if(next == nil)
break; break;
t = next; t = next;
curinst = t->pc; ci = t->i;
goto Again; goto Again;
case ONOTNL: case ONOTNL:
if(*rsp != L'\n') { if(r != L'\n') {
curinst++; ci++;
goto Again; goto Again;
} }
goto Done; goto Done;
case OBOL: case OBOL:
if(rsp == str || rsp[-1] == L'\n') { if(rsp == str || rsp[-1] == L'\n') {
curinst++; ci++;
goto Again; goto Again;
} }
goto Done; goto Done;
case OEOL: case OEOL:
if(*rsp == '\n' || *rsp == L'\0' && rep == nil) { if(r == L'\n' || r == L'\0' && rep == nil) {
curinst++; ci++;
goto Again; goto Again;
} }
goto Done; goto Done;
case OJMP: case OJMP:
curinst = curinst->a; ci = ci->a;
goto Again; goto Again;
case OSPLIT: case OSPLIT:
if(avail == nil) if(avail == nil) {
next = --pooltop; assert(pool < p->threads + p->nthr);
else { next = pool++;
} else {
next = avail; next = avail;
avail = avail->next; avail = avail->next;
} }
next->pc = curinst->b; next->i = ci->b;
if(msize > 0) if(msize > 0)
memcpy(next->sem, t->sem, sizeof(Resub)*msize); memcpy(next->sem, t->sem, sizeof(Resub)*msize);
next->pri = t->pri;
next->next = t->next; next->next = t->next;
next->gen = t->gen;
t->next = next; t->next = next;
curinst = curinst->a; ci = ci->a;
goto Again; goto Again;
case OSAVE: case OSAVE:
if(curinst->sub < msize) if(ci->sub < msize)
t->sem[curinst->sub].rsp = rsp; t->sem[ci->sub].rsp = rsp;
curinst++; ci++;
goto Again; goto Again;
case OUNSAVE: case OUNSAVE:
if(curinst->sub == 0) { if(ci->sub == 0) {
/* "Highest" priority is the left-most longest. */ matchgen = t->gen;
if (t->pri > matchpri)
goto Done;
match = 1;
matchpri = t->pri;
if(sem != nil && msize > 0) { if(sem != nil && msize > 0) {
memcpy(sem, t->sem, sizeof(Resub)*msize); memcpy(sem, t->sem, sizeof(Resub)*msize);
sem->rep = rsp; sem->rep = rsp;
} }
goto Done; goto Done;
} }
if(curinst->sub < msize) if(ci->sub < msize)
t->sem[curinst->sub].rep = rsp; t->sem[ci->sub].rep = rsp;
curinst++; ci++;
goto Again; goto Again;
Done: Done:
next = t->next; next = t->next;
@ -161,30 +173,9 @@ Again:
if(next == nil) if(next == nil)
break; break;
t = next; t = next;
curinst = t->pc; ci = t->i;
goto Again; goto Again;
} }
Start:
/* Start again once if we haven't found anything. */
if(first == 1 && match == 0) {
first = 0;
if(avail == nil)
t = --pooltop;
else {
t = avail;
avail = avail->next;
}
if(msize > 0)
memset(t->sem, 0, sizeof(Resub)*msize);
/* "Lower" priority thread */
t->pri = matchpri = pri++;
t->next = nil;
curinst = prog->startinst;
goto Again;
}
/* If we have a match and no extant threads, we are done. */
if(match == 1 && nlist->head == nil)
break;
tmp = clist; tmp = clist;
clist = nlist; clist = nlist;
nlist = tmp; nlist = tmp;
@ -193,5 +184,5 @@ Start:
} }
if(rep != nil) if(rep != nil)
*rep = endr; *rep = endr;
return match; return matchgen > 0 ? 1 : 0;
} }