libregexp: fix ambiguous match selection

echo SYSSYSR1 | sed 's/SYS.+/sysr1/'

was producing SYSsysr1 instead of sysr1.
Bug was introduced during overflow cleanup earlier this year.

Also bring regexec.c and rregexec.c into sync again.
Also allocate large enough lists in the regexec2/rregexec2 case.
This commit is contained in:
Russ Cox 2007-12-07 15:32:45 -05:00
parent 6c6117397f
commit a7511dd43d
5 changed files with 117 additions and 113 deletions

View file

@ -23,90 +23,89 @@ _renewmatch(Resub *mp, int ms, Resublist *sp)
} }
/* /*
* Note optimization in _renewthread: * Add ip to the list [lp, elp], but only if it is not there already.
* *lp must be pending when _renewthread called; if *l has been looked * These work lists are stored and processed in increasing
* at already, the optimization is a bug. * order of sp[0], so if the ip is there already, the one that's
* there already is a more left match and takes priority.
*/ */
extern Relist* static Relist*
_renewthread(Relist *lp, /* _relist to add to */ _renewthread1(Relist *lp, /* Relist to add to */
Relist *elp, /* limit pointer for Relist */
Reinst *ip, /* instruction to add */ Reinst *ip, /* instruction to add */
int ms, int ms,
Resublist *sep) /* pointers to subexpressions */ Resublist *sep) /* pointers to subexpressions */
{ {
Relist *p; Relist *p;
for(p=lp; p->inst; p++){ for(p=lp; p->inst; p++)
if(p->inst == ip){ if(p->inst == ip)
if(sep->m[0].s.sp < p->se.m[0].s.sp){
if(ms > 1)
p->se = *sep;
else
p->se.m[0] = sep->m[0];
}
return 0; return 0;
}
} if(p == elp) /* refuse to overflow buffer */
return elp;
p->inst = ip; p->inst = ip;
if(ms > 1) if(ms > 1)
p->se = *sep; p->se = *sep;
else else
p->se.m[0] = sep->m[0]; p->se.m[0] = sep->m[0];
(++p)->inst = 0; (p+1)->inst = 0;
return p; return p;
} }
extern int
_renewthread(Relist *lp, Relist *elp, Reinst *ip, int ms, Resublist *sep)
{
Relist *ap;
ap = _renewthread1(lp, elp, ip, ms, sep);
if(ap == 0)
return 0;
if(ap == elp)
return -1;
/*
* Added ip to list at ap.
* Expand any ORs right now, so that entire
* work list ends up being sorted by increasing m[0].sp.
*/
for(; ap->inst; ap++){
if(ap->inst->type == OR){
if(_renewthread1(lp, elp, ap->inst->u1.right, ms, &ap->se) == elp)
return -1;
if(_renewthread1(lp, elp, ap->inst->u2.next, ms, &ap->se) == elp)
return -1;
}
}
return 0;
}
/* /*
* same as renewthread, but called with * same as renewthread, but called with
* initial empty start pointer. * initial empty start pointer.
*/ */
extern Relist* extern int
_renewemptythread(Relist *lp, /* _relist to add to */ _renewemptythread(Relist *lp, /* _relist to add to */
Relist *elp,
Reinst *ip, /* instruction to add */ Reinst *ip, /* instruction to add */
int ms, int ms,
char *sp) /* pointers to subexpressions */ char *sp) /* pointers to subexpressions */
{ {
Relist *p; Resublist sep;
for(p=lp; p->inst; p++){
if(p->inst == ip){
if(sp < p->se.m[0].s.sp) {
if(ms > 1)
memset(&p->se, 0, sizeof(p->se));
p->se.m[0].s.sp = sp;
}
return 0;
}
}
p->inst = ip;
if(ms > 1) if(ms > 1)
memset(&p->se, 0, sizeof(p->se)); memset(&sep, 0, sizeof sep);
p->se.m[0].s.sp = sp; sep.m[0].s.sp = sp;
(++p)->inst = 0; sep.m[0].e.ep = 0;
return p; return _renewthread(lp, elp, ip, ms, &sep);
} }
extern Relist* extern int
_rrenewemptythread(Relist *lp, /* _relist to add to */ _rrenewemptythread(Relist *lp, /* _relist to add to */
Relist *elp,
Reinst *ip, /* instruction to add */ Reinst *ip, /* instruction to add */
int ms, int ms,
Rune *rsp) /* pointers to subexpressions */ Rune *rsp) /* pointers to subexpressions */
{ {
Relist *p; return _renewemptythread(lp, elp, ip, ms, (char*)rsp);
for(p=lp; p->inst; p++){
if(p->inst == ip){
if(rsp < p->se.m[0].s.rsp) {
if(ms > 1)
memset(&p->se, 0, sizeof(p->se));
p->se.m[0].s.rsp = rsp;
}
return 0;
}
}
p->inst = ip;
if(ms > 1)
memset(&p->se, 0, sizeof(p->se));
p->se.m[0].s.rsp = rsp;
(++p)->inst = 0;
return p;
} }

View file

@ -232,7 +232,7 @@ optimize(Reprog *pp)
int size; int size;
Reprog *npp; Reprog *npp;
Reclass *cl; Reclass *cl;
int diff; int diff, proglen;
/* /*
* get rid of NOOP chains * get rid of NOOP chains
@ -249,10 +249,13 @@ optimize(Reprog *pp)
* necessary. Reallocate to the actual space used * necessary. Reallocate to the actual space used
* and then relocate the code. * and then relocate the code.
*/ */
size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); proglen = freep - pp->firstinst;
size = sizeof(Reprog) + proglen*sizeof(Reinst);
npp = realloc(pp, size); npp = realloc(pp, size);
if(npp==0 || npp==pp) if(npp==0 || npp==pp){
pp->proglen = proglen;
return pp; return pp;
}
diff = (char *)npp - (char *)pp; diff = (char *)npp - (char *)pp;
freep = (Reinst *)((char *)freep + diff); freep = (Reinst *)((char *)freep + diff);
for(inst=npp->firstinst; inst<freep; inst++){ for(inst=npp->firstinst; inst<freep; inst++){
@ -273,6 +276,7 @@ optimize(Reprog *pp)
*(char**)(void*)&inst->u2.left += diff; *(char**)(void*)&inst->u2.left += diff;
} }
*(char**)(void*)&npp->startinst += diff; *(char**)(void*)&npp->startinst += diff;
npp->proglen = proglen;
return npp; return npp;
} }

View file

@ -68,7 +68,7 @@ struct Reljunk
Rune* reol; Rune* reol;
}; };
extern Relist* _renewthread(Relist*, Reinst*, int, Resublist*); extern int _renewthread(Relist*, Relist*, Reinst*, int, Resublist*);
extern void _renewmatch(Resub*, int, Resublist*); extern void _renewmatch(Resub*, int, Resublist*);
extern Relist* _renewemptythread(Relist*, Reinst*, int, char*); extern int _renewemptythread(Relist*, Relist*, Reinst*, int, char*);
extern Relist* _rrenewemptythread(Relist*, Reinst*, int, Rune*); extern int _rrenewemptythread(Relist*, Relist*, Reinst*, int, Rune*);

View file

@ -2,7 +2,6 @@
#include "regexp9.h" #include "regexp9.h"
#include "regcomp.h" #include "regcomp.h"
/* /*
* return 0 if no match * return 0 if no match
* >0 if a match * >0 if a match
@ -13,16 +12,14 @@ regexec1(Reprog *progp, /* program to run */
char *bol, /* string to run machine on */ char *bol, /* string to run machine on */
Resub *mp, /* subexpression elements */ Resub *mp, /* subexpression elements */
int ms, /* number of elements at mp */ int ms, /* number of elements at mp */
Reljunk *j Reljunk *j)
)
{ {
int flag=0; int flag=0;
Reinst *inst; Reinst *inst;
Relist *tlp; Relist *tlp;
char *s; char *s;
int i, checkstart; int i, checkstart, n;
Rune r, *rp, *ep; Rune r, *rp, *ep;
int n;
Relist* tl; /* This list, next list */ Relist* tl; /* This list, next list */
Relist* nl; Relist* nl;
Relist* tle; /* ends of this and next list */ Relist* tle; /* ends of this and next list */
@ -48,7 +45,7 @@ regexec1(Reprog *progp, /* program to run */
switch(j->starttype) { switch(j->starttype) {
case RUNE: case RUNE:
p = utfrune(s, j->startchar); p = utfrune(s, j->startchar);
if(p == 0 || s == j->eol) if(p == 0 || (j->eol && p >= j->eol))
return match; return match;
s = p; s = p;
break; break;
@ -56,7 +53,7 @@ regexec1(Reprog *progp, /* program to run */
if(s == bol) if(s == bol)
break; break;
p = utfrune(s, '\n'); p = utfrune(s, '\n');
if(p == 0 || s == j->eol) if(p == 0 || (j->eol && p+1 >= j->eol))
return match; return match;
s = p+1; s = p+1;
break; break;
@ -77,17 +74,16 @@ regexec1(Reprog *progp, /* program to run */
/* Add first instruction to current list */ /* Add first instruction to current list */
if(match == 0) if(match == 0)
_renewemptythread(tl, progp->startinst, ms, s); _renewemptythread(tl, tle, progp->startinst, ms, s);
/* Execute machine until current list is empty */ /* Execute machine until current list is empty */
for(tlp=tl; tlp->inst; tlp++){ /* assignment = */ for(tlp=tl; tlp->inst; tlp++){
for(inst = tlp->inst; ; inst = inst->u2.next){ for(inst = tlp->inst; ; inst = inst->u2.next){
switch(inst->type){ switch(inst->type){
case RUNE: /* regular character */ case RUNE: /* regular character */
if(inst->u1.r == r){ if(inst->u1.r == r)
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
}
break; break;
case LBRA: case LBRA:
tlp->se.m[inst->u1.subid].s.sp = s; tlp->se.m[inst->u1.subid].s.sp = s;
@ -97,11 +93,11 @@ regexec1(Reprog *progp, /* program to run */
continue; continue;
case ANY: case ANY:
if(r != '\n') if(r != '\n')
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case ANYNL: case ANYNL:
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case BOL: case BOL:
@ -116,7 +112,7 @@ regexec1(Reprog *progp, /* program to run */
ep = inst->u1.cp->end; ep = inst->u1.cp->end;
for(rp = inst->u1.cp->spans; rp < ep; rp += 2) for(rp = inst->u1.cp->spans; rp < ep; rp += 2)
if(r >= rp[0] && r <= rp[1]){ if(r >= rp[0] && r <= rp[1]){
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
} }
@ -127,15 +123,12 @@ regexec1(Reprog *progp, /* program to run */
if(r >= rp[0] && r <= rp[1]) if(r >= rp[0] && r <= rp[1])
break; break;
if(rp == ep) if(rp == ep)
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case OR: case OR:
/* evaluate right choice later */ /* expanded during renewthread; just a place holder */
if(_renewthread(tl, inst->u1.right, ms, &tlp->se) == tle) break;
return -1;
/* efficiency: advance and re-evaluate */
continue;
case END: /* Match! */ case END: /* Match! */
match = 1; match = 1;
tlp->se.m[0].e.ep = s; tlp->se.m[0].e.ep = s;
@ -165,19 +158,18 @@ regexec2(Reprog *progp, /* program to run */
int rv; int rv;
Relist *relist0, *relist1; Relist *relist0, *relist1;
/* mark space */ relist0 = malloc((progp->proglen+1)*sizeof(Relist));
relist0 = malloc(BIGLISTSIZE*sizeof(Relist));
if(relist0 == nil) if(relist0 == nil)
return -1; return -1;
relist1 = malloc(BIGLISTSIZE*sizeof(Relist)); relist1 = malloc((progp->proglen+1)*sizeof(Relist));
if(relist1 == nil){ if(relist1 == nil){
free(relist1); free(relist1);
return -1; return -1;
} }
j->relist[0] = relist0; j->relist[0] = relist0;
j->relist[1] = relist1; j->relist[1] = relist1;
j->reliste[0] = relist0 + BIGLISTSIZE - 2; j->reliste[0] = relist0 + progp->proglen;
j->reliste[1] = relist1 + BIGLISTSIZE - 2; j->reliste[1] = relist1 + progp->proglen;
rv = regexec1(progp, bol, mp, ms, j); rv = regexec1(progp, bol, mp, ms, j);
free(relist0); free(relist0);
@ -218,8 +210,8 @@ regexec(Reprog *progp, /* program to run */
/* mark space */ /* mark space */
j.relist[0] = relist0; j.relist[0] = relist0;
j.relist[1] = relist1; j.relist[1] = relist1;
j.reliste[0] = relist0 + nelem(relist0) - 2; j.reliste[0] = relist0 + nelem(relist0) - 1;
j.reliste[1] = relist1 + nelem(relist1) - 2; j.reliste[1] = relist1 + nelem(relist1) - 1;
rv = regexec1(progp, bol, mp, ms, &j); rv = regexec1(progp, bol, mp, ms, &j);
if(rv >= 0) if(rv >= 0)

View file

@ -9,9 +9,9 @@
*/ */
static int static int
rregexec1(Reprog *progp, /* program to run */ rregexec1(Reprog *progp, /* program to run */
Rune *bol, /* string to run machine on */ Rune *bol, /* string to run machine on */
Resub *mp, /* subexpression elements */ Resub *mp, /* subexpression elements */
int ms, /* number of elements at mp */ int ms, /* number of elements at mp */
Reljunk *j) Reljunk *j)
{ {
int flag=0; int flag=0;
@ -28,7 +28,7 @@ rregexec1(Reprog *progp, /* program to run */
Rune *p; Rune *p;
match = 0; match = 0;
checkstart = j->startchar; checkstart = j->starttype;
if(mp) if(mp)
for(i=0; i<ms; i++) { for(i=0; i<ms; i++) {
mp[i].s.rsp = 0; mp[i].s.rsp = 0;
@ -46,7 +46,7 @@ rregexec1(Reprog *progp, /* program to run */
switch(j->starttype) { switch(j->starttype) {
case RUNE: case RUNE:
p = runestrchr(s, j->startchar); p = runestrchr(s, j->startchar);
if(p == 0 || p == j->reol) if(p == 0 || (j->reol && p >= j->reol))
return match; return match;
s = p; s = p;
break; break;
@ -54,7 +54,7 @@ rregexec1(Reprog *progp, /* program to run */
if(s == bol) if(s == bol)
break; break;
p = runestrchr(s, '\n'); p = runestrchr(s, '\n');
if(p == 0 || s == j->reol) if(p == 0 || (j->reol && p+1 >= j->reol))
return match; return match;
s = p+1; s = p+1;
break; break;
@ -71,15 +71,16 @@ rregexec1(Reprog *progp, /* program to run */
nl->inst = 0; nl->inst = 0;
/* Add first instruction to current list */ /* Add first instruction to current list */
_rrenewemptythread(tl, progp->startinst, ms, s); if(match == 0)
_rrenewemptythread(tl, tle, progp->startinst, ms, s);
/* Execute machine until current list is empty */ /* Execute machine until current list is empty */
for(tlp=tl; tlp->inst; tlp++){ for(tlp=tl; tlp->inst; tlp++){
for(inst=tlp->inst; ; inst = inst->u2.next){ for(inst = tlp->inst; ; inst = inst->u2.next){
switch(inst->type){ switch(inst->type){
case RUNE: /* regular character */ case RUNE: /* regular character */
if(inst->u1.r == r) if(inst->u1.r == r)
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case LBRA: case LBRA:
@ -90,11 +91,11 @@ rregexec1(Reprog *progp, /* program to run */
continue; continue;
case ANY: case ANY:
if(r != '\n') if(r != '\n')
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case ANYNL: case ANYNL:
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case BOL: case BOL:
@ -109,7 +110,7 @@ rregexec1(Reprog *progp, /* program to run */
ep = inst->u1.cp->end; ep = inst->u1.cp->end;
for(rp = inst->u1.cp->spans; rp < ep; rp += 2) for(rp = inst->u1.cp->spans; rp < ep; rp += 2)
if(r >= rp[0] && r <= rp[1]){ if(r >= rp[0] && r <= rp[1]){
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
} }
@ -120,15 +121,12 @@ rregexec1(Reprog *progp, /* program to run */
if(r >= rp[0] && r <= rp[1]) if(r >= rp[0] && r <= rp[1])
break; break;
if(rp == ep) if(rp == ep)
if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle) if(_renewthread(nl, nle, inst->u2.next, ms, &tlp->se) < 0)
return -1; return -1;
break; break;
case OR: case OR:
/* evaluate right choice later */ /* expanded during renewthread; just a place holder */
if(_renewthread(tl, inst->u1.right, ms, &tlp->se) == tle) break;
return -1;
/* efficiency: advance and re-evaluate */
continue;
case END: /* Match! */ case END: /* Match! */
match = 1; match = 1;
tlp->se.m[0].e.rep = s; tlp->se.m[0].e.rep = s;
@ -141,7 +139,7 @@ rregexec1(Reprog *progp, /* program to run */
} }
if(s == j->reol) if(s == j->reol)
break; break;
checkstart = j->startchar && nl->inst==0; checkstart = j->starttype && nl->inst==0;
s++; s++;
}while(r); }while(r);
return match; return match;
@ -155,15 +153,26 @@ rregexec2(Reprog *progp, /* program to run */
Reljunk *j Reljunk *j
) )
{ {
Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE]; int rv;
Relist *relist0, *relist1;
/* mark space */ relist0 = malloc((progp->proglen+1)*sizeof(Relist));
if(relist0 == nil)
return -1;
relist1 = malloc((progp->proglen+1)*sizeof(Relist));
if(relist1 == nil){
free(relist1);
return -1;
}
j->relist[0] = relist0; j->relist[0] = relist0;
j->relist[1] = relist1; j->relist[1] = relist1;
j->reliste[0] = relist0 + nelem(relist0) - 2; j->reliste[0] = relist0 + progp->proglen;
j->reliste[1] = relist1 + nelem(relist1) - 2; j->reliste[1] = relist1 + progp->proglen;
return rregexec1(progp, bol, mp, ms, j); rv = rregexec1(progp, bol, mp, ms, j);
free(relist0);
free(relist1);
return rv;
} }
extern int extern int
@ -199,8 +208,8 @@ rregexec(Reprog *progp, /* program to run */
/* mark space */ /* mark space */
j.relist[0] = relist0; j.relist[0] = relist0;
j.relist[1] = relist1; j.relist[1] = relist1;
j.reliste[0] = relist0 + nelem(relist0) - 2; j.reliste[0] = relist0 + nelem(relist0) - 1;
j.reliste[1] = relist1 + nelem(relist1) - 2; j.reliste[1] = relist1 + nelem(relist1) - 1;
rv = rregexec1(progp, bol, mp, ms, &j); rv = rregexec1(progp, bol, mp, ms, &j);
if(rv >= 0) if(rv >= 0)