Annotation of embedaddon/php/ext/ereg/regex/engine.c, revision 1.1
1.1 ! misho 1: /*
! 2: * The matching engine and friends. This file is #included by regexec.c
! 3: * after suitable #defines of a variety of macros used herein, so that
! 4: * different state representations can be used without duplicating masses
! 5: * of code.
! 6: */
! 7:
! 8: #ifdef SNAMES
! 9: #define matcher smatcher
! 10: #define fast sfast
! 11: #define slow sslow
! 12: #define dissect sdissect
! 13: #define backref sbackref
! 14: #define step sstep
! 15: #define print sprint
! 16: #define at sat
! 17: #define match smat
! 18: #endif
! 19: #ifdef LNAMES
! 20: #define matcher lmatcher
! 21: #define fast lfast
! 22: #define slow lslow
! 23: #define dissect ldissect
! 24: #define backref lbackref
! 25: #define step lstep
! 26: #define print lprint
! 27: #define at lat
! 28: #define match lmat
! 29: #endif
! 30:
! 31: /* another structure passed up and down to avoid zillions of parameters */
! 32: struct match {
! 33: struct re_guts *g;
! 34: int eflags;
! 35: regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
! 36: unsigned char *offp; /* offsets work from here */
! 37: unsigned char *beginp; /* start of string -- virtual NUL precedes */
! 38: unsigned char *endp; /* end of string -- virtual NUL here */
! 39: unsigned char *coldp; /* can be no match starting before here */
! 40: unsigned char **lastpos; /* [nplus+1] */
! 41: STATEVARS;
! 42: states st; /* current states */
! 43: states fresh; /* states for a fresh start */
! 44: states tmp; /* temporary */
! 45: states empty; /* empty set of states */
! 46: };
! 47:
! 48: #include "engine.ih"
! 49:
! 50: #ifdef REDEBUG
! 51: #define SP(t, s, c) print(m, t, s, c, stdout)
! 52: #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
! 53: #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
! 54: #else
! 55: #define SP(t, s, c) /* nothing */
! 56: #define AT(t, p1, p2, s1, s2) /* nothing */
! 57: #define NOTE(s) /* nothing */
! 58: #endif
! 59:
! 60: /*
! 61: - matcher - the actual matching engine
! 62: == static int matcher(register struct re_guts *g, char *string, \
! 63: == size_t nmatch, regmatch_t pmatch[], int eflags);
! 64: */
! 65: static int /* 0 success, REG_NOMATCH failure */
! 66: matcher(g, string, nmatch, pmatch, eflags)
! 67: register struct re_guts *g;
! 68: unsigned char *string;
! 69: size_t nmatch;
! 70: regmatch_t pmatch[];
! 71: int eflags;
! 72: {
! 73: register unsigned char *endp;
! 74: register size_t i;
! 75: struct match mv;
! 76: register struct match *m = &mv;
! 77: register unsigned char *dp;
! 78: const register sopno gf = g->firststate+1; /* +1 for OEND */
! 79: const register sopno gl = g->laststate;
! 80: unsigned char *start;
! 81: unsigned char *stop;
! 82:
! 83: /* simplify the situation where possible */
! 84: if (g->cflags®_NOSUB)
! 85: nmatch = 0;
! 86: if (eflags®_STARTEND) {
! 87: start = string + pmatch[0].rm_so;
! 88: stop = string + pmatch[0].rm_eo;
! 89: } else {
! 90: start = string;
! 91: stop = start + strlen(start);
! 92: }
! 93: if (stop < start)
! 94: return(REG_INVARG);
! 95:
! 96: /* prescreening; this does wonders for this rather slow code */
! 97: if (g->must != NULL) {
! 98: for (dp = start; dp < stop; dp++)
! 99: if (*dp == g->must[0] && stop - dp >= g->mlen &&
! 100: memcmp(dp, g->must, (size_t)g->mlen) == 0)
! 101: break;
! 102: if (dp == stop) /* we didn't find g->must */
! 103: return(REG_NOMATCH);
! 104: }
! 105:
! 106: /* match struct setup */
! 107: m->g = g;
! 108: m->eflags = eflags;
! 109: m->pmatch = NULL;
! 110: m->lastpos = NULL;
! 111: m->offp = string;
! 112: m->beginp = start;
! 113: m->endp = stop;
! 114: STATESETUP(m, 4);
! 115: SETUP(m->st);
! 116: SETUP(m->fresh);
! 117: SETUP(m->tmp);
! 118: SETUP(m->empty);
! 119: CLEAR(m->empty);
! 120:
! 121: /* this loop does only one repetition except for backrefs */
! 122: for (;;) {
! 123: endp = fast(m, start, stop, gf, gl);
! 124: if (endp == NULL) { /* a miss */
! 125: STATETEARDOWN(m);
! 126: return(REG_NOMATCH);
! 127: }
! 128: if (nmatch == 0 && !g->backrefs)
! 129: break; /* no further info needed */
! 130:
! 131: /* where? */
! 132: assert(m->coldp != NULL);
! 133: for (;;) {
! 134: NOTE("finding start");
! 135: endp = slow(m, m->coldp, stop, gf, gl);
! 136: if (endp != NULL)
! 137: break;
! 138: assert(m->coldp < m->endp);
! 139: m->coldp++;
! 140: }
! 141: if (nmatch == 1 && !g->backrefs)
! 142: break; /* no further info needed */
! 143:
! 144: /* oh my, he wants the subexpressions... */
! 145: if (m->pmatch == NULL)
! 146: m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
! 147: sizeof(regmatch_t));
! 148: if (m->pmatch == NULL) {
! 149: STATETEARDOWN(m);
! 150: return(REG_ESPACE);
! 151: }
! 152: for (i = 1; i <= m->g->nsub; i++)
! 153: m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
! 154: if (!g->backrefs && !(m->eflags®_BACKR)) {
! 155: NOTE("dissecting");
! 156: dp = dissect(m, m->coldp, endp, gf, gl);
! 157: } else {
! 158: if (g->nplus > 0 && m->lastpos == NULL)
! 159: m->lastpos = (unsigned char **)malloc((g->nplus+1) *
! 160: sizeof(unsigned char *));
! 161: if (g->nplus > 0 && m->lastpos == NULL) {
! 162: free((char *)m->pmatch);
! 163: STATETEARDOWN(m);
! 164: return(REG_ESPACE);
! 165: }
! 166: NOTE("backref dissect");
! 167: dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
! 168: }
! 169: if (dp != NULL)
! 170: break;
! 171:
! 172: /* uh-oh... we couldn't find a subexpression-level match */
! 173: assert(g->backrefs); /* must be back references doing it */
! 174: assert(g->nplus == 0 || m->lastpos != NULL);
! 175: for (;;) {
! 176: if (dp != NULL || endp <= m->coldp)
! 177: break; /* defeat */
! 178: NOTE("backoff");
! 179: endp = slow(m, m->coldp, endp-1, gf, gl);
! 180: if (endp == NULL)
! 181: break; /* defeat */
! 182: /* try it on a shorter possibility */
! 183: #ifndef NDEBUG
! 184: for (i = 1; i <= m->g->nsub; i++) {
! 185: assert(m->pmatch[i].rm_so == -1);
! 186: assert(m->pmatch[i].rm_eo == -1);
! 187: }
! 188: #endif
! 189: NOTE("backoff dissect");
! 190: dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
! 191: }
! 192: assert(dp == NULL || dp == endp);
! 193: if (dp != NULL) /* found a shorter one */
! 194: break;
! 195:
! 196: /* despite initial appearances, there is no match here */
! 197: NOTE("false alarm");
! 198: start = m->coldp + 1; /* recycle starting later */
! 199: assert(start <= stop);
! 200: }
! 201:
! 202: /* fill in the details if requested */
! 203: if (nmatch > 0) {
! 204: pmatch[0].rm_so = m->coldp - m->offp;
! 205: pmatch[0].rm_eo = endp - m->offp;
! 206: }
! 207: if (nmatch > 1) {
! 208: assert(m->pmatch != NULL);
! 209: for (i = 1; i < nmatch; i++)
! 210: if (i <= m->g->nsub)
! 211: pmatch[i] = m->pmatch[i];
! 212: else {
! 213: pmatch[i].rm_so = -1;
! 214: pmatch[i].rm_eo = -1;
! 215: }
! 216: }
! 217:
! 218: if (m->pmatch != NULL)
! 219: free((char *)m->pmatch);
! 220: if (m->lastpos != NULL)
! 221: free((char *)m->lastpos);
! 222: STATETEARDOWN(m);
! 223: return(0);
! 224: }
! 225:
! 226: /*
! 227: - dissect - figure out what matched what, no back references
! 228: == static unsigned char *dissect(register struct match *m, unsigned char *start, \
! 229: == unsigned char *stop, sopno startst, sopno stopst);
! 230: */
! 231: static unsigned char * /* == stop (success) always */
! 232: dissect(m, start, stop, startst, stopst)
! 233: register struct match *m;
! 234: unsigned char *start;
! 235: unsigned char *stop;
! 236: sopno startst;
! 237: sopno stopst;
! 238: {
! 239: register int i;
! 240: register sopno ss; /* start sop of current subRE */
! 241: register sopno es; /* end sop of current subRE */
! 242: register unsigned char *sp; /* start of string matched by it */
! 243: register unsigned char *stp; /* string matched by it cannot pass here */
! 244: register unsigned char *rest; /* start of rest of string */
! 245: register unsigned char *tail; /* string unmatched by rest of RE */
! 246: register sopno ssub; /* start sop of subsubRE */
! 247: register sopno esub; /* end sop of subsubRE */
! 248: register unsigned char *ssp; /* start of string matched by subsubRE */
! 249: register unsigned char *sep; /* end of string matched by subsubRE */
! 250: register unsigned char *oldssp; /* previous ssp */
! 251: register unsigned char *dp;
! 252:
! 253: AT("diss", start, stop, startst, stopst);
! 254: sp = start;
! 255: for (ss = startst; ss < stopst; ss = es) {
! 256: /* identify end of subRE */
! 257: es = ss;
! 258: switch (OP(m->g->strip[es])) {
! 259: case OPLUS_:
! 260: case OQUEST_:
! 261: es += OPND(m->g->strip[es]);
! 262: break;
! 263: case OCH_:
! 264: while (OP(m->g->strip[es]) != O_CH)
! 265: es += OPND(m->g->strip[es]);
! 266: break;
! 267: }
! 268: es++;
! 269:
! 270: /* figure out what it matched */
! 271: switch (OP(m->g->strip[ss])) {
! 272: case OEND:
! 273: assert(PHP_REGEX_NOPE);
! 274: break;
! 275: case OCHAR:
! 276: sp++;
! 277: break;
! 278: case OBOL:
! 279: case OEOL:
! 280: case OBOW:
! 281: case OEOW:
! 282: break;
! 283: case OANY:
! 284: case OANYOF:
! 285: sp++;
! 286: break;
! 287: case OBACK_:
! 288: case O_BACK:
! 289: assert(PHP_REGEX_NOPE);
! 290: break;
! 291: /* cases where length of match is hard to find */
! 292: case OQUEST_:
! 293: stp = stop;
! 294: for (;;) {
! 295: /* how long could this one be? */
! 296: rest = slow(m, sp, stp, ss, es);
! 297: assert(rest != NULL); /* it did match */
! 298: /* could the rest match the rest? */
! 299: tail = slow(m, rest, stop, es, stopst);
! 300: if (tail == stop)
! 301: break; /* yes! */
! 302: /* no -- try a shorter match for this one */
! 303: stp = rest - 1;
! 304: assert(stp >= sp); /* it did work */
! 305: }
! 306: ssub = ss + 1;
! 307: esub = es - 1;
! 308: /* did innards match? */
! 309: if (slow(m, sp, rest, ssub, esub) != NULL) {
! 310: dp = dissect(m, sp, rest, ssub, esub);
! 311: assert(dp == rest);
! 312: } else /* no */
! 313: assert(sp == rest);
! 314: sp = rest;
! 315: break;
! 316: case OPLUS_:
! 317: stp = stop;
! 318: for (;;) {
! 319: /* how long could this one be? */
! 320: rest = slow(m, sp, stp, ss, es);
! 321: assert(rest != NULL); /* it did match */
! 322: /* could the rest match the rest? */
! 323: tail = slow(m, rest, stop, es, stopst);
! 324: if (tail == stop)
! 325: break; /* yes! */
! 326: /* no -- try a shorter match for this one */
! 327: stp = rest - 1;
! 328: assert(stp >= sp); /* it did work */
! 329: }
! 330: ssub = ss + 1;
! 331: esub = es - 1;
! 332: ssp = sp;
! 333: oldssp = ssp;
! 334: for (;;) { /* find last match of innards */
! 335: sep = slow(m, ssp, rest, ssub, esub);
! 336: if (sep == NULL || sep == ssp)
! 337: break; /* failed or matched null */
! 338: oldssp = ssp; /* on to next try */
! 339: ssp = sep;
! 340: }
! 341: if (sep == NULL) {
! 342: /* last successful match */
! 343: sep = ssp;
! 344: ssp = oldssp;
! 345: }
! 346: assert(sep == rest); /* must exhaust substring */
! 347: assert(slow(m, ssp, sep, ssub, esub) == rest);
! 348: dp = dissect(m, ssp, sep, ssub, esub);
! 349: assert(dp == sep);
! 350: sp = rest;
! 351: break;
! 352: case OCH_:
! 353: stp = stop;
! 354: for (;;) {
! 355: /* how long could this one be? */
! 356: rest = slow(m, sp, stp, ss, es);
! 357: assert(rest != NULL); /* it did match */
! 358: /* could the rest match the rest? */
! 359: tail = slow(m, rest, stop, es, stopst);
! 360: if (tail == stop)
! 361: break; /* yes! */
! 362: /* no -- try a shorter match for this one */
! 363: stp = rest - 1;
! 364: assert(stp >= sp); /* it did work */
! 365: }
! 366: ssub = ss + 1;
! 367: esub = ss + OPND(m->g->strip[ss]) - 1;
! 368: assert(OP(m->g->strip[esub]) == OOR1);
! 369: for (;;) { /* find first matching branch */
! 370: if (slow(m, sp, rest, ssub, esub) == rest)
! 371: break; /* it matched all of it */
! 372: /* that one missed, try next one */
! 373: assert(OP(m->g->strip[esub]) == OOR1);
! 374: esub++;
! 375: assert(OP(m->g->strip[esub]) == OOR2);
! 376: ssub = esub + 1;
! 377: esub += OPND(m->g->strip[esub]);
! 378: if (OP(m->g->strip[esub]) == OOR2)
! 379: esub--;
! 380: else
! 381: assert(OP(m->g->strip[esub]) == O_CH);
! 382: }
! 383: dp = dissect(m, sp, rest, ssub, esub);
! 384: assert(dp == rest);
! 385: sp = rest;
! 386: break;
! 387: case O_PLUS:
! 388: case O_QUEST:
! 389: case OOR1:
! 390: case OOR2:
! 391: case O_CH:
! 392: assert(PHP_REGEX_NOPE);
! 393: break;
! 394: case OLPAREN:
! 395: i = OPND(m->g->strip[ss]);
! 396: assert(0 < i && i <= m->g->nsub);
! 397: m->pmatch[i].rm_so = sp - m->offp;
! 398: break;
! 399: case ORPAREN:
! 400: i = OPND(m->g->strip[ss]);
! 401: assert(0 < i && i <= m->g->nsub);
! 402: m->pmatch[i].rm_eo = sp - m->offp;
! 403: break;
! 404: default: /* uh oh */
! 405: assert(PHP_REGEX_NOPE);
! 406: break;
! 407: }
! 408: }
! 409:
! 410: assert(sp == stop);
! 411: return(sp);
! 412: }
! 413:
! 414: /*
! 415: - backref - figure out what matched what, figuring in back references
! 416: == static unsigned char *backref(register struct match *m, unsigned char *start, \
! 417: == unsigned char *stop, sopno startst, sopno stopst, sopno lev);
! 418: */
! 419: static unsigned char * /* == stop (success) or NULL (failure) */
! 420: backref(m, start, stop, startst, stopst, lev)
! 421: register struct match *m;
! 422: unsigned char *start;
! 423: unsigned char *stop;
! 424: sopno startst;
! 425: sopno stopst;
! 426: sopno lev; /* PLUS nesting level */
! 427: {
! 428: register int i;
! 429: register sopno ss; /* start sop of current subRE */
! 430: register unsigned char *sp; /* start of string matched by it */
! 431: register sopno ssub; /* start sop of subsubRE */
! 432: register sopno esub; /* end sop of subsubRE */
! 433: register unsigned char *ssp; /* start of string matched by subsubRE */
! 434: register unsigned char *dp;
! 435: register size_t len;
! 436: register int hard;
! 437: register sop s;
! 438: register regoff_t offsave;
! 439: register cset *cs;
! 440:
! 441: AT("back", start, stop, startst, stopst);
! 442: sp = start;
! 443:
! 444: /* get as far as we can with easy stuff */
! 445: hard = 0;
! 446: for (ss = startst; !hard && ss < stopst; ss++)
! 447: switch (OP(s = m->g->strip[ss])) {
! 448: case OCHAR:
! 449: if (sp == stop || *sp++ != (unsigned char)OPND(s))
! 450: return(NULL);
! 451: break;
! 452: case OANY:
! 453: if (sp == stop)
! 454: return(NULL);
! 455: sp++;
! 456: break;
! 457: case OANYOF:
! 458: cs = &m->g->sets[OPND(s)];
! 459: if (sp == stop || !CHIN(cs, *sp++))
! 460: return(NULL);
! 461: break;
! 462: case OBOL:
! 463: if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
! 464: (sp < m->endp && *(sp-1) == '\n' &&
! 465: (m->g->cflags®_NEWLINE)) )
! 466: { /* yes */ }
! 467: else
! 468: return(NULL);
! 469: break;
! 470: case OEOL:
! 471: if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
! 472: (sp < m->endp && *sp == '\n' &&
! 473: (m->g->cflags®_NEWLINE)) )
! 474: { /* yes */ }
! 475: else
! 476: return(NULL);
! 477: break;
! 478: case OBOW:
! 479: if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
! 480: (sp < m->endp && *(sp-1) == '\n' &&
! 481: (m->g->cflags®_NEWLINE)) ||
! 482: (sp > m->beginp &&
! 483: !ISWORD(*(sp-1))) ) &&
! 484: (sp < m->endp && ISWORD(*sp)) )
! 485: { /* yes */ }
! 486: else
! 487: return(NULL);
! 488: break;
! 489: case OEOW:
! 490: if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
! 491: (sp < m->endp && *sp == '\n' &&
! 492: (m->g->cflags®_NEWLINE)) ||
! 493: (sp < m->endp && !ISWORD(*sp)) ) &&
! 494: (sp > m->beginp && ISWORD(*(sp-1))) )
! 495: { /* yes */ }
! 496: else
! 497: return(NULL);
! 498: break;
! 499: case O_QUEST:
! 500: break;
! 501: case OOR1: /* matches null but needs to skip */
! 502: ss++;
! 503: s = m->g->strip[ss];
! 504: do {
! 505: assert(OP(s) == OOR2);
! 506: ss += OPND(s);
! 507: } while (OP(s = m->g->strip[ss]) != O_CH);
! 508: /* note that the ss++ gets us past the O_CH */
! 509: break;
! 510: default: /* have to make a choice */
! 511: hard = 1;
! 512: break;
! 513: }
! 514: if (!hard) { /* that was it! */
! 515: if (sp != stop)
! 516: return(NULL);
! 517: return(sp);
! 518: }
! 519: ss--; /* adjust for the for's final increment */
! 520:
! 521: /* the hard stuff */
! 522: AT("hard", sp, stop, ss, stopst);
! 523: s = m->g->strip[ss];
! 524: switch (OP(s)) {
! 525: case OBACK_: /* the vilest depths */
! 526: i = OPND(s);
! 527: assert(0 < i && i <= m->g->nsub);
! 528: if (m->pmatch[i].rm_eo == -1)
! 529: return(NULL);
! 530: assert(m->pmatch[i].rm_so != -1);
! 531: len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
! 532: assert(stop - m->beginp >= len);
! 533: if (sp > stop - len)
! 534: return(NULL); /* not enough left to match */
! 535: ssp = m->offp + m->pmatch[i].rm_so;
! 536: if (memcmp(sp, ssp, len) != 0)
! 537: return(NULL);
! 538: while (m->g->strip[ss] != SOP(O_BACK, i))
! 539: ss++;
! 540: return(backref(m, sp+len, stop, ss+1, stopst, lev));
! 541: break;
! 542: case OQUEST_: /* to null or not */
! 543: dp = backref(m, sp, stop, ss+1, stopst, lev);
! 544: if (dp != NULL)
! 545: return(dp); /* not */
! 546: return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
! 547: break;
! 548: case OPLUS_:
! 549: assert(m->lastpos != NULL);
! 550: assert(lev+1 <= m->g->nplus);
! 551: m->lastpos[lev+1] = sp;
! 552: return(backref(m, sp, stop, ss+1, stopst, lev+1));
! 553: break;
! 554: case O_PLUS:
! 555: if (sp == m->lastpos[lev]) /* last pass matched null */
! 556: return(backref(m, sp, stop, ss+1, stopst, lev-1));
! 557: /* try another pass */
! 558: m->lastpos[lev] = sp;
! 559: dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
! 560: if (dp == NULL)
! 561: return(backref(m, sp, stop, ss+1, stopst, lev-1));
! 562: else
! 563: return(dp);
! 564: break;
! 565: case OCH_: /* find the right one, if any */
! 566: ssub = ss + 1;
! 567: esub = ss + OPND(s) - 1;
! 568: assert(OP(m->g->strip[esub]) == OOR1);
! 569: for (;;) { /* find first matching branch */
! 570: dp = backref(m, sp, stop, ssub, esub, lev);
! 571: if (dp != NULL)
! 572: return(dp);
! 573: /* that one missed, try next one */
! 574: if (OP(m->g->strip[esub]) == O_CH)
! 575: return(NULL); /* there is none */
! 576: esub++;
! 577: assert(OP(m->g->strip[esub]) == OOR2);
! 578: ssub = esub + 1;
! 579: esub += OPND(m->g->strip[esub]);
! 580: if (OP(m->g->strip[esub]) == OOR2)
! 581: esub--;
! 582: else
! 583: assert(OP(m->g->strip[esub]) == O_CH);
! 584: }
! 585: break;
! 586: case OLPAREN: /* must undo assignment if rest fails */
! 587: i = OPND(s);
! 588: assert(0 < i && i <= m->g->nsub);
! 589: offsave = m->pmatch[i].rm_so;
! 590: m->pmatch[i].rm_so = sp - m->offp;
! 591: dp = backref(m, sp, stop, ss+1, stopst, lev);
! 592: if (dp != NULL)
! 593: return(dp);
! 594: m->pmatch[i].rm_so = offsave;
! 595: return(NULL);
! 596: break;
! 597: case ORPAREN: /* must undo assignment if rest fails */
! 598: i = OPND(s);
! 599: assert(0 < i && i <= m->g->nsub);
! 600: offsave = m->pmatch[i].rm_eo;
! 601: m->pmatch[i].rm_eo = sp - m->offp;
! 602: dp = backref(m, sp, stop, ss+1, stopst, lev);
! 603: if (dp != NULL)
! 604: return(dp);
! 605: m->pmatch[i].rm_eo = offsave;
! 606: return(NULL);
! 607: break;
! 608: default: /* uh oh */
! 609: assert(PHP_REGEX_NOPE);
! 610: break;
! 611: }
! 612:
! 613: /* "can't happen" */
! 614: assert(PHP_REGEX_NOPE);
! 615: /* NOTREACHED */
! 616: return((unsigned char *)NULL); /* dummy */
! 617: }
! 618:
! 619: /*
! 620: - fast - step through the string at top speed
! 621: == static unsigned char *fast(register struct match *m, unsigned char *start, \
! 622: == unsigned char *stop, sopno startst, sopno stopst);
! 623: */
! 624: static unsigned char * /* where tentative match ended, or NULL */
! 625: fast(m, start, stop, startst, stopst)
! 626: register struct match *m;
! 627: unsigned char *start;
! 628: unsigned char *stop;
! 629: sopno startst;
! 630: sopno stopst;
! 631: {
! 632: register states st = m->st;
! 633: register states fresh = m->fresh;
! 634: register states tmp = m->tmp;
! 635: register unsigned char *p = start;
! 636: register int c = (start == m->beginp) ? OUT : *(start-1);
! 637: register int lastc; /* previous c */
! 638: register int flagch;
! 639: register int i;
! 640: register unsigned char *coldp; /* last p after which no match was underway */
! 641:
! 642: CLEAR(st);
! 643: SET1(st, startst);
! 644: st = step(m->g, startst, stopst, st, NOTHING, st);
! 645: ASSIGN(fresh, st);
! 646: SP("start", st, *p);
! 647: coldp = NULL;
! 648: for (;;) {
! 649: /* next character */
! 650: lastc = c;
! 651: c = (p == m->endp) ? OUT : *p;
! 652: if (EQ(st, fresh))
! 653: coldp = p;
! 654:
! 655: /* is there an EOL and/or BOL between lastc and c? */
! 656: flagch = '\0';
! 657: i = 0;
! 658: if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
! 659: (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
! 660: flagch = BOL;
! 661: i = m->g->nbol;
! 662: }
! 663: if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
! 664: (c == OUT && !(m->eflags®_NOTEOL)) ) {
! 665: flagch = (flagch == BOL) ? BOLEOL : EOL;
! 666: i += m->g->neol;
! 667: }
! 668: if (i != 0) {
! 669: for (; i > 0; i--)
! 670: st = step(m->g, startst, stopst, st, flagch, st);
! 671: SP("boleol", st, c);
! 672: }
! 673:
! 674: /* how about a word boundary? */
! 675: if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
! 676: (c != OUT && ISWORD(c)) ) {
! 677: flagch = BOW;
! 678: }
! 679: if ( (lastc != OUT && ISWORD(lastc)) &&
! 680: (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
! 681: flagch = EOW;
! 682: }
! 683: if (flagch == BOW || flagch == EOW) {
! 684: st = step(m->g, startst, stopst, st, flagch, st);
! 685: SP("boweow", st, c);
! 686: }
! 687:
! 688: /* are we done? */
! 689: if (ISSET(st, stopst) || p == stop)
! 690: break; /* NOTE BREAK OUT */
! 691:
! 692: /* no, we must deal with this character */
! 693: ASSIGN(tmp, st);
! 694: ASSIGN(st, fresh);
! 695: assert(c != OUT);
! 696: st = step(m->g, startst, stopst, tmp, c, st);
! 697: SP("aft", st, c);
! 698: assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
! 699: p++;
! 700: }
! 701:
! 702: assert(coldp != NULL);
! 703: m->coldp = coldp;
! 704: if (ISSET(st, stopst))
! 705: return(p+1);
! 706: else
! 707: return(NULL);
! 708: }
! 709:
! 710: /*
! 711: - slow - step through the string more deliberately
! 712: == static unsigned char *slow(register struct match *m, unsigned char *start, \
! 713: == unsigned char *stop, sopno startst, sopno stopst);
! 714: */
! 715: static unsigned char * /* where it ended */
! 716: slow(m, start, stop, startst, stopst)
! 717: register struct match *m;
! 718: unsigned char *start;
! 719: unsigned char *stop;
! 720: sopno startst;
! 721: sopno stopst;
! 722: {
! 723: register states st = m->st;
! 724: register states empty = m->empty;
! 725: register states tmp = m->tmp;
! 726: register unsigned char *p = start;
! 727: register int c = (start == m->beginp) ? OUT : *(start-1);
! 728: register int lastc; /* previous c */
! 729: register int flagch;
! 730: register int i;
! 731: register unsigned char *matchp; /* last p at which a match ended */
! 732:
! 733: AT("slow", start, stop, startst, stopst);
! 734: CLEAR(st);
! 735: SET1(st, startst);
! 736: SP("sstart", st, *p);
! 737: st = step(m->g, startst, stopst, st, NOTHING, st);
! 738: matchp = NULL;
! 739: for (;;) {
! 740: /* next character */
! 741: lastc = c;
! 742: c = (p == m->endp) ? OUT : *p;
! 743:
! 744: /* is there an EOL and/or BOL between lastc and c? */
! 745: flagch = '\0';
! 746: i = 0;
! 747: if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
! 748: (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
! 749: flagch = BOL;
! 750: i = m->g->nbol;
! 751: }
! 752: if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
! 753: (c == OUT && !(m->eflags®_NOTEOL)) ) {
! 754: flagch = (flagch == BOL) ? BOLEOL : EOL;
! 755: i += m->g->neol;
! 756: }
! 757: if (i != 0) {
! 758: for (; i > 0; i--)
! 759: st = step(m->g, startst, stopst, st, flagch, st);
! 760: SP("sboleol", st, c);
! 761: }
! 762:
! 763: /* how about a word boundary? */
! 764: if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
! 765: (c != OUT && ISWORD(c)) ) {
! 766: flagch = BOW;
! 767: }
! 768: if ( (lastc != OUT && ISWORD(lastc)) &&
! 769: (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
! 770: flagch = EOW;
! 771: }
! 772: if (flagch == BOW || flagch == EOW) {
! 773: st = step(m->g, startst, stopst, st, flagch, st);
! 774: SP("sboweow", st, c);
! 775: }
! 776:
! 777: /* are we done? */
! 778: if (ISSET(st, stopst))
! 779: matchp = p;
! 780: if (EQ(st, empty) || p == stop)
! 781: break; /* NOTE BREAK OUT */
! 782:
! 783: /* no, we must deal with this character */
! 784: ASSIGN(tmp, st);
! 785: ASSIGN(st, empty);
! 786: assert(c != OUT);
! 787: st = step(m->g, startst, stopst, tmp, c, st);
! 788: SP("saft", st, c);
! 789: assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
! 790: p++;
! 791: }
! 792:
! 793: return(matchp);
! 794: }
! 795:
! 796:
! 797: /*
! 798: - step - map set of states reachable before char to set reachable after
! 799: == static states step(register struct re_guts *g, sopno start, sopno stop, \
! 800: == register states bef, int ch, register states aft);
! 801: == #define BOL (OUT+1)
! 802: == #define EOL (BOL+1)
! 803: == #define BOLEOL (BOL+2)
! 804: == #define NOTHING (BOL+3)
! 805: == #define BOW (BOL+4)
! 806: == #define EOW (BOL+5)
! 807: == #define CODEMAX (BOL+5) // highest code used
! 808: == #define NONCHAR(c) ((c) > UCHAR_MAX)
! 809: == #define NNONCHAR (CODEMAX-UCHAR_MAX)
! 810: */
! 811: static states
! 812: step(g, start, stop, bef, ch, aft)
! 813: register struct re_guts *g;
! 814: sopno start; /* start state within strip */
! 815: sopno stop; /* state after stop state within strip */
! 816: register states bef; /* states reachable before */
! 817: int ch; /* character or NONCHAR code */
! 818: register states aft; /* states already known reachable after */
! 819: {
! 820: register cset *cs;
! 821: register sop s;
! 822: register sopno pc;
! 823: register onestate here; /* note, macros know this name */
! 824: register sopno look;
! 825: register long i;
! 826:
! 827: for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
! 828: s = g->strip[pc];
! 829: switch (OP(s)) {
! 830: case OEND:
! 831: assert(pc == stop-1);
! 832: break;
! 833: case OCHAR:
! 834: /* only characters can match */
! 835: assert(!NONCHAR(ch) || ch != (unsigned char)OPND(s));
! 836: if (ch == (unsigned char)OPND(s))
! 837: FWD(aft, bef, 1);
! 838: break;
! 839: case OBOL:
! 840: if (ch == BOL || ch == BOLEOL)
! 841: FWD(aft, bef, 1);
! 842: break;
! 843: case OEOL:
! 844: if (ch == EOL || ch == BOLEOL)
! 845: FWD(aft, bef, 1);
! 846: break;
! 847: case OBOW:
! 848: if (ch == BOW)
! 849: FWD(aft, bef, 1);
! 850: break;
! 851: case OEOW:
! 852: if (ch == EOW)
! 853: FWD(aft, bef, 1);
! 854: break;
! 855: case OANY:
! 856: if (!NONCHAR(ch))
! 857: FWD(aft, bef, 1);
! 858: break;
! 859: case OANYOF:
! 860: cs = &g->sets[OPND(s)];
! 861: if (!NONCHAR(ch) && CHIN(cs, ch))
! 862: FWD(aft, bef, 1);
! 863: break;
! 864: case OBACK_: /* ignored here */
! 865: case O_BACK:
! 866: FWD(aft, aft, 1);
! 867: break;
! 868: case OPLUS_: /* forward, this is just an empty */
! 869: FWD(aft, aft, 1);
! 870: break;
! 871: case O_PLUS: /* both forward and back */
! 872: FWD(aft, aft, 1);
! 873: i = ISSETBACK(aft, OPND(s));
! 874: BACK(aft, aft, OPND(s));
! 875: if (!i && ISSETBACK(aft, OPND(s))) {
! 876: /* oho, must reconsider loop body */
! 877: pc -= OPND(s) + 1;
! 878: INIT(here, pc);
! 879: }
! 880: break;
! 881: case OQUEST_: /* two branches, both forward */
! 882: FWD(aft, aft, 1);
! 883: FWD(aft, aft, OPND(s));
! 884: break;
! 885: case O_QUEST: /* just an empty */
! 886: FWD(aft, aft, 1);
! 887: break;
! 888: case OLPAREN: /* not significant here */
! 889: case ORPAREN:
! 890: FWD(aft, aft, 1);
! 891: break;
! 892: case OCH_: /* mark the first two branches */
! 893: FWD(aft, aft, 1);
! 894: assert(OP(g->strip[pc+OPND(s)]) == OOR2);
! 895: FWD(aft, aft, OPND(s));
! 896: break;
! 897: case OOR1: /* done a branch, find the O_CH */
! 898: if (ISSTATEIN(aft, here)) {
! 899: for (look = 1;
! 900: OP(s = g->strip[pc+look]) != O_CH;
! 901: look += OPND(s))
! 902: assert(OP(s) == OOR2);
! 903: FWD(aft, aft, look);
! 904: }
! 905: break;
! 906: case OOR2: /* propagate OCH_'s marking */
! 907: FWD(aft, aft, 1);
! 908: if (OP(g->strip[pc+OPND(s)]) != O_CH) {
! 909: assert(OP(g->strip[pc+OPND(s)]) == OOR2);
! 910: FWD(aft, aft, OPND(s));
! 911: }
! 912: break;
! 913: case O_CH: /* just empty */
! 914: FWD(aft, aft, 1);
! 915: break;
! 916: default: /* ooooops... */
! 917: assert(PHP_REGEX_NOPE);
! 918: break;
! 919: }
! 920: }
! 921:
! 922: return(aft);
! 923: }
! 924:
! 925: #ifdef REDEBUG
! 926: /*
! 927: - print - print a set of states
! 928: == #ifdef REDEBUG
! 929: == static void print(struct match *m, unsigned char *caption, states st, \
! 930: == int ch, FILE *d);
! 931: == #endif
! 932: */
! 933: static void
! 934: print(m, caption, st, ch, d)
! 935: struct match *m;
! 936: unsigned char *caption;
! 937: states st;
! 938: int ch;
! 939: FILE *d;
! 940: {
! 941: register struct re_guts *g = m->g;
! 942: register int i;
! 943: register int first = 1;
! 944:
! 945: if (!(m->eflags®_TRACE))
! 946: return;
! 947:
! 948: fprintf(d, "%s", caption);
! 949: if (ch != '\0')
! 950: fprintf(d, " %s", pchar(ch));
! 951: for (i = 0; i < g->nstates; i++)
! 952: if (ISSET(st, i)) {
! 953: fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
! 954: first = 0;
! 955: }
! 956: fprintf(d, "\n");
! 957: }
! 958:
! 959: /*
! 960: - at - print current situation
! 961: == #ifdef REDEBUG
! 962: == static void at(struct match *m, unsigned char *title, unsigned char *start, unsigned char *stop, \
! 963: == sopno startst, sopno stopst);
! 964: == #endif
! 965: */
! 966: static void
! 967: at(m, title, start, stop, startst, stopst)
! 968: struct match *m;
! 969: unsigned char *title;
! 970: unsigned char *start;
! 971: unsigned char *stop;
! 972: sopno startst;
! 973: sopno stopst;
! 974: {
! 975: if (!(m->eflags®_TRACE))
! 976: return;
! 977:
! 978: printf("%s %s-", title, pchar(*start));
! 979: printf("%s ", pchar(*stop));
! 980: printf("%ld-%ld\n", (long)startst, (long)stopst);
! 981: }
! 982:
! 983: #ifndef PCHARDONE
! 984: #define PCHARDONE /* never again */
! 985: /*
! 986: - pchar - make a character printable
! 987: == #ifdef REDEBUG
! 988: == static unsigned char *pchar(int ch);
! 989: == #endif
! 990: *
! 991: * Is this identical to regchar() over in debug.c? Well, yes. But a
! 992: * duplicate here avoids having a debugging-capable regexec.o tied to
! 993: * a matching debug.o, and this is convenient. It all disappears in
! 994: * the non-debug compilation anyway, so it doesn't matter much.
! 995: */
! 996: static unsigned char * /* -> representation */
! 997: pchar(ch)
! 998: int ch;
! 999: {
! 1000: static unsigned char pbuf[10];
! 1001:
! 1002: if (isprint(ch) || ch == ' ')
! 1003: sprintf(pbuf, "%c", ch);
! 1004: else
! 1005: sprintf(pbuf, "\\%o", ch);
! 1006: return(pbuf);
! 1007: }
! 1008: #endif
! 1009: #endif
! 1010:
! 1011: #undef matcher
! 1012: #undef fast
! 1013: #undef slow
! 1014: #undef dissect
! 1015: #undef backref
! 1016: #undef step
! 1017: #undef print
! 1018: #undef at
! 1019: #undef match
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>