embedaddon/smartmontools/regex/regex_internal.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / smartmontools / regex / regex_internal.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 16:32:16 2012 UTC (12 years, 4 months ago) by misho
Branches: smartmontools, elwix, MAIN
CVS tags: v5_43, v5_42, HEAD

smartmontools

1: /* Extended regular expression matching and search library. 2: Copyright (C) 2002, 2003 Free Software Foundation, Inc. 3: This file is part of the GNU C Library. 4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5: 6: The GNU C Library is free software; you can redistribute it and/or 7: modify it under the terms of the GNU Lesser General Public 8: License as published by the Free Software Foundation; either 9: version 2.1 of the License, or (at your option) any later version. 10: 11: The GNU C Library is distributed in the hope that it will be useful, 12: but WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: Lesser General Public License for more details. 15: 16: You should have received a copy of the GNU Lesser General Public 17: License along with the GNU C Library; if not, write to the Free 18: Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19: 02111-1307 USA. */ 20: 21: static void re_string_construct_common (const char *str, int len, 22: re_string_t *pstr, 23: RE_TRANSLATE_TYPE trans, int icase); 24: #ifdef RE_ENABLE_I18N 25: static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx, 26: wint_t *last_wc); 27: #endif /* RE_ENABLE_I18N */ 28: static re_dfastate_t *create_newstate_common (re_dfa_t *dfa, 29: const re_node_set *nodes, 30: unsigned int hash); 31: static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate, 32: unsigned int hash); 33: static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa, 34: const re_node_set *nodes, 35: unsigned int hash); 36: static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa, 37: const re_node_set *nodes, 38: unsigned int context, 39: unsigned int hash); 40: static unsigned int inline calc_state_hash (const re_node_set *nodes, 41: unsigned int context); 42: 43: /* Functions for string operation. */ 44: 45: /* This function allocate the buffers. It is necessary to call 46: re_string_reconstruct before using the object. */ 47: 48: static reg_errcode_t 49: re_string_allocate (pstr, str, len, init_len, trans, icase) 50: re_string_t *pstr; 51: const char *str; 52: int len, init_len, icase; 53: RE_TRANSLATE_TYPE trans; 54: { 55: reg_errcode_t ret; 56: int init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 57: re_string_construct_common (str, len, pstr, trans, icase); 58: pstr->stop = pstr->len; 59: 60: ret = re_string_realloc_buffers (pstr, init_buf_len); 61: if (BE (ret != REG_NOERROR, 0)) 62: return ret; 63: 64: pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case 65: : (unsigned char *) str); 66: pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; 67: pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr) 68: || MB_CUR_MAX > 1) ? pstr->valid_len : len; 69: return REG_NOERROR; 70: } 71: 72: /* This function allocate the buffers, and initialize them. */ 73: 74: static reg_errcode_t 75: re_string_construct (pstr, str, len, trans, icase) 76: re_string_t *pstr; 77: const char *str; 78: int len, icase; 79: RE_TRANSLATE_TYPE trans; 80: { 81: reg_errcode_t ret; 82: re_string_construct_common (str, len, pstr, trans, icase); 83: pstr->stop = pstr->len; 84: /* Set 0 so that this function can initialize whole buffers. */ 85: pstr->valid_len = 0; 86: 87: if (len > 0) 88: { 89: ret = re_string_realloc_buffers (pstr, len + 1); 90: if (BE (ret != REG_NOERROR, 0)) 91: return ret; 92: } 93: pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case 94: : (unsigned char *) str); 95: pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; 96: 97: if (icase) 98: { 99: #ifdef RE_ENABLE_I18N 100: if (MB_CUR_MAX > 1) 101: build_wcs_upper_buffer (pstr); 102: else 103: #endif /* RE_ENABLE_I18N */ 104: build_upper_buffer (pstr); 105: } 106: else 107: { 108: #ifdef RE_ENABLE_I18N 109: if (MB_CUR_MAX > 1) 110: build_wcs_buffer (pstr); 111: else 112: #endif /* RE_ENABLE_I18N */ 113: { 114: if (trans != NULL) 115: re_string_translate_buffer (pstr); 116: else 117: pstr->valid_len = len; 118: } 119: } 120: 121: /* Initialized whole buffers, then valid_len == bufs_len. */ 122: pstr->valid_len = pstr->bufs_len; 123: return REG_NOERROR; 124: } 125: 126: /* Helper functions for re_string_allocate, and re_string_construct. */ 127: 128: static reg_errcode_t 129: re_string_realloc_buffers (pstr, new_buf_len) 130: re_string_t *pstr; 131: int new_buf_len; 132: { 133: #ifdef RE_ENABLE_I18N 134: if (MB_CUR_MAX > 1) 135: { 136: wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len); 137: if (BE (new_array == NULL, 0)) 138: return REG_ESPACE; 139: pstr->wcs = new_array; 140: } 141: #endif /* RE_ENABLE_I18N */ 142: if (MBS_ALLOCATED (pstr)) 143: { 144: unsigned char *new_array = re_realloc (pstr->mbs, unsigned char, 145: new_buf_len); 146: if (BE (new_array == NULL, 0)) 147: return REG_ESPACE; 148: pstr->mbs = new_array; 149: } 150: if (MBS_CASE_ALLOCATED (pstr)) 151: { 152: unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char, 153: new_buf_len); 154: if (BE (new_array == NULL, 0)) 155: return REG_ESPACE; 156: pstr->mbs_case = new_array; 157: if (!MBS_ALLOCATED (pstr)) 158: pstr->mbs = pstr->mbs_case; 159: } 160: pstr->bufs_len = new_buf_len; 161: return REG_NOERROR; 162: } 163: 164: 165: static void 166: re_string_construct_common (str, len, pstr, trans, icase) 167: const char *str; 168: int len; 169: re_string_t *pstr; 170: RE_TRANSLATE_TYPE trans; 171: int icase; 172: { 173: memset (pstr, '\0', sizeof (re_string_t)); 174: pstr->raw_mbs = (const unsigned char *) str; 175: pstr->len = len; 176: pstr->trans = trans; 177: pstr->icase = icase ? 1 : 0; 178: } 179: 180: #ifdef RE_ENABLE_I18N 181: 182: /* Build wide character buffer PSTR->WCS. 183: If the byte sequence of the string are: 184: <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 185: Then wide character buffer will be: 186: <wc1> , WEOF , <wc2> , WEOF , <wc3> 187: We use WEOF for padding, they indicate that the position isn't 188: a first byte of a multibyte character. 189: 190: Note that this function assumes PSTR->VALID_LEN elements are already 191: built and starts from PSTR->VALID_LEN. */ 192: 193: static void 194: build_wcs_buffer (pstr) 195: re_string_t *pstr; 196: { 197: mbstate_t prev_st; 198: int byte_idx, end_idx, mbclen, remain_len; 199: /* Build the buffers from pstr->valid_len to either pstr->len or 200: pstr->bufs_len. */ 201: end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len; 202: for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 203: { 204: wchar_t wc; 205: remain_len = end_idx - byte_idx; 206: prev_st = pstr->cur_state; 207: mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 208: + byte_idx), remain_len, &pstr->cur_state); 209: if (BE (mbclen == (size_t) -2, 0)) 210: { 211: /* The buffer doesn't have enough space, finish to build. */ 212: pstr->cur_state = prev_st; 213: break; 214: } 215: else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 216: { 217: /* We treat these cases as a singlebyte character. */ 218: mbclen = 1; 219: wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 220: pstr->cur_state = prev_st; 221: } 222: 223: /* Apply the translateion if we need. */ 224: if (pstr->trans != NULL && mbclen == 1) 225: { 226: int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]]; 227: pstr->mbs_case[byte_idx] = ch; 228: } 229: /* Write wide character and padding. */ 230: pstr->wcs[byte_idx++] = wc; 231: /* Write paddings. */ 232: for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 233: pstr->wcs[byte_idx++] = WEOF; 234: } 235: pstr->valid_len = byte_idx; 236: } 237: 238: /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 239: but for REG_ICASE. */ 240: 241: static void 242: build_wcs_upper_buffer (pstr) 243: re_string_t *pstr; 244: { 245: mbstate_t prev_st; 246: int byte_idx, end_idx, mbclen, remain_len; 247: /* Build the buffers from pstr->valid_len to either pstr->len or 248: pstr->bufs_len. */ 249: end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len; 250: for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 251: { 252: wchar_t wc; 253: remain_len = end_idx - byte_idx; 254: prev_st = pstr->cur_state; 255: mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 256: + byte_idx), remain_len, &pstr->cur_state); 257: if (BE (mbclen == (size_t) -2, 0)) 258: { 259: /* The buffer doesn't have enough space, finish to build. */ 260: pstr->cur_state = prev_st; 261: break; 262: } 263: else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0) 264: { 265: /* In case of a singlebyte character. */ 266: int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 267: /* Apply the translateion if we need. */ 268: if (pstr->trans != NULL && mbclen == 1) 269: { 270: ch = pstr->trans[ch]; 271: pstr->mbs_case[byte_idx] = ch; 272: } 273: pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc; 274: pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch; 275: if (BE (mbclen == (size_t) -1, 0)) 276: pstr->cur_state = prev_st; 277: } 278: else /* mbclen > 1 */ 279: { 280: if (iswlower (wc)) 281: wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st); 282: else 283: memcpy (pstr->mbs + byte_idx, 284: pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 285: pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc; 286: /* Write paddings. */ 287: for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 288: pstr->wcs[byte_idx++] = WEOF; 289: } 290: } 291: pstr->valid_len = byte_idx; 292: } 293: 294: /* Skip characters until the index becomes greater than NEW_RAW_IDX. 295: Return the index. */ 296: 297: static int 298: re_string_skip_chars (pstr, new_raw_idx, last_wc) 299: re_string_t *pstr; 300: int new_raw_idx; 301: wint_t *last_wc; 302: { 303: mbstate_t prev_st; 304: int rawbuf_idx, mbclen; 305: wchar_t wc = 0; 306: 307: /* Skip the characters which are not necessary to check. */ 308: for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len; 309: rawbuf_idx < new_raw_idx;) 310: { 311: int remain_len; 312: remain_len = pstr->len - rawbuf_idx; 313: prev_st = pstr->cur_state; 314: mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, 315: remain_len, &pstr->cur_state); 316: if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 317: { 318: /* We treat these cases as a singlebyte character. */ 319: mbclen = 1; 320: pstr->cur_state = prev_st; 321: } 322: /* Then proceed the next character. */ 323: rawbuf_idx += mbclen; 324: } 325: *last_wc = (wint_t) wc; 326: return rawbuf_idx; 327: } 328: #endif /* RE_ENABLE_I18N */ 329: 330: /* Build the buffer PSTR->MBS, and apply the translation if we need. 331: This function is used in case of REG_ICASE. */ 332: 333: static void 334: build_upper_buffer (pstr) 335: re_string_t *pstr; 336: { 337: int char_idx, end_idx; 338: end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 339: 340: for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 341: { 342: int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 343: if (pstr->trans != NULL) 344: { 345: ch = pstr->trans[ch]; 346: pstr->mbs_case[char_idx] = ch; 347: } 348: if (islower (ch)) 349: pstr->mbs[char_idx] = toupper (ch); 350: else 351: pstr->mbs[char_idx] = ch; 352: } 353: pstr->valid_len = char_idx; 354: } 355: 356: /* Apply TRANS to the buffer in PSTR. */ 357: 358: static void 359: re_string_translate_buffer (pstr) 360: re_string_t *pstr; 361: { 362: int buf_idx, end_idx; 363: end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 364: 365: for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 366: { 367: int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 368: pstr->mbs_case[buf_idx] = pstr->trans[ch]; 369: } 370: 371: pstr->valid_len = buf_idx; 372: } 373: 374: /* This function re-construct the buffers. 375: Concretely, convert to wide character in case of MB_CUR_MAX > 1, 376: convert to upper case in case of REG_ICASE, apply translation. */ 377: 378: static reg_errcode_t 379: re_string_reconstruct (pstr, idx, eflags, newline) 380: re_string_t *pstr; 381: int idx, eflags, newline; 382: { 383: int offset = idx - pstr->raw_mbs_idx; 384: if (offset < 0) 385: { 386: /* Reset buffer. */ 387: #ifdef RE_ENABLE_I18N 388: if (MB_CUR_MAX > 1) 389: memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 390: #endif /* RE_ENABLE_I18N */ 391: pstr->len += pstr->raw_mbs_idx; 392: pstr->stop += pstr->raw_mbs_idx; 393: pstr->valid_len = pstr->raw_mbs_idx = 0; 394: pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 395: : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 396: if (!MBS_CASE_ALLOCATED (pstr)) 397: pstr->mbs_case = (unsigned char *) pstr->raw_mbs; 398: if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr)) 399: pstr->mbs = (unsigned char *) pstr->raw_mbs; 400: offset = idx; 401: } 402: 403: if (offset != 0) 404: { 405: /* Are the characters which are already checked remain? */ 406: if (offset < pstr->valid_len) 407: { 408: /* Yes, move them to the front of the buffer. */ 409: pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, 410: newline); 411: #ifdef RE_ENABLE_I18N 412: if (MB_CUR_MAX > 1) 413: memmove (pstr->wcs, pstr->wcs + offset, 414: (pstr->valid_len - offset) * sizeof (wint_t)); 415: #endif /* RE_ENABLE_I18N */ 416: if (MBS_ALLOCATED (pstr)) 417: memmove (pstr->mbs, pstr->mbs + offset, 418: pstr->valid_len - offset); 419: if (MBS_CASE_ALLOCATED (pstr)) 420: memmove (pstr->mbs_case, pstr->mbs_case + offset, 421: pstr->valid_len - offset); 422: pstr->valid_len -= offset; 423: #if DEBUG 424: assert (pstr->valid_len > 0); 425: #endif 426: } 427: else 428: { 429: /* No, skip all characters until IDX. */ 430: pstr->valid_len = 0; 431: #ifdef RE_ENABLE_I18N 432: if (MB_CUR_MAX > 1) 433: { 434: int wcs_idx; 435: wint_t wc; 436: pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 437: for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 438: pstr->wcs[wcs_idx] = WEOF; 439: if (pstr->trans && wc <= 0xff) 440: wc = pstr->trans[wc]; 441: pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD 442: : ((newline && IS_WIDE_NEWLINE (wc)) 443: ? CONTEXT_NEWLINE : 0)); 444: } 445: else 446: #endif /* RE_ENABLE_I18N */ 447: { 448: int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 449: if (pstr->trans) 450: c = pstr->trans[c]; 451: pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD 452: : ((newline && IS_NEWLINE (c)) 453: ? CONTEXT_NEWLINE : 0)); 454: } 455: } 456: if (!MBS_CASE_ALLOCATED (pstr)) 457: { 458: pstr->mbs_case += offset; 459: /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */ 460: if (!MBS_ALLOCATED (pstr)) 461: pstr->mbs += offset; 462: } 463: } 464: pstr->raw_mbs_idx = idx; 465: pstr->len -= offset; 466: pstr->stop -= offset; 467: 468: /* Then build the buffers. */ 469: #ifdef RE_ENABLE_I18N 470: if (MB_CUR_MAX > 1) 471: { 472: if (pstr->icase) 473: build_wcs_upper_buffer (pstr); 474: else 475: build_wcs_buffer (pstr); 476: } 477: else 478: #endif /* RE_ENABLE_I18N */ 479: { 480: if (pstr->icase) 481: build_upper_buffer (pstr); 482: else if (pstr->trans != NULL) 483: re_string_translate_buffer (pstr); 484: } 485: pstr->cur_idx = 0; 486: 487: return REG_NOERROR; 488: } 489: 490: static void 491: re_string_destruct (pstr) 492: re_string_t *pstr; 493: { 494: #ifdef RE_ENABLE_I18N 495: re_free (pstr->wcs); 496: #endif /* RE_ENABLE_I18N */ 497: if (MBS_ALLOCATED (pstr)) 498: re_free (pstr->mbs); 499: if (MBS_CASE_ALLOCATED (pstr)) 500: re_free (pstr->mbs_case); 501: } 502: 503: /* Return the context at IDX in INPUT. */ 504: 505: static unsigned int 506: re_string_context_at (input, idx, eflags, newline_anchor) 507: const re_string_t *input; 508: int idx, eflags, newline_anchor; 509: { 510: int c; 511: if (idx < 0 || idx == input->len) 512: { 513: if (idx < 0) 514: /* In this case, we use the value stored in input->tip_context, 515: since we can't know the character in input->mbs[-1] here. */ 516: return input->tip_context; 517: else /* (idx == input->len) */ 518: return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 519: : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 520: } 521: #ifdef RE_ENABLE_I18N 522: if (MB_CUR_MAX > 1) 523: { 524: wint_t wc; 525: int wc_idx = idx; 526: while(input->wcs[wc_idx] == WEOF) 527: { 528: #ifdef DEBUG 529: /* It must not happen. */ 530: assert (wc_idx >= 0); 531: #endif 532: --wc_idx; 533: if (wc_idx < 0) 534: return input->tip_context; 535: } 536: wc = input->wcs[wc_idx]; 537: if (IS_WIDE_WORD_CHAR (wc)) 538: return CONTEXT_WORD; 539: return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0; 540: } 541: else 542: #endif 543: { 544: c = re_string_byte_at (input, idx); 545: if (IS_WORD_CHAR (c)) 546: return CONTEXT_WORD; 547: return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0; 548: } 549: } 550: 551: /* Functions for set operation. */ 552: 553: static reg_errcode_t 554: re_node_set_alloc (set, size) 555: re_node_set *set; 556: int size; 557: { 558: set->alloc = size; 559: set->nelem = 0; 560: set->elems = re_malloc (int, size); 561: if (BE (set->elems == NULL, 0)) 562: return REG_ESPACE; 563: return REG_NOERROR; 564: } 565: 566: static reg_errcode_t 567: re_node_set_init_1 (set, elem) 568: re_node_set *set; 569: int elem; 570: { 571: set->alloc = 1; 572: set->nelem = 1; 573: set->elems = re_malloc (int, 1); 574: if (BE (set->elems == NULL, 0)) 575: { 576: set->alloc = set->nelem = 0; 577: return REG_ESPACE; 578: } 579: set->elems[0] = elem; 580: return REG_NOERROR; 581: } 582: 583: static reg_errcode_t 584: re_node_set_init_2 (set, elem1, elem2) 585: re_node_set *set; 586: int elem1, elem2; 587: { 588: set->alloc = 2; 589: set->elems = re_malloc (int, 2); 590: if (BE (set->elems == NULL, 0)) 591: return REG_ESPACE; 592: if (elem1 == elem2) 593: { 594: set->nelem = 1; 595: set->elems[0] = elem1; 596: } 597: else 598: { 599: set->nelem = 2; 600: if (elem1 < elem2) 601: { 602: set->elems[0] = elem1; 603: set->elems[1] = elem2; 604: } 605: else 606: { 607: set->elems[0] = elem2; 608: set->elems[1] = elem1; 609: } 610: } 611: return REG_NOERROR; 612: } 613: 614: static reg_errcode_t 615: re_node_set_init_copy (dest, src) 616: re_node_set *dest; 617: const re_node_set *src; 618: { 619: dest->nelem = src->nelem; 620: if (src->nelem > 0) 621: { 622: dest->alloc = dest->nelem; 623: dest->elems = re_malloc (int, dest->alloc); 624: if (BE (dest->elems == NULL, 0)) 625: { 626: dest->alloc = dest->nelem = 0; 627: return REG_ESPACE; 628: } 629: memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 630: } 631: else 632: re_node_set_init_empty (dest); 633: return REG_NOERROR; 634: } 635: 636: /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 637: DEST. Return value indicate the error code or REG_NOERROR if succeeded. 638: Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 639: 640: static reg_errcode_t 641: re_node_set_add_intersect (dest, src1, src2) 642: re_node_set *dest; 643: const re_node_set *src1, *src2; 644: { 645: int i1, i2, id; 646: if (src1->nelem > 0 && src2->nelem > 0) 647: { 648: if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 649: { 650: dest->alloc = src1->nelem + src2->nelem + dest->nelem; 651: dest->elems = re_realloc (dest->elems, int, dest->alloc); 652: if (BE (dest->elems == NULL, 0)) 653: return REG_ESPACE; 654: } 655: } 656: else 657: return REG_NOERROR; 658: 659: for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 660: { 661: if (src1->elems[i1] > src2->elems[i2]) 662: { 663: ++i2; 664: continue; 665: } 666: if (src1->elems[i1] == src2->elems[i2]) 667: { 668: while (id < dest->nelem && dest->elems[id] < src2->elems[i2]) 669: ++id; 670: if (id < dest->nelem && dest->elems[id] == src2->elems[i2]) 671: ++id; 672: else 673: { 674: memmove (dest->elems + id + 1, dest->elems + id, 675: sizeof (int) * (dest->nelem - id)); 676: dest->elems[id++] = src2->elems[i2++]; 677: ++dest->nelem; 678: } 679: } 680: ++i1; 681: } 682: return REG_NOERROR; 683: } 684: 685: /* Calculate the union set of the sets SRC1 and SRC2. And store it to 686: DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 687: 688: static reg_errcode_t 689: re_node_set_init_union (dest, src1, src2) 690: re_node_set *dest; 691: const re_node_set *src1, *src2; 692: { 693: int i1, i2, id; 694: if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 695: { 696: dest->alloc = src1->nelem + src2->nelem; 697: dest->elems = re_malloc (int, dest->alloc); 698: if (BE (dest->elems == NULL, 0)) 699: return REG_ESPACE; 700: } 701: else 702: { 703: if (src1 != NULL && src1->nelem > 0) 704: return re_node_set_init_copy (dest, src1); 705: else if (src2 != NULL && src2->nelem > 0) 706: return re_node_set_init_copy (dest, src2); 707: else 708: re_node_set_init_empty (dest); 709: return REG_NOERROR; 710: } 711: for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 712: { 713: if (src1->elems[i1] > src2->elems[i2]) 714: { 715: dest->elems[id++] = src2->elems[i2++]; 716: continue; 717: } 718: if (src1->elems[i1] == src2->elems[i2]) 719: ++i2; 720: dest->elems[id++] = src1->elems[i1++]; 721: } 722: if (i1 < src1->nelem) 723: { 724: memcpy (dest->elems + id, src1->elems + i1, 725: (src1->nelem - i1) * sizeof (int)); 726: id += src1->nelem - i1; 727: } 728: else if (i2 < src2->nelem) 729: { 730: memcpy (dest->elems + id, src2->elems + i2, 731: (src2->nelem - i2) * sizeof (int)); 732: id += src2->nelem - i2; 733: } 734: dest->nelem = id; 735: return REG_NOERROR; 736: } 737: 738: /* Calculate the union set of the sets DEST and SRC. And store it to 739: DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 740: 741: static reg_errcode_t 742: re_node_set_merge (dest, src) 743: re_node_set *dest; 744: const re_node_set *src; 745: { 746: int si, di; 747: if (src == NULL || src->nelem == 0) 748: return REG_NOERROR; 749: if (dest->alloc < src->nelem + dest->nelem) 750: { 751: int *new_buffer; 752: dest->alloc = 2 * (src->nelem + dest->alloc); 753: new_buffer = re_realloc (dest->elems, int, dest->alloc); 754: if (BE (new_buffer == NULL, 0)) 755: return REG_ESPACE; 756: dest->elems = new_buffer; 757: } 758: 759: for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;) 760: { 761: int cp_from, ncp, mid, right, src_elem = src->elems[si]; 762: /* Binary search the spot we will add the new element. */ 763: right = dest->nelem; 764: while (di < right) 765: { 766: mid = (di + right) / 2; 767: if (dest->elems[mid] < src_elem) 768: di = mid + 1; 769: else 770: right = mid; 771: } 772: if (di >= dest->nelem) 773: break; 774: 775: if (dest->elems[di] == src_elem) 776: { 777: /* Skip since, DEST already has the element. */ 778: ++di; 779: ++si; 780: continue; 781: } 782: 783: /* Skip the src elements which are less than dest->elems[di]. */ 784: cp_from = si; 785: while (si < src->nelem && src->elems[si] < dest->elems[di]) 786: ++si; 787: /* Copy these src elements. */ 788: ncp = si - cp_from; 789: memmove (dest->elems + di + ncp, dest->elems + di, 790: sizeof (int) * (dest->nelem - di)); 791: memcpy (dest->elems + di, src->elems + cp_from, 792: sizeof (int) * ncp); 793: /* Update counters. */ 794: di += ncp; 795: dest->nelem += ncp; 796: } 797: 798: /* Copy remaining src elements. */ 799: if (si < src->nelem) 800: { 801: memcpy (dest->elems + di, src->elems + si, 802: sizeof (int) * (src->nelem - si)); 803: dest->nelem += src->nelem - si; 804: } 805: return REG_NOERROR; 806: } 807: 808: /* Insert the new element ELEM to the re_node_set* SET. 809: return 0 if SET already has ELEM, 810: return -1 if an error is occured, return 1 otherwise. */ 811: 812: static int 813: re_node_set_insert (set, elem) 814: re_node_set *set; 815: int elem; 816: { 817: int idx, right, mid; 818: /* In case of the set is empty. */ 819: if (set->elems == NULL || set->alloc == 0) 820: { 821: if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1)) 822: return 1; 823: else 824: return -1; 825: } 826: 827: /* Binary search the spot we will add the new element. */ 828: idx = 0; 829: right = set->nelem; 830: while (idx < right) 831: { 832: mid = (idx + right) / 2; 833: if (set->elems[mid] < elem) 834: idx = mid + 1; 835: else 836: right = mid; 837: } 838: 839: /* Realloc if we need. */ 840: if (set->alloc < set->nelem + 1) 841: { 842: int *new_array; 843: set->alloc = set->alloc * 2; 844: new_array = re_malloc (int, set->alloc); 845: if (BE (new_array == NULL, 0)) 846: return -1; 847: /* Copy the elements they are followed by the new element. */ 848: if (idx > 0) 849: memcpy (new_array, set->elems, sizeof (int) * (idx)); 850: /* Copy the elements which follows the new element. */ 851: if (set->nelem - idx > 0) 852: memcpy (new_array + idx + 1, set->elems + idx, 853: sizeof (int) * (set->nelem - idx)); 854: re_free (set->elems); 855: set->elems = new_array; 856: } 857: else 858: { 859: /* Move the elements which follows the new element. */ 860: if (set->nelem - idx > 0) 861: memmove (set->elems + idx + 1, set->elems + idx, 862: sizeof (int) * (set->nelem - idx)); 863: } 864: /* Insert the new element. */ 865: set->elems[idx] = elem; 866: ++set->nelem; 867: return 1; 868: } 869: 870: /* Compare two node sets SET1 and SET2. 871: return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */ 872: 873: static int 874: re_node_set_compare (set1, set2) 875: const re_node_set *set1, *set2; 876: { 877: int i; 878: if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 879: return 0; 880: for (i = 0 ; i < set1->nelem ; i++) 881: if (set1->elems[i] != set2->elems[i]) 882: return 0; 883: return 1; 884: } 885: 886: /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 887: 888: static int 889: re_node_set_contains (set, elem) 890: const re_node_set *set; 891: int elem; 892: { 893: int idx, right, mid; 894: if (set->nelem <= 0) 895: return 0; 896: 897: /* Binary search the element. */ 898: idx = 0; 899: right = set->nelem - 1; 900: while (idx < right) 901: { 902: mid = (idx + right) / 2; 903: if (set->elems[mid] < elem) 904: idx = mid + 1; 905: else 906: right = mid; 907: } 908: return set->elems[idx] == elem ? idx + 1 : 0; 909: } 910: 911: static void 912: re_node_set_remove_at (set, idx) 913: re_node_set *set; 914: int idx; 915: { 916: if (idx < 0 || idx >= set->nelem) 917: return; 918: if (idx < set->nelem - 1) 919: memmove (set->elems + idx, set->elems + idx + 1, 920: sizeof (int) * (set->nelem - idx - 1)); 921: --set->nelem; 922: } 923: 924: 925: /* Add the token TOKEN to dfa->nodes, and return the index of the token. 926: Or return -1, if an error will be occured. */ 927: 928: static int 929: re_dfa_add_node (dfa, token, mode) 930: re_dfa_t *dfa; 931: re_token_t token; 932: int mode; 933: { 934: if (dfa->nodes_len >= dfa->nodes_alloc) 935: { 936: re_token_t *new_array; 937: dfa->nodes_alloc *= 2; 938: new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc); 939: if (BE (new_array == NULL, 0)) 940: return -1; 941: else 942: dfa->nodes = new_array; 943: if (mode) 944: { 945: int *new_nexts, *new_indices; 946: re_node_set *new_edests, *new_eclosures, *new_inveclosures; 947: 948: new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc); 949: new_indices = re_realloc (dfa->org_indices, int, dfa->nodes_alloc); 950: new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc); 951: new_eclosures = re_realloc (dfa->eclosures, re_node_set, 952: dfa->nodes_alloc); 953: new_inveclosures = re_realloc (dfa->inveclosures, re_node_set, 954: dfa->nodes_alloc); 955: if (BE (new_nexts == NULL || new_indices == NULL 956: || new_edests == NULL || new_eclosures == NULL 957: || new_inveclosures == NULL, 0)) 958: return -1; 959: dfa->nexts = new_nexts; 960: dfa->org_indices = new_indices; 961: dfa->edests = new_edests; 962: dfa->eclosures = new_eclosures; 963: dfa->inveclosures = new_inveclosures; 964: } 965: } 966: dfa->nodes[dfa->nodes_len] = token; 967: dfa->nodes[dfa->nodes_len].duplicated = 0; 968: dfa->nodes[dfa->nodes_len].constraint = 0; 969: return dfa->nodes_len++; 970: } 971: 972: static unsigned int inline 973: calc_state_hash (nodes, context) 974: const re_node_set *nodes; 975: unsigned int context; 976: { 977: unsigned int hash = nodes->nelem + context; 978: int i; 979: for (i = 0 ; i < nodes->nelem ; i++) 980: hash += nodes->elems[i]; 981: return hash; 982: } 983: 984: /* Search for the state whose node_set is equivalent to NODES. 985: Return the pointer to the state, if we found it in the DFA. 986: Otherwise create the new one and return it. In case of an error 987: return NULL and set the error code in ERR. 988: Note: - We assume NULL as the invalid state, then it is possible that 989: return value is NULL and ERR is REG_NOERROR. 990: - We never return non-NULL value in case of any errors, it is for 991: optimization. */ 992: 993: static re_dfastate_t* 994: re_acquire_state (err, dfa, nodes) 995: reg_errcode_t *err; 996: re_dfa_t *dfa; 997: const re_node_set *nodes; 998: { 999: unsigned int hash; 1000: re_dfastate_t *new_state; 1001: struct re_state_table_entry *spot; 1002: int i; 1003: if (BE (nodes->nelem == 0, 0)) 1004: { 1005: *err = REG_NOERROR; 1006: return NULL; 1007: } 1008: hash = calc_state_hash (nodes, 0); 1009: spot = dfa->state_table + (hash & dfa->state_hash_mask); 1010: 1011: for (i = 0 ; i < spot->num ; i++) 1012: { 1013: re_dfastate_t *state = spot->array[i]; 1014: if (hash != state->hash) 1015: continue; 1016: if (re_node_set_compare (&state->nodes, nodes)) 1017: return state; 1018: } 1019: 1020: /* There are no appropriate state in the dfa, create the new one. */ 1021: new_state = create_ci_newstate (dfa, nodes, hash); 1022: if (BE (new_state != NULL, 1)) 1023: return new_state; 1024: else 1025: { 1026: *err = REG_ESPACE; 1027: return NULL; 1028: } 1029: } 1030: 1031: /* Search for the state whose node_set is equivalent to NODES and 1032: whose context is equivalent to CONTEXT. 1033: Return the pointer to the state, if we found it in the DFA. 1034: Otherwise create the new one and return it. In case of an error 1035: return NULL and set the error code in ERR. 1036: Note: - We assume NULL as the invalid state, then it is possible that 1037: return value is NULL and ERR is REG_NOERROR. 1038: - We never return non-NULL value in case of any errors, it is for 1039: optimization. */ 1040: 1041: static re_dfastate_t* 1042: re_acquire_state_context (err, dfa, nodes, context) 1043: reg_errcode_t *err; 1044: re_dfa_t *dfa; 1045: const re_node_set *nodes; 1046: unsigned int context; 1047: { 1048: unsigned int hash; 1049: re_dfastate_t *new_state; 1050: struct re_state_table_entry *spot; 1051: int i; 1052: if (nodes->nelem == 0) 1053: { 1054: *err = REG_NOERROR; 1055: return NULL; 1056: } 1057: hash = calc_state_hash (nodes, context); 1058: spot = dfa->state_table + (hash & dfa->state_hash_mask); 1059: 1060: for (i = 0 ; i < spot->num ; i++) 1061: { 1062: re_dfastate_t *state = spot->array[i]; 1063: if (hash != state->hash) 1064: continue; 1065: if (re_node_set_compare (state->entrance_nodes, nodes) 1066: && state->context == context) 1067: return state; 1068: } 1069: /* There are no appropriate state in `dfa', create the new one. */ 1070: new_state = create_cd_newstate (dfa, nodes, context, hash); 1071: if (BE (new_state != NULL, 1)) 1072: return new_state; 1073: else 1074: { 1075: *err = REG_ESPACE; 1076: return NULL; 1077: } 1078: } 1079: 1080: /* Allocate memory for DFA state and initialize common properties. 1081: Return the new state if succeeded, otherwise return NULL. */ 1082: 1083: static re_dfastate_t * 1084: create_newstate_common (dfa, nodes, hash) 1085: re_dfa_t *dfa; 1086: const re_node_set *nodes; 1087: unsigned int hash; 1088: { 1089: re_dfastate_t *newstate; 1090: reg_errcode_t err; 1091: newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1092: if (BE (newstate == NULL, 0)) 1093: return NULL; 1094: err = re_node_set_init_copy (&newstate->nodes, nodes); 1095: if (BE (err != REG_NOERROR, 0)) 1096: { 1097: re_free (newstate); 1098: return NULL; 1099: } 1100: newstate->trtable = NULL; 1101: newstate->trtable_search = NULL; 1102: newstate->hash = hash; 1103: return newstate; 1104: } 1105: 1106: /* Store the new state NEWSTATE whose hash value is HASH in appropriate 1107: position. Return value indicate the error code if failed. */ 1108: 1109: static reg_errcode_t 1110: register_state (dfa, newstate, hash) 1111: re_dfa_t *dfa; 1112: re_dfastate_t *newstate; 1113: unsigned int hash; 1114: { 1115: struct re_state_table_entry *spot; 1116: spot = dfa->state_table + (hash & dfa->state_hash_mask); 1117: 1118: if (spot->alloc <= spot->num) 1119: { 1120: re_dfastate_t **new_array; 1121: spot->alloc = 2 * spot->num + 2; 1122: new_array = re_realloc (spot->array, re_dfastate_t *, spot->alloc); 1123: if (BE (new_array == NULL, 0)) 1124: return REG_ESPACE; 1125: spot->array = new_array; 1126: } 1127: spot->array[spot->num++] = newstate; 1128: return REG_NOERROR; 1129: } 1130: 1131: /* Create the new state which is independ of contexts. 1132: Return the new state if succeeded, otherwise return NULL. */ 1133: 1134: static re_dfastate_t * 1135: create_ci_newstate (dfa, nodes, hash) 1136: re_dfa_t *dfa; 1137: const re_node_set *nodes; 1138: unsigned int hash; 1139: { 1140: int i; 1141: reg_errcode_t err; 1142: re_dfastate_t *newstate; 1143: newstate = create_newstate_common (dfa, nodes, hash); 1144: if (BE (newstate == NULL, 0)) 1145: return NULL; 1146: newstate->entrance_nodes = &newstate->nodes; 1147: 1148: for (i = 0 ; i < nodes->nelem ; i++) 1149: { 1150: re_token_t *node = dfa->nodes + nodes->elems[i]; 1151: re_token_type_t type = node->type; 1152: if (type == CHARACTER && !node->constraint) 1153: continue; 1154: 1155: /* If the state has the halt node, the state is a halt state. */ 1156: else if (type == END_OF_RE) 1157: newstate->halt = 1; 1158: #ifdef RE_ENABLE_I18N 1159: else if (type == COMPLEX_BRACKET 1160: || (type == OP_PERIOD && MB_CUR_MAX > 1)) 1161: newstate->accept_mb = 1; 1162: #endif /* RE_ENABLE_I18N */ 1163: else if (type == OP_BACK_REF) 1164: newstate->has_backref = 1; 1165: else if (type == ANCHOR || node->constraint) 1166: newstate->has_constraint = 1; 1167: } 1168: err = register_state (dfa, newstate, hash); 1169: if (BE (err != REG_NOERROR, 0)) 1170: { 1171: free_state (newstate); 1172: newstate = NULL; 1173: } 1174: return newstate; 1175: } 1176: 1177: /* Create the new state which is depend on the context CONTEXT. 1178: Return the new state if succeeded, otherwise return NULL. */ 1179: 1180: static re_dfastate_t * 1181: create_cd_newstate (dfa, nodes, context, hash) 1182: re_dfa_t *dfa; 1183: const re_node_set *nodes; 1184: unsigned int context, hash; 1185: { 1186: int i, nctx_nodes = 0; 1187: reg_errcode_t err; 1188: re_dfastate_t *newstate; 1189: 1190: newstate = create_newstate_common (dfa, nodes, hash); 1191: if (BE (newstate == NULL, 0)) 1192: return NULL; 1193: newstate->context = context; 1194: newstate->entrance_nodes = &newstate->nodes; 1195: 1196: for (i = 0 ; i < nodes->nelem ; i++) 1197: { 1198: unsigned int constraint = 0; 1199: re_token_t *node = dfa->nodes + nodes->elems[i]; 1200: re_token_type_t type = node->type; 1201: if (node->constraint) 1202: constraint = node->constraint; 1203: 1204: if (type == CHARACTER && !constraint) 1205: continue; 1206: /* If the state has the halt node, the state is a halt state. */ 1207: else if (type == END_OF_RE) 1208: newstate->halt = 1; 1209: #ifdef RE_ENABLE_I18N 1210: else if (type == COMPLEX_BRACKET 1211: || (type == OP_PERIOD && MB_CUR_MAX > 1)) 1212: newstate->accept_mb = 1; 1213: #endif /* RE_ENABLE_I18N */ 1214: else if (type == OP_BACK_REF) 1215: newstate->has_backref = 1; 1216: else if (type == ANCHOR) 1217: constraint = node->opr.ctx_type; 1218: 1219: if (constraint) 1220: { 1221: if (newstate->entrance_nodes == &newstate->nodes) 1222: { 1223: newstate->entrance_nodes = re_malloc (re_node_set, 1); 1224: if (BE (newstate->entrance_nodes == NULL, 0)) 1225: { 1226: free_state (newstate); 1227: return NULL; 1228: } 1229: re_node_set_init_copy (newstate->entrance_nodes, nodes); 1230: nctx_nodes = 0; 1231: newstate->has_constraint = 1; 1232: } 1233: 1234: if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1235: { 1236: re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1237: ++nctx_nodes; 1238: } 1239: } 1240: } 1241: err = register_state (dfa, newstate, hash); 1242: if (BE (err != REG_NOERROR, 0)) 1243: { 1244: free_state (newstate); 1245: newstate = NULL; 1246: } 1247: return newstate; 1248: } 1249: 1250: static void 1251: free_state (state) 1252: re_dfastate_t *state; 1253: { 1254: if (state->entrance_nodes != &state->nodes) 1255: { 1256: re_node_set_free (state->entrance_nodes); 1257: re_free (state->entrance_nodes); 1258: } 1259: re_node_set_free (&state->nodes); 1260: re_free (state->trtable); 1261: re_free (state->trtable_search); 1262: re_free (state); 1263: }