1: /* Extended regular expression matching and search library.
2: Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3: This file is part of the GNU C Library.
4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5:
6: The GNU C Library is free software; you can redistribute it and/or
7: modify it under the terms of the GNU Lesser General Public
8: License as published by the Free Software Foundation; either
9: version 2.1 of the License, or (at your option) any later version.
10:
11: The GNU C Library is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14: Lesser General Public License for more details.
15:
16: You should have received a copy of the GNU Lesser General Public
17: License along with the GNU C Library; if not, write to the Free
18: Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19: MA 02110-1301 USA. */
20:
21: static void re_string_construct_common (const char *str, int len,
22: re_string_t *pstr,
23: RE_TRANSLATE_TYPE trans, int icase);
24: #ifdef RE_ENABLE_I18N
25: static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
26: wint_t *last_wc);
27: #endif /* RE_ENABLE_I18N */
28: static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
29: const re_node_set *nodes,
30: unsigned int hash);
31: static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
32: unsigned int hash);
33: static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
34: const re_node_set *nodes,
35: unsigned int hash);
36: static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
37: const re_node_set *nodes,
38: unsigned int context,
39: unsigned int hash);
40: static unsigned int inline calc_state_hash (const re_node_set *nodes,
41: unsigned int context);
42:
43: /* Functions for string operation. */
44:
45: /* This function allocate the buffers. It is necessary to call
46: re_string_reconstruct before using the object. */
47:
48: static reg_errcode_t
49: re_string_allocate (pstr, str, len, init_len, trans, icase)
50: re_string_t *pstr;
51: const char *str;
52: int len, init_len, icase;
53: RE_TRANSLATE_TYPE trans;
54: {
55: reg_errcode_t ret;
56: int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
57: re_string_construct_common (str, len, pstr, trans, icase);
58: pstr->stop = pstr->len;
59:
60: ret = re_string_realloc_buffers (pstr, init_buf_len);
61: if (BE (ret != REG_NOERROR, 0))
62: return ret;
63:
64: pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
65: : (unsigned char *) str);
66: pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
67: pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
68: || MB_CUR_MAX > 1) ? pstr->valid_len : len;
69: return REG_NOERROR;
70: }
71:
72: /* This function allocate the buffers, and initialize them. */
73:
74: static reg_errcode_t
75: re_string_construct (pstr, str, len, trans, icase)
76: re_string_t *pstr;
77: const char *str;
78: int len, icase;
79: RE_TRANSLATE_TYPE trans;
80: {
81: reg_errcode_t ret;
82: re_string_construct_common (str, len, pstr, trans, icase);
83: pstr->stop = pstr->len;
84: /* Set 0 so that this function can initialize whole buffers. */
85: pstr->valid_len = 0;
86:
87: if (len > 0)
88: {
89: ret = re_string_realloc_buffers (pstr, len + 1);
90: if (BE (ret != REG_NOERROR, 0))
91: return ret;
92: }
93: pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
94: : (unsigned char *) str);
95: pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
96:
97: if (icase)
98: {
99: #ifdef RE_ENABLE_I18N
100: if (MB_CUR_MAX > 1)
101: build_wcs_upper_buffer (pstr);
102: else
103: #endif /* RE_ENABLE_I18N */
104: build_upper_buffer (pstr);
105: }
106: else
107: {
108: #ifdef RE_ENABLE_I18N
109: if (MB_CUR_MAX > 1)
110: build_wcs_buffer (pstr);
111: else
112: #endif /* RE_ENABLE_I18N */
113: {
114: if (trans != NULL)
115: re_string_translate_buffer (pstr);
116: else
117: pstr->valid_len = len;
118: }
119: }
120:
121: /* Initialized whole buffers, then valid_len == bufs_len. */
122: pstr->valid_len = pstr->bufs_len;
123: return REG_NOERROR;
124: }
125:
126: /* Helper functions for re_string_allocate, and re_string_construct. */
127:
128: static reg_errcode_t
129: re_string_realloc_buffers (pstr, new_buf_len)
130: re_string_t *pstr;
131: int new_buf_len;
132: {
133: #ifdef RE_ENABLE_I18N
134: if (MB_CUR_MAX > 1)
135: {
136: wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
137: if (BE (new_array == NULL, 0))
138: return REG_ESPACE;
139: pstr->wcs = new_array;
140: }
141: #endif /* RE_ENABLE_I18N */
142: if (MBS_ALLOCATED (pstr))
143: {
144: unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
145: new_buf_len);
146: if (BE (new_array == NULL, 0))
147: return REG_ESPACE;
148: pstr->mbs = new_array;
149: }
150: if (MBS_CASE_ALLOCATED (pstr))
151: {
152: unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char,
153: new_buf_len);
154: if (BE (new_array == NULL, 0))
155: return REG_ESPACE;
156: pstr->mbs_case = new_array;
157: if (!MBS_ALLOCATED (pstr))
158: pstr->mbs = pstr->mbs_case;
159: }
160: pstr->bufs_len = new_buf_len;
161: return REG_NOERROR;
162: }
163:
164:
165: static void
166: re_string_construct_common (str, len, pstr, trans, icase)
167: const char *str;
168: int len;
169: re_string_t *pstr;
170: RE_TRANSLATE_TYPE trans;
171: int icase;
172: {
173: memset (pstr, '\0', sizeof (re_string_t));
174: pstr->raw_mbs = (const unsigned char *) str;
175: pstr->len = len;
176: pstr->trans = trans;
177: pstr->icase = icase ? 1 : 0;
178: }
179:
180: #ifdef RE_ENABLE_I18N
181:
182: /* Build wide character buffer PSTR->WCS.
183: If the byte sequence of the string are:
184: <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185: Then wide character buffer will be:
186: <wc1> , WEOF , <wc2> , WEOF , <wc3>
187: We use WEOF for padding, they indicate that the position isn't
188: a first byte of a multibyte character.
189:
190: Note that this function assumes PSTR->VALID_LEN elements are already
191: built and starts from PSTR->VALID_LEN. */
192:
193: static void
194: build_wcs_buffer (pstr)
195: re_string_t *pstr;
196: {
197: mbstate_t prev_st;
198: int byte_idx, end_idx, mbclen, remain_len;
199: /* Build the buffers from pstr->valid_len to either pstr->len or
200: pstr->bufs_len. */
201: end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
202: for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
203: {
204: wchar_t wc;
205: remain_len = end_idx - byte_idx;
206: prev_st = pstr->cur_state;
207: mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
208: + byte_idx), remain_len, &pstr->cur_state);
209: if (BE (mbclen == (size_t) -2, 0))
210: {
211: /* The buffer doesn't have enough space, finish to build. */
212: pstr->cur_state = prev_st;
213: break;
214: }
215: else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
216: {
217: /* We treat these cases as a singlebyte character. */
218: mbclen = 1;
219: wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
220: pstr->cur_state = prev_st;
221: }
222:
223: /* Apply the translateion if we need. */
224: if (pstr->trans != NULL && mbclen == 1)
225: {
226: int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
227: pstr->mbs_case[byte_idx] = ch;
228: }
229: /* Write wide character and padding. */
230: pstr->wcs[byte_idx++] = wc;
231: /* Write paddings. */
232: for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
233: pstr->wcs[byte_idx++] = WEOF;
234: }
235: pstr->valid_len = byte_idx;
236: }
237:
238: /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
239: but for REG_ICASE. */
240:
241: static void
242: build_wcs_upper_buffer (pstr)
243: re_string_t *pstr;
244: {
245: mbstate_t prev_st;
246: int byte_idx, end_idx, mbclen, remain_len;
247: /* Build the buffers from pstr->valid_len to either pstr->len or
248: pstr->bufs_len. */
249: end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
250: for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
251: {
252: wchar_t wc;
253: remain_len = end_idx - byte_idx;
254: prev_st = pstr->cur_state;
255: mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
256: + byte_idx), remain_len, &pstr->cur_state);
257: if (BE (mbclen == (size_t) -2, 0))
258: {
259: /* The buffer doesn't have enough space, finish to build. */
260: pstr->cur_state = prev_st;
261: break;
262: }
263: else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
264: {
265: /* In case of a singlebyte character. */
266: int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
267: /* Apply the translateion if we need. */
268: if (pstr->trans != NULL && mbclen == 1)
269: {
270: ch = pstr->trans[ch];
271: pstr->mbs_case[byte_idx] = ch;
272: }
273: pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
274: pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
275: if (BE (mbclen == (size_t) -1, 0))
276: pstr->cur_state = prev_st;
277: }
278: else /* mbclen > 1 */
279: {
280: if (iswlower (wc))
281: wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
282: else
283: memcpy (pstr->mbs + byte_idx,
284: pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
285: pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
286: /* Write paddings. */
287: for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
288: pstr->wcs[byte_idx++] = WEOF;
289: }
290: }
291: pstr->valid_len = byte_idx;
292: }
293:
294: /* Skip characters until the index becomes greater than NEW_RAW_IDX.
295: Return the index. */
296:
297: static int
298: re_string_skip_chars (pstr, new_raw_idx, last_wc)
299: re_string_t *pstr;
300: int new_raw_idx;
301: wint_t *last_wc;
302: {
303: mbstate_t prev_st;
304: int rawbuf_idx, mbclen;
305: wchar_t wc = 0;
306:
307: /* Skip the characters which are not necessary to check. */
308: for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
309: rawbuf_idx < new_raw_idx;)
310: {
311: int remain_len;
312: remain_len = pstr->len - rawbuf_idx;
313: prev_st = pstr->cur_state;
314: mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
315: remain_len, &pstr->cur_state);
316: if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
317: {
318: /* We treat these cases as a singlebyte character. */
319: mbclen = 1;
320: pstr->cur_state = prev_st;
321: }
322: /* Then proceed the next character. */
323: rawbuf_idx += mbclen;
324: }
325: *last_wc = (wint_t) wc;
326: return rawbuf_idx;
327: }
328: #endif /* RE_ENABLE_I18N */
329:
330: /* Build the buffer PSTR->MBS, and apply the translation if we need.
331: This function is used in case of REG_ICASE. */
332:
333: static void
334: build_upper_buffer (pstr)
335: re_string_t *pstr;
336: {
337: int char_idx, end_idx;
338: end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
339:
340: for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
341: {
342: int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
343: if (pstr->trans != NULL)
344: {
345: ch = pstr->trans[ch];
346: pstr->mbs_case[char_idx] = ch;
347: }
348: if (islower (ch))
349: pstr->mbs[char_idx] = toupper (ch);
350: else
351: pstr->mbs[char_idx] = ch;
352: }
353: pstr->valid_len = char_idx;
354: }
355:
356: /* Apply TRANS to the buffer in PSTR. */
357:
358: static void
359: re_string_translate_buffer (pstr)
360: re_string_t *pstr;
361: {
362: int buf_idx, end_idx;
363: end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
364:
365: for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
366: {
367: int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
368: pstr->mbs_case[buf_idx] = pstr->trans[ch];
369: }
370:
371: pstr->valid_len = buf_idx;
372: }
373:
374: /* This function re-construct the buffers.
375: Concretely, convert to wide character in case of MB_CUR_MAX > 1,
376: convert to upper case in case of REG_ICASE, apply translation. */
377:
378: static reg_errcode_t
379: re_string_reconstruct (pstr, idx, eflags, newline)
380: re_string_t *pstr;
381: int idx, eflags, newline;
382: {
383: int offset = idx - pstr->raw_mbs_idx;
384: if (offset < 0)
385: {
386: /* Reset buffer. */
387: #ifdef RE_ENABLE_I18N
388: if (MB_CUR_MAX > 1)
389: memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
390: #endif /* RE_ENABLE_I18N */
391: pstr->len += pstr->raw_mbs_idx;
392: pstr->stop += pstr->raw_mbs_idx;
393: pstr->valid_len = pstr->raw_mbs_idx = 0;
394: pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
395: : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
396: if (!MBS_CASE_ALLOCATED (pstr))
397: pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
398: if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
399: pstr->mbs = (unsigned char *) pstr->raw_mbs;
400: offset = idx;
401: }
402:
403: if (offset != 0)
404: {
405: /* Are the characters which are already checked remain? */
406: if (offset < pstr->valid_len)
407: {
408: /* Yes, move them to the front of the buffer. */
409: pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
410: newline);
411: #ifdef RE_ENABLE_I18N
412: if (MB_CUR_MAX > 1)
413: memmove (pstr->wcs, pstr->wcs + offset,
414: (pstr->valid_len - offset) * sizeof (wint_t));
415: #endif /* RE_ENABLE_I18N */
416: if (MBS_ALLOCATED (pstr))
417: memmove (pstr->mbs, pstr->mbs + offset,
418: pstr->valid_len - offset);
419: if (MBS_CASE_ALLOCATED (pstr))
420: memmove (pstr->mbs_case, pstr->mbs_case + offset,
421: pstr->valid_len - offset);
422: pstr->valid_len -= offset;
423: #if DEBUG
424: assert (pstr->valid_len > 0);
425: #endif
426: }
427: else
428: {
429: /* No, skip all characters until IDX. */
430: pstr->valid_len = 0;
431: #ifdef RE_ENABLE_I18N
432: if (MB_CUR_MAX > 1)
433: {
434: int wcs_idx;
435: wint_t wc;
436: pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
437: for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
438: pstr->wcs[wcs_idx] = WEOF;
439: if (pstr->trans && wc <= 0xff)
440: wc = pstr->trans[wc];
441: pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
442: : ((newline && IS_WIDE_NEWLINE (wc))
443: ? CONTEXT_NEWLINE : 0));
444: }
445: else
446: #endif /* RE_ENABLE_I18N */
447: {
448: int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
449: if (pstr->trans)
450: c = pstr->trans[c];
451: pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
452: : ((newline && IS_NEWLINE (c))
453: ? CONTEXT_NEWLINE : 0));
454: }
455: }
456: if (!MBS_CASE_ALLOCATED (pstr))
457: {
458: pstr->mbs_case += offset;
459: /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
460: if (!MBS_ALLOCATED (pstr))
461: pstr->mbs += offset;
462: }
463: }
464: pstr->raw_mbs_idx = idx;
465: pstr->len -= offset;
466: pstr->stop -= offset;
467:
468: /* Then build the buffers. */
469: #ifdef RE_ENABLE_I18N
470: if (MB_CUR_MAX > 1)
471: {
472: if (pstr->icase)
473: build_wcs_upper_buffer (pstr);
474: else
475: build_wcs_buffer (pstr);
476: }
477: else
478: #endif /* RE_ENABLE_I18N */
479: {
480: if (pstr->icase)
481: build_upper_buffer (pstr);
482: else if (pstr->trans != NULL)
483: re_string_translate_buffer (pstr);
484: }
485: pstr->cur_idx = 0;
486:
487: return REG_NOERROR;
488: }
489:
490: static void
491: re_string_destruct (pstr)
492: re_string_t *pstr;
493: {
494: #ifdef RE_ENABLE_I18N
495: re_free (pstr->wcs);
496: #endif /* RE_ENABLE_I18N */
497: if (MBS_ALLOCATED (pstr))
498: re_free (pstr->mbs);
499: if (MBS_CASE_ALLOCATED (pstr))
500: re_free (pstr->mbs_case);
501: }
502:
503: /* Return the context at IDX in INPUT. */
504:
505: static unsigned int
506: re_string_context_at (input, idx, eflags, newline_anchor)
507: const re_string_t *input;
508: int idx, eflags, newline_anchor;
509: {
510: int c;
511: if (idx < 0 || idx == input->len)
512: {
513: if (idx < 0)
514: /* In this case, we use the value stored in input->tip_context,
515: since we can't know the character in input->mbs[-1] here. */
516: return input->tip_context;
517: else /* (idx == input->len) */
518: return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
519: : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
520: }
521: #ifdef RE_ENABLE_I18N
522: if (MB_CUR_MAX > 1)
523: {
524: wint_t wc;
525: int wc_idx = idx;
526: while(input->wcs[wc_idx] == WEOF)
527: {
528: #ifdef DEBUG
529: /* It must not happen. */
530: assert (wc_idx >= 0);
531: #endif
532: --wc_idx;
533: if (wc_idx < 0)
534: return input->tip_context;
535: }
536: wc = input->wcs[wc_idx];
537: if (IS_WIDE_WORD_CHAR (wc))
538: return CONTEXT_WORD;
539: return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
540: }
541: else
542: #endif
543: {
544: c = re_string_byte_at (input, idx);
545: if (IS_WORD_CHAR (c))
546: return CONTEXT_WORD;
547: return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
548: }
549: }
550:
551: /* Functions for set operation. */
552:
553: static reg_errcode_t
554: re_node_set_alloc (set, size)
555: re_node_set *set;
556: int size;
557: {
558: set->alloc = size;
559: set->nelem = 0;
560: set->elems = re_malloc (int, size);
561: if (BE (set->elems == NULL, 0))
562: return REG_ESPACE;
563: return REG_NOERROR;
564: }
565:
566: static reg_errcode_t
567: re_node_set_init_1 (set, elem)
568: re_node_set *set;
569: int elem;
570: {
571: set->alloc = 1;
572: set->nelem = 1;
573: set->elems = re_malloc (int, 1);
574: if (BE (set->elems == NULL, 0))
575: {
576: set->alloc = set->nelem = 0;
577: return REG_ESPACE;
578: }
579: set->elems[0] = elem;
580: return REG_NOERROR;
581: }
582:
583: static reg_errcode_t
584: re_node_set_init_2 (set, elem1, elem2)
585: re_node_set *set;
586: int elem1, elem2;
587: {
588: set->alloc = 2;
589: set->elems = re_malloc (int, 2);
590: if (BE (set->elems == NULL, 0))
591: return REG_ESPACE;
592: if (elem1 == elem2)
593: {
594: set->nelem = 1;
595: set->elems[0] = elem1;
596: }
597: else
598: {
599: set->nelem = 2;
600: if (elem1 < elem2)
601: {
602: set->elems[0] = elem1;
603: set->elems[1] = elem2;
604: }
605: else
606: {
607: set->elems[0] = elem2;
608: set->elems[1] = elem1;
609: }
610: }
611: return REG_NOERROR;
612: }
613:
614: static reg_errcode_t
615: re_node_set_init_copy (dest, src)
616: re_node_set *dest;
617: const re_node_set *src;
618: {
619: dest->nelem = src->nelem;
620: if (src->nelem > 0)
621: {
622: dest->alloc = dest->nelem;
623: dest->elems = re_malloc (int, dest->alloc);
624: if (BE (dest->elems == NULL, 0))
625: {
626: dest->alloc = dest->nelem = 0;
627: return REG_ESPACE;
628: }
629: memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
630: }
631: else
632: re_node_set_init_empty (dest);
633: return REG_NOERROR;
634: }
635:
636: /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
637: DEST. Return value indicate the error code or REG_NOERROR if succeeded.
638: Note: We assume dest->elems is NULL, when dest->alloc is 0. */
639:
640: static reg_errcode_t
641: re_node_set_add_intersect (dest, src1, src2)
642: re_node_set *dest;
643: const re_node_set *src1, *src2;
644: {
645: int i1, i2, id;
646: if (src1->nelem > 0 && src2->nelem > 0)
647: {
648: if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
649: {
650: dest->alloc = src1->nelem + src2->nelem + dest->nelem;
651: dest->elems = re_realloc (dest->elems, int, dest->alloc);
652: if (BE (dest->elems == NULL, 0))
653: return REG_ESPACE;
654: }
655: }
656: else
657: return REG_NOERROR;
658:
659: for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
660: {
661: if (src1->elems[i1] > src2->elems[i2])
662: {
663: ++i2;
664: continue;
665: }
666: if (src1->elems[i1] == src2->elems[i2])
667: {
668: while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
669: ++id;
670: if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
671: ++id;
672: else
673: {
674: memmove (dest->elems + id + 1, dest->elems + id,
675: sizeof (int) * (dest->nelem - id));
676: dest->elems[id++] = src2->elems[i2++];
677: ++dest->nelem;
678: }
679: }
680: ++i1;
681: }
682: return REG_NOERROR;
683: }
684:
685: /* Calculate the union set of the sets SRC1 and SRC2. And store it to
686: DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
687:
688: static reg_errcode_t
689: re_node_set_init_union (dest, src1, src2)
690: re_node_set *dest;
691: const re_node_set *src1, *src2;
692: {
693: int i1, i2, id;
694: if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
695: {
696: dest->alloc = src1->nelem + src2->nelem;
697: dest->elems = re_malloc (int, dest->alloc);
698: if (BE (dest->elems == NULL, 0))
699: return REG_ESPACE;
700: }
701: else
702: {
703: if (src1 != NULL && src1->nelem > 0)
704: return re_node_set_init_copy (dest, src1);
705: else if (src2 != NULL && src2->nelem > 0)
706: return re_node_set_init_copy (dest, src2);
707: else
708: re_node_set_init_empty (dest);
709: return REG_NOERROR;
710: }
711: for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
712: {
713: if (src1->elems[i1] > src2->elems[i2])
714: {
715: dest->elems[id++] = src2->elems[i2++];
716: continue;
717: }
718: if (src1->elems[i1] == src2->elems[i2])
719: ++i2;
720: dest->elems[id++] = src1->elems[i1++];
721: }
722: if (i1 < src1->nelem)
723: {
724: memcpy (dest->elems + id, src1->elems + i1,
725: (src1->nelem - i1) * sizeof (int));
726: id += src1->nelem - i1;
727: }
728: else if (i2 < src2->nelem)
729: {
730: memcpy (dest->elems + id, src2->elems + i2,
731: (src2->nelem - i2) * sizeof (int));
732: id += src2->nelem - i2;
733: }
734: dest->nelem = id;
735: return REG_NOERROR;
736: }
737:
738: /* Calculate the union set of the sets DEST and SRC. And store it to
739: DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
740:
741: static reg_errcode_t
742: re_node_set_merge (dest, src)
743: re_node_set *dest;
744: const re_node_set *src;
745: {
746: int si, di;
747: if (src == NULL || src->nelem == 0)
748: return REG_NOERROR;
749: if (dest->alloc < src->nelem + dest->nelem)
750: {
751: int *new_buffer;
752: dest->alloc = 2 * (src->nelem + dest->alloc);
753: new_buffer = re_realloc (dest->elems, int, dest->alloc);
754: if (BE (new_buffer == NULL, 0))
755: return REG_ESPACE;
756: dest->elems = new_buffer;
757: }
758:
759: for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
760: {
761: int cp_from, ncp, mid, right, src_elem = src->elems[si];
762: /* Binary search the spot we will add the new element. */
763: right = dest->nelem;
764: while (di < right)
765: {
766: mid = (di + right) / 2;
767: if (dest->elems[mid] < src_elem)
768: di = mid + 1;
769: else
770: right = mid;
771: }
772: if (di >= dest->nelem)
773: break;
774:
775: if (dest->elems[di] == src_elem)
776: {
777: /* Skip since, DEST already has the element. */
778: ++di;
779: ++si;
780: continue;
781: }
782:
783: /* Skip the src elements which are less than dest->elems[di]. */
784: cp_from = si;
785: while (si < src->nelem && src->elems[si] < dest->elems[di])
786: ++si;
787: /* Copy these src elements. */
788: ncp = si - cp_from;
789: memmove (dest->elems + di + ncp, dest->elems + di,
790: sizeof (int) * (dest->nelem - di));
791: memcpy (dest->elems + di, src->elems + cp_from,
792: sizeof (int) * ncp);
793: /* Update counters. */
794: di += ncp;
795: dest->nelem += ncp;
796: }
797:
798: /* Copy remaining src elements. */
799: if (si < src->nelem)
800: {
801: memcpy (dest->elems + di, src->elems + si,
802: sizeof (int) * (src->nelem - si));
803: dest->nelem += src->nelem - si;
804: }
805: return REG_NOERROR;
806: }
807:
808: /* Insert the new element ELEM to the re_node_set* SET.
809: return 0 if SET already has ELEM,
810: return -1 if an error is occured, return 1 otherwise. */
811:
812: static int
813: re_node_set_insert (set, elem)
814: re_node_set *set;
815: int elem;
816: {
817: int idx, right, mid;
818: /* In case of the set is empty. */
819: if (set->elems == NULL || set->alloc == 0)
820: {
821: if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
822: return 1;
823: else
824: return -1;
825: }
826:
827: /* Binary search the spot we will add the new element. */
828: idx = 0;
829: right = set->nelem;
830: while (idx < right)
831: {
832: mid = (idx + right) / 2;
833: if (set->elems[mid] < elem)
834: idx = mid + 1;
835: else
836: right = mid;
837: }
838:
839: /* Realloc if we need. */
840: if (set->alloc < set->nelem + 1)
841: {
842: int *new_array;
843: set->alloc = set->alloc * 2;
844: new_array = re_malloc (int, set->alloc);
845: if (BE (new_array == NULL, 0))
846: return -1;
847: /* Copy the elements they are followed by the new element. */
848: if (idx > 0)
849: memcpy (new_array, set->elems, sizeof (int) * (idx));
850: /* Copy the elements which follows the new element. */
851: if (set->nelem - idx > 0)
852: memcpy (new_array + idx + 1, set->elems + idx,
853: sizeof (int) * (set->nelem - idx));
854: re_free (set->elems);
855: set->elems = new_array;
856: }
857: else
858: {
859: /* Move the elements which follows the new element. */
860: if (set->nelem - idx > 0)
861: memmove (set->elems + idx + 1, set->elems + idx,
862: sizeof (int) * (set->nelem - idx));
863: }
864: /* Insert the new element. */
865: set->elems[idx] = elem;
866: ++set->nelem;
867: return 1;
868: }
869:
870: /* Compare two node sets SET1 and SET2.
871: return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise. */
872:
873: static int
874: re_node_set_compare (set1, set2)
875: const re_node_set *set1, *set2;
876: {
877: int i;
878: if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
879: return 0;
880: for (i = 0 ; i < set1->nelem ; i++)
881: if (set1->elems[i] != set2->elems[i])
882: return 0;
883: return 1;
884: }
885:
886: /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
887:
888: static int
889: re_node_set_contains (set, elem)
890: const re_node_set *set;
891: int elem;
892: {
893: int idx, right, mid;
894: if (set->nelem <= 0)
895: return 0;
896:
897: /* Binary search the element. */
898: idx = 0;
899: right = set->nelem - 1;
900: while (idx < right)
901: {
902: mid = (idx + right) / 2;
903: if (set->elems[mid] < elem)
904: idx = mid + 1;
905: else
906: right = mid;
907: }
908: return set->elems[idx] == elem ? idx + 1 : 0;
909: }
910:
911: static void
912: re_node_set_remove_at (set, idx)
913: re_node_set *set;
914: int idx;
915: {
916: if (idx < 0 || idx >= set->nelem)
917: return;
918: if (idx < set->nelem - 1)
919: memmove (set->elems + idx, set->elems + idx + 1,
920: sizeof (int) * (set->nelem - idx - 1));
921: --set->nelem;
922: }
923:
924:
925: /* Add the token TOKEN to dfa->nodes, and return the index of the token.
926: Or return -1, if an error will be occured. */
927:
928: static int
929: re_dfa_add_node (dfa, token, mode)
930: re_dfa_t *dfa;
931: re_token_t token;
932: int mode;
933: {
934: if (dfa->nodes_len >= dfa->nodes_alloc)
935: {
936: re_token_t *new_array;
937: dfa->nodes_alloc *= 2;
938: new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
939: if (BE (new_array == NULL, 0))
940: return -1;
941: else
942: dfa->nodes = new_array;
943: if (mode)
944: {
945: int *new_nexts, *new_indices;
946: re_node_set *new_edests, *new_eclosures, *new_inveclosures;
947:
948: new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
949: new_indices = re_realloc (dfa->org_indices, int, dfa->nodes_alloc);
950: new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
951: new_eclosures = re_realloc (dfa->eclosures, re_node_set,
952: dfa->nodes_alloc);
953: new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
954: dfa->nodes_alloc);
955: if (BE (new_nexts == NULL || new_indices == NULL
956: || new_edests == NULL || new_eclosures == NULL
957: || new_inveclosures == NULL, 0))
958: return -1;
959: dfa->nexts = new_nexts;
960: dfa->org_indices = new_indices;
961: dfa->edests = new_edests;
962: dfa->eclosures = new_eclosures;
963: dfa->inveclosures = new_inveclosures;
964: }
965: }
966: dfa->nodes[dfa->nodes_len] = token;
967: dfa->nodes[dfa->nodes_len].duplicated = 0;
968: dfa->nodes[dfa->nodes_len].constraint = 0;
969: return dfa->nodes_len++;
970: }
971:
972: static unsigned int inline
973: calc_state_hash (nodes, context)
974: const re_node_set *nodes;
975: unsigned int context;
976: {
977: unsigned int hash = nodes->nelem + context;
978: int i;
979: for (i = 0 ; i < nodes->nelem ; i++)
980: hash += nodes->elems[i];
981: return hash;
982: }
983:
984: /* Search for the state whose node_set is equivalent to NODES.
985: Return the pointer to the state, if we found it in the DFA.
986: Otherwise create the new one and return it. In case of an error
987: return NULL and set the error code in ERR.
988: Note: - We assume NULL as the invalid state, then it is possible that
989: return value is NULL and ERR is REG_NOERROR.
990: - We never return non-NULL value in case of any errors, it is for
991: optimization. */
992:
993: static re_dfastate_t*
994: re_acquire_state (err, dfa, nodes)
995: reg_errcode_t *err;
996: re_dfa_t *dfa;
997: const re_node_set *nodes;
998: {
999: unsigned int hash;
1000: re_dfastate_t *new_state;
1001: struct re_state_table_entry *spot;
1002: int i;
1003: if (BE (nodes->nelem == 0, 0))
1004: {
1005: *err = REG_NOERROR;
1006: return NULL;
1007: }
1008: hash = calc_state_hash (nodes, 0);
1009: spot = dfa->state_table + (hash & dfa->state_hash_mask);
1010:
1011: for (i = 0 ; i < spot->num ; i++)
1012: {
1013: re_dfastate_t *state = spot->array[i];
1014: if (hash != state->hash)
1015: continue;
1016: if (re_node_set_compare (&state->nodes, nodes))
1017: return state;
1018: }
1019:
1020: /* There are no appropriate state in the dfa, create the new one. */
1021: new_state = create_ci_newstate (dfa, nodes, hash);
1022: if (BE (new_state != NULL, 1))
1023: return new_state;
1024: else
1025: {
1026: *err = REG_ESPACE;
1027: return NULL;
1028: }
1029: }
1030:
1031: /* Search for the state whose node_set is equivalent to NODES and
1032: whose context is equivalent to CONTEXT.
1033: Return the pointer to the state, if we found it in the DFA.
1034: Otherwise create the new one and return it. In case of an error
1035: return NULL and set the error code in ERR.
1036: Note: - We assume NULL as the invalid state, then it is possible that
1037: return value is NULL and ERR is REG_NOERROR.
1038: - We never return non-NULL value in case of any errors, it is for
1039: optimization. */
1040:
1041: static re_dfastate_t*
1042: re_acquire_state_context (err, dfa, nodes, context)
1043: reg_errcode_t *err;
1044: re_dfa_t *dfa;
1045: const re_node_set *nodes;
1046: unsigned int context;
1047: {
1048: unsigned int hash;
1049: re_dfastate_t *new_state;
1050: struct re_state_table_entry *spot;
1051: int i;
1052: if (nodes->nelem == 0)
1053: {
1054: *err = REG_NOERROR;
1055: return NULL;
1056: }
1057: hash = calc_state_hash (nodes, context);
1058: spot = dfa->state_table + (hash & dfa->state_hash_mask);
1059:
1060: for (i = 0 ; i < spot->num ; i++)
1061: {
1062: re_dfastate_t *state = spot->array[i];
1063: if (hash != state->hash)
1064: continue;
1065: if (re_node_set_compare (state->entrance_nodes, nodes)
1066: && state->context == context)
1067: return state;
1068: }
1069: /* There are no appropriate state in `dfa', create the new one. */
1070: new_state = create_cd_newstate (dfa, nodes, context, hash);
1071: if (BE (new_state != NULL, 1))
1072: return new_state;
1073: else
1074: {
1075: *err = REG_ESPACE;
1076: return NULL;
1077: }
1078: }
1079:
1080: /* Allocate memory for DFA state and initialize common properties.
1081: Return the new state if succeeded, otherwise return NULL. */
1082:
1083: static re_dfastate_t *
1084: create_newstate_common (dfa, nodes, hash)
1085: re_dfa_t *dfa;
1086: const re_node_set *nodes;
1087: unsigned int hash;
1088: {
1089: re_dfastate_t *newstate;
1090: reg_errcode_t err;
1091: newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1092: if (BE (newstate == NULL, 0))
1093: return NULL;
1094: err = re_node_set_init_copy (&newstate->nodes, nodes);
1095: if (BE (err != REG_NOERROR, 0))
1096: {
1097: re_free (newstate);
1098: return NULL;
1099: }
1100: newstate->trtable = NULL;
1101: newstate->trtable_search = NULL;
1102: newstate->hash = hash;
1103: return newstate;
1104: }
1105:
1106: /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1107: position. Return value indicate the error code if failed. */
1108:
1109: static reg_errcode_t
1110: register_state (dfa, newstate, hash)
1111: re_dfa_t *dfa;
1112: re_dfastate_t *newstate;
1113: unsigned int hash;
1114: {
1115: struct re_state_table_entry *spot;
1116: spot = dfa->state_table + (hash & dfa->state_hash_mask);
1117:
1118: if (spot->alloc <= spot->num)
1119: {
1120: re_dfastate_t **new_array;
1121: spot->alloc = 2 * spot->num + 2;
1122: new_array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
1123: if (BE (new_array == NULL, 0))
1124: return REG_ESPACE;
1125: spot->array = new_array;
1126: }
1127: spot->array[spot->num++] = newstate;
1128: return REG_NOERROR;
1129: }
1130:
1131: /* Create the new state which is independ of contexts.
1132: Return the new state if succeeded, otherwise return NULL. */
1133:
1134: static re_dfastate_t *
1135: create_ci_newstate (dfa, nodes, hash)
1136: re_dfa_t *dfa;
1137: const re_node_set *nodes;
1138: unsigned int hash;
1139: {
1140: int i;
1141: reg_errcode_t err;
1142: re_dfastate_t *newstate;
1143: newstate = create_newstate_common (dfa, nodes, hash);
1144: if (BE (newstate == NULL, 0))
1145: return NULL;
1146: newstate->entrance_nodes = &newstate->nodes;
1147:
1148: for (i = 0 ; i < nodes->nelem ; i++)
1149: {
1150: re_token_t *node = dfa->nodes + nodes->elems[i];
1151: re_token_type_t type = node->type;
1152: if (type == CHARACTER && !node->constraint)
1153: continue;
1154:
1155: /* If the state has the halt node, the state is a halt state. */
1156: else if (type == END_OF_RE)
1157: newstate->halt = 1;
1158: #ifdef RE_ENABLE_I18N
1159: else if (type == COMPLEX_BRACKET
1160: || (type == OP_PERIOD && MB_CUR_MAX > 1))
1161: newstate->accept_mb = 1;
1162: #endif /* RE_ENABLE_I18N */
1163: else if (type == OP_BACK_REF)
1164: newstate->has_backref = 1;
1165: else if (type == ANCHOR || node->constraint)
1166: newstate->has_constraint = 1;
1167: }
1168: err = register_state (dfa, newstate, hash);
1169: if (BE (err != REG_NOERROR, 0))
1170: {
1171: free_state (newstate);
1172: newstate = NULL;
1173: }
1174: return newstate;
1175: }
1176:
1177: /* Create the new state which is depend on the context CONTEXT.
1178: Return the new state if succeeded, otherwise return NULL. */
1179:
1180: static re_dfastate_t *
1181: create_cd_newstate (dfa, nodes, context, hash)
1182: re_dfa_t *dfa;
1183: const re_node_set *nodes;
1184: unsigned int context, hash;
1185: {
1186: int i, nctx_nodes = 0;
1187: reg_errcode_t err;
1188: re_dfastate_t *newstate;
1189:
1190: newstate = create_newstate_common (dfa, nodes, hash);
1191: if (BE (newstate == NULL, 0))
1192: return NULL;
1193: newstate->context = context;
1194: newstate->entrance_nodes = &newstate->nodes;
1195:
1196: for (i = 0 ; i < nodes->nelem ; i++)
1197: {
1198: unsigned int constraint = 0;
1199: re_token_t *node = dfa->nodes + nodes->elems[i];
1200: re_token_type_t type = node->type;
1201: if (node->constraint)
1202: constraint = node->constraint;
1203:
1204: if (type == CHARACTER && !constraint)
1205: continue;
1206: /* If the state has the halt node, the state is a halt state. */
1207: else if (type == END_OF_RE)
1208: newstate->halt = 1;
1209: #ifdef RE_ENABLE_I18N
1210: else if (type == COMPLEX_BRACKET
1211: || (type == OP_PERIOD && MB_CUR_MAX > 1))
1212: newstate->accept_mb = 1;
1213: #endif /* RE_ENABLE_I18N */
1214: else if (type == OP_BACK_REF)
1215: newstate->has_backref = 1;
1216: else if (type == ANCHOR)
1217: constraint = node->opr.ctx_type;
1218:
1219: if (constraint)
1220: {
1221: if (newstate->entrance_nodes == &newstate->nodes)
1222: {
1223: newstate->entrance_nodes = re_malloc (re_node_set, 1);
1224: if (BE (newstate->entrance_nodes == NULL, 0))
1225: {
1226: free_state (newstate);
1227: return NULL;
1228: }
1229: re_node_set_init_copy (newstate->entrance_nodes, nodes);
1230: nctx_nodes = 0;
1231: newstate->has_constraint = 1;
1232: }
1233:
1234: if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1235: {
1236: re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1237: ++nctx_nodes;
1238: }
1239: }
1240: }
1241: err = register_state (dfa, newstate, hash);
1242: if (BE (err != REG_NOERROR, 0))
1243: {
1244: free_state (newstate);
1245: newstate = NULL;
1246: }
1247: return newstate;
1248: }
1249:
1250: static void
1251: free_state (state)
1252: re_dfastate_t *state;
1253: {
1254: if (state->entrance_nodes != &state->nodes)
1255: {
1256: re_node_set_free (state->entrance_nodes);
1257: re_free (state->entrance_nodes);
1258: }
1259: re_node_set_free (&state->nodes);
1260: re_free (state->trtable);
1261: re_free (state->trtable_search);
1262: re_free (state);
1263: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>