Annotation of embedaddon/php/ext/mbstring/oniguruma/regcomp.c, revision 1.1.1.1
1.1 misho 1: /**********************************************************************
2: regcomp.c - Oniguruma (regular expression library)
3: **********************************************************************/
4: /*-
5: * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6: * All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: */
29:
30: #include "regparse.h"
31:
32: OnigAmbigType OnigDefaultAmbigFlag =
33: (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
34: ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE);
35:
36: extern OnigAmbigType
37: onig_get_default_ambig_flag(void)
38: {
39: return OnigDefaultAmbigFlag;
40: }
41:
42: extern int
43: onig_set_default_ambig_flag(OnigAmbigType ambig_flag)
44: {
45: OnigDefaultAmbigFlag = ambig_flag;
46: return 0;
47: }
48:
49:
50: static UChar*
51: k_strdup(UChar* s, UChar* end)
52: {
53: int len = end - s;
54:
55: if (len > 0) {
56: UChar* r = (UChar* )xmalloc(len + 1);
57: CHECK_NULL_RETURN(r);
58: xmemcpy(r, s, len);
59: r[len] = (UChar )0;
60: return r;
61: }
62: else return NULL;
63: }
64:
65: /*
66: Caution: node should not be a string node.
67: (s and end member address break)
68: */
69: static void
70: swap_node(Node* a, Node* b)
71: {
72: Node c;
73: c = *a; *a = *b; *b = c;
74: }
75:
76: static OnigDistance
77: distance_add(OnigDistance d1, OnigDistance d2)
78: {
79: if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
80: return ONIG_INFINITE_DISTANCE;
81: else {
82: if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
83: else return ONIG_INFINITE_DISTANCE;
84: }
85: }
86:
87: static OnigDistance
88: distance_multiply(OnigDistance d, int m)
89: {
90: if (m == 0) return 0;
91:
92: if (d < ONIG_INFINITE_DISTANCE / m)
93: return d * m;
94: else
95: return ONIG_INFINITE_DISTANCE;
96: }
97:
98: static int
99: bitset_is_empty(BitSetRef bs)
100: {
101: int i;
102: for (i = 0; i < BITSET_SIZE; i++) {
103: if (bs[i] != 0) return 0;
104: }
105: return 1;
106: }
107:
108: #ifdef ONIG_DEBUG
109: static int
110: bitset_on_num(BitSetRef bs)
111: {
112: int i, n;
113:
114: n = 0;
115: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
116: if (BITSET_AT(bs, i)) n++;
117: }
118: return n;
119: }
120: #endif
121:
122: extern int
123: onig_bbuf_init(BBuf* buf, int size)
124: {
125: buf->p = (UChar* )xmalloc(size);
126: if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
127:
128: buf->alloc = size;
129: buf->used = 0;
130: return 0;
131: }
132:
133:
134: #ifdef USE_SUBEXP_CALL
135:
136: static int
137: unset_addr_list_init(UnsetAddrList* uslist, int size)
138: {
139: UnsetAddr* p;
140:
141: p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
142: CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
143: uslist->num = 0;
144: uslist->alloc = size;
145: uslist->us = p;
146: return 0;
147: }
148:
149: static void
150: unset_addr_list_end(UnsetAddrList* uslist)
151: {
152: if (IS_NOT_NULL(uslist->us))
153: xfree(uslist->us);
154: }
155:
156: static int
157: unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
158: {
159: UnsetAddr* p;
160: int size;
161:
162: if (uslist->num >= uslist->alloc) {
163: size = uslist->alloc * 2;
164: p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
165: CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
166: uslist->alloc = size;
167: uslist->us = p;
168: }
169:
170: uslist->us[uslist->num].offset = offset;
171: uslist->us[uslist->num].target = node;
172: uslist->num++;
173: return 0;
174: }
175: #endif /* USE_SUBEXP_CALL */
176:
177:
178: static int
179: add_opcode(regex_t* reg, int opcode)
180: {
181: BBUF_ADD1(reg, opcode);
182: return 0;
183: }
184:
185: #ifdef USE_COMBINATION_EXPLOSION_CHECK
186: static int
187: add_state_check_num(regex_t* reg, int num)
188: {
189: StateCheckNumType n = (StateCheckNumType )num;
190:
191: BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
192: return 0;
193: }
194: #endif
195:
196: static int
197: add_rel_addr(regex_t* reg, int addr)
198: {
199: RelAddrType ra = (RelAddrType )addr;
200:
201: BBUF_ADD(reg, &ra, SIZE_RELADDR);
202: return 0;
203: }
204:
205: static int
206: add_abs_addr(regex_t* reg, int addr)
207: {
208: AbsAddrType ra = (AbsAddrType )addr;
209:
210: BBUF_ADD(reg, &ra, SIZE_ABSADDR);
211: return 0;
212: }
213:
214: static int
215: add_length(regex_t* reg, int len)
216: {
217: LengthType l = (LengthType )len;
218:
219: BBUF_ADD(reg, &l, SIZE_LENGTH);
220: return 0;
221: }
222:
223: static int
224: add_mem_num(regex_t* reg, int num)
225: {
226: MemNumType n = (MemNumType )num;
227:
228: BBUF_ADD(reg, &n, SIZE_MEMNUM);
229: return 0;
230: }
231:
232: static int
233: add_pointer(regex_t* reg, void* addr)
234: {
235: PointerType ptr = (PointerType )addr;
236:
237: BBUF_ADD(reg, &ptr, SIZE_POINTER);
238: return 0;
239: }
240:
241: static int
242: add_option(regex_t* reg, OnigOptionType option)
243: {
244: BBUF_ADD(reg, &option, SIZE_OPTION);
245: return 0;
246: }
247:
248: static int
249: add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
250: {
251: int r;
252:
253: r = add_opcode(reg, opcode);
254: if (r) return r;
255: r = add_rel_addr(reg, addr);
256: return r;
257: }
258:
259: static int
260: add_bytes(regex_t* reg, UChar* bytes, int len)
261: {
262: BBUF_ADD(reg, bytes, len);
263: return 0;
264: }
265:
266: static int
267: add_bitset(regex_t* reg, BitSetRef bs)
268: {
269: BBUF_ADD(reg, bs, SIZE_BITSET);
270: return 0;
271: }
272:
273: static int
274: add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
275: {
276: int r;
277:
278: r = add_opcode(reg, opcode);
279: if (r) return r;
280: r = add_option(reg, option);
281: return r;
282: }
283:
284: static int compile_length_tree(Node* node, regex_t* reg);
285: static int compile_tree(Node* node, regex_t* reg);
286:
287:
288: #define IS_NEED_STR_LEN_OP_EXACT(op) \
289: ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
290: (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
291:
292: static int
293: select_str_opcode(int mb_len, int str_len, int ignore_case)
294: {
295: int op;
296:
297: if (ignore_case) {
298: switch (str_len) {
299: case 1: op = OP_EXACT1_IC; break;
300: default: op = OP_EXACTN_IC; break;
301: }
302: }
303: else {
304: switch (mb_len) {
305: case 1:
306: switch (str_len) {
307: case 1: op = OP_EXACT1; break;
308: case 2: op = OP_EXACT2; break;
309: case 3: op = OP_EXACT3; break;
310: case 4: op = OP_EXACT4; break;
311: case 5: op = OP_EXACT5; break;
312: default: op = OP_EXACTN; break;
313: }
314: break;
315:
316: case 2:
317: switch (str_len) {
318: case 1: op = OP_EXACTMB2N1; break;
319: case 2: op = OP_EXACTMB2N2; break;
320: case 3: op = OP_EXACTMB2N3; break;
321: default: op = OP_EXACTMB2N; break;
322: }
323: break;
324:
325: case 3:
326: op = OP_EXACTMB3N;
327: break;
328:
329: default:
330: op = OP_EXACTMBN;
331: break;
332: }
333: }
334: return op;
335: }
336:
337: static int
338: compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
339: {
340: int r;
341: int saved_num_null_check = reg->num_null_check;
342:
343: if (empty_info != 0) {
344: r = add_opcode(reg, OP_NULL_CHECK_START);
345: if (r) return r;
346: r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
347: if (r) return r;
348: reg->num_null_check++;
349: }
350:
351: r = compile_tree(node, reg);
352: if (r) return r;
353:
354: if (empty_info != 0) {
355: if (empty_info == NQ_TARGET_IS_EMPTY)
356: r = add_opcode(reg, OP_NULL_CHECK_END);
357: else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
358: r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
359: else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
360: r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
361:
362: if (r) return r;
363: r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
364: }
365: return r;
366: }
367:
368: #ifdef USE_SUBEXP_CALL
369: static int
370: compile_call(CallNode* node, regex_t* reg)
371: {
372: int r;
373:
374: r = add_opcode(reg, OP_CALL);
375: if (r) return r;
376: r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
377: node->target);
378: if (r) return r;
379: r = add_abs_addr(reg, 0 /*dummy addr.*/);
380: return r;
381: }
382: #endif
383:
384: static int
385: compile_tree_n_times(Node* node, int n, regex_t* reg)
386: {
387: int i, r;
388:
389: for (i = 0; i < n; i++) {
390: r = compile_tree(node, reg);
391: if (r) return r;
392: }
393: return 0;
394: }
395:
396: static int
397: add_compile_string_length(UChar* s, int mb_len, int str_len,
398: regex_t* reg, int ignore_case)
399: {
400: int len;
401: int op = select_str_opcode(mb_len, str_len, ignore_case);
402:
403: len = SIZE_OPCODE;
404:
405: if (op == OP_EXACTMBN) len += SIZE_LENGTH;
406: if (IS_NEED_STR_LEN_OP_EXACT(op))
407: len += SIZE_LENGTH;
408:
409: len += mb_len * str_len;
410: return len;
411: }
412:
413: static int
414: add_compile_string(UChar* s, int mb_len, int str_len,
415: regex_t* reg, int ignore_case)
416: {
417: int op = select_str_opcode(mb_len, str_len, ignore_case);
418: add_opcode(reg, op);
419:
420: if (op == OP_EXACTMBN)
421: add_length(reg, mb_len);
422:
423: if (IS_NEED_STR_LEN_OP_EXACT(op)) {
424: if (op == OP_EXACTN_IC)
425: add_length(reg, mb_len * str_len);
426: else
427: add_length(reg, str_len);
428: }
429:
430: add_bytes(reg, s, mb_len * str_len);
431: return 0;
432: }
433:
434:
435: static int
436: compile_length_string_node(Node* node, regex_t* reg)
437: {
438: int rlen, r, len, prev_len, slen, ambig;
439: OnigEncoding enc = reg->enc;
440: UChar *p, *prev;
441: StrNode* sn;
442:
443: sn = &(NSTRING(node));
444: if (sn->end <= sn->s)
445: return 0;
446:
447: ambig = NSTRING_IS_AMBIG(node);
448:
449: p = prev = sn->s;
450: prev_len = enc_len(enc, p);
451: p += prev_len;
452: slen = 1;
453: rlen = 0;
454:
455: for (; p < sn->end; ) {
456: len = enc_len(enc, p);
457: if (len == prev_len) {
458: slen++;
459: }
460: else {
461: r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
462: rlen += r;
463: prev = p;
464: slen = 1;
465: prev_len = len;
466: }
467: p += len;
468: }
469: r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
470: rlen += r;
471: return rlen;
472: }
473:
474: static int
475: compile_length_string_raw_node(StrNode* sn, regex_t* reg)
476: {
477: if (sn->end <= sn->s)
478: return 0;
479:
480: return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
481: }
482:
483: static int
484: compile_string_node(Node* node, regex_t* reg)
485: {
486: int r, len, prev_len, slen, ambig;
487: OnigEncoding enc = reg->enc;
488: UChar *p, *prev, *end;
489: StrNode* sn;
490:
491: sn = &(NSTRING(node));
492: if (sn->end <= sn->s)
493: return 0;
494:
495: end = sn->end;
496: ambig = NSTRING_IS_AMBIG(node);
497:
498: p = prev = sn->s;
499: prev_len = enc_len(enc, p);
500: p += prev_len;
501: slen = 1;
502:
503: for (; p < end; ) {
504: len = enc_len(enc, p);
505: if (len == prev_len) {
506: slen++;
507: }
508: else {
509: r = add_compile_string(prev, prev_len, slen, reg, ambig);
510: if (r) return r;
511:
512: prev = p;
513: slen = 1;
514: prev_len = len;
515: }
516:
517: p += len;
518: }
519: return add_compile_string(prev, prev_len, slen, reg, ambig);
520: }
521:
522: static int
523: compile_string_raw_node(StrNode* sn, regex_t* reg)
524: {
525: if (sn->end <= sn->s)
526: return 0;
527:
528: return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
529: }
530:
531: static int
532: add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
533: {
534: #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
535: add_length(reg, mbuf->used);
536: return add_bytes(reg, mbuf->p, mbuf->used);
537: #else
538: static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
539:
540: int r, pad_size;
541: UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
542:
543: GET_ALIGNMENT_PAD_SIZE(p, pad_size);
544: add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
545: if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
546:
547: r = add_bytes(reg, mbuf->p, mbuf->used);
548:
549: /* padding for return value from compile_length_cclass_node() to be fix. */
550: pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
551: if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
552: return r;
553: #endif
554: }
555:
556: static int
557: compile_length_cclass_node(CClassNode* cc, regex_t* reg)
558: {
559: int len;
560:
561: if (IS_CCLASS_SHARE(cc)) {
562: len = SIZE_OPCODE + SIZE_POINTER;
563: return len;
564: }
565:
566: if (IS_NULL(cc->mbuf)) {
567: len = SIZE_OPCODE + SIZE_BITSET;
568: }
569: else {
570: if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
571: len = SIZE_OPCODE;
572: }
573: else {
574: len = SIZE_OPCODE + SIZE_BITSET;
575: }
576: #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
577: len += SIZE_LENGTH + cc->mbuf->used;
578: #else
579: len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
580: #endif
581: }
582:
583: return len;
584: }
585:
586: static int
587: compile_cclass_node(CClassNode* cc, regex_t* reg)
588: {
589: int r;
590:
591: if (IS_CCLASS_SHARE(cc)) {
592: add_opcode(reg, OP_CCLASS_NODE);
593: r = add_pointer(reg, cc);
594: return r;
595: }
596:
597: if (IS_NULL(cc->mbuf)) {
598: if (IS_CCLASS_NOT(cc))
599: add_opcode(reg, OP_CCLASS_NOT);
600: else
601: add_opcode(reg, OP_CCLASS);
602:
603: r = add_bitset(reg, cc->bs);
604: }
605: else {
606: if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
607: if (IS_CCLASS_NOT(cc))
608: add_opcode(reg, OP_CCLASS_MB_NOT);
609: else
610: add_opcode(reg, OP_CCLASS_MB);
611:
612: r = add_multi_byte_cclass(cc->mbuf, reg);
613: }
614: else {
615: if (IS_CCLASS_NOT(cc))
616: add_opcode(reg, OP_CCLASS_MIX_NOT);
617: else
618: add_opcode(reg, OP_CCLASS_MIX);
619:
620: r = add_bitset(reg, cc->bs);
621: if (r) return r;
622: r = add_multi_byte_cclass(cc->mbuf, reg);
623: }
624: }
625:
626: return r;
627: }
628:
629: static int
630: entry_repeat_range(regex_t* reg, int id, int lower, int upper)
631: {
632: #define REPEAT_RANGE_ALLOC 4
633:
634: OnigRepeatRange* p;
635:
636: if (reg->repeat_range_alloc == 0) {
637: p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
638: CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
639: reg->repeat_range = p;
640: reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
641: }
642: else if (reg->repeat_range_alloc <= id) {
643: int n;
644: n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
645: p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
646: sizeof(OnigRepeatRange) * n);
647: CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
648: reg->repeat_range = p;
649: reg->repeat_range_alloc = n;
650: }
651: else {
652: p = reg->repeat_range;
653: }
654:
655: p[id].lower = lower;
656: p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
657: return 0;
658: }
659:
660: static int
661: compile_range_repeat_node(QuantifierNode* qn, int target_len, int empty_info,
662: regex_t* reg)
663: {
664: int r;
665: int num_repeat = reg->num_repeat;
666:
667: r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
668: if (r) return r;
669: r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
670: reg->num_repeat++;
671: if (r) return r;
672: r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
673: if (r) return r;
674:
675: r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
676: if (r) return r;
677:
678: r = compile_tree_empty_check(qn->target, reg, empty_info);
679: if (r) return r;
680:
681: if (
682: #ifdef USE_SUBEXP_CALL
683: reg->num_call > 0 ||
684: #endif
685: IS_QUANTIFIER_IN_REPEAT(qn)) {
686: r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
687: }
688: else {
689: r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
690: }
691: if (r) return r;
692: r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
693: return r;
694: }
695:
696: static int
697: is_anychar_star_quantifier(QuantifierNode* qn)
698: {
699: if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
700: NTYPE(qn->target) == N_ANYCHAR)
701: return 1;
702: else
703: return 0;
704: }
705:
706: #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
707: #define CKN_ON (ckn > 0)
708:
709: #ifdef USE_COMBINATION_EXPLOSION_CHECK
710:
711: static int
712: compile_length_quantifier_node(QuantifierNode* qn, regex_t* reg)
713: {
714: int len, mod_tlen, cklen;
715: int ckn;
716: int infinite = IS_REPEAT_INFINITE(qn->upper);
717: int empty_info = qn->target_empty_info;
718: int tlen = compile_length_tree(qn->target, reg);
719:
720: if (tlen < 0) return tlen;
721:
722: ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
723:
724: cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
725:
726: /* anychar repeat */
727: if (NTYPE(qn->target) == N_ANYCHAR) {
728: if (qn->greedy && infinite) {
729: if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
730: return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
731: else
732: return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
733: }
734: }
735:
736: if (empty_info != 0)
737: mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
738: else
739: mod_tlen = tlen;
740:
741: if (infinite && qn->lower <= 1) {
742: if (qn->greedy) {
743: if (qn->lower == 1)
744: len = SIZE_OP_JUMP;
745: else
746: len = 0;
747:
748: len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
749: }
750: else {
751: if (qn->lower == 0)
752: len = SIZE_OP_JUMP;
753: else
754: len = 0;
755:
756: len += mod_tlen + SIZE_OP_PUSH + cklen;
757: }
758: }
759: else if (qn->upper == 0) {
760: if (qn->is_refered != 0) /* /(?<n>..){0}/ */
761: len = SIZE_OP_JUMP + tlen;
762: else
763: len = 0;
764: }
765: else if (qn->upper == 1 && qn->greedy) {
766: if (qn->lower == 0) {
767: if (CKN_ON) {
768: len = SIZE_OP_STATE_CHECK_PUSH + tlen;
769: }
770: else {
771: len = SIZE_OP_PUSH + tlen;
772: }
773: }
774: else {
775: len = tlen;
776: }
777: }
778: else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
779: len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
780: }
781: else {
782: len = SIZE_OP_REPEAT_INC
783: + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
784: if (CKN_ON)
785: len += SIZE_OP_STATE_CHECK;
786: }
787:
788: return len;
789: }
790:
791: static int
792: compile_quantifier_node(QuantifierNode* qn, regex_t* reg)
793: {
794: int r, mod_tlen;
795: int ckn;
796: int infinite = IS_REPEAT_INFINITE(qn->upper);
797: int empty_info = qn->target_empty_info;
798: int tlen = compile_length_tree(qn->target, reg);
799:
800: if (tlen < 0) return tlen;
801:
802: ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
803:
804: if (is_anychar_star_quantifier(qn)) {
805: r = compile_tree_n_times(qn->target, qn->lower, reg);
806: if (r) return r;
807: if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
808: if (IS_MULTILINE(reg->options))
809: r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
810: else
811: r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
812: if (r) return r;
813: if (CKN_ON) {
814: r = add_state_check_num(reg, ckn);
815: if (r) return r;
816: }
817:
818: return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
819: }
820: else {
821: if (IS_MULTILINE(reg->options)) {
822: r = add_opcode(reg, (CKN_ON ?
823: OP_STATE_CHECK_ANYCHAR_ML_STAR
824: : OP_ANYCHAR_ML_STAR));
825: }
826: else {
827: r = add_opcode(reg, (CKN_ON ?
828: OP_STATE_CHECK_ANYCHAR_STAR
829: : OP_ANYCHAR_STAR));
830: }
831: if (r) return r;
832: if (CKN_ON)
833: r = add_state_check_num(reg, ckn);
834:
835: return r;
836: }
837: }
838:
839: if (empty_info != 0)
840: mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
841: else
842: mod_tlen = tlen;
843:
844: if (infinite && qn->lower <= 1) {
845: if (qn->greedy) {
846: if (qn->lower == 1) {
847: r = add_opcode_rel_addr(reg, OP_JUMP,
848: (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
849: if (r) return r;
850: }
851:
852: if (CKN_ON) {
853: r = add_opcode(reg, OP_STATE_CHECK_PUSH);
854: if (r) return r;
855: r = add_state_check_num(reg, ckn);
856: if (r) return r;
857: r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
858: }
859: else {
860: r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
861: }
862: if (r) return r;
863: r = compile_tree_empty_check(qn->target, reg, empty_info);
864: if (r) return r;
865: r = add_opcode_rel_addr(reg, OP_JUMP,
866: -(mod_tlen + (int )SIZE_OP_JUMP
867: + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
868: }
869: else {
870: if (qn->lower == 0) {
871: r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
872: if (r) return r;
873: }
874: r = compile_tree_empty_check(qn->target, reg, empty_info);
875: if (r) return r;
876: if (CKN_ON) {
877: r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
878: if (r) return r;
879: r = add_state_check_num(reg, ckn);
880: if (r) return r;
881: r = add_rel_addr(reg,
882: -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
883: }
884: else
885: r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
886: }
887: }
888: else if (qn->upper == 0) {
889: if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
890: r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
891: if (r) return r;
892: r = compile_tree(qn->target, reg);
893: }
894: else
895: r = 0;
896: }
897: else if (qn->upper == 1 && qn->greedy) {
898: if (qn->lower == 0) {
899: if (CKN_ON) {
900: r = add_opcode(reg, OP_STATE_CHECK_PUSH);
901: if (r) return r;
902: r = add_state_check_num(reg, ckn);
903: if (r) return r;
904: r = add_rel_addr(reg, tlen);
905: }
906: else {
907: r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
908: }
909: if (r) return r;
910: }
911:
912: r = compile_tree(qn->target, reg);
913: }
914: else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
915: if (CKN_ON) {
916: r = add_opcode(reg, OP_STATE_CHECK_PUSH);
917: if (r) return r;
918: r = add_state_check_num(reg, ckn);
919: if (r) return r;
920: r = add_rel_addr(reg, SIZE_OP_JUMP);
921: }
922: else {
923: r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
924: }
925:
926: if (r) return r;
927: r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
928: if (r) return r;
929: r = compile_tree(qn->target, reg);
930: }
931: else {
932: r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
933: if (CKN_ON) {
934: if (r) return r;
935: r = add_opcode(reg, OP_STATE_CHECK);
936: if (r) return r;
937: r = add_state_check_num(reg, ckn);
938: }
939: }
940: return r;
941: }
942:
943: #else /* USE_COMBINATION_EXPLOSION_CHECK */
944:
945: static int
946: compile_length_quantifier_node(QuantifierNode* qn, regex_t* reg)
947: {
948: int len, mod_tlen;
949: int infinite = IS_REPEAT_INFINITE(qn->upper);
950: int empty_info = qn->target_empty_info;
951: int tlen = compile_length_tree(qn->target, reg);
952:
953: if (tlen < 0) return tlen;
954:
955: /* anychar repeat */
956: if (NTYPE(qn->target) == N_ANYCHAR) {
957: if (qn->greedy && infinite) {
958: if (IS_NOT_NULL(qn->next_head_exact))
959: return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
960: else
961: return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
962: }
963: }
964:
965: if (empty_info != 0)
966: mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
967: else
968: mod_tlen = tlen;
969:
970: if (infinite &&
971: (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
972: if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
973: len = SIZE_OP_JUMP;
974: }
975: else {
976: len = tlen * qn->lower;
977: }
978:
979: if (qn->greedy) {
980: if (IS_NOT_NULL(qn->head_exact))
981: len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
982: else if (IS_NOT_NULL(qn->next_head_exact))
983: len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
984: else
985: len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
986: }
987: else
988: len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
989: }
990: else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
991: len = SIZE_OP_JUMP + tlen;
992: }
993: else if (!infinite && qn->greedy &&
994: (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
995: <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
996: len = tlen * qn->lower;
997: len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
998: }
999: else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1000: len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1001: }
1002: else {
1003: len = SIZE_OP_REPEAT_INC
1004: + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1005: }
1006:
1007: return len;
1008: }
1009:
1010: static int
1011: compile_quantifier_node(QuantifierNode* qn, regex_t* reg)
1012: {
1013: int i, r, mod_tlen;
1014: int infinite = IS_REPEAT_INFINITE(qn->upper);
1015: int empty_info = qn->target_empty_info;
1016: int tlen = compile_length_tree(qn->target, reg);
1017:
1018: if (tlen < 0) return tlen;
1019:
1020: if (is_anychar_star_quantifier(qn)) {
1021: r = compile_tree_n_times(qn->target, qn->lower, reg);
1022: if (r) return r;
1023: if (IS_NOT_NULL(qn->next_head_exact)) {
1024: if (IS_MULTILINE(reg->options))
1025: r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1026: else
1027: r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1028: if (r) return r;
1029: return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
1030: }
1031: else {
1032: if (IS_MULTILINE(reg->options))
1033: return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1034: else
1035: return add_opcode(reg, OP_ANYCHAR_STAR);
1036: }
1037: }
1038:
1039: if (empty_info != 0)
1040: mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1041: else
1042: mod_tlen = tlen;
1043:
1044: if (infinite &&
1045: (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1046: if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1047: if (qn->greedy) {
1048: if (IS_NOT_NULL(qn->head_exact))
1049: r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1050: else if (IS_NOT_NULL(qn->next_head_exact))
1051: r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1052: else
1053: r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1054: }
1055: else {
1056: r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1057: }
1058: if (r) return r;
1059: }
1060: else {
1061: r = compile_tree_n_times(qn->target, qn->lower, reg);
1062: if (r) return r;
1063: }
1064:
1065: if (qn->greedy) {
1066: if (IS_NOT_NULL(qn->head_exact)) {
1067: r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1068: mod_tlen + SIZE_OP_JUMP);
1069: if (r) return r;
1070: add_bytes(reg, NSTRING(qn->head_exact).s, 1);
1071: r = compile_tree_empty_check(qn->target, reg, empty_info);
1072: if (r) return r;
1073: r = add_opcode_rel_addr(reg, OP_JUMP,
1074: -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1075: }
1076: else if (IS_NOT_NULL(qn->next_head_exact)) {
1077: r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1078: mod_tlen + SIZE_OP_JUMP);
1079: if (r) return r;
1080: add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
1081: r = compile_tree_empty_check(qn->target, reg, empty_info);
1082: if (r) return r;
1083: r = add_opcode_rel_addr(reg, OP_JUMP,
1084: -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1085: }
1086: else {
1087: r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1088: if (r) return r;
1089: r = compile_tree_empty_check(qn->target, reg, empty_info);
1090: if (r) return r;
1091: r = add_opcode_rel_addr(reg, OP_JUMP,
1092: -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1093: }
1094: }
1095: else {
1096: r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1097: if (r) return r;
1098: r = compile_tree_empty_check(qn->target, reg, empty_info);
1099: if (r) return r;
1100: r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1101: }
1102: }
1103: else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1104: r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1105: if (r) return r;
1106: r = compile_tree(qn->target, reg);
1107: }
1108: else if (!infinite && qn->greedy &&
1109: (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1110: <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1111: int n = qn->upper - qn->lower;
1112:
1113: r = compile_tree_n_times(qn->target, qn->lower, reg);
1114: if (r) return r;
1115:
1116: for (i = 0; i < n; i++) {
1117: r = add_opcode_rel_addr(reg, OP_PUSH,
1118: (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1119: if (r) return r;
1120: r = compile_tree(qn->target, reg);
1121: if (r) return r;
1122: }
1123: }
1124: else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1125: r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1126: if (r) return r;
1127: r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1128: if (r) return r;
1129: r = compile_tree(qn->target, reg);
1130: }
1131: else {
1132: r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1133: }
1134: return r;
1135: }
1136: #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1137:
1138: static int
1139: compile_length_option_node(EffectNode* node, regex_t* reg)
1140: {
1141: int tlen;
1142: OnigOptionType prev = reg->options;
1143:
1144: reg->options = node->option;
1145: tlen = compile_length_tree(node->target, reg);
1146: reg->options = prev;
1147:
1148: if (tlen < 0) return tlen;
1149:
1150: if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1151: return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1152: + tlen + SIZE_OP_SET_OPTION;
1153: }
1154: else
1155: return tlen;
1156: }
1157:
1158: static int
1159: compile_option_node(EffectNode* node, regex_t* reg)
1160: {
1161: int r;
1162: OnigOptionType prev = reg->options;
1163:
1164: if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1165: r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1166: if (r) return r;
1167: r = add_opcode_option(reg, OP_SET_OPTION, prev);
1168: if (r) return r;
1169: r = add_opcode(reg, OP_FAIL);
1170: if (r) return r;
1171: }
1172:
1173: reg->options = node->option;
1174: r = compile_tree(node->target, reg);
1175: reg->options = prev;
1176:
1177: if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1178: if (r) return r;
1179: r = add_opcode_option(reg, OP_SET_OPTION, prev);
1180: }
1181: return r;
1182: }
1183:
1184: static int
1185: compile_length_effect_node(EffectNode* node, regex_t* reg)
1186: {
1187: int len;
1188: int tlen;
1189:
1190: if (node->type == EFFECT_OPTION)
1191: return compile_length_option_node(node, reg);
1192:
1193: if (node->target) {
1194: tlen = compile_length_tree(node->target, reg);
1195: if (tlen < 0) return tlen;
1196: }
1197: else
1198: tlen = 0;
1199:
1200: switch (node->type) {
1201: case EFFECT_MEMORY:
1202: #ifdef USE_SUBEXP_CALL
1203: if (IS_EFFECT_CALLED(node)) {
1204: len = SIZE_OP_MEMORY_START_PUSH + tlen
1205: + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1206: if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1207: len += (IS_EFFECT_RECURSION(node)
1208: ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1209: else
1210: len += (IS_EFFECT_RECURSION(node)
1211: ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1212: }
1213: else
1214: #endif
1215: {
1216: if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1217: len = SIZE_OP_MEMORY_START_PUSH;
1218: else
1219: len = SIZE_OP_MEMORY_START;
1220:
1221: len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1222: ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1223: }
1224: break;
1225:
1226: case EFFECT_STOP_BACKTRACK:
1227: if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
1228: QuantifierNode* qn = &NQUANTIFIER(node->target);
1229: tlen = compile_length_tree(qn->target, reg);
1230: if (tlen < 0) return tlen;
1231:
1232: len = tlen * qn->lower
1233: + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1234: }
1235: else {
1236: len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1237: }
1238: break;
1239:
1240: default:
1241: return ONIGERR_TYPE_BUG;
1242: break;
1243: }
1244:
1245: return len;
1246: }
1247:
1248: static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1249:
1250: static int
1251: compile_effect_node(EffectNode* node, regex_t* reg)
1252: {
1253: int r, len;
1254:
1255: if (node->type == EFFECT_OPTION)
1256: return compile_option_node(node, reg);
1257:
1258: switch (node->type) {
1259: case EFFECT_MEMORY:
1260: #ifdef USE_SUBEXP_CALL
1261: if (IS_EFFECT_CALLED(node)) {
1262: r = add_opcode(reg, OP_CALL);
1263: if (r) return r;
1264: node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1265: node->state |= NST_ADDR_FIXED;
1266: r = add_abs_addr(reg, (int )node->call_addr);
1267: if (r) return r;
1268: len = compile_length_tree(node->target, reg);
1269: len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1270: if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1271: len += (IS_EFFECT_RECURSION(node)
1272: ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1273: else
1274: len += (IS_EFFECT_RECURSION(node)
1275: ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1276:
1277: r = add_opcode_rel_addr(reg, OP_JUMP, len);
1278: if (r) return r;
1279: }
1280: #endif
1281: if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1282: r = add_opcode(reg, OP_MEMORY_START_PUSH);
1283: else
1284: r = add_opcode(reg, OP_MEMORY_START);
1285: if (r) return r;
1286: r = add_mem_num(reg, node->regnum);
1287: if (r) return r;
1288: r = compile_tree(node->target, reg);
1289: if (r) return r;
1290: #ifdef USE_SUBEXP_CALL
1291: if (IS_EFFECT_CALLED(node)) {
1292: if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1293: r = add_opcode(reg, (IS_EFFECT_RECURSION(node)
1294: ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1295: else
1296: r = add_opcode(reg, (IS_EFFECT_RECURSION(node)
1297: ? OP_MEMORY_END_REC : OP_MEMORY_END));
1298:
1299: if (r) return r;
1300: r = add_mem_num(reg, node->regnum);
1301: if (r) return r;
1302: r = add_opcode(reg, OP_RETURN);
1303: }
1304: else
1305: #endif
1306: {
1307: if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1308: r = add_opcode(reg, OP_MEMORY_END_PUSH);
1309: else
1310: r = add_opcode(reg, OP_MEMORY_END);
1311: if (r) return r;
1312: r = add_mem_num(reg, node->regnum);
1313: }
1314: break;
1315:
1316: case EFFECT_STOP_BACKTRACK:
1317: if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
1318: QuantifierNode* qn = &NQUANTIFIER(node->target);
1319: r = compile_tree_n_times(qn->target, qn->lower, reg);
1320: if (r) return r;
1321:
1322: len = compile_length_tree(qn->target, reg);
1323: if (len < 0) return len;
1324:
1325: r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1326: if (r) return r;
1327: r = compile_tree(qn->target, reg);
1328: if (r) return r;
1329: r = add_opcode(reg, OP_POP);
1330: if (r) return r;
1331: r = add_opcode_rel_addr(reg, OP_JUMP,
1332: -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1333: }
1334: else {
1335: r = add_opcode(reg, OP_PUSH_STOP_BT);
1336: if (r) return r;
1337: r = compile_tree(node->target, reg);
1338: if (r) return r;
1339: r = add_opcode(reg, OP_POP_STOP_BT);
1340: }
1341: break;
1342:
1343: default:
1344: return ONIGERR_TYPE_BUG;
1345: break;
1346: }
1347:
1348: return r;
1349: }
1350:
1351: static int
1352: compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1353: {
1354: int len;
1355: int tlen = 0;
1356:
1357: if (node->target) {
1358: tlen = compile_length_tree(node->target, reg);
1359: if (tlen < 0) return tlen;
1360: }
1361:
1362: switch (node->type) {
1363: case ANCHOR_PREC_READ:
1364: len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1365: break;
1366: case ANCHOR_PREC_READ_NOT:
1367: len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1368: break;
1369: case ANCHOR_LOOK_BEHIND:
1370: len = SIZE_OP_LOOK_BEHIND + tlen;
1371: break;
1372: case ANCHOR_LOOK_BEHIND_NOT:
1373: len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1374: break;
1375:
1376: default:
1377: len = SIZE_OPCODE;
1378: break;
1379: }
1380:
1381: return len;
1382: }
1383:
1384: static int
1385: compile_anchor_node(AnchorNode* node, regex_t* reg)
1386: {
1387: int r, len;
1388:
1389: switch (node->type) {
1390: case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1391: case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1392: case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1393: case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1394: case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1395: case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1396:
1397: case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1398: case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1399: #ifdef USE_WORD_BEGIN_END
1400: case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1401: case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1402: #endif
1403:
1404: case ANCHOR_PREC_READ:
1405: r = add_opcode(reg, OP_PUSH_POS);
1406: if (r) return r;
1407: r = compile_tree(node->target, reg);
1408: if (r) return r;
1409: r = add_opcode(reg, OP_POP_POS);
1410: break;
1411:
1412: case ANCHOR_PREC_READ_NOT:
1413: len = compile_length_tree(node->target, reg);
1414: if (len < 0) return len;
1415: r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1416: if (r) return r;
1417: r = compile_tree(node->target, reg);
1418: if (r) return r;
1419: r = add_opcode(reg, OP_FAIL_POS);
1420: break;
1421:
1422: case ANCHOR_LOOK_BEHIND:
1423: {
1424: int n;
1425: r = add_opcode(reg, OP_LOOK_BEHIND);
1426: if (r) return r;
1427: if (node->char_len < 0) {
1428: r = get_char_length_tree(node->target, reg, &n);
1429: if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1430: }
1431: else
1432: n = node->char_len;
1433: r = add_length(reg, n);
1434: if (r) return r;
1435: r = compile_tree(node->target, reg);
1436: }
1437: break;
1438:
1439: case ANCHOR_LOOK_BEHIND_NOT:
1440: {
1441: int n;
1442: len = compile_length_tree(node->target, reg);
1443: r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1444: len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1445: if (r) return r;
1446: if (node->char_len < 0) {
1447: r = get_char_length_tree(node->target, reg, &n);
1448: if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1449: }
1450: else
1451: n = node->char_len;
1452: r = add_length(reg, n);
1453: if (r) return r;
1454: r = compile_tree(node->target, reg);
1455: if (r) return r;
1456: r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1457: }
1458: break;
1459:
1460: default:
1461: return ONIGERR_TYPE_BUG;
1462: break;
1463: }
1464:
1465: return r;
1466: }
1467:
1468: static int
1469: compile_length_tree(Node* node, regex_t* reg)
1470: {
1471: int len, type, r;
1472:
1473: type = NTYPE(node);
1474: switch (type) {
1475: case N_LIST:
1476: len = 0;
1477: do {
1478: r = compile_length_tree(NCONS(node).left, reg);
1479: if (r < 0) return r;
1480: len += r;
1481: } while (IS_NOT_NULL(node = NCONS(node).right));
1482: r = len;
1483: break;
1484:
1485: case N_ALT:
1486: {
1487: int n;
1488:
1489: n = r = 0;
1490: do {
1491: r += compile_length_tree(NCONS(node).left, reg);
1492: n++;
1493: } while (IS_NOT_NULL(node = NCONS(node).right));
1494: r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1495: }
1496: break;
1497:
1498: case N_STRING:
1499: if (NSTRING_IS_RAW(node))
1500: r = compile_length_string_raw_node(&(NSTRING(node)), reg);
1501: else
1502: r = compile_length_string_node(node, reg);
1503: break;
1504:
1505: case N_CCLASS:
1506: r = compile_length_cclass_node(&(NCCLASS(node)), reg);
1507: break;
1508:
1509: case N_CTYPE:
1510: case N_ANYCHAR:
1511: r = SIZE_OPCODE;
1512: break;
1513:
1514: case N_BACKREF:
1515: {
1516: BackrefNode* br = &(NBACKREF(node));
1517:
1518: #ifdef USE_BACKREF_AT_LEVEL
1519: if (IS_BACKREF_NEST_LEVEL(br)) {
1520: r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1521: SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1522: }
1523: else
1524: #endif
1525: if (br->back_num == 1) {
1526: r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1527: ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1528: }
1529: else {
1530: r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1531: }
1532: }
1533: break;
1534:
1535: #ifdef USE_SUBEXP_CALL
1536: case N_CALL:
1537: r = SIZE_OP_CALL;
1538: break;
1539: #endif
1540:
1541: case N_QUANTIFIER:
1542: r = compile_length_quantifier_node(&(NQUANTIFIER(node)), reg);
1543: break;
1544:
1545: case N_EFFECT:
1546: r = compile_length_effect_node(&NEFFECT(node), reg);
1547: break;
1548:
1549: case N_ANCHOR:
1550: r = compile_length_anchor_node(&(NANCHOR(node)), reg);
1551: break;
1552:
1553: default:
1554: return ONIGERR_TYPE_BUG;
1555: break;
1556: }
1557:
1558: return r;
1559: }
1560:
1561: static int
1562: compile_tree(Node* node, regex_t* reg)
1563: {
1564: int n, type, len, pos, r = 0;
1565:
1566: type = NTYPE(node);
1567: switch (type) {
1568: case N_LIST:
1569: do {
1570: r = compile_tree(NCONS(node).left, reg);
1571: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1572: break;
1573:
1574: case N_ALT:
1575: {
1576: Node* x = node;
1577: len = 0;
1578: do {
1579: len += compile_length_tree(NCONS(x).left, reg);
1580: if (NCONS(x).right != NULL) {
1581: len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1582: }
1583: } while (IS_NOT_NULL(x = NCONS(x).right));
1584: pos = reg->used + len; /* goal position */
1585:
1586: do {
1587: len = compile_length_tree(NCONS(node).left, reg);
1588: if (IS_NOT_NULL(NCONS(node).right)) {
1589: r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1590: if (r) break;
1591: }
1592: r = compile_tree(NCONS(node).left, reg);
1593: if (r) break;
1594: if (IS_NOT_NULL(NCONS(node).right)) {
1595: len = pos - (reg->used + SIZE_OP_JUMP);
1596: r = add_opcode_rel_addr(reg, OP_JUMP, len);
1597: if (r) break;
1598: }
1599: } while (IS_NOT_NULL(node = NCONS(node).right));
1600: }
1601: break;
1602:
1603: case N_STRING:
1604: if (NSTRING_IS_RAW(node))
1605: r = compile_string_raw_node(&(NSTRING(node)), reg);
1606: else
1607: r = compile_string_node(node, reg);
1608: break;
1609:
1610: case N_CCLASS:
1611: r = compile_cclass_node(&(NCCLASS(node)), reg);
1612: break;
1613:
1614: case N_CTYPE:
1615: {
1616: int op;
1617:
1618: switch (NCTYPE(node).type) {
1619: case CTYPE_WORD: op = OP_WORD; break;
1620: case CTYPE_NOT_WORD: op = OP_NOT_WORD; break;
1621: default:
1622: return ONIGERR_TYPE_BUG;
1623: break;
1624: }
1625: r = add_opcode(reg, op);
1626: }
1627: break;
1628:
1629: case N_ANYCHAR:
1630: if (IS_MULTILINE(reg->options))
1631: r = add_opcode(reg, OP_ANYCHAR_ML);
1632: else
1633: r = add_opcode(reg, OP_ANYCHAR);
1634: break;
1635:
1636: case N_BACKREF:
1637: {
1638: BackrefNode* br = &(NBACKREF(node));
1639:
1640: #ifdef USE_BACKREF_AT_LEVEL
1641: if (IS_BACKREF_NEST_LEVEL(br)) {
1642: r = add_opcode(reg, OP_BACKREF_AT_LEVEL);
1643: if (r) return r;
1644: r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1645: if (r) return r;
1646: r = add_length(reg, br->nest_level);
1647: if (r) return r;
1648:
1649: goto add_bacref_mems;
1650: }
1651: else
1652: #endif
1653: if (br->back_num == 1) {
1654: n = br->back_static[0];
1655: if (IS_IGNORECASE(reg->options)) {
1656: r = add_opcode(reg, OP_BACKREFN_IC);
1657: if (r) return r;
1658: r = add_mem_num(reg, n);
1659: }
1660: else {
1661: switch (n) {
1662: case 1: r = add_opcode(reg, OP_BACKREF1); break;
1663: case 2: r = add_opcode(reg, OP_BACKREF2); break;
1664: default:
1665: r = add_opcode(reg, OP_BACKREFN);
1666: if (r) return r;
1667: r = add_mem_num(reg, n);
1668: break;
1669: }
1670: }
1671: }
1672: else {
1673: int i;
1674: int* p;
1675:
1676: if (IS_IGNORECASE(reg->options)) {
1677: r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1678: }
1679: else {
1680: r = add_opcode(reg, OP_BACKREF_MULTI);
1681: }
1682: if (r) return r;
1683:
1684: #ifdef USE_BACKREF_AT_LEVEL
1685: add_bacref_mems:
1686: #endif
1687: r = add_length(reg, br->back_num);
1688: if (r) return r;
1689: p = BACKREFS_P(br);
1690: for (i = br->back_num - 1; i >= 0; i--) {
1691: r = add_mem_num(reg, p[i]);
1692: if (r) return r;
1693: }
1694: }
1695: }
1696: break;
1697:
1698: #ifdef USE_SUBEXP_CALL
1699: case N_CALL:
1700: r = compile_call(&(NCALL(node)), reg);
1701: break;
1702: #endif
1703:
1704: case N_QUANTIFIER:
1705: r = compile_quantifier_node(&(NQUANTIFIER(node)), reg);
1706: break;
1707:
1708: case N_EFFECT:
1709: r = compile_effect_node(&NEFFECT(node), reg);
1710: break;
1711:
1712: case N_ANCHOR:
1713: r = compile_anchor_node(&(NANCHOR(node)), reg);
1714: break;
1715:
1716: default:
1717: #ifdef ONIG_DEBUG
1718: fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1719: #endif
1720: break;
1721: }
1722:
1723: return r;
1724: }
1725:
1726: #ifdef USE_NAMED_GROUP
1727:
1728: static int
1729: noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1730: {
1731: int r = 0;
1732: Node* node = *plink;
1733:
1734: switch (NTYPE(node)) {
1735: case N_LIST:
1736: case N_ALT:
1737: do {
1738: r = noname_disable_map(&(NCONS(node).left), map, counter);
1739: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1740: break;
1741:
1742: case N_QUANTIFIER:
1743: {
1744: Node** ptarget = &(NQUANTIFIER(node).target);
1745: Node* old = *ptarget;
1746: r = noname_disable_map(ptarget, map, counter);
1747: if (*ptarget != old && NTYPE(*ptarget) == N_QUANTIFIER) {
1748: onig_reduce_nested_quantifier(node, *ptarget);
1749: }
1750: }
1751: break;
1752:
1753: case N_EFFECT:
1754: {
1755: EffectNode* en = &(NEFFECT(node));
1756: if (en->type == EFFECT_MEMORY) {
1757: if (IS_EFFECT_NAMED_GROUP(en)) {
1758: (*counter)++;
1759: map[en->regnum].new_val = *counter;
1760: en->regnum = *counter;
1761: r = noname_disable_map(&(en->target), map, counter);
1762: }
1763: else {
1764: *plink = en->target;
1765: en->target = NULL_NODE;
1766: onig_node_free(node);
1767: r = noname_disable_map(plink, map, counter);
1768: }
1769: }
1770: else
1771: r = noname_disable_map(&(en->target), map, counter);
1772: }
1773: break;
1774:
1775: default:
1776: break;
1777: }
1778:
1779: return r;
1780: }
1781:
1782: static int
1783: renumber_node_backref(Node* node, GroupNumRemap* map)
1784: {
1785: int i, pos, n, old_num;
1786: int *backs;
1787: BackrefNode* bn = &(NBACKREF(node));
1788:
1789: if (! IS_BACKREF_NAME_REF(bn))
1790: return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1791:
1792: old_num = bn->back_num;
1793: if (IS_NULL(bn->back_dynamic))
1794: backs = bn->back_static;
1795: else
1796: backs = bn->back_dynamic;
1797:
1798: for (i = 0, pos = 0; i < old_num; i++) {
1799: n = map[backs[i]].new_val;
1800: if (n > 0) {
1801: backs[pos] = n;
1802: pos++;
1803: }
1804: }
1805:
1806: bn->back_num = pos;
1807: return 0;
1808: }
1809:
1810: static int
1811: renumber_by_map(Node* node, GroupNumRemap* map)
1812: {
1813: int r = 0;
1814:
1815: switch (NTYPE(node)) {
1816: case N_LIST:
1817: case N_ALT:
1818: do {
1819: r = renumber_by_map(NCONS(node).left, map);
1820: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1821: break;
1822: case N_QUANTIFIER:
1823: r = renumber_by_map(NQUANTIFIER(node).target, map);
1824: break;
1825: case N_EFFECT:
1826: r = renumber_by_map(NEFFECT(node).target, map);
1827: break;
1828:
1829: case N_BACKREF:
1830: r = renumber_node_backref(node, map);
1831: break;
1832:
1833: default:
1834: break;
1835: }
1836:
1837: return r;
1838: }
1839:
1840: static int
1841: numbered_ref_check(Node* node)
1842: {
1843: int r = 0;
1844:
1845: switch (NTYPE(node)) {
1846: case N_LIST:
1847: case N_ALT:
1848: do {
1849: r = numbered_ref_check(NCONS(node).left);
1850: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
1851: break;
1852: case N_QUANTIFIER:
1853: r = numbered_ref_check(NQUANTIFIER(node).target);
1854: break;
1855: case N_EFFECT:
1856: r = numbered_ref_check(NEFFECT(node).target);
1857: break;
1858:
1859: case N_BACKREF:
1860: if (! IS_BACKREF_NAME_REF(&(NBACKREF(node))))
1861: return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1862: break;
1863:
1864: default:
1865: break;
1866: }
1867:
1868: return r;
1869: }
1870:
1871: static int
1872: disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1873: {
1874: int r, i, pos, counter;
1875: BitStatusType loc;
1876: GroupNumRemap* map;
1877:
1878: map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
1879: CHECK_NULL_RETURN_VAL(map, ONIGERR_MEMORY);
1880: for (i = 1; i <= env->num_mem; i++) {
1881: map[i].new_val = 0;
1882: }
1883: counter = 0;
1884: r = noname_disable_map(root, map, &counter);
1885: if (r != 0) return r;
1886:
1887: r = renumber_by_map(*root, map);
1888: if (r != 0) return r;
1889:
1890: for (i = 1, pos = 1; i <= env->num_mem; i++) {
1891: if (map[i].new_val > 0) {
1892: SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1893: pos++;
1894: }
1895: }
1896:
1897: loc = env->capture_history;
1898: BIT_STATUS_CLEAR(env->capture_history);
1899: for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1900: if (BIT_STATUS_AT(loc, i)) {
1901: BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1902: }
1903: }
1904:
1905: env->num_mem = env->num_named;
1906: reg->num_mem = env->num_named;
1907:
1908: return onig_renumber_name_table(reg, map);
1909: }
1910: #endif /* USE_NAMED_GROUP */
1911:
1912: #ifdef USE_SUBEXP_CALL
1913: static int
1914: unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1915: {
1916: int i, offset;
1917: EffectNode* en;
1918: AbsAddrType addr;
1919:
1920: for (i = 0; i < uslist->num; i++) {
1921: en = &(NEFFECT(uslist->us[i].target));
1922: if (! IS_EFFECT_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1923: addr = en->call_addr;
1924: offset = uslist->us[i].offset;
1925:
1926: BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1927: }
1928: return 0;
1929: }
1930: #endif
1931:
1932: #ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK
1933: static int
1934: quantifiers_memory_node_info(Node* node)
1935: {
1936: int r = 0;
1937:
1938: switch (NTYPE(node)) {
1939: case N_LIST:
1940: case N_ALT:
1941: {
1942: int v;
1943: do {
1944: v = quantifiers_memory_node_info(NCONS(node).left);
1945: if (v > r) r = v;
1946: } while (v >= 0 && IS_NOT_NULL(node = NCONS(node).right));
1947: }
1948: break;
1949:
1950: #ifdef USE_SUBEXP_CALL
1951: case N_CALL:
1952: if (IS_CALL_RECURSION(&NCALL(node))) {
1953: return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1954: }
1955: else
1956: r = quantifiers_memory_node_info(NCALL(node).target);
1957: break;
1958: #endif
1959:
1960: case N_QUANTIFIER:
1961: {
1962: QuantifierNode* qn = &(NQUANTIFIER(node));
1963: if (qn->upper != 0) {
1964: r = quantifiers_memory_node_info(qn->target);
1965: }
1966: }
1967: break;
1968:
1969: case N_EFFECT:
1970: {
1971: EffectNode* en = &(NEFFECT(node));
1972: switch (en->type) {
1973: case EFFECT_MEMORY:
1974: return NQ_TARGET_IS_EMPTY_MEM;
1975: break;
1976:
1977: case EFFECT_OPTION:
1978: case EFFECT_STOP_BACKTRACK:
1979: r = quantifiers_memory_node_info(en->target);
1980: break;
1981: default:
1982: break;
1983: }
1984: }
1985: break;
1986:
1987: case N_BACKREF:
1988: case N_STRING:
1989: case N_CTYPE:
1990: case N_CCLASS:
1991: case N_ANYCHAR:
1992: case N_ANCHOR:
1993: default:
1994: break;
1995: }
1996:
1997: return r;
1998: }
1999: #endif /* USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK */
2000:
2001: static int
2002: get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2003: {
2004: OnigDistance tmin;
2005: int r = 0;
2006:
2007: *min = 0;
2008: switch (NTYPE(node)) {
2009: case N_BACKREF:
2010: {
2011: int i;
2012: int* backs;
2013: Node** nodes = SCANENV_MEM_NODES(env);
2014: BackrefNode* br = &(NBACKREF(node));
2015: if (br->state & NST_RECURSION) break;
2016:
2017: backs = BACKREFS_P(br);
2018: if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2019: r = get_min_match_length(nodes[backs[0]], min, env);
2020: if (r != 0) break;
2021: for (i = 1; i < br->back_num; i++) {
2022: if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2023: r = get_min_match_length(nodes[backs[i]], &tmin, env);
2024: if (r != 0) break;
2025: if (*min > tmin) *min = tmin;
2026: }
2027: }
2028: break;
2029:
2030: #ifdef USE_SUBEXP_CALL
2031: case N_CALL:
2032: if (IS_CALL_RECURSION(&NCALL(node))) {
2033: EffectNode* en = &(NEFFECT(NCALL(node).target));
2034: if (IS_EFFECT_MIN_FIXED(en))
2035: *min = en->min_len;
2036: }
2037: else
2038: r = get_min_match_length(NCALL(node).target, min, env);
2039: break;
2040: #endif
2041:
2042: case N_LIST:
2043: do {
2044: r = get_min_match_length(NCONS(node).left, &tmin, env);
2045: if (r == 0) *min += tmin;
2046: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2047: break;
2048:
2049: case N_ALT:
2050: {
2051: Node *x, *y;
2052: y = node;
2053: do {
2054: x = NCONS(y).left;
2055: r = get_min_match_length(x, &tmin, env);
2056: if (r != 0) break;
2057: if (y == node) *min = tmin;
2058: else if (*min > tmin) *min = tmin;
2059: } while (r == 0 && IS_NOT_NULL(y = NCONS(y).right));
2060: }
2061: break;
2062:
2063: case N_STRING:
2064: {
2065: StrNode* sn = &(NSTRING(node));
2066: *min = sn->end - sn->s;
2067: }
2068: break;
2069:
2070: case N_CTYPE:
2071: switch (NCTYPE(node).type) {
2072: case CTYPE_WORD: *min = 1; break;
2073: case CTYPE_NOT_WORD: *min = 1; break;
2074: default:
2075: break;
2076: }
2077: break;
2078:
2079: case N_CCLASS:
2080: case N_ANYCHAR:
2081: *min = 1;
2082: break;
2083:
2084: case N_QUANTIFIER:
2085: {
2086: QuantifierNode* qn = &(NQUANTIFIER(node));
2087:
2088: if (qn->lower > 0) {
2089: r = get_min_match_length(qn->target, min, env);
2090: if (r == 0)
2091: *min = distance_multiply(*min, qn->lower);
2092: }
2093: }
2094: break;
2095:
2096: case N_EFFECT:
2097: {
2098: EffectNode* en = &(NEFFECT(node));
2099: switch (en->type) {
2100: case EFFECT_MEMORY:
2101: #ifdef USE_SUBEXP_CALL
2102: if (IS_EFFECT_MIN_FIXED(en))
2103: *min = en->min_len;
2104: else {
2105: r = get_min_match_length(en->target, min, env);
2106: if (r == 0) {
2107: en->min_len = *min;
2108: SET_EFFECT_STATUS(node, NST_MIN_FIXED);
2109: }
2110: }
2111: break;
2112: #endif
2113: case EFFECT_OPTION:
2114: case EFFECT_STOP_BACKTRACK:
2115: r = get_min_match_length(en->target, min, env);
2116: break;
2117: }
2118: }
2119: break;
2120:
2121: case N_ANCHOR:
2122: default:
2123: break;
2124: }
2125:
2126: return r;
2127: }
2128:
2129: static int
2130: get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2131: {
2132: OnigDistance tmax;
2133: int r = 0;
2134:
2135: *max = 0;
2136: switch (NTYPE(node)) {
2137: case N_LIST:
2138: do {
2139: r = get_max_match_length(NCONS(node).left, &tmax, env);
2140: if (r == 0)
2141: *max = distance_add(*max, tmax);
2142: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2143: break;
2144:
2145: case N_ALT:
2146: do {
2147: r = get_max_match_length(NCONS(node).left, &tmax, env);
2148: if (r == 0 && *max < tmax) *max = tmax;
2149: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2150: break;
2151:
2152: case N_STRING:
2153: {
2154: StrNode* sn = &(NSTRING(node));
2155: *max = sn->end - sn->s;
2156: }
2157: break;
2158:
2159: case N_CTYPE:
2160: switch (NCTYPE(node).type) {
2161: case CTYPE_WORD:
2162: case CTYPE_NOT_WORD:
2163: *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2164: break;
2165:
2166: default:
2167: break;
2168: }
2169: break;
2170:
2171: case N_CCLASS:
2172: case N_ANYCHAR:
2173: *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2174: break;
2175:
2176: case N_BACKREF:
2177: {
2178: int i;
2179: int* backs;
2180: Node** nodes = SCANENV_MEM_NODES(env);
2181: BackrefNode* br = &(NBACKREF(node));
2182: if (br->state & NST_RECURSION) {
2183: *max = ONIG_INFINITE_DISTANCE;
2184: break;
2185: }
2186: backs = BACKREFS_P(br);
2187: for (i = 0; i < br->back_num; i++) {
2188: if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2189: r = get_max_match_length(nodes[backs[i]], &tmax, env);
2190: if (r != 0) break;
2191: if (*max < tmax) *max = tmax;
2192: }
2193: }
2194: break;
2195:
2196: #ifdef USE_SUBEXP_CALL
2197: case N_CALL:
2198: if (! IS_CALL_RECURSION(&(NCALL(node))))
2199: r = get_max_match_length(NCALL(node).target, max, env);
2200: else
2201: *max = ONIG_INFINITE_DISTANCE;
2202: break;
2203: #endif
2204:
2205: case N_QUANTIFIER:
2206: {
2207: QuantifierNode* qn = &(NQUANTIFIER(node));
2208:
2209: if (qn->upper != 0) {
2210: r = get_max_match_length(qn->target, max, env);
2211: if (r == 0 && *max != 0) {
2212: if (! IS_REPEAT_INFINITE(qn->upper))
2213: *max = distance_multiply(*max, qn->upper);
2214: else
2215: *max = ONIG_INFINITE_DISTANCE;
2216: }
2217: }
2218: }
2219: break;
2220:
2221: case N_EFFECT:
2222: {
2223: EffectNode* en = &(NEFFECT(node));
2224: switch (en->type) {
2225: case EFFECT_MEMORY:
2226: #ifdef USE_SUBEXP_CALL
2227: if (IS_EFFECT_MAX_FIXED(en))
2228: *max = en->max_len;
2229: else {
2230: r = get_max_match_length(en->target, max, env);
2231: if (r == 0) {
2232: en->max_len = *max;
2233: SET_EFFECT_STATUS(node, NST_MAX_FIXED);
2234: }
2235: }
2236: break;
2237: #endif
2238: case EFFECT_OPTION:
2239: case EFFECT_STOP_BACKTRACK:
2240: r = get_max_match_length(en->target, max, env);
2241: break;
2242: }
2243: }
2244: break;
2245:
2246: case N_ANCHOR:
2247: default:
2248: break;
2249: }
2250:
2251: return r;
2252: }
2253:
2254: #define GET_CHAR_LEN_VARLEN -1
2255: #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2256:
2257: /* fixed size pattern node only */
2258: static int
2259: get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2260: {
2261: int tlen;
2262: int r = 0;
2263:
2264: level++;
2265: *len = 0;
2266: switch (NTYPE(node)) {
2267: case N_LIST:
2268: do {
2269: r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level);
2270: if (r == 0)
2271: *len = distance_add(*len, tlen);
2272: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2273: break;
2274:
2275: case N_ALT:
2276: {
2277: int tlen2;
2278: int varlen = 0;
2279:
2280: r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level);
2281: while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)) {
2282: r = get_char_length_tree1(NCONS(node).left, reg, &tlen2, level);
2283: if (r == 0) {
2284: if (tlen != tlen2)
2285: varlen = 1;
2286: }
2287: }
2288: if (r == 0) {
2289: if (varlen != 0) {
2290: if (level == 1)
2291: r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2292: else
2293: r = GET_CHAR_LEN_VARLEN;
2294: }
2295: else
2296: *len = tlen;
2297: }
2298: }
2299: break;
2300:
2301: case N_STRING:
2302: {
2303: StrNode* sn = &(NSTRING(node));
2304: UChar *s = sn->s;
2305: while (s < sn->end) {
2306: s += enc_len(reg->enc, s);
2307: (*len)++;
2308: }
2309: }
2310: break;
2311:
2312: case N_QUANTIFIER:
2313: {
2314: QuantifierNode* qn = &(NQUANTIFIER(node));
2315: if (qn->lower == qn->upper) {
2316: r = get_char_length_tree1(qn->target, reg, &tlen, level);
2317: if (r == 0)
2318: *len = distance_multiply(tlen, qn->lower);
2319: }
2320: else
2321: r = GET_CHAR_LEN_VARLEN;
2322: }
2323: break;
2324:
2325: #ifdef USE_SUBEXP_CALL
2326: case N_CALL:
2327: if (! IS_CALL_RECURSION(&(NCALL(node))))
2328: r = get_char_length_tree1(NCALL(node).target, reg, len, level);
2329: else
2330: r = GET_CHAR_LEN_VARLEN;
2331: break;
2332: #endif
2333:
2334: case N_CTYPE:
2335: switch (NCTYPE(node).type) {
2336: case CTYPE_WORD:
2337: case CTYPE_NOT_WORD:
2338: *len = 1;
2339: break;
2340: }
2341: break;
2342:
2343: case N_CCLASS:
2344: case N_ANYCHAR:
2345: *len = 1;
2346: break;
2347:
2348: case N_EFFECT:
2349: {
2350: EffectNode* en = &(NEFFECT(node));
2351: switch (en->type) {
2352: case EFFECT_MEMORY:
2353: #ifdef USE_SUBEXP_CALL
2354: if (IS_EFFECT_CLEN_FIXED(en))
2355: *len = en->char_len;
2356: else {
2357: r = get_char_length_tree1(en->target, reg, len, level);
2358: if (r == 0) {
2359: en->char_len = *len;
2360: SET_EFFECT_STATUS(node, NST_CLEN_FIXED);
2361: }
2362: }
2363: break;
2364: #endif
2365: case EFFECT_OPTION:
2366: case EFFECT_STOP_BACKTRACK:
2367: r = get_char_length_tree1(en->target, reg, len, level);
2368: break;
2369: default:
2370: break;
2371: }
2372: }
2373: break;
2374:
2375: case N_ANCHOR:
2376: break;
2377:
2378: default:
2379: r = GET_CHAR_LEN_VARLEN;
2380: break;
2381: }
2382:
2383: return r;
2384: }
2385:
2386: static int
2387: get_char_length_tree(Node* node, regex_t* reg, int* len)
2388: {
2389: return get_char_length_tree1(node, reg, len, 0);
2390: }
2391:
2392: /* x is not included y ==> 1 : 0 */
2393: static int
2394: is_not_included(Node* x, Node* y, regex_t* reg)
2395: {
2396: int i, len;
2397: OnigCodePoint code;
2398: UChar *p, c;
2399: int ytype;
2400:
2401: retry:
2402: ytype = NTYPE(y);
2403: switch (NTYPE(x)) {
2404: case N_CTYPE:
2405: {
2406: switch (ytype) {
2407: case N_CTYPE:
2408: switch (NCTYPE(x).type) {
2409: case CTYPE_WORD:
2410: if (NCTYPE(y).type == CTYPE_NOT_WORD)
2411: return 1;
2412: else
2413: return 0;
2414: break;
2415: case CTYPE_NOT_WORD:
2416: if (NCTYPE(y).type == CTYPE_WORD)
2417: return 1;
2418: else
2419: return 0;
2420: break;
2421: default:
2422: break;
2423: }
2424: break;
2425:
2426: case N_CCLASS:
2427: swap:
2428: {
2429: Node* tmp;
2430: tmp = x; x = y; y = tmp;
2431: goto retry;
2432: }
2433: break;
2434:
2435: case N_STRING:
2436: goto swap;
2437: break;
2438:
2439: default:
2440: break;
2441: }
2442: }
2443: break;
2444:
2445: case N_CCLASS:
2446: {
2447: CClassNode* xc = &(NCCLASS(x));
2448: switch (ytype) {
2449: case N_CTYPE:
2450: switch (NCTYPE(y).type) {
2451: case CTYPE_WORD:
2452: if (IS_NULL(xc->mbuf) && !IS_CCLASS_NOT(xc)) {
2453: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2454: if (BITSET_AT(xc->bs, i)) {
2455: if (ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) return 0;
2456: }
2457: }
2458: return 1;
2459: }
2460: return 0;
2461: break;
2462: case CTYPE_NOT_WORD:
2463: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2464: if (! ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) {
2465: if (!IS_CCLASS_NOT(xc)) {
2466: if (BITSET_AT(xc->bs, i))
2467: return 0;
2468: }
2469: else {
2470: if (! BITSET_AT(xc->bs, i))
2471: return 0;
2472: }
2473: }
2474: }
2475: return 1;
2476: break;
2477:
2478: default:
2479: break;
2480: }
2481: break;
2482:
2483: case N_CCLASS:
2484: {
2485: int v;
2486: CClassNode* yc = &(NCCLASS(y));
2487:
2488: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2489: v = BITSET_AT(xc->bs, i);
2490: if ((v != 0 && !IS_CCLASS_NOT(xc)) ||
2491: (v == 0 && IS_CCLASS_NOT(xc))) {
2492: v = BITSET_AT(yc->bs, i);
2493: if ((v != 0 && !IS_CCLASS_NOT(yc)) ||
2494: (v == 0 && IS_CCLASS_NOT(yc)))
2495: return 0;
2496: }
2497: }
2498: if ((IS_NULL(xc->mbuf) && !IS_CCLASS_NOT(xc)) ||
2499: (IS_NULL(yc->mbuf) && !IS_CCLASS_NOT(yc)))
2500: return 1;
2501: return 0;
2502: }
2503: break;
2504:
2505: case N_STRING:
2506: goto swap;
2507: break;
2508:
2509: default:
2510: break;
2511: }
2512: }
2513: break;
2514:
2515: case N_STRING:
2516: {
2517: StrNode* xs = &(NSTRING(x));
2518: if (NSTRING_LEN(x) == 0)
2519: break;
2520:
2521: c = *(xs->s);
2522: switch (ytype) {
2523: case N_CTYPE:
2524: switch (NCTYPE(y).type) {
2525: case CTYPE_WORD:
2526: return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 0 : 1);
2527: break;
2528: case CTYPE_NOT_WORD:
2529: return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 1 : 0);
2530: break;
2531: default:
2532: break;
2533: }
2534: break;
2535:
2536: case N_CCLASS:
2537: {
2538: CClassNode* cc = &(NCCLASS(y));
2539:
2540: code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2541: xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2542: return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2543: }
2544: break;
2545:
2546: case N_STRING:
2547: {
2548: UChar *q;
2549: StrNode* ys = &(NSTRING(y));
2550: len = NSTRING_LEN(x);
2551: if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2552: if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2553: /* tiny version */
2554: return 0;
2555: }
2556: else {
2557: for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2558: if (*p != *q) return 1;
2559: }
2560: }
2561: }
2562: break;
2563:
2564: default:
2565: break;
2566: }
2567: }
2568: break;
2569:
2570: default:
2571: break;
2572: }
2573:
2574: return 0;
2575: }
2576:
2577: static Node*
2578: get_head_value_node(Node* node, int exact, regex_t* reg)
2579: {
2580: Node* n = NULL_NODE;
2581:
2582: switch (NTYPE(node)) {
2583: case N_BACKREF:
2584: case N_ALT:
2585: case N_ANYCHAR:
2586: #ifdef USE_SUBEXP_CALL
2587: case N_CALL:
2588: #endif
2589: break;
2590:
2591: case N_CTYPE:
2592: case N_CCLASS:
2593: if (exact == 0) {
2594: n = node;
2595: }
2596: break;
2597:
2598: case N_LIST:
2599: n = get_head_value_node(NCONS(node).left, exact, reg);
2600: break;
2601:
2602: case N_STRING:
2603: {
2604: StrNode* sn = &(NSTRING(node));
2605:
2606: if (sn->end <= sn->s)
2607: break;
2608:
2609: if (exact != 0 &&
2610: !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2611: #if 0
2612: UChar* tmp = sn->s;
2613: if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag,
2614: &tmp, sn->end))
2615: n = node;
2616: #endif
2617: }
2618: else {
2619: n = node;
2620: }
2621: }
2622: break;
2623:
2624: case N_QUANTIFIER:
2625: {
2626: QuantifierNode* qn = &(NQUANTIFIER(node));
2627: if (qn->lower > 0) {
2628: if (IS_NOT_NULL(qn->head_exact))
2629: n = qn->head_exact;
2630: else
2631: n = get_head_value_node(qn->target, exact, reg);
2632: }
2633: }
2634: break;
2635:
2636: case N_EFFECT:
2637: {
2638: EffectNode* en = &(NEFFECT(node));
2639: switch (en->type) {
2640: case EFFECT_OPTION:
2641: {
2642: OnigOptionType options = reg->options;
2643:
2644: reg->options = NEFFECT(node).option;
2645: n = get_head_value_node(NEFFECT(node).target, exact, reg);
2646: reg->options = options;
2647: }
2648: break;
2649:
2650: case EFFECT_MEMORY:
2651: case EFFECT_STOP_BACKTRACK:
2652: n = get_head_value_node(en->target, exact, reg);
2653: break;
2654: }
2655: }
2656: break;
2657:
2658: case N_ANCHOR:
2659: if (NANCHOR(node).type == ANCHOR_PREC_READ)
2660: n = get_head_value_node(NANCHOR(node).target, exact, reg);
2661: break;
2662:
2663: default:
2664: break;
2665: }
2666:
2667: return n;
2668: }
2669:
2670: static int
2671: check_type_tree(Node* node, int type_mask, int effect_mask, int anchor_mask)
2672: {
2673: int type, r = 0;
2674:
2675: type = NTYPE(node);
2676: if ((type & type_mask) == 0)
2677: return 1;
2678:
2679: switch (type) {
2680: case N_LIST:
2681: case N_ALT:
2682: do {
2683: r = check_type_tree(NCONS(node).left, type_mask, effect_mask, anchor_mask);
2684: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2685: break;
2686:
2687: case N_QUANTIFIER:
2688: r = check_type_tree(NQUANTIFIER(node).target, type_mask, effect_mask,
2689: anchor_mask);
2690: break;
2691:
2692: case N_EFFECT:
2693: {
2694: EffectNode* en = &(NEFFECT(node));
2695: if ((en->type & effect_mask) == 0)
2696: return 1;
2697:
2698: r = check_type_tree(en->target, type_mask, effect_mask, anchor_mask);
2699: }
2700: break;
2701:
2702: case N_ANCHOR:
2703: type = NANCHOR(node).type;
2704: if ((type & anchor_mask) == 0)
2705: return 1;
2706:
2707: if (NANCHOR(node).target)
2708: r = check_type_tree(NANCHOR(node).target,
2709: type_mask, effect_mask, anchor_mask);
2710: break;
2711:
2712: default:
2713: break;
2714: }
2715: return r;
2716: }
2717:
2718: #ifdef USE_SUBEXP_CALL
2719:
2720: #define RECURSION_EXIST 1
2721: #define RECURSION_INFINITE 2
2722:
2723: static int
2724: subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2725: {
2726: int type;
2727: int r = 0;
2728:
2729: type = NTYPE(node);
2730: switch (type) {
2731: case N_LIST:
2732: {
2733: Node *x;
2734: OnigDistance min;
2735: int ret;
2736:
2737: x = node;
2738: do {
2739: ret = subexp_inf_recursive_check(NCONS(x).left, env, head);
2740: if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2741: r |= ret;
2742: if (head) {
2743: ret = get_min_match_length(NCONS(x).left, &min, env);
2744: if (ret != 0) return ret;
2745: if (min != 0) head = 0;
2746: }
2747: } while (IS_NOT_NULL(x = NCONS(x).right));
2748: }
2749: break;
2750:
2751: case N_ALT:
2752: {
2753: int ret;
2754: r = RECURSION_EXIST;
2755: do {
2756: ret = subexp_inf_recursive_check(NCONS(node).left, env, head);
2757: if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2758: r &= ret;
2759: } while (IS_NOT_NULL(node = NCONS(node).right));
2760: }
2761: break;
2762:
2763: case N_QUANTIFIER:
2764: r = subexp_inf_recursive_check(NQUANTIFIER(node).target, env, head);
2765: if (r == RECURSION_EXIST) {
2766: if (NQUANTIFIER(node).lower == 0) r = 0;
2767: }
2768: break;
2769:
2770: case N_ANCHOR:
2771: {
2772: AnchorNode* an = &(NANCHOR(node));
2773: switch (an->type) {
2774: case ANCHOR_PREC_READ:
2775: case ANCHOR_PREC_READ_NOT:
2776: case ANCHOR_LOOK_BEHIND:
2777: case ANCHOR_LOOK_BEHIND_NOT:
2778: r = subexp_inf_recursive_check(an->target, env, head);
2779: break;
2780: }
2781: }
2782: break;
2783:
2784: case N_CALL:
2785: r = subexp_inf_recursive_check(NCALL(node).target, env, head);
2786: break;
2787:
2788: case N_EFFECT:
2789: if (IS_EFFECT_MARK2(&(NEFFECT(node))))
2790: return 0;
2791: else if (IS_EFFECT_MARK1(&(NEFFECT(node))))
2792: return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2793: else {
2794: SET_EFFECT_STATUS(node, NST_MARK2);
2795: r = subexp_inf_recursive_check(NEFFECT(node).target, env, head);
2796: CLEAR_EFFECT_STATUS(node, NST_MARK2);
2797: }
2798: break;
2799:
2800: default:
2801: break;
2802: }
2803:
2804: return r;
2805: }
2806:
2807: static int
2808: subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2809: {
2810: int type;
2811: int r = 0;
2812:
2813: type = NTYPE(node);
2814: switch (type) {
2815: case N_LIST:
2816: case N_ALT:
2817: do {
2818: r = subexp_inf_recursive_check_trav(NCONS(node).left, env);
2819: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
2820: break;
2821:
2822: case N_QUANTIFIER:
2823: r = subexp_inf_recursive_check_trav(NQUANTIFIER(node).target, env);
2824: break;
2825:
2826: case N_ANCHOR:
2827: {
2828: AnchorNode* an = &(NANCHOR(node));
2829: switch (an->type) {
2830: case ANCHOR_PREC_READ:
2831: case ANCHOR_PREC_READ_NOT:
2832: case ANCHOR_LOOK_BEHIND:
2833: case ANCHOR_LOOK_BEHIND_NOT:
2834: r = subexp_inf_recursive_check_trav(an->target, env);
2835: break;
2836: }
2837: }
2838: break;
2839:
2840: case N_EFFECT:
2841: {
2842: EffectNode* en = &(NEFFECT(node));
2843:
2844: if (IS_EFFECT_RECURSION(en)) {
2845: SET_EFFECT_STATUS(node, NST_MARK1);
2846: r = subexp_inf_recursive_check(en->target, env, 1);
2847: if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2848: CLEAR_EFFECT_STATUS(node, NST_MARK1);
2849: }
2850: r = subexp_inf_recursive_check_trav(en->target, env);
2851: }
2852:
2853: break;
2854:
2855: default:
2856: break;
2857: }
2858:
2859: return r;
2860: }
2861:
2862: static int
2863: subexp_recursive_check(Node* node)
2864: {
2865: int type;
2866: int r = 0;
2867:
2868: type = NTYPE(node);
2869: switch (type) {
2870: case N_LIST:
2871: case N_ALT:
2872: do {
2873: r |= subexp_recursive_check(NCONS(node).left);
2874: } while (IS_NOT_NULL(node = NCONS(node).right));
2875: break;
2876:
2877: case N_QUANTIFIER:
2878: r = subexp_recursive_check(NQUANTIFIER(node).target);
2879: break;
2880:
2881: case N_ANCHOR:
2882: {
2883: AnchorNode* an = &(NANCHOR(node));
2884: switch (an->type) {
2885: case ANCHOR_PREC_READ:
2886: case ANCHOR_PREC_READ_NOT:
2887: case ANCHOR_LOOK_BEHIND:
2888: case ANCHOR_LOOK_BEHIND_NOT:
2889: r = subexp_recursive_check(an->target);
2890: break;
2891: }
2892: }
2893: break;
2894:
2895: case N_CALL:
2896: r = subexp_recursive_check(NCALL(node).target);
2897: if (r != 0) SET_CALL_RECURSION(node);
2898: break;
2899:
2900: case N_EFFECT:
2901: if (IS_EFFECT_MARK2(&(NEFFECT(node))))
2902: return 0;
2903: else if (IS_EFFECT_MARK1(&(NEFFECT(node))))
2904: return 1; /* recursion */
2905: else {
2906: SET_EFFECT_STATUS(node, NST_MARK2);
2907: r = subexp_recursive_check(NEFFECT(node).target);
2908: CLEAR_EFFECT_STATUS(node, NST_MARK2);
2909: }
2910: break;
2911:
2912: default:
2913: break;
2914: }
2915:
2916: return r;
2917: }
2918:
2919:
2920: static int
2921: subexp_recursive_check_trav(Node* node, ScanEnv* env)
2922: {
2923: #define FOUND_CALLED_NODE 1
2924:
2925: int type;
2926: int r = 0;
2927:
2928: type = NTYPE(node);
2929: switch (type) {
2930: case N_LIST:
2931: case N_ALT:
2932: {
2933: int ret;
2934: do {
2935: ret = subexp_recursive_check_trav(NCONS(node).left, env);
2936: if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2937: else if (ret < 0) return ret;
2938: } while (IS_NOT_NULL(node = NCONS(node).right));
2939: }
2940: break;
2941:
2942: case N_QUANTIFIER:
2943: r = subexp_recursive_check_trav(NQUANTIFIER(node).target, env);
2944: if (NQUANTIFIER(node).upper == 0) {
2945: if (r == FOUND_CALLED_NODE)
2946: NQUANTIFIER(node).is_refered = 1;
2947: }
2948: break;
2949:
2950: case N_ANCHOR:
2951: {
2952: AnchorNode* an = &(NANCHOR(node));
2953: switch (an->type) {
2954: case ANCHOR_PREC_READ:
2955: case ANCHOR_PREC_READ_NOT:
2956: case ANCHOR_LOOK_BEHIND:
2957: case ANCHOR_LOOK_BEHIND_NOT:
2958: r = subexp_recursive_check_trav(an->target, env);
2959: break;
2960: }
2961: }
2962: break;
2963:
2964: case N_EFFECT:
2965: {
2966: EffectNode* en = &(NEFFECT(node));
2967:
2968: if (! IS_EFFECT_RECURSION(en)) {
2969: if (IS_EFFECT_CALLED(en)) {
2970: SET_EFFECT_STATUS(node, NST_MARK1);
2971: r = subexp_recursive_check(en->target);
2972: if (r != 0) SET_EFFECT_STATUS(node, NST_RECURSION);
2973: CLEAR_EFFECT_STATUS(node, NST_MARK1);
2974: }
2975: }
2976: r = subexp_recursive_check_trav(en->target, env);
2977: if (IS_EFFECT_CALLED(en))
2978: r |= FOUND_CALLED_NODE;
2979: }
2980: break;
2981:
2982: default:
2983: break;
2984: }
2985:
2986: return r;
2987: }
2988:
2989: static int
2990: setup_subexp_call(Node* node, ScanEnv* env)
2991: {
2992: int type;
2993: int r = 0;
2994:
2995: type = NTYPE(node);
2996: switch (type) {
2997: case N_LIST:
2998: do {
2999: r = setup_subexp_call(NCONS(node).left, env);
3000: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3001: break;
3002:
3003: case N_ALT:
3004: do {
3005: r = setup_subexp_call(NCONS(node).left, env);
3006: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3007: break;
3008:
3009: case N_QUANTIFIER:
3010: r = setup_subexp_call(NQUANTIFIER(node).target, env);
3011: break;
3012: case N_EFFECT:
3013: r = setup_subexp_call(NEFFECT(node).target, env);
3014: break;
3015:
3016: case N_CALL:
3017: {
3018: int n, num, *refs;
3019: UChar *p;
3020: CallNode* cn = &(NCALL(node));
3021: Node** nodes = SCANENV_MEM_NODES(env);
3022:
3023: #ifdef USE_NAMED_GROUP
3024: n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs);
3025: #else
3026: n = -1;
3027: #endif
3028: if (n <= 0) {
3029: /* name not found, check group number. (?*ddd) */
3030: p = cn->name;
3031: num = onig_scan_unsigned_number(&p, cn->name_end, env->enc);
3032: if (num <= 0 || p != cn->name_end) {
3033: onig_scan_env_set_error_string(env,
3034: ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3035: return ONIGERR_UNDEFINED_NAME_REFERENCE;
3036: }
3037: #ifdef USE_NAMED_GROUP
3038: if (env->num_named > 0 &&
3039: IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3040: !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3041: return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3042: }
3043: #endif
3044: if (num > env->num_mem) {
3045: onig_scan_env_set_error_string(env,
3046: ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3047: return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3048: }
3049: cn->ref_num = num;
3050: goto set_call_attr;
3051: }
3052: else if (n > 1) {
3053: onig_scan_env_set_error_string(env,
3054: ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3055: return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3056: }
3057: else {
3058: cn->ref_num = refs[0];
3059: set_call_attr:
3060: cn->target = nodes[cn->ref_num];
3061: if (IS_NULL(cn->target)) {
3062: onig_scan_env_set_error_string(env,
3063: ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3064: return ONIGERR_UNDEFINED_NAME_REFERENCE;
3065: }
3066: SET_EFFECT_STATUS(cn->target, NST_CALLED);
3067: BIT_STATUS_ON_AT(env->bt_mem_start, cn->ref_num);
3068: cn->unset_addr_list = env->unset_addr_list;
3069: }
3070: }
3071: break;
3072:
3073: case N_ANCHOR:
3074: {
3075: AnchorNode* an = &(NANCHOR(node));
3076:
3077: switch (an->type) {
3078: case ANCHOR_PREC_READ:
3079: case ANCHOR_PREC_READ_NOT:
3080: case ANCHOR_LOOK_BEHIND:
3081: case ANCHOR_LOOK_BEHIND_NOT:
3082: r = setup_subexp_call(an->target, env);
3083: break;
3084: }
3085: }
3086: break;
3087:
3088: default:
3089: break;
3090: }
3091:
3092: return r;
3093: }
3094: #endif
3095:
3096: /* divide different length alternatives in look-behind.
3097: (?<=A|B) ==> (?<=A)|(?<=B)
3098: (?<!A|B) ==> (?<!A)(?<!B)
3099: */
3100: static int
3101: divide_look_behind_alternatives(Node* node)
3102: {
3103: Node tmp_node;
3104: Node *head, *np, *insert_node;
3105: AnchorNode* an = &(NANCHOR(node));
3106: int anc_type = an->type;
3107:
3108: head = an->target;
3109: np = NCONS(head).left;
3110: tmp_node = *node; *node = *head; *head = tmp_node;
3111: NCONS(node).left = head;
3112: NANCHOR(head).target = np;
3113:
3114: np = node;
3115: while ((np = NCONS(np).right) != NULL_NODE) {
3116: insert_node = onig_node_new_anchor(anc_type);
3117: CHECK_NULL_RETURN_VAL(insert_node, ONIGERR_MEMORY);
3118: NANCHOR(insert_node).target = NCONS(np).left;
3119: NCONS(np).left = insert_node;
3120: }
3121:
3122: if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3123: np = node;
3124: do {
3125: np->type = N_LIST; /* alt -> list */
3126: } while ((np = NCONS(np).right) != NULL_NODE);
3127: }
3128: return 0;
3129: }
3130:
3131: static int
3132: setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3133: {
3134: int r, len;
3135: AnchorNode* an = &(NANCHOR(node));
3136:
3137: r = get_char_length_tree(an->target, reg, &len);
3138: if (r == 0)
3139: an->char_len = len;
3140: else if (r == GET_CHAR_LEN_VARLEN)
3141: r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3142: else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3143: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3144: r = divide_look_behind_alternatives(node);
3145: else
3146: r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3147: }
3148:
3149: return r;
3150: }
3151:
3152: static int
3153: next_setup(Node* node, Node* next_node, regex_t* reg)
3154: {
3155: int type;
3156:
3157: retry:
3158: type = NTYPE(node);
3159: if (type == N_QUANTIFIER) {
3160: QuantifierNode* qn = &(NQUANTIFIER(node));
3161: if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3162: #ifdef USE_QUANTIFIER_PEEK_NEXT
3163: qn->next_head_exact = get_head_value_node(next_node, 1, reg);
3164: #endif
3165: /* automatic posseivation a*b ==> (?>a*)b */
3166: if (qn->lower <= 1) {
3167: int ttype = NTYPE(qn->target);
3168: if (IS_NODE_TYPE_SIMPLE(ttype)) {
3169: Node *x, *y;
3170: x = get_head_value_node(qn->target, 0, reg);
3171: if (IS_NOT_NULL(x)) {
3172: y = get_head_value_node(next_node, 0, reg);
3173: if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3174: Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK);
3175: CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
3176: SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3177: swap_node(node, en);
3178: NEFFECT(node).target = en;
3179: }
3180: }
3181: }
3182: }
3183: }
3184: }
3185: else if (type == N_EFFECT) {
3186: EffectNode* en = &(NEFFECT(node));
3187: if (en->type == EFFECT_MEMORY) {
3188: node = en->target;
3189: goto retry;
3190: }
3191: }
3192: return 0;
3193: }
3194:
3195:
3196: static int
3197: divide_ambig_string_node_sub(regex_t* reg, int prev_ambig,
3198: UChar* prev_start, UChar* prev,
3199: UChar* end, Node*** tailp, Node** root)
3200: {
3201: UChar *tmp, *wp;
3202: Node* snode;
3203:
3204: if (prev_ambig != 0) {
3205: tmp = prev_start;
3206: wp = prev_start;
3207: while (tmp < prev) {
3208: wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
3209: &tmp, end, wp);
3210: }
3211: snode = onig_node_new_str(prev_start, wp);
3212: CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
3213: NSTRING_SET_AMBIG(snode);
3214: if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode);
3215: }
3216: else {
3217: snode = onig_node_new_str(prev_start, prev);
3218: CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
3219: }
3220:
3221: if (*tailp == (Node** )0) {
3222: *root = onig_node_new_list(snode, NULL);
3223: CHECK_NULL_RETURN_VAL(*root, ONIGERR_MEMORY);
3224: *tailp = &(NCONS(*root).right);
3225: }
3226: else {
3227: **tailp = onig_node_new_list(snode, NULL);
3228: CHECK_NULL_RETURN_VAL(**tailp, ONIGERR_MEMORY);
3229: *tailp = &(NCONS(**tailp).right);
3230: }
3231:
3232: return 0;
3233: }
3234:
3235: static int
3236: divide_ambig_string_node(Node* node, regex_t* reg)
3237: {
3238: StrNode* sn = &NSTRING(node);
3239: int ambig, prev_ambig;
3240: UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp;
3241: Node *root = NULL_NODE;
3242: Node **tailp = (Node** )0;
3243: int r;
3244:
3245: start = prev_start = p = sn->s;
3246: end = sn->end;
3247: if (p >= end) return 0;
3248:
3249: prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end);
3250:
3251: while (p < end) {
3252: prev = p;
3253: if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc,
3254: reg->ambig_flag, &p, end))) {
3255:
3256: r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, prev,
3257: end, &tailp, &root);
3258: if (r != 0) return r;
3259:
3260: prev_ambig = ambig;
3261: prev_start = prev;
3262: }
3263: }
3264:
3265: if (prev_start == start) {
3266: if (prev_ambig != 0) {
3267: NSTRING_SET_AMBIG(node);
3268: tmp = start;
3269: wp = start;
3270: while (tmp < end) {
3271: wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
3272: &tmp, end, wp);
3273: }
3274: if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node);
3275: sn->end = wp;
3276: }
3277: }
3278: else {
3279: r = divide_ambig_string_node_sub(reg, prev_ambig, prev_start, end,
3280: end, &tailp, &root);
3281: if (r != 0) return r;
3282:
3283: swap_node(node, root);
3284: onig_node_str_clear(root); /* should be after swap! */
3285: onig_node_free(root); /* free original string node */
3286: }
3287:
3288: return 0;
3289: }
3290:
3291: #ifdef USE_COMBINATION_EXPLOSION_CHECK
3292:
3293: #define CEC_THRES_NUM_BIG_REPEAT 512
3294: #define CEC_INFINITE_NUM 0x7fffffff
3295:
3296: #define CEC_IN_INFINITE_REPEAT (1<<0)
3297: #define CEC_IN_FINITE_REPEAT (1<<1)
3298: #define CEC_CONT_BIG_REPEAT (1<<2)
3299:
3300: static int
3301: setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3302: {
3303: int type;
3304: int r = state;
3305:
3306: type = NTYPE(node);
3307: switch (type) {
3308: case N_LIST:
3309: {
3310: Node* prev = NULL_NODE;
3311: do {
3312: r = setup_comb_exp_check(NCONS(node).left, r, env);
3313: prev = NCONS(node).left;
3314: } while (r >= 0 && IS_NOT_NULL(node = NCONS(node).right));
3315: }
3316: break;
3317:
3318: case N_ALT:
3319: {
3320: int ret;
3321: do {
3322: ret = setup_comb_exp_check(NCONS(node).left, state, env);
3323: r |= ret;
3324: } while (ret >= 0 && IS_NOT_NULL(node = NCONS(node).right));
3325: }
3326: break;
3327:
3328: case N_QUANTIFIER:
3329: {
3330: int child_state = state;
3331: int add_state = 0;
3332: QuantifierNode* qn = &(NQUANTIFIER(node));
3333: Node* target = qn->target;
3334: int var_num;
3335:
3336: if (! IS_REPEAT_INFINITE(qn->upper)) {
3337: if (qn->upper > 1) {
3338: /* {0,1}, {1,1} are allowed */
3339: child_state |= CEC_IN_FINITE_REPEAT;
3340:
3341: /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3342: if (env->backrefed_mem == 0) {
3343: if (NTYPE(qn->target) == N_EFFECT) {
3344: EffectNode* en = &(NEFFECT(qn->target));
3345: if (en->type == EFFECT_MEMORY) {
3346: if (NTYPE(en->target) == N_QUANTIFIER) {
3347: QuantifierNode* q = &(NQUANTIFIER(en->target));
3348: if (IS_REPEAT_INFINITE(q->upper)
3349: && q->greedy == qn->greedy) {
3350: qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3351: if (qn->upper == 1)
3352: child_state = state;
3353: }
3354: }
3355: }
3356: }
3357: }
3358: }
3359: }
3360:
3361: if (state & CEC_IN_FINITE_REPEAT) {
3362: qn->comb_exp_check_num = -1;
3363: }
3364: else {
3365: if (IS_REPEAT_INFINITE(qn->upper)) {
3366: var_num = CEC_INFINITE_NUM;
3367: child_state |= CEC_IN_INFINITE_REPEAT;
3368: }
3369: else {
3370: var_num = qn->upper - qn->lower;
3371: }
3372:
3373: if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3374: add_state |= CEC_CONT_BIG_REPEAT;
3375:
3376: if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3377: ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3378: var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3379: if (qn->comb_exp_check_num == 0) {
3380: env->num_comb_exp_check++;
3381: qn->comb_exp_check_num = env->num_comb_exp_check;
3382: if (env->curr_max_regnum > env->comb_exp_max_regnum)
3383: env->comb_exp_max_regnum = env->curr_max_regnum;
3384: }
3385: }
3386: }
3387:
3388: r = setup_comb_exp_check(target, child_state, env);
3389: r |= add_state;
3390: }
3391: break;
3392:
3393: case N_EFFECT:
3394: {
3395: EffectNode* en = &(NEFFECT(node));
3396:
3397: switch (en->type) {
3398: case EFFECT_MEMORY:
3399: {
3400: if (env->curr_max_regnum < en->regnum)
3401: env->curr_max_regnum = en->regnum;
3402:
3403: r = setup_comb_exp_check(en->target, state, env);
3404: }
3405: break;
3406:
3407: default:
3408: r = setup_comb_exp_check(en->target, state, env);
3409: break;
3410: }
3411: }
3412: break;
3413:
3414: #ifdef USE_SUBEXP_CALL
3415: case N_CALL:
3416: if (IS_CALL_RECURSION(&(NCALL(node))))
3417: env->has_recursion = 1;
3418: else
3419: r = setup_comb_exp_check(NCALL(node).target, state, env);
3420: break;
3421: #endif
3422:
3423: default:
3424: break;
3425: }
3426:
3427: return r;
3428: }
3429: #endif
3430:
3431: #define IN_ALT (1<<0)
3432: #define IN_NOT (1<<1)
3433: #define IN_REPEAT (1<<2)
3434: #define IN_VAR_REPEAT (1<<3)
3435:
3436: /* setup_tree does the following work.
3437: 1. check empty loop. (set qn->target_empty_info)
3438: 2. expand ignore-case in char class.
3439: 3. set memory status bit flags. (reg->mem_stats)
3440: 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3441: 5. find invalid patterns in look-behind.
3442: 6. expand repeated string.
3443: */
3444: static int
3445: setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3446: {
3447: int type;
3448: int r = 0;
3449:
3450: type = NTYPE(node);
3451: switch (type) {
3452: case N_LIST:
3453: {
3454: Node* prev = NULL_NODE;
3455: do {
3456: r = setup_tree(NCONS(node).left, reg, state, env);
3457: if (IS_NOT_NULL(prev) && r == 0) {
3458: r = next_setup(prev, NCONS(node).left, reg);
3459: }
3460: prev = NCONS(node).left;
3461: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3462: }
3463: break;
3464:
3465: case N_ALT:
3466: do {
3467: r = setup_tree(NCONS(node).left, reg, (state | IN_ALT), env);
3468: } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
3469: break;
3470:
3471: case N_CCLASS:
3472: break;
3473:
3474: case N_STRING:
3475: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3476: r = divide_ambig_string_node(node, reg);
3477: }
3478: break;
3479:
3480: case N_CTYPE:
3481: case N_ANYCHAR:
3482: break;
3483:
3484: #ifdef USE_SUBEXP_CALL
3485: case N_CALL:
3486: break;
3487: #endif
3488:
3489: case N_BACKREF:
3490: {
3491: int i;
3492: int* p;
3493: Node** nodes = SCANENV_MEM_NODES(env);
3494: BackrefNode* br = &(NBACKREF(node));
3495: p = BACKREFS_P(br);
3496: for (i = 0; i < br->back_num; i++) {
3497: if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3498: BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3499: BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3500: #ifdef USE_BACKREF_AT_LEVEL
3501: if (IS_BACKREF_NEST_LEVEL(br)) {
3502: BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3503: }
3504: #endif
3505: SET_EFFECT_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3506: }
3507: }
3508: break;
3509:
3510: case N_QUANTIFIER:
3511: {
3512: OnigDistance d;
3513: QuantifierNode* qn = &(NQUANTIFIER(node));
3514: Node* target = qn->target;
3515:
3516: if ((state & IN_REPEAT) != 0) {
3517: qn->state |= NST_IN_REPEAT;
3518: }
3519:
3520: if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3521: r = get_min_match_length(target, &d, env);
3522: if (r) break;
3523: if (d == 0) {
3524: qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3525: #ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK
3526: r = quantifiers_memory_node_info(target);
3527: if (r < 0) break;
3528: if (r > 0) {
3529: qn->target_empty_info = r;
3530: }
3531: #endif
3532: #if 0
3533: r = get_max_match_length(target, &d, env);
3534: if (r == 0 && d == 0) {
3535: /* ()* ==> ()?, ()+ ==> () */
3536: qn->upper = 1;
3537: if (qn->lower > 1) qn->lower = 1;
3538: if (NTYPE(target) == N_STRING) {
3539: qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3540: }
3541: }
3542: #endif
3543: }
3544: }
3545:
3546: state |= IN_REPEAT;
3547: if (qn->lower != qn->upper)
3548: state |= IN_VAR_REPEAT;
3549: r = setup_tree(target, reg, state, env);
3550: if (r) break;
3551:
3552: /* expand string */
3553: #define EXPAND_STRING_MAX_LENGTH 100
3554: if (NTYPE(target) == N_STRING) {
3555: if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3556: qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3557: int len = NSTRING_LEN(target);
3558: StrNode* sn = &(NSTRING(target));
3559:
3560: if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3561: int i, n = qn->lower;
3562: onig_node_conv_to_str_node(node, NSTRING(target).flag);
3563: for (i = 0; i < n; i++) {
3564: r = onig_node_str_cat(node, sn->s, sn->end);
3565: if (r) break;
3566: }
3567: onig_node_free(target);
3568: break; /* break case N_QUANTIFIER: */
3569: }
3570: }
3571: }
3572:
3573: #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3574: if (qn->greedy && (qn->target_empty_info != 0)) {
3575: if (NTYPE(target) == N_QUANTIFIER) {
3576: QuantifierNode* tqn = &(NQUANTIFIER(target));
3577: if (IS_NOT_NULL(tqn->head_exact)) {
3578: qn->head_exact = tqn->head_exact;
3579: tqn->head_exact = NULL;
3580: }
3581: }
3582: else {
3583: qn->head_exact = get_head_value_node(qn->target, 1, reg);
3584: }
3585: }
3586: #endif
3587: }
3588: break;
3589:
3590: case N_EFFECT:
3591: {
3592: EffectNode* en = &(NEFFECT(node));
3593:
3594: switch (en->type) {
3595: case EFFECT_OPTION:
3596: {
3597: OnigOptionType options = reg->options;
3598: reg->options = NEFFECT(node).option;
3599: r = setup_tree(NEFFECT(node).target, reg, state, env);
3600: reg->options = options;
3601: }
3602: break;
3603:
3604: case EFFECT_MEMORY:
3605: if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3606: BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3607: /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */
3608: }
3609: r = setup_tree(en->target, reg, state, env);
3610: break;
3611:
3612: case EFFECT_STOP_BACKTRACK:
3613: {
3614: Node* target = en->target;
3615: r = setup_tree(target, reg, state, env);
3616: if (NTYPE(target) == N_QUANTIFIER) {
3617: QuantifierNode* tqn = &(NQUANTIFIER(target));
3618: if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3619: tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3620: int qtype = NTYPE(tqn->target);
3621: if (IS_NODE_TYPE_SIMPLE(qtype))
3622: SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3623: }
3624: }
3625: }
3626: break;
3627: }
3628: }
3629: break;
3630:
3631: case N_ANCHOR:
3632: {
3633: AnchorNode* an = &(NANCHOR(node));
3634:
3635: switch (an->type) {
3636: case ANCHOR_PREC_READ:
3637: r = setup_tree(an->target, reg, state, env);
3638: break;
3639: case ANCHOR_PREC_READ_NOT:
3640: r = setup_tree(an->target, reg, (state | IN_NOT), env);
3641: break;
3642:
3643: /* allowed node types in look-behind */
3644: #define ALLOWED_TYPE_IN_LB \
3645: ( N_LIST | N_ALT | N_STRING | N_CCLASS | N_CTYPE | \
3646: N_ANYCHAR | N_ANCHOR | N_EFFECT | N_QUANTIFIER | N_CALL )
3647:
3648: #define ALLOWED_EFFECT_IN_LB ( EFFECT_MEMORY )
3649: #define ALLOWED_EFFECT_IN_LB_NOT 0
3650:
3651: #define ALLOWED_ANCHOR_IN_LB \
3652: ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3653: #define ALLOWED_ANCHOR_IN_LB_NOT \
3654: ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3655:
3656: case ANCHOR_LOOK_BEHIND:
3657: {
3658: r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3659: ALLOWED_EFFECT_IN_LB, ALLOWED_ANCHOR_IN_LB);
3660: if (r < 0) return r;
3661: if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3662: r = setup_look_behind(node, reg, env);
3663: if (r != 0) return r;
3664: r = setup_tree(an->target, reg, state, env);
3665: }
3666: break;
3667:
3668: case ANCHOR_LOOK_BEHIND_NOT:
3669: {
3670: r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3671: ALLOWED_EFFECT_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3672: if (r < 0) return r;
3673: if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3674: r = setup_look_behind(node, reg, env);
3675: if (r != 0) return r;
3676: r = setup_tree(an->target, reg, (state | IN_NOT), env);
3677: }
3678: break;
3679: }
3680: }
3681: break;
3682:
3683: default:
3684: break;
3685: }
3686:
3687: return r;
3688: }
3689:
3690: /* set skip map for Boyer-Moor search */
3691: static int
3692: set_bm_skip(UChar* s, UChar* end, OnigEncoding enc,
3693: UChar skip[], int** int_skip)
3694: {
3695: int i, len;
3696:
3697: len = end - s;
3698: if (len < ONIG_CHAR_TABLE_SIZE) {
3699: for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
3700:
3701: for (i = 0; i < len - 1; i++)
3702: skip[s[i]] = len - 1 - i;
3703: }
3704: else {
3705: if (IS_NULL(*int_skip)) {
3706: *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3707: if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3708: }
3709: for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3710:
3711: for (i = 0; i < len - 1; i++)
3712: (*int_skip)[s[i]] = len - 1 - i;
3713: }
3714: return 0;
3715: }
3716:
3717: #define OPT_EXACT_MAXLEN 24
3718:
3719: typedef struct {
3720: OnigDistance min; /* min byte length */
3721: OnigDistance max; /* max byte length */
3722: } MinMaxLen;
3723:
3724: typedef struct {
3725: MinMaxLen mmd;
3726: OnigEncoding enc;
3727: OnigOptionType options;
3728: OnigAmbigType ambig_flag;
3729: ScanEnv* scan_env;
3730: } OptEnv;
3731:
3732: typedef struct {
3733: int left_anchor;
3734: int right_anchor;
3735: } OptAncInfo;
3736:
3737: typedef struct {
3738: MinMaxLen mmd; /* info position */
3739: OptAncInfo anc;
3740:
3741: int reach_end;
3742: int ignore_case;
3743: int len;
3744: UChar s[OPT_EXACT_MAXLEN];
3745: } OptExactInfo;
3746:
3747: typedef struct {
3748: MinMaxLen mmd; /* info position */
3749: OptAncInfo anc;
3750:
3751: int value; /* weighted value */
3752: UChar map[ONIG_CHAR_TABLE_SIZE];
3753: } OptMapInfo;
3754:
3755: typedef struct {
3756: MinMaxLen len;
3757:
3758: OptAncInfo anc;
3759: OptExactInfo exb; /* boundary */
3760: OptExactInfo exm; /* middle */
3761: OptExactInfo expr; /* prec read (?=...) */
3762:
3763: OptMapInfo map; /* boundary */
3764: } NodeOptInfo;
3765:
3766:
3767: static int
3768: map_position_value(OnigEncoding enc, int i)
3769: {
3770: static const short int ByteValTable[] = {
3771: 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
3772: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3773: 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
3774: 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
3775: 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3776: 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
3777: 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
3778: 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
3779: };
3780:
3781: if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0])) {
3782: if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
3783: return 20;
3784: else
3785: return (int )ByteValTable[i];
3786: }
3787: else
3788: return 4; /* Take it easy. */
3789: }
3790:
3791: static int
3792: distance_value(MinMaxLen* mm)
3793: {
3794: /* 1000 / (min-max-dist + 1) */
3795: static const short int dist_vals[] = {
3796: 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
3797: 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
3798: 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
3799: 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
3800: 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
3801: 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
3802: 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
3803: 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
3804: 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
3805: 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
3806: };
3807:
3808: int d;
3809:
3810: if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
3811:
3812: d = mm->max - mm->min;
3813: if (d < sizeof(dist_vals)/sizeof(dist_vals[0]))
3814: /* return dist_vals[d] * 16 / (mm->min + 12); */
3815: return (int )dist_vals[d];
3816: else
3817: return 1;
3818: }
3819:
3820: static int
3821: comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
3822: {
3823: if (v2 <= 0) return -1;
3824: if (v1 <= 0) return 1;
3825:
3826: v1 *= distance_value(d1);
3827: v2 *= distance_value(d2);
3828:
3829: if (v2 > v1) return 1;
3830: if (v2 < v1) return -1;
3831:
3832: if (d2->min < d1->min) return 1;
3833: if (d2->min > d1->min) return -1;
3834: return 0;
3835: }
3836:
3837: static int
3838: is_equal_mml(MinMaxLen* a, MinMaxLen* b)
3839: {
3840: return (a->min == b->min && a->max == b->max) ? 1 : 0;
3841: }
3842:
3843:
3844: static void
3845: set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
3846: {
3847: mml->min = min;
3848: mml->max = max;
3849: }
3850:
3851: static void
3852: clear_mml(MinMaxLen* mml)
3853: {
3854: mml->min = mml->max = 0;
3855: }
3856:
3857: static void
3858: copy_mml(MinMaxLen* to, MinMaxLen* from)
3859: {
3860: to->min = from->min;
3861: to->max = from->max;
3862: }
3863:
3864: static void
3865: add_mml(MinMaxLen* to, MinMaxLen* from)
3866: {
3867: to->min = distance_add(to->min, from->min);
3868: to->max = distance_add(to->max, from->max);
3869: }
3870:
3871: #if 0
3872: static void
3873: add_len_mml(MinMaxLen* to, OnigDistance len)
3874: {
3875: to->min = distance_add(to->min, len);
3876: to->max = distance_add(to->max, len);
3877: }
3878: #endif
3879:
3880: static void
3881: alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
3882: {
3883: if (to->min > from->min) to->min = from->min;
3884: if (to->max < from->max) to->max = from->max;
3885: }
3886:
3887: static void
3888: copy_opt_env(OptEnv* to, OptEnv* from)
3889: {
3890: *to = *from;
3891: }
3892:
3893: static void
3894: clear_opt_anc_info(OptAncInfo* anc)
3895: {
3896: anc->left_anchor = 0;
3897: anc->right_anchor = 0;
3898: }
3899:
3900: static void
3901: copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
3902: {
3903: *to = *from;
3904: }
3905:
3906: static void
3907: concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
3908: OnigDistance left_len, OnigDistance right_len)
3909: {
3910: clear_opt_anc_info(to);
3911:
3912: to->left_anchor = left->left_anchor;
3913: if (left_len == 0) {
3914: to->left_anchor |= right->left_anchor;
3915: }
3916:
3917: to->right_anchor = right->right_anchor;
3918: if (right_len == 0) {
3919: to->right_anchor |= left->right_anchor;
3920: }
3921: }
3922:
3923: static int
3924: is_left_anchor(int anc)
3925: {
3926: if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
3927: anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
3928: anc == ANCHOR_PREC_READ_NOT)
3929: return 0;
3930:
3931: return 1;
3932: }
3933:
3934: static int
3935: is_set_opt_anc_info(OptAncInfo* to, int anc)
3936: {
3937: if ((to->left_anchor & anc) != 0) return 1;
3938:
3939: return ((to->right_anchor & anc) != 0 ? 1 : 0);
3940: }
3941:
3942: static void
3943: add_opt_anc_info(OptAncInfo* to, int anc)
3944: {
3945: if (is_left_anchor(anc))
3946: to->left_anchor |= anc;
3947: else
3948: to->right_anchor |= anc;
3949: }
3950:
3951: static void
3952: remove_opt_anc_info(OptAncInfo* to, int anc)
3953: {
3954: if (is_left_anchor(anc))
3955: to->left_anchor &= ~anc;
3956: else
3957: to->right_anchor &= ~anc;
3958: }
3959:
3960: static void
3961: alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
3962: {
3963: to->left_anchor &= add->left_anchor;
3964: to->right_anchor &= add->right_anchor;
3965: }
3966:
3967: static int
3968: is_full_opt_exact_info(OptExactInfo* ex)
3969: {
3970: return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
3971: }
3972:
3973: static void
3974: clear_opt_exact_info(OptExactInfo* ex)
3975: {
3976: clear_mml(&ex->mmd);
3977: clear_opt_anc_info(&ex->anc);
3978: ex->reach_end = 0;
3979: ex->ignore_case = 0;
3980: ex->len = 0;
3981: ex->s[0] = '\0';
3982: }
3983:
3984: static void
3985: copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
3986: {
3987: *to = *from;
3988: }
3989:
3990: static void
3991: concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
3992: {
3993: int i, j, len;
3994: UChar *p, *end;
3995: OptAncInfo tanc;
3996:
3997: if (! to->ignore_case && add->ignore_case) {
3998: if (to->len >= add->len) return ; /* avoid */
3999:
4000: to->ignore_case = 1;
4001: }
4002:
4003: p = add->s;
4004: end = p + add->len;
4005: for (i = to->len; p < end; ) {
4006: len = enc_len(enc, p);
4007: if (i + len > OPT_EXACT_MAXLEN) break;
4008: for (j = 0; j < len && p < end; j++)
4009: to->s[i++] = *p++;
4010: }
4011:
4012: to->len = i;
4013: to->reach_end = (p == end ? add->reach_end : 0);
4014:
4015: concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4016: if (! to->reach_end) tanc.right_anchor = 0;
4017: copy_opt_anc_info(&to->anc, &tanc);
4018: }
4019:
4020: static void
4021: concat_opt_exact_info_str(OptExactInfo* to,
4022: UChar* s, UChar* end, int raw, OnigEncoding enc)
4023: {
4024: int i, j, len;
4025: UChar *p;
4026:
4027: for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4028: len = enc_len(enc, p);
4029: if (i + len > OPT_EXACT_MAXLEN) break;
4030: for (j = 0; j < len && p < end; j++)
4031: to->s[i++] = *p++;
4032: }
4033:
4034: to->len = i;
4035: }
4036:
4037: static void
4038: alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4039: {
4040: int i, j, len;
4041:
4042: if (add->len == 0 || to->len == 0) {
4043: clear_opt_exact_info(to);
4044: return ;
4045: }
4046:
4047: if (! is_equal_mml(&to->mmd, &add->mmd)) {
4048: clear_opt_exact_info(to);
4049: return ;
4050: }
4051:
4052: for (i = 0; i < to->len && i < add->len; ) {
4053: if (to->s[i] != add->s[i]) break;
4054: len = enc_len(env->enc, to->s + i);
4055:
4056: for (j = 1; j < len; j++) {
4057: if (to->s[i+j] != add->s[i+j]) break;
4058: }
4059: if (j < len) break;
4060: i += len;
4061: }
4062:
4063: if (! add->reach_end || i < add->len || i < to->len) {
4064: to->reach_end = 0;
4065: }
4066: to->len = i;
4067: to->ignore_case |= add->ignore_case;
4068:
4069: alt_merge_opt_anc_info(&to->anc, &add->anc);
4070: if (! to->reach_end) to->anc.right_anchor = 0;
4071: }
4072:
4073: static void
4074: select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4075: {
4076: int v1, v2;
4077:
4078: v1 = now->len;
4079: v2 = alt->len;
4080:
4081: if (v2 == 0) {
4082: return ;
4083: }
4084: else if (v1 == 0) {
4085: copy_opt_exact_info(now, alt);
4086: return ;
4087: }
4088: else if (v1 <= 2 && v2 <= 2) {
4089: /* ByteValTable[x] is big value --> low price */
4090: v2 = map_position_value(enc, now->s[0]);
4091: v1 = map_position_value(enc, alt->s[0]);
4092:
4093: if (now->len > 1) v1 += 5;
4094: if (alt->len > 1) v2 += 5;
4095: }
4096:
4097: if (now->ignore_case == 0) v1 *= 2;
4098: if (alt->ignore_case == 0) v2 *= 2;
4099:
4100: if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4101: copy_opt_exact_info(now, alt);
4102: }
4103:
4104: static void
4105: clear_opt_map_info(OptMapInfo* map)
4106: {
4107: static const OptMapInfo clean_info = {
4108: {0, 0}, {0, 0}, 0,
4109: {
4110: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4111: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4112: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4113: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4114: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4115: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4116: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4117: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4118: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4119: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4120: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4121: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4122: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4123: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4124: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4125: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4126: }
4127: };
4128:
4129: xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4130: }
4131:
4132: static void
4133: copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4134: {
4135: *to = *from;
4136: }
4137:
4138: static void
4139: add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4140: {
4141: if (map->map[c] == 0) {
4142: map->map[c] = 1;
4143: map->value += map_position_value(enc, c);
4144: }
4145: }
4146:
4147: static int
4148: add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4149: OnigEncoding enc, OnigAmbigType ambig_flag)
4150: {
4151: int i, n, len;
4152: UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN];
4153: OnigCodePoint code;
4154: const OnigPairAmbigCodes* pccs;
4155: OnigAmbigType amb;
4156:
4157: add_char_opt_map_info(map, p[0], enc);
4158: code = ONIGENC_MBC_TO_CODE(enc, p, end);
4159:
4160: for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
4161: if ((amb & ambig_flag) == 0) continue;
4162:
4163: n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs);
4164: for (i = 0; i < n; i++) {
4165: if (pccs[i].from == code) {
4166: len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf);
4167: if (len < 0) return len;
4168: add_char_opt_map_info(map, buf[0], enc);
4169: }
4170: }
4171: }
4172: return 0;
4173: }
4174:
4175: static void
4176: select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4177: {
4178: static int z = 1<<15; /* 32768: something big value */
4179:
4180: int v1, v2;
4181:
4182: if (alt->value == 0) return ;
4183: if (now->value == 0) {
4184: copy_opt_map_info(now, alt);
4185: return ;
4186: }
4187:
4188: v1 = z / now->value;
4189: v2 = z / alt->value;
4190: if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4191: copy_opt_map_info(now, alt);
4192: }
4193:
4194: static int
4195: comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4196: {
4197: #define COMP_EM_BASE 20
4198: int ve, vm;
4199:
4200: if (m->value <= 0) return -1;
4201:
4202: ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4203: vm = COMP_EM_BASE * 5 * 2 / m->value;
4204: return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4205: }
4206:
4207: static void
4208: alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4209: {
4210: int i, val;
4211:
4212: /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4213: if (to->value == 0) return ;
4214: if (add->value == 0 || to->mmd.max < add->mmd.min) {
4215: clear_opt_map_info(to);
4216: return ;
4217: }
4218:
4219: alt_merge_mml(&to->mmd, &add->mmd);
4220:
4221: val = 0;
4222: for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4223: if (add->map[i])
4224: to->map[i] = 1;
4225:
4226: if (to->map[i])
4227: val += map_position_value(enc, i);
4228: }
4229: to->value = val;
4230:
4231: alt_merge_opt_anc_info(&to->anc, &add->anc);
4232: }
4233:
4234: static void
4235: set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4236: {
4237: copy_mml(&(opt->exb.mmd), mmd);
4238: copy_mml(&(opt->expr.mmd), mmd);
4239: copy_mml(&(opt->map.mmd), mmd);
4240: }
4241:
4242: static void
4243: clear_node_opt_info(NodeOptInfo* opt)
4244: {
4245: clear_mml(&opt->len);
4246: clear_opt_anc_info(&opt->anc);
4247: clear_opt_exact_info(&opt->exb);
4248: clear_opt_exact_info(&opt->exm);
4249: clear_opt_exact_info(&opt->expr);
4250: clear_opt_map_info(&opt->map);
4251: }
4252:
4253: static void
4254: copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4255: {
4256: *to = *from;
4257: }
4258:
4259: static void
4260: concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4261: {
4262: int exb_reach, exm_reach;
4263: OptAncInfo tanc;
4264:
4265: concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4266: copy_opt_anc_info(&to->anc, &tanc);
4267:
4268: if (add->exb.len > 0 && to->len.max == 0) {
4269: concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4270: to->len.max, add->len.max);
4271: copy_opt_anc_info(&add->exb.anc, &tanc);
4272: }
4273:
4274: if (add->map.value > 0 && to->len.max == 0) {
4275: if (add->map.mmd.max == 0)
4276: add->map.anc.left_anchor |= to->anc.left_anchor;
4277: }
4278:
4279: exb_reach = to->exb.reach_end;
4280: exm_reach = to->exm.reach_end;
4281:
4282: if (add->len.max != 0)
4283: to->exb.reach_end = to->exm.reach_end = 0;
4284:
4285: if (add->exb.len > 0) {
4286: if (exb_reach) {
4287: concat_opt_exact_info(&to->exb, &add->exb, enc);
4288: clear_opt_exact_info(&add->exb);
4289: }
4290: else if (exm_reach) {
4291: concat_opt_exact_info(&to->exm, &add->exb, enc);
4292: clear_opt_exact_info(&add->exb);
4293: }
4294: }
4295: select_opt_exact_info(enc, &to->exm, &add->exb);
4296: select_opt_exact_info(enc, &to->exm, &add->exm);
4297:
4298: if (to->expr.len > 0) {
4299: if (add->len.max > 0) {
4300: if (to->expr.len > (int )add->len.max)
4301: to->expr.len = add->len.max;
4302:
4303: if (to->expr.mmd.max == 0)
4304: select_opt_exact_info(enc, &to->exb, &to->expr);
4305: else
4306: select_opt_exact_info(enc, &to->exm, &to->expr);
4307: }
4308: }
4309: else if (add->expr.len > 0) {
4310: copy_opt_exact_info(&to->expr, &add->expr);
4311: }
4312:
4313: select_opt_map_info(&to->map, &add->map);
4314:
4315: add_mml(&to->len, &add->len);
4316: }
4317:
4318: static void
4319: alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4320: {
4321: alt_merge_opt_anc_info (&to->anc, &add->anc);
4322: alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4323: alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4324: alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4325: alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4326:
4327: alt_merge_mml(&to->len, &add->len);
4328: }
4329:
4330:
4331: #define MAX_NODE_OPT_INFO_REF_COUNT 5
4332:
4333: static int
4334: optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4335: {
4336: int type;
4337: int r = 0;
4338:
4339: clear_node_opt_info(opt);
4340: set_bound_node_opt_info(opt, &env->mmd);
4341:
4342: type = NTYPE(node);
4343: switch (type) {
4344: case N_LIST:
4345: {
4346: OptEnv nenv;
4347: NodeOptInfo nopt;
4348: Node* nd = node;
4349:
4350: copy_opt_env(&nenv, env);
4351: do {
4352: r = optimize_node_left(NCONS(nd).left, &nopt, &nenv);
4353: if (r == 0) {
4354: add_mml(&nenv.mmd, &nopt.len);
4355: concat_left_node_opt_info(env->enc, opt, &nopt);
4356: }
4357: } while (r == 0 && IS_NOT_NULL(nd = NCONS(nd).right));
4358: }
4359: break;
4360:
4361: case N_ALT:
4362: {
4363: NodeOptInfo nopt;
4364: Node* nd = node;
4365:
4366: do {
4367: r = optimize_node_left(NCONS(nd).left, &nopt, env);
4368: if (r == 0) {
4369: if (nd == node) copy_node_opt_info(opt, &nopt);
4370: else alt_merge_node_opt_info(opt, &nopt, env);
4371: }
4372: } while ((r == 0) && IS_NOT_NULL(nd = NCONS(nd).right));
4373: }
4374: break;
4375:
4376: case N_STRING:
4377: {
4378: StrNode* sn = &(NSTRING(node));
4379: int slen = sn->end - sn->s;
4380: int is_raw = NSTRING_IS_RAW(node);
4381:
4382: if (! NSTRING_IS_AMBIG(node)) {
4383: concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4384: NSTRING_IS_RAW(node), env->enc);
4385: if (slen > 0) {
4386: add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4387: }
4388: set_mml(&opt->len, slen, slen);
4389: }
4390: else {
4391: int n, max;
4392:
4393: concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4394: is_raw, env->enc);
4395: opt->exb.ignore_case = 1;
4396:
4397: if (slen > 0) {
4398: r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4399: env->enc, env->ambig_flag);
4400: if (r != 0) break;
4401: }
4402:
4403: if (NSTRING_IS_AMBIG_REDUCE(node)) {
4404: n = onigenc_strlen(env->enc, sn->s, sn->end);
4405: max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4406: }
4407: else {
4408: max = slen;
4409: }
4410: set_mml(&opt->len, slen, max);
4411: }
4412:
4413: if (opt->exb.len == slen)
4414: opt->exb.reach_end = 1;
4415: }
4416: break;
4417:
4418: case N_CCLASS:
4419: {
4420: int i, z;
4421: CClassNode* cc = &(NCCLASS(node));
4422:
4423: /* no need to check ignore case. (setted in setup_tree()) */
4424:
4425: if (IS_NOT_NULL(cc->mbuf) || IS_CCLASS_NOT(cc)) {
4426: OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4427: OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4428:
4429: set_mml(&opt->len, min, max);
4430: }
4431: else {
4432: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4433: z = BITSET_AT(cc->bs, i);
4434: if ((z && !IS_CCLASS_NOT(cc)) || (!z && IS_CCLASS_NOT(cc))) {
4435: add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4436: }
4437: }
4438: set_mml(&opt->len, 1, 1);
4439: }
4440: }
4441: break;
4442:
4443: case N_CTYPE:
4444: {
4445: int i, min, max;
4446:
4447: max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4448:
4449: if (max == 1) {
4450: min = 1;
4451:
4452: switch (NCTYPE(node).type) {
4453: case CTYPE_NOT_WORD:
4454: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4455: if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4456: add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4457: }
4458: }
4459: break;
4460:
4461: case CTYPE_WORD:
4462: for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4463: if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4464: add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4465: }
4466: }
4467: break;
4468: }
4469: }
4470: else {
4471: min = ONIGENC_MBC_MINLEN(env->enc);
4472: }
4473: set_mml(&opt->len, min, max);
4474: }
4475: break;
4476:
4477: case N_ANYCHAR:
4478: {
4479: OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4480: OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4481: set_mml(&opt->len, min, max);
4482: }
4483: break;
4484:
4485: case N_ANCHOR:
4486: switch (NANCHOR(node).type) {
4487: case ANCHOR_BEGIN_BUF:
4488: case ANCHOR_BEGIN_POSITION:
4489: case ANCHOR_BEGIN_LINE:
4490: case ANCHOR_END_BUF:
4491: case ANCHOR_SEMI_END_BUF:
4492: case ANCHOR_END_LINE:
4493: add_opt_anc_info(&opt->anc, NANCHOR(node).type);
4494: break;
4495:
4496: case ANCHOR_PREC_READ:
4497: {
4498: NodeOptInfo nopt;
4499:
4500: r = optimize_node_left(NANCHOR(node).target, &nopt, env);
4501: if (r == 0) {
4502: if (nopt.exb.len > 0)
4503: copy_opt_exact_info(&opt->expr, &nopt.exb);
4504: else if (nopt.exm.len > 0)
4505: copy_opt_exact_info(&opt->expr, &nopt.exm);
4506:
4507: opt->expr.reach_end = 0;
4508:
4509: if (nopt.map.value > 0)
4510: copy_opt_map_info(&opt->map, &nopt.map);
4511: }
4512: }
4513: break;
4514:
4515: case ANCHOR_PREC_READ_NOT:
4516: case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4517: case ANCHOR_LOOK_BEHIND_NOT:
4518: break;
4519: }
4520: break;
4521:
4522: case N_BACKREF:
4523: {
4524: int i;
4525: int* backs;
4526: OnigDistance min, max, tmin, tmax;
4527: Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4528: BackrefNode* br = &(NBACKREF(node));
4529:
4530: if (br->state & NST_RECURSION) {
4531: set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4532: break;
4533: }
4534: backs = BACKREFS_P(br);
4535: r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4536: if (r != 0) break;
4537: r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4538: if (r != 0) break;
4539: for (i = 1; i < br->back_num; i++) {
4540: r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4541: if (r != 0) break;
4542: r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4543: if (r != 0) break;
4544: if (min > tmin) min = tmin;
4545: if (max < tmax) max = tmax;
4546: }
4547: if (r == 0) set_mml(&opt->len, min, max);
4548: }
4549: break;
4550:
4551: #ifdef USE_SUBEXP_CALL
4552: case N_CALL:
4553: if (IS_CALL_RECURSION(&(NCALL(node))))
4554: set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4555: else {
4556: OnigOptionType save = env->options;
4557: env->options = NEFFECT(NCALL(node).target).option;
4558: r = optimize_node_left(NCALL(node).target, opt, env);
4559: env->options = save;
4560: }
4561: break;
4562: #endif
4563:
4564: case N_QUANTIFIER:
4565: {
4566: int i;
4567: OnigDistance min, max;
4568: NodeOptInfo nopt;
4569: QuantifierNode* qn = &(NQUANTIFIER(node));
4570:
4571: r = optimize_node_left(qn->target, &nopt, env);
4572: if (r) break;
4573:
4574: if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4575: if (env->mmd.max == 0 &&
4576: NTYPE(qn->target) == N_ANYCHAR && qn->greedy) {
4577: if (IS_MULTILINE(env->options))
4578: add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4579: else
4580: add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4581: }
4582: }
4583: else {
4584: if (qn->lower > 0) {
4585: copy_node_opt_info(opt, &nopt);
4586: if (nopt.exb.len > 0) {
4587: if (nopt.exb.reach_end) {
4588: for (i = 2; i < qn->lower &&
4589: ! is_full_opt_exact_info(&opt->exb); i++) {
4590: concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4591: }
4592: if (i < qn->lower) {
4593: opt->exb.reach_end = 0;
4594: }
4595: }
4596: }
4597:
4598: if (qn->lower != qn->upper) {
4599: opt->exb.reach_end = 0;
4600: opt->exm.reach_end = 0;
4601: }
4602: if (qn->lower > 1)
4603: opt->exm.reach_end = 0;
4604: }
4605: }
4606:
4607: min = distance_multiply(nopt.len.min, qn->lower);
4608: if (IS_REPEAT_INFINITE(qn->upper))
4609: max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4610: else
4611: max = distance_multiply(nopt.len.max, qn->upper);
4612:
4613: set_mml(&opt->len, min, max);
4614: }
4615: break;
4616:
4617: case N_EFFECT:
4618: {
4619: EffectNode* en = &(NEFFECT(node));
4620:
4621: switch (en->type) {
4622: case EFFECT_OPTION:
4623: {
4624: OnigOptionType save = env->options;
4625:
4626: env->options = en->option;
4627: r = optimize_node_left(en->target, opt, env);
4628: env->options = save;
4629: }
4630: break;
4631:
4632: case EFFECT_MEMORY:
4633: #ifdef USE_SUBEXP_CALL
4634: en->opt_count++;
4635: if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4636: OnigDistance min, max;
4637:
4638: min = 0;
4639: max = ONIG_INFINITE_DISTANCE;
4640: if (IS_EFFECT_MIN_FIXED(en)) min = en->min_len;
4641: if (IS_EFFECT_MAX_FIXED(en)) max = en->max_len;
4642: set_mml(&opt->len, min, max);
4643: }
4644: else
4645: #endif
4646: {
4647: r = optimize_node_left(en->target, opt, env);
4648:
4649: if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4650: if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4651: remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4652: }
4653: }
4654: break;
4655:
4656: case EFFECT_STOP_BACKTRACK:
4657: r = optimize_node_left(en->target, opt, env);
4658: break;
4659: }
4660: }
4661: break;
4662:
4663: default:
4664: #ifdef ONIG_DEBUG
4665: fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4666: NTYPE(node));
4667: #endif
4668: r = ONIGERR_TYPE_BUG;
4669: break;
4670: }
4671:
4672: return r;
4673: }
4674:
4675: static int
4676: set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4677: {
4678: int r;
4679:
4680: if (e->len == 0) return 0;
4681:
4682: if (e->ignore_case) {
4683: reg->exact = (UChar* )xmalloc(e->len);
4684: CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
4685: xmemcpy(reg->exact, e->s, e->len);
4686: reg->exact_end = reg->exact + e->len;
4687: reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4688: }
4689: else {
4690: int allow_reverse;
4691:
4692: reg->exact = k_strdup(e->s, e->s + e->len);
4693: CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
4694: reg->exact_end = reg->exact + e->len;
4695:
4696: allow_reverse =
4697: ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4698:
4699: if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4700: r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4701: reg->map, &(reg->int_map));
4702: if (r) return r;
4703:
4704: reg->optimize = (allow_reverse != 0
4705: ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4706: }
4707: else {
4708: reg->optimize = ONIG_OPTIMIZE_EXACT;
4709: }
4710: }
4711:
4712: reg->dmin = e->mmd.min;
4713: reg->dmax = e->mmd.max;
4714:
4715: if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4716: reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
4717: }
4718:
4719: return 0;
4720: }
4721:
4722: static void
4723: set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4724: {
4725: int i;
4726:
4727: for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4728: reg->map[i] = m->map[i];
4729:
4730: reg->optimize = ONIG_OPTIMIZE_MAP;
4731: reg->dmin = m->mmd.min;
4732: reg->dmax = m->mmd.max;
4733:
4734: if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4735: reg->threshold_len = reg->dmin + 1;
4736: }
4737: }
4738:
4739: static void
4740: set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4741: {
4742: reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4743: reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4744: }
4745:
4746: #ifdef ONIG_DEBUG
4747: static void print_optimize_info(FILE* f, regex_t* reg);
4748: #endif
4749:
4750: static int
4751: set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4752: {
4753:
4754: int r;
4755: NodeOptInfo opt;
4756: OptEnv env;
4757:
4758: env.enc = reg->enc;
4759: env.options = reg->options;
4760: env.ambig_flag = reg->ambig_flag;
4761: env.scan_env = scan_env;
4762: clear_mml(&env.mmd);
4763:
4764: r = optimize_node_left(node, &opt, &env);
4765: if (r) return r;
4766:
4767: reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4768: ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4769:
4770: reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4771:
4772: if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
4773: reg->anchor_dmin = opt.len.min;
4774: reg->anchor_dmax = opt.len.max;
4775: }
4776:
4777: if (opt.exb.len > 0 || opt.exm.len > 0) {
4778: select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
4779: if (opt.map.value > 0 &&
4780: comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
4781: goto set_map;
4782: }
4783: else {
4784: r = set_optimize_exact_info(reg, &opt.exb);
4785: set_sub_anchor(reg, &opt.exb.anc);
4786: }
4787: }
4788: else if (opt.map.value > 0) {
4789: set_map:
4790: set_optimize_map_info(reg, &opt.map);
4791: set_sub_anchor(reg, &opt.map.anc);
4792: }
4793: else {
4794: reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
4795: if (opt.len.max == 0)
4796: reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
4797: }
4798:
4799: #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
4800: print_optimize_info(stderr, reg);
4801: #endif
4802: return r;
4803: }
4804:
4805: static void
4806: clear_optimize_info(regex_t* reg)
4807: {
4808: reg->optimize = ONIG_OPTIMIZE_NONE;
4809: reg->anchor = 0;
4810: reg->anchor_dmin = 0;
4811: reg->anchor_dmax = 0;
4812: reg->sub_anchor = 0;
4813: reg->exact_end = (UChar* )NULL;
4814: reg->threshold_len = 0;
4815: if (IS_NOT_NULL(reg->exact)) {
4816: xfree(reg->exact);
4817: reg->exact = (UChar* )NULL;
4818: }
4819: }
4820:
4821: #ifdef ONIG_DEBUG
4822:
4823: static void print_enc_string(FILE* fp, OnigEncoding enc,
4824: const UChar *s, const UChar *end)
4825: {
4826: fprintf(fp, "\nPATTERN: /");
4827:
4828: if (ONIGENC_MBC_MINLEN(enc) > 1) {
4829: const UChar *p;
4830: OnigCodePoint code;
4831:
4832: p = s;
4833: while (p < end) {
4834: code = ONIGENC_MBC_TO_CODE(enc, p, end);
4835: if (code >= 0x80) {
4836: fprintf(fp, " 0x%04x ", (int )code);
4837: }
4838: else {
4839: fputc((int )code, fp);
4840: }
4841:
4842: p += enc_len(enc, p);
4843: }
4844: }
4845: else {
4846: while (s < end) {
4847: fputc((int )*s, fp);
4848: s++;
4849: }
4850: }
4851:
4852: fprintf(fp, "/\n");
4853: }
4854:
4855: static void
4856: print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
4857: {
4858: if (a == ONIG_INFINITE_DISTANCE)
4859: fputs("inf", f);
4860: else
4861: fprintf(f, "(%u)", a);
4862:
4863: fputs("-", f);
4864:
4865: if (b == ONIG_INFINITE_DISTANCE)
4866: fputs("inf", f);
4867: else
4868: fprintf(f, "(%u)", b);
4869: }
4870:
4871: static void
4872: print_anchor(FILE* f, int anchor)
4873: {
4874: int q = 0;
4875:
4876: fprintf(f, "[");
4877:
4878: if (anchor & ANCHOR_BEGIN_BUF) {
4879: fprintf(f, "begin-buf");
4880: q = 1;
4881: }
4882: if (anchor & ANCHOR_BEGIN_LINE) {
4883: if (q) fprintf(f, ", ");
4884: q = 1;
4885: fprintf(f, "begin-line");
4886: }
4887: if (anchor & ANCHOR_BEGIN_POSITION) {
4888: if (q) fprintf(f, ", ");
4889: q = 1;
4890: fprintf(f, "begin-pos");
4891: }
4892: if (anchor & ANCHOR_END_BUF) {
4893: if (q) fprintf(f, ", ");
4894: q = 1;
4895: fprintf(f, "end-buf");
4896: }
4897: if (anchor & ANCHOR_SEMI_END_BUF) {
4898: if (q) fprintf(f, ", ");
4899: q = 1;
4900: fprintf(f, "semi-end-buf");
4901: }
4902: if (anchor & ANCHOR_END_LINE) {
4903: if (q) fprintf(f, ", ");
4904: q = 1;
4905: fprintf(f, "end-line");
4906: }
4907: if (anchor & ANCHOR_ANYCHAR_STAR) {
4908: if (q) fprintf(f, ", ");
4909: q = 1;
4910: fprintf(f, "anychar-star");
4911: }
4912: if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
4913: if (q) fprintf(f, ", ");
4914: fprintf(f, "anychar-star-pl");
4915: }
4916:
4917: fprintf(f, "]");
4918: }
4919:
4920: static void
4921: print_optimize_info(FILE* f, regex_t* reg)
4922: {
4923: static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
4924: "EXACT_IC", "MAP" };
4925:
4926: fprintf(f, "optimize: %s\n", on[reg->optimize]);
4927: fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
4928: if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
4929: print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
4930: fprintf(f, "\n");
4931:
4932: if (reg->optimize) {
4933: fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
4934: fprintf(f, "\n");
4935: }
4936: fprintf(f, "\n");
4937:
4938: if (reg->exact) {
4939: UChar *p;
4940: fprintf(f, "exact: [");
4941: for (p = reg->exact; p < reg->exact_end; p++) {
4942: fputc(*p, f);
4943: }
4944: fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
4945: }
4946: else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
4947: int c, i, n = 0;
4948:
4949: for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4950: if (reg->map[i]) n++;
4951:
4952: fprintf(f, "map: n=%d\n", n);
4953: if (n > 0) {
4954: c = 0;
4955: fputc('[', f);
4956: for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4957: if (reg->map[i] != 0) {
4958: if (c > 0) fputs(", ", f);
4959: c++;
4960: if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
4961: ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
4962: fputc(i, f);
4963: else
4964: fprintf(f, "%d", i);
4965: }
4966: }
4967: fprintf(f, "]\n");
4968: }
4969: }
4970: }
4971: #endif /* ONIG_DEBUG */
4972:
4973:
4974: static void
4975: onig_free_body(regex_t* reg)
4976: {
4977: if (IS_NOT_NULL(reg->p)) xfree(reg->p);
4978: if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
4979: if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
4980: if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
4981: if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
4982: if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
4983:
4984: #ifdef USE_NAMED_GROUP
4985: onig_names_free(reg);
4986: #endif
4987: }
4988:
4989: extern void
4990: onig_free(regex_t* reg)
4991: {
4992: if (IS_NOT_NULL(reg)) {
4993: onig_free_body(reg);
4994: xfree(reg);
4995: }
4996: }
4997:
4998: #define REGEX_TRANSFER(to,from) do {\
4999: (to)->state = ONIG_STATE_MODIFY;\
5000: onig_free_body(to);\
5001: xmemcpy(to, from, sizeof(regex_t));\
5002: xfree(from);\
5003: } while (0)
5004:
5005: extern void
5006: onig_transfer(regex_t* to, regex_t* from)
5007: {
5008: THREAD_ATOMIC_START;
5009: REGEX_TRANSFER(to, from);
5010: THREAD_ATOMIC_END;
5011: }
5012:
5013: #define REGEX_CHAIN_HEAD(reg) do {\
5014: while (IS_NOT_NULL((reg)->chain)) {\
5015: (reg) = (reg)->chain;\
5016: }\
5017: } while (0)
5018:
5019: extern void
5020: onig_chain_link_add(regex_t* to, regex_t* add)
5021: {
5022: THREAD_ATOMIC_START;
5023: REGEX_CHAIN_HEAD(to);
5024: to->chain = add;
5025: THREAD_ATOMIC_END;
5026: }
5027:
5028: extern void
5029: onig_chain_reduce(regex_t* reg)
5030: {
5031: regex_t *head, *prev;
5032:
5033: prev = reg;
5034: head = prev->chain;
5035: if (IS_NOT_NULL(head)) {
5036: reg->state = ONIG_STATE_MODIFY;
5037: while (IS_NOT_NULL(head->chain)) {
5038: prev = head;
5039: head = head->chain;
5040: }
5041: prev->chain = (regex_t* )NULL;
5042: REGEX_TRANSFER(reg, head);
5043: }
5044: }
5045:
5046: #if 0
5047: extern int
5048: onig_clone(regex_t** to, regex_t* from)
5049: {
5050: int r, size;
5051: regex_t* reg;
5052:
5053: #ifdef USE_MULTI_THREAD_SYSTEM
5054: if (ONIG_STATE(from) >= ONIG_STATE_NORMAL) {
5055: ONIG_STATE_INC(from);
5056: if (IS_NOT_NULL(from->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5057: onig_chain_reduce(from);
5058: ONIG_STATE_INC(from);
5059: }
5060: }
5061: else {
5062: int n = 0;
5063: while (ONIG_STATE(from) < ONIG_STATE_NORMAL) {
5064: if (++n > THREAD_PASS_LIMIT_COUNT)
5065: return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT;
5066: THREAD_PASS;
5067: }
5068: ONIG_STATE_INC(from);
5069: }
5070: #endif /* USE_MULTI_THREAD_SYSTEM */
5071:
5072: r = onig_alloc_init(®, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT,
5073: from->enc, ONIG_SYNTAX_DEFAULT);
5074: if (r != 0) {
5075: ONIG_STATE_DEC(from);
5076: return r;
5077: }
5078:
5079: xmemcpy(reg, from, sizeof(onig_t));
5080: reg->chain = (regex_t* )NULL;
5081: reg->state = ONIG_STATE_NORMAL;
5082:
5083: if (from->p) {
5084: reg->p = (UChar* )xmalloc(reg->alloc);
5085: if (IS_NULL(reg->p)) goto mem_error;
5086: xmemcpy(reg->p, from->p, reg->alloc);
5087: }
5088:
5089: if (from->exact) {
5090: reg->exact = (UChar* )xmalloc(from->exact_end - from->exact);
5091: if (IS_NULL(reg->exact)) goto mem_error;
5092: reg->exact_end = reg->exact + (from->exact_end - from->exact);
5093: xmemcpy(reg->exact, from->exact, reg->exact_end - reg->exact);
5094: }
5095:
5096: if (from->int_map) {
5097: size = sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5098: reg->int_map = (int* )xmalloc(size);
5099: if (IS_NULL(reg->int_map)) goto mem_error;
5100: xmemcpy(reg->int_map, from->int_map, size);
5101: }
5102:
5103: if (from->int_map_backward) {
5104: size = sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5105: reg->int_map_backward = (int* )xmalloc(size);
5106: if (IS_NULL(reg->int_map_backward)) goto mem_error;
5107: xmemcpy(reg->int_map_backward, from->int_map_backward, size);
5108: }
5109:
5110: #ifdef USE_NAMED_GROUP
5111: reg->name_table = names_clone(from); /* names_clone is not implemented */
5112: #endif
5113:
5114: ONIG_STATE_DEC(from);
5115: *to = reg;
5116: return 0;
5117:
5118: mem_error:
5119: ONIG_STATE_DEC(from);
5120: return ONIGERR_MEMORY;
5121: }
5122: #endif
5123:
5124: #ifdef ONIG_DEBUG
5125: static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5126: #endif
5127: #ifdef ONIG_DEBUG_PARSE_TREE
5128: static void print_tree P_((FILE* f, Node* node));
5129: #endif
5130:
5131: extern int
5132: onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5133: OnigErrorInfo* einfo)
5134: {
5135: #define COMPILE_INIT_SIZE 20
5136:
5137: int r, init_size;
5138: Node* root;
5139: ScanEnv scan_env;
5140: #ifdef USE_SUBEXP_CALL
5141: UnsetAddrList uslist;
5142: #endif
5143:
5144: reg->state = ONIG_STATE_COMPILING;
5145:
5146: #ifdef ONIG_DEBUG
5147: print_enc_string(stderr, reg->enc, pattern, pattern_end);
5148: #endif
5149:
5150: if (reg->alloc == 0) {
5151: init_size = (pattern_end - pattern) * 2;
5152: if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5153: r = BBUF_INIT(reg, init_size);
5154: if (r != 0) goto end;
5155: }
5156: else
5157: reg->used = 0;
5158:
5159: reg->num_mem = 0;
5160: reg->num_repeat = 0;
5161: reg->num_null_check = 0;
5162: reg->repeat_range_alloc = 0;
5163: reg->repeat_range = (OnigRepeatRange* )NULL;
5164: #ifdef USE_COMBINATION_EXPLOSION_CHECK
5165: reg->num_comb_exp_check = 0;
5166: #endif
5167:
5168: r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5169: if (r != 0) goto err;
5170:
5171: #ifdef USE_NAMED_GROUP
5172: /* mixed use named group and no-named group */
5173: if (scan_env.num_named > 0 &&
5174: IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5175: !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5176: if (scan_env.num_named != scan_env.num_mem)
5177: r = disable_noname_group_capture(&root, reg, &scan_env);
5178: else
5179: r = numbered_ref_check(root);
5180:
5181: if (r != 0) goto err;
5182: }
5183: #endif
5184:
5185: #ifdef ONIG_DEBUG_PARSE_TREE
5186: print_tree(stderr, root);
5187: #endif
5188:
5189: #ifdef USE_SUBEXP_CALL
5190: if (scan_env.num_call > 0) {
5191: r = unset_addr_list_init(&uslist, scan_env.num_call);
5192: if (r != 0) goto err;
5193: scan_env.unset_addr_list = &uslist;
5194: r = setup_subexp_call(root, &scan_env);
5195: if (r != 0) goto err_unset;
5196: r = subexp_recursive_check_trav(root, &scan_env);
5197: if (r < 0) goto err_unset;
5198: r = subexp_inf_recursive_check_trav(root, &scan_env);
5199: if (r != 0) goto err_unset;
5200:
5201: reg->num_call = scan_env.num_call;
5202: }
5203: else
5204: reg->num_call = 0;
5205: #endif
5206:
5207: r = setup_tree(root, reg, 0, &scan_env);
5208: if (r != 0) goto err_unset;
5209:
5210: reg->capture_history = scan_env.capture_history;
5211: reg->bt_mem_start = scan_env.bt_mem_start;
5212: reg->bt_mem_start |= reg->capture_history;
5213: if (IS_FIND_CONDITION(reg->options))
5214: BIT_STATUS_ON_ALL(reg->bt_mem_end);
5215: else {
5216: reg->bt_mem_end = scan_env.bt_mem_end;
5217: reg->bt_mem_end |= reg->capture_history;
5218: }
5219:
5220: #ifdef USE_COMBINATION_EXPLOSION_CHECK
5221: if (scan_env.backrefed_mem == 0
5222: #ifdef USE_SUBEXP_CALL
5223: || scan_env.num_call == 0
5224: #endif
5225: ) {
5226: setup_comb_exp_check(root, 0, &scan_env);
5227: #ifdef USE_SUBEXP_CALL
5228: if (scan_env.has_recursion != 0) {
5229: scan_env.num_comb_exp_check = 0;
5230: }
5231: else
5232: #endif
5233: if (scan_env.comb_exp_max_regnum > 0) {
5234: int i;
5235: for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5236: if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5237: scan_env.num_comb_exp_check = 0;
5238: break;
5239: }
5240: }
5241: }
5242: }
5243:
5244: reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5245: #endif
5246:
5247: clear_optimize_info(reg);
5248: #ifndef ONIG_DONT_OPTIMIZE
5249: r = set_optimize_info_from_tree(root, reg, &scan_env);
5250: if (r != 0) goto err_unset;
5251: #endif
5252:
5253: if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5254: xfree(scan_env.mem_nodes_dynamic);
5255: scan_env.mem_nodes_dynamic = (Node** )NULL;
5256: }
5257:
5258: r = compile_tree(root, reg);
5259: if (r == 0) {
5260: r = add_opcode(reg, OP_END);
5261: #ifdef USE_SUBEXP_CALL
5262: if (scan_env.num_call > 0) {
5263: r = unset_addr_list_fix(&uslist, reg);
5264: unset_addr_list_end(&uslist);
5265: if (r) goto err;
5266: }
5267: #endif
5268:
5269: if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5270: reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5271: else {
5272: if (reg->bt_mem_start != 0)
5273: reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5274: else
5275: reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5276: }
5277: }
5278: #ifdef USE_SUBEXP_CALL
5279: else if (scan_env.num_call > 0) {
5280: unset_addr_list_end(&uslist);
5281: }
5282: #endif
5283: onig_node_free(root);
5284:
5285: #ifdef ONIG_DEBUG_COMPILE
5286: #ifdef USE_NAMED_GROUP
5287: onig_print_names(stderr, reg);
5288: #endif
5289: print_compiled_byte_code_list(stderr, reg);
5290: #endif
5291:
5292: end:
5293: reg->state = ONIG_STATE_NORMAL;
5294: return r;
5295:
5296: err_unset:
5297: #ifdef USE_SUBEXP_CALL
5298: if (scan_env.num_call > 0) {
5299: unset_addr_list_end(&uslist);
5300: }
5301: #endif
5302: err:
5303: if (IS_NOT_NULL(scan_env.error)) {
5304: if (IS_NOT_NULL(einfo)) {
5305: einfo->enc = scan_env.enc;
5306: einfo->par = scan_env.error;
5307: einfo->par_end = scan_env.error_end;
5308: }
5309: }
5310:
5311: if (IS_NOT_NULL(root)) onig_node_free(root);
5312: if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5313: xfree(scan_env.mem_nodes_dynamic);
5314: return r;
5315: }
5316:
5317: #ifdef USE_RECOMPILE_API
5318: extern int
5319: onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5320: OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5321: OnigErrorInfo* einfo)
5322: {
5323: int r;
5324: regex_t *new_reg;
5325:
5326: r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5327: if (r) return r;
5328: if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5329: onig_transfer(reg, new_reg);
5330: }
5331: else {
5332: onig_chain_link_add(reg, new_reg);
5333: }
5334: return 0;
5335: }
5336: #endif
5337:
5338: static int onig_inited = 0;
5339:
5340: extern int
5341: onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag,
5342: OnigEncoding enc, OnigSyntaxType* syntax)
5343: {
5344: if (! onig_inited)
5345: onig_init();
5346:
5347: if (ONIGENC_IS_UNDEF(enc))
5348: return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5349:
5350: if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5351: == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5352: return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5353: }
5354:
5355: *reg = (regex_t* )xmalloc(sizeof(regex_t));
5356: if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5357: (*reg)->state = ONIG_STATE_MODIFY;
5358:
5359: if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5360: option |= syntax->options;
5361: option &= ~ONIG_OPTION_SINGLELINE;
5362: }
5363: else
5364: option |= syntax->options;
5365:
5366: (*reg)->enc = enc;
5367: (*reg)->options = option;
5368: (*reg)->syntax = syntax;
5369: (*reg)->optimize = 0;
5370: (*reg)->exact = (UChar* )NULL;
5371: (*reg)->int_map = (int* )NULL;
5372: (*reg)->int_map_backward = (int* )NULL;
5373: (*reg)->chain = (regex_t* )NULL;
5374:
5375: (*reg)->p = (UChar* )NULL;
5376: (*reg)->alloc = 0;
5377: (*reg)->used = 0;
5378: (*reg)->name_table = (void* )NULL;
5379:
5380: (*reg)->ambig_flag = ambig_flag;
5381: (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc);
5382:
5383: return 0;
5384: }
5385:
5386: extern int
5387: onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5388: OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5389: OnigErrorInfo* einfo)
5390: {
5391: int r;
5392:
5393: if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5394:
5395: r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT,
5396: enc, syntax);
5397: if (r) return r;
5398:
5399: r = onig_compile(*reg, pattern, pattern_end, einfo);
5400: if (r) {
5401: onig_free(*reg);
5402: *reg = NULL;
5403: }
5404: return r;
5405: }
5406:
5407: extern int
5408: onig_init(void)
5409: {
5410: if (onig_inited != 0)
5411: return 0;
5412:
5413: onig_inited = 1;
5414:
5415: THREAD_SYSTEM_INIT;
5416: THREAD_ATOMIC_START;
5417:
5418: onigenc_init();
5419: onigenc_set_default_caseconv_table((UChar* )0);
5420:
5421: #ifdef ONIG_DEBUG_STATISTICS
5422: onig_statistics_init();
5423: #endif
5424:
5425: THREAD_ATOMIC_END;
5426: return 0;
5427: }
5428:
5429:
5430: extern int
5431: onig_end(void)
5432: {
5433: extern int onig_free_shared_cclass_table(void);
5434:
5435: THREAD_ATOMIC_START;
5436:
5437: #ifdef ONIG_DEBUG_STATISTICS
5438: onig_print_statistics(stderr);
5439: #endif
5440:
5441: #ifdef USE_SHARED_CCLASS_TABLE
5442: onig_free_shared_cclass_table();
5443: #endif
5444:
5445: #ifdef USE_RECYCLE_NODE
5446: onig_free_node_list();
5447: #endif
5448:
5449: onig_inited = 0;
5450:
5451: THREAD_ATOMIC_END;
5452: THREAD_SYSTEM_END;
5453: return 0;
5454: }
5455:
5456:
5457: #ifdef ONIG_DEBUG
5458:
5459: /* arguments type */
5460: #define ARG_SPECIAL -1
5461: #define ARG_NON 0
5462: #define ARG_RELADDR 1
5463: #define ARG_ABSADDR 2
5464: #define ARG_LENGTH 3
5465: #define ARG_MEMNUM 4
5466: #define ARG_OPTION 5
5467: #define ARG_STATE_CHECK 6
5468:
5469: OnigOpInfoType OnigOpInfo[] = {
5470: { OP_FINISH, "finish", ARG_NON },
5471: { OP_END, "end", ARG_NON },
5472: { OP_EXACT1, "exact1", ARG_SPECIAL },
5473: { OP_EXACT2, "exact2", ARG_SPECIAL },
5474: { OP_EXACT3, "exact3", ARG_SPECIAL },
5475: { OP_EXACT4, "exact4", ARG_SPECIAL },
5476: { OP_EXACT5, "exact5", ARG_SPECIAL },
5477: { OP_EXACTN, "exactn", ARG_SPECIAL },
5478: { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5479: { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5480: { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5481: { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5482: { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5483: { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5484: { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5485: { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5486: { OP_CCLASS, "cclass", ARG_SPECIAL },
5487: { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5488: { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5489: { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5490: { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5491: { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5492: { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5493: { OP_ANYCHAR, "anychar", ARG_NON },
5494: { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5495: { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5496: { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5497: { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5498: { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5499: { OP_WORD, "word", ARG_NON },
5500: { OP_NOT_WORD, "not-word", ARG_NON },
5501: { OP_WORD_BOUND, "word-bound", ARG_NON },
5502: { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5503: { OP_WORD_BEGIN, "word-begin", ARG_NON },
5504: { OP_WORD_END, "word-end", ARG_NON },
5505: { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5506: { OP_END_BUF, "end-buf", ARG_NON },
5507: { OP_BEGIN_LINE, "begin-line", ARG_NON },
5508: { OP_END_LINE, "end-line", ARG_NON },
5509: { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5510: { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5511: { OP_BACKREF1, "backref1", ARG_NON },
5512: { OP_BACKREF2, "backref2", ARG_NON },
5513: { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5514: { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5515: { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5516: { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5517: { OP_BACKREF_AT_LEVEL, "backref_at_level", ARG_SPECIAL },
5518: { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5519: { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5520: { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5521: { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5522: { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5523: { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5524: { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5525: { OP_SET_OPTION, "set-option", ARG_OPTION },
5526: { OP_FAIL, "fail", ARG_NON },
5527: { OP_JUMP, "jump", ARG_RELADDR },
5528: { OP_PUSH, "push", ARG_RELADDR },
5529: { OP_POP, "pop", ARG_NON },
5530: { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5531: { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5532: { OP_REPEAT, "repeat", ARG_SPECIAL },
5533: { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5534: { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5535: { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5536: { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5537: { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5538: { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5539: { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5540: { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5541: { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5542: { OP_PUSH_POS, "push-pos", ARG_NON },
5543: { OP_POP_POS, "pop-pos", ARG_NON },
5544: { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5545: { OP_FAIL_POS, "fail-pos", ARG_NON },
5546: { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5547: { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5548: { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5549: { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5550: { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5551: { OP_CALL, "call", ARG_ABSADDR },
5552: { OP_RETURN, "return", ARG_NON },
5553: { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5554: { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5555: { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5556: { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5557: { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5558: "state-check-anychar-ml*", ARG_STATE_CHECK },
5559: { -1, "", ARG_NON }
5560: };
5561:
5562: static char*
5563: op2name(int opcode)
5564: {
5565: int i;
5566:
5567: for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5568: if (opcode == OnigOpInfo[i].opcode)
5569: return OnigOpInfo[i].name;
5570: }
5571: return "";
5572: }
5573:
5574: static int
5575: op2arg_type(int opcode)
5576: {
5577: int i;
5578:
5579: for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5580: if (opcode == OnigOpInfo[i].opcode)
5581: return OnigOpInfo[i].arg_type;
5582: }
5583: return ARG_SPECIAL;
5584: }
5585:
5586: static void
5587: Indent(FILE* f, int indent)
5588: {
5589: int i;
5590: for (i = 0; i < indent; i++) putc(' ', f);
5591: }
5592:
5593: static void
5594: p_string(FILE* f, int len, UChar* s)
5595: {
5596: fputs(":", f);
5597: while (len-- > 0) { fputc(*s++, f); }
5598: }
5599:
5600: static void
5601: p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5602: {
5603: int x = len * mb_len;
5604:
5605: fprintf(f, ":%d:", len);
5606: while (x-- > 0) { fputc(*s++, f); }
5607: }
5608:
5609: extern void
5610: onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5611: OnigEncoding enc)
5612: {
5613: int i, n, arg_type;
5614: RelAddrType addr;
5615: LengthType len;
5616: MemNumType mem;
5617: StateCheckNumType scn;
5618: OnigCodePoint code;
5619: UChar *q;
5620:
5621: fprintf(f, "[%s", op2name(*bp));
5622: arg_type = op2arg_type(*bp);
5623: if (arg_type != ARG_SPECIAL) {
5624: bp++;
5625: switch (arg_type) {
5626: case ARG_NON:
5627: break;
5628: case ARG_RELADDR:
5629: GET_RELADDR_INC(addr, bp);
5630: fprintf(f, ":(%d)", addr);
5631: break;
5632: case ARG_ABSADDR:
5633: GET_ABSADDR_INC(addr, bp);
5634: fprintf(f, ":(%d)", addr);
5635: break;
5636: case ARG_LENGTH:
5637: GET_LENGTH_INC(len, bp);
5638: fprintf(f, ":%d", len);
5639: break;
5640: case ARG_MEMNUM:
5641: mem = *((MemNumType* )bp);
5642: bp += SIZE_MEMNUM;
5643: fprintf(f, ":%d", mem);
5644: break;
5645: case ARG_OPTION:
5646: {
5647: OnigOptionType option = *((OnigOptionType* )bp);
5648: bp += SIZE_OPTION;
5649: fprintf(f, ":%d", option);
5650: }
5651: break;
5652:
5653: case ARG_STATE_CHECK:
5654: scn = *((StateCheckNumType* )bp);
5655: bp += SIZE_STATE_CHECK_NUM;
5656: fprintf(f, ":%d", scn);
5657: break;
5658: }
5659: }
5660: else {
5661: switch (*bp++) {
5662: case OP_EXACT1:
5663: case OP_ANYCHAR_STAR_PEEK_NEXT:
5664: case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5665: p_string(f, 1, bp++); break;
5666: case OP_EXACT2:
5667: p_string(f, 2, bp); bp += 2; break;
5668: case OP_EXACT3:
5669: p_string(f, 3, bp); bp += 3; break;
5670: case OP_EXACT4:
5671: p_string(f, 4, bp); bp += 4; break;
5672: case OP_EXACT5:
5673: p_string(f, 5, bp); bp += 5; break;
5674: case OP_EXACTN:
5675: GET_LENGTH_INC(len, bp);
5676: p_len_string(f, len, 1, bp);
5677: bp += len;
5678: break;
5679:
5680: case OP_EXACTMB2N1:
5681: p_string(f, 2, bp); bp += 2; break;
5682: case OP_EXACTMB2N2:
5683: p_string(f, 4, bp); bp += 4; break;
5684: case OP_EXACTMB2N3:
5685: p_string(f, 6, bp); bp += 6; break;
5686: case OP_EXACTMB2N:
5687: GET_LENGTH_INC(len, bp);
5688: p_len_string(f, len, 2, bp);
5689: bp += len * 2;
5690: break;
5691: case OP_EXACTMB3N:
5692: GET_LENGTH_INC(len, bp);
5693: p_len_string(f, len, 3, bp);
5694: bp += len * 3;
5695: break;
5696: case OP_EXACTMBN:
5697: {
5698: int mb_len;
5699:
5700: GET_LENGTH_INC(mb_len, bp);
5701: GET_LENGTH_INC(len, bp);
5702: fprintf(f, ":%d:%d:", mb_len, len);
5703: n = len * mb_len;
5704: while (n-- > 0) { fputc(*bp++, f); }
5705: }
5706: break;
5707:
5708: case OP_EXACT1_IC:
5709: len = enc_len(enc, bp);
5710: p_string(f, len, bp);
5711: bp += len;
5712: break;
5713: case OP_EXACTN_IC:
5714: GET_LENGTH_INC(len, bp);
5715: p_len_string(f, len, 1, bp);
5716: bp += len;
5717: break;
5718:
5719: case OP_CCLASS:
5720: n = bitset_on_num((BitSetRef )bp);
5721: bp += SIZE_BITSET;
5722: fprintf(f, ":%d", n);
5723: break;
5724:
5725: case OP_CCLASS_NOT:
5726: n = bitset_on_num((BitSetRef )bp);
5727: bp += SIZE_BITSET;
5728: fprintf(f, ":%d", n);
5729: break;
5730:
5731: case OP_CCLASS_MB:
5732: case OP_CCLASS_MB_NOT:
5733: GET_LENGTH_INC(len, bp);
5734: q = bp;
5735: #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5736: ALIGNMENT_RIGHT(q);
5737: #endif
5738: GET_CODE_POINT(code, q);
5739: bp += len;
5740: fprintf(f, ":%d:%d", (int )code, len);
5741: break;
5742:
5743: case OP_CCLASS_MIX:
5744: case OP_CCLASS_MIX_NOT:
5745: n = bitset_on_num((BitSetRef )bp);
5746: bp += SIZE_BITSET;
5747: GET_LENGTH_INC(len, bp);
5748: q = bp;
5749: #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5750: ALIGNMENT_RIGHT(q);
5751: #endif
5752: GET_CODE_POINT(code, q);
5753: bp += len;
5754: fprintf(f, ":%d:%d:%d", n, (int )code, len);
5755: break;
5756:
5757: case OP_CCLASS_NODE:
5758: {
5759: CClassNode *cc;
5760:
5761: GET_POINTER_INC(cc, bp);
5762: n = bitset_on_num(cc->bs);
5763: fprintf(f, ":%u:%d", (unsigned int )cc, n);
5764: }
5765: break;
5766:
5767: case OP_BACKREFN_IC:
5768: mem = *((MemNumType* )bp);
5769: bp += SIZE_MEMNUM;
5770: fprintf(f, ":%d", mem);
5771: break;
5772:
5773: case OP_BACKREF_MULTI_IC:
5774: case OP_BACKREF_MULTI:
5775: fputs(" ", f);
5776: GET_LENGTH_INC(len, bp);
5777: for (i = 0; i < len; i++) {
5778: GET_MEMNUM_INC(mem, bp);
5779: if (i > 0) fputs(", ", f);
5780: fprintf(f, "%d", mem);
5781: }
5782: break;
5783:
5784: case OP_BACKREF_AT_LEVEL:
5785: {
5786: OnigOptionType option;
5787: LengthType level;
5788:
5789: GET_OPTION_INC(option, bp);
5790: fprintf(f, ":%d", option);
5791: GET_LENGTH_INC(level, bp);
5792: fprintf(f, ":%d", level);
5793:
5794: fputs(" ", f);
5795: GET_LENGTH_INC(len, bp);
5796: for (i = 0; i < len; i++) {
5797: GET_MEMNUM_INC(mem, bp);
5798: if (i > 0) fputs(", ", f);
5799: fprintf(f, "%d", mem);
5800: }
5801: }
5802: break;
5803:
5804: case OP_REPEAT:
5805: case OP_REPEAT_NG:
5806: {
5807: mem = *((MemNumType* )bp);
5808: bp += SIZE_MEMNUM;
5809: addr = *((RelAddrType* )bp);
5810: bp += SIZE_RELADDR;
5811: fprintf(f, ":%d:%d", mem, addr);
5812: }
5813: break;
5814:
5815: case OP_PUSH_OR_JUMP_EXACT1:
5816: case OP_PUSH_IF_PEEK_NEXT:
5817: addr = *((RelAddrType* )bp);
5818: bp += SIZE_RELADDR;
5819: fprintf(f, ":(%d)", addr);
5820: p_string(f, 1, bp);
5821: bp += 1;
5822: break;
5823:
5824: case OP_LOOK_BEHIND:
5825: GET_LENGTH_INC(len, bp);
5826: fprintf(f, ":%d", len);
5827: break;
5828:
5829: case OP_PUSH_LOOK_BEHIND_NOT:
5830: GET_RELADDR_INC(addr, bp);
5831: GET_LENGTH_INC(len, bp);
5832: fprintf(f, ":%d:(%d)", len, addr);
5833: break;
5834:
5835: case OP_STATE_CHECK_PUSH:
5836: case OP_STATE_CHECK_PUSH_OR_JUMP:
5837: scn = *((StateCheckNumType* )bp);
5838: bp += SIZE_STATE_CHECK_NUM;
5839: addr = *((RelAddrType* )bp);
5840: bp += SIZE_RELADDR;
5841: fprintf(f, ":%d:(%d)", scn, addr);
5842: break;
5843:
5844: default:
5845: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
5846: *--bp);
5847: }
5848: }
5849: fputs("]", f);
5850: if (nextp) *nextp = bp;
5851: }
5852:
5853: static void
5854: print_compiled_byte_code_list(FILE* f, regex_t* reg)
5855: {
5856: int ncode;
5857: UChar* bp = reg->p;
5858: UChar* end = reg->p + reg->used;
5859:
5860: fprintf(f, "code length: %d\n", reg->used);
5861:
5862: ncode = 0;
5863: while (bp < end) {
5864: ncode++;
5865: if (bp > reg->p) {
5866: if (ncode % 5 == 0)
5867: fprintf(f, "\n");
5868: else
5869: fputs(" ", f);
5870: }
5871: onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
5872: }
5873:
5874: fprintf(f, "\n");
5875: }
5876:
5877: static void
5878: print_indent_tree(FILE* f, Node* node, int indent)
5879: {
5880: int i, type;
5881: int add = 3;
5882: UChar* p;
5883:
5884: Indent(f, indent);
5885: if (IS_NULL(node)) {
5886: fprintf(f, "ERROR: null node!!!\n");
5887: exit (0);
5888: }
5889:
5890: type = NTYPE(node);
5891: switch (type) {
5892: case N_LIST:
5893: case N_ALT:
5894: if (NTYPE(node) == N_LIST)
5895: fprintf(f, "<list:%x>\n", (int )node);
5896: else
5897: fprintf(f, "<alt:%x>\n", (int )node);
5898:
5899: print_indent_tree(f, NCONS(node).left, indent + add);
5900: while (IS_NOT_NULL(node = NCONS(node).right)) {
5901: if (NTYPE(node) != type) {
5902: fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
5903: exit(0);
5904: }
5905: print_indent_tree(f, NCONS(node).left, indent + add);
5906: }
5907: break;
5908:
5909: case N_STRING:
5910: fprintf(f, "<string%s:%x>",
5911: (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
5912: for (p = NSTRING(node).s; p < NSTRING(node).end; p++) {
5913: if (*p >= 0x20 && *p < 0x7f)
5914: fputc(*p, f);
5915: else {
5916: fprintf(f, " 0x%02x", *p);
5917: }
5918: }
5919: break;
5920:
5921: case N_CCLASS:
5922: fprintf(f, "<cclass:%x>", (int )node);
5923: if (IS_CCLASS_NOT(&NCCLASS(node))) fputs(" not", f);
5924: if (NCCLASS(node).mbuf) {
5925: BBuf* bbuf = NCCLASS(node).mbuf;
5926: for (i = 0; i < bbuf->used; i++) {
5927: if (i > 0) fprintf(f, ",");
5928: fprintf(f, "%0x", bbuf->p[i]);
5929: }
5930: }
5931: break;
5932:
5933: case N_CTYPE:
5934: fprintf(f, "<ctype:%x> ", (int )node);
5935: switch (NCTYPE(node).type) {
5936: case CTYPE_WORD: fputs("word", f); break;
5937: case CTYPE_NOT_WORD: fputs("not word", f); break;
5938: default:
5939: fprintf(f, "ERROR: undefined ctype.\n");
5940: exit(0);
5941: }
5942: break;
5943:
5944: case N_ANYCHAR:
5945: fprintf(f, "<anychar:%x>", (int )node);
5946: break;
5947:
5948: case N_ANCHOR:
5949: fprintf(f, "<anchor:%x> ", (int )node);
5950: switch (NANCHOR(node).type) {
5951: case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
5952: case ANCHOR_END_BUF: fputs("end buf", f); break;
5953: case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
5954: case ANCHOR_END_LINE: fputs("end line", f); break;
5955: case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
5956: case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
5957:
5958: case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
5959: case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
5960: #ifdef USE_WORD_BEGIN_END
5961: case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
5962: case ANCHOR_WORD_END: fputs("word end", f); break;
5963: #endif
5964: case ANCHOR_PREC_READ: fputs("prec read", f); break;
5965: case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
5966: case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
5967: case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
5968:
5969: default:
5970: fprintf(f, "ERROR: undefined anchor type.\n");
5971: break;
5972: }
5973: break;
5974:
5975: case N_BACKREF:
5976: {
5977: int* p;
5978: BackrefNode* br = &(NBACKREF(node));
5979: p = BACKREFS_P(br);
5980: fprintf(f, "<backref:%x>", (int )node);
5981: for (i = 0; i < br->back_num; i++) {
5982: if (i > 0) fputs(", ", f);
5983: fprintf(f, "%d", p[i]);
5984: }
5985: }
5986: break;
5987:
5988: #ifdef USE_SUBEXP_CALL
5989: case N_CALL:
5990: {
5991: CallNode* cn = &(NCALL(node));
5992: fprintf(f, "<call:%x>", (int )node);
5993: p_string(f, cn->name_end - cn->name, cn->name);
5994: }
5995: break;
5996: #endif
5997:
5998: case N_QUANTIFIER:
5999: fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6000: NQUANTIFIER(node).lower, NQUANTIFIER(node).upper,
6001: (NQUANTIFIER(node).greedy ? "" : "?"));
6002: print_indent_tree(f, NQUANTIFIER(node).target, indent + add);
6003: break;
6004:
6005: case N_EFFECT:
6006: fprintf(f, "<effect:%x> ", (int )node);
6007: switch (NEFFECT(node).type) {
6008: case EFFECT_OPTION:
6009: fprintf(f, "option:%d\n", NEFFECT(node).option);
6010: print_indent_tree(f, NEFFECT(node).target, indent + add);
6011: break;
6012: case EFFECT_MEMORY:
6013: fprintf(f, "memory:%d", NEFFECT(node).regnum);
6014: break;
6015: case EFFECT_STOP_BACKTRACK:
6016: fprintf(f, "stop-bt");
6017: break;
6018:
6019: default:
6020: break;
6021: }
6022: fprintf(f, "\n");
6023: print_indent_tree(f, NEFFECT(node).target, indent + add);
6024: break;
6025:
6026: default:
6027: fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6028: break;
6029: }
6030:
6031: if (type != N_LIST && type != N_ALT && type != N_QUANTIFIER &&
6032: type != N_EFFECT)
6033: fprintf(f, "\n");
6034: fflush(f);
6035: }
6036: #endif /* ONIG_DEBUG */
6037:
6038: #ifdef ONIG_DEBUG_PARSE_TREE
6039: static void
6040: print_tree(FILE* f, Node* node)
6041: {
6042: print_indent_tree(f, node, 0);
6043: }
6044: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>