Annotation of embedaddon/php/ext/mbstring/oniguruma/regparse.c, revision 1.1.1.1
1.1 misho 1: /**********************************************************************
2: regparse.c - Oniguruma (regular expression library)
3: **********************************************************************/
4: /*-
5: * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6: * All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: */
29:
30: #include "regparse.h"
31:
32: #define WARN_BUFSIZE 256
33:
34: OnigSyntaxType OnigSyntaxRuby = {
35: (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
36: ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
37: ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
38: ONIG_SYN_OP_ESC_C_CONTROL )
39: & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
40: , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
41: ONIG_SYN_OP2_OPTION_RUBY |
42: ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
43: ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
44: ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
45: ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
46: ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
47: ONIG_SYN_OP2_ESC_H_XDIGIT )
48: , ( SYN_GNU_REGEX_BV |
49: ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
50: ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
51: ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
52: ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
53: ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
54: ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
55: ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
56: , ONIG_OPTION_NONE
57: };
58:
59: OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
60:
61: extern void onig_null_warn(const char* s) { }
62:
63: #ifdef RUBY_PLATFORM
64: extern void
65: onig_rb_warn(const char* s)
66: {
67: rb_warn("%s", s);
68: }
69:
70: extern void
71: onig_rb_warning(const char* s)
72: {
73: rb_warning("%s", s);
74: }
75: #endif
76:
77: #ifdef DEFAULT_WARN_FUNCTION
78: static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
79: #else
80: static OnigWarnFunc onig_warn = onig_null_warn;
81: #endif
82:
83: #ifdef DEFAULT_VERB_WARN_FUNCTION
84: static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
85: #else
86: static OnigWarnFunc onig_verb_warn = onig_null_warn;
87: #endif
88:
89: extern void onig_set_warn_func(OnigWarnFunc f)
90: {
91: onig_warn = f;
92: }
93:
94: extern void onig_set_verb_warn_func(OnigWarnFunc f)
95: {
96: onig_verb_warn = f;
97: }
98:
99: static void
100: bbuf_free(BBuf* bbuf)
101: {
102: if (IS_NOT_NULL(bbuf)) {
103: if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
104: xfree(bbuf);
105: }
106: }
107:
108: static int
109: bbuf_clone(BBuf** rto, BBuf* from)
110: {
111: int r;
112: BBuf *to;
113:
114: *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
115: CHECK_NULL_RETURN_VAL(to, ONIGERR_MEMORY);
116: r = BBUF_INIT(to, from->alloc);
117: if (r != 0) return r;
118: to->used = from->used;
119: xmemcpy(to->p, from->p, from->used);
120: return 0;
121: }
122:
123: #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
124:
125: #define MBCODE_START_POS(enc) \
126: (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
127:
128: #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
129: add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
130:
131: #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
132: if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
133: r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
134: if (r) return r;\
135: }\
136: } while (0)
137:
138:
139: #define BITSET_IS_EMPTY(bs,empty) do {\
140: int i;\
141: empty = 1;\
142: for (i = 0; i < BITSET_SIZE; i++) {\
143: if ((bs)[i] != 0) {\
144: empty = 0; break;\
145: }\
146: }\
147: } while (0)
148:
149: static void
150: bitset_set_range(BitSetRef bs, int from, int to)
151: {
152: int i;
153: for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
154: BITSET_SET_BIT(bs, i);
155: }
156: }
157:
158: #if 0
159: static void
160: bitset_set_all(BitSetRef bs)
161: {
162: int i;
163: for (i = 0; i < BITSET_SIZE; i++) {
164: bs[i] = ~((Bits )0);
165: }
166: }
167: #endif
168:
169: static void
170: bitset_invert(BitSetRef bs)
171: {
172: int i;
173: for (i = 0; i < BITSET_SIZE; i++) {
174: bs[i] = ~(bs[i]);
175: }
176: }
177:
178: static void
179: bitset_invert_to(BitSetRef from, BitSetRef to)
180: {
181: int i;
182: for (i = 0; i < BITSET_SIZE; i++) {
183: to[i] = ~(from[i]);
184: }
185: }
186:
187: static void
188: bitset_and(BitSetRef dest, BitSetRef bs)
189: {
190: int i;
191: for (i = 0; i < BITSET_SIZE; i++) {
192: dest[i] &= bs[i];
193: }
194: }
195:
196: static void
197: bitset_or(BitSetRef dest, BitSetRef bs)
198: {
199: int i;
200: for (i = 0; i < BITSET_SIZE; i++) {
201: dest[i] |= bs[i];
202: }
203: }
204:
205: static void
206: bitset_copy(BitSetRef dest, BitSetRef bs)
207: {
208: int i;
209: for (i = 0; i < BITSET_SIZE; i++) {
210: dest[i] = bs[i];
211: }
212: }
213:
214: extern int
215: onig_strncmp(const UChar* s1, const UChar* s2, int n)
216: {
217: int x;
218:
219: while (n-- > 0) {
220: x = *s2++ - *s1++;
221: if (x) return x;
222: }
223: return 0;
224: }
225:
226: static void
227: k_strcpy(UChar* dest, const UChar* src, const UChar* end)
228: {
229: int len = end - src;
230: if (len > 0) {
231: xmemcpy(dest, src, len);
232: dest[len] = (UChar )0;
233: }
234: }
235:
236: static UChar*
237: strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
238: {
239: int slen, term_len, i;
240: UChar *r;
241:
242: slen = end - s;
243: term_len = ONIGENC_MBC_MINLEN(enc);
244:
245: r = (UChar* )xmalloc(slen + term_len);
246: CHECK_NULL_RETURN(r);
247: xmemcpy(r, s, slen);
248:
249: for (i = 0; i < term_len; i++)
250: r[slen + i] = (UChar )0;
251:
252: return r;
253: }
254:
255:
256: /* scan pattern methods */
257: #define PEND_VALUE 0
258:
259: #define PFETCH_READY UChar* pfetch_prev
260: #define PEND (p < end ? 0 : 1)
261: #define PUNFETCH p = pfetch_prev
262: #define PINC do { \
263: pfetch_prev = p; \
264: p += ONIGENC_MBC_ENC_LEN(enc, p); \
265: } while (0)
266: #define PFETCH(c) do { \
267: c = ONIGENC_MBC_TO_CODE(enc, p, end); \
268: pfetch_prev = p; \
269: p += ONIGENC_MBC_ENC_LEN(enc, p); \
270: } while (0)
271:
272: #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
273: #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
274:
275: static UChar*
276: k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
277: int capa)
278: {
279: UChar* r;
280:
281: if (dest)
282: r = (UChar* )xrealloc(dest, capa + 1);
283: else
284: r = (UChar* )xmalloc(capa + 1);
285:
286: CHECK_NULL_RETURN(r);
287: k_strcpy(r + (dest_end - dest), src, src_end);
288: return r;
289: }
290:
291: /* dest on static area */
292: static UChar*
293: strcat_capa_from_static(UChar* dest, UChar* dest_end,
294: const UChar* src, const UChar* src_end, int capa)
295: {
296: UChar* r;
297:
298: r = (UChar* )xmalloc(capa + 1);
299: CHECK_NULL_RETURN(r);
300: k_strcpy(r, dest, dest_end);
301: k_strcpy(r + (dest_end - dest), src, src_end);
302: return r;
303: }
304:
305: #ifdef USE_NAMED_GROUP
306:
307: #define INIT_NAME_BACKREFS_ALLOC_NUM 8
308:
309: typedef struct {
310: UChar* name;
311: int name_len; /* byte length */
312: int back_num; /* number of backrefs */
313: int back_alloc;
314: int back_ref1;
315: int* back_refs;
316: } NameEntry;
317:
318: #ifdef USE_ST_HASH_TABLE
319:
320: #include "st.h"
321:
322: typedef struct {
323: unsigned char* s;
324: unsigned char* end;
325: } st_strend_key;
326:
327: static int strend_cmp(st_strend_key*, st_strend_key*);
328: static int strend_hash(st_strend_key*);
329:
330: static struct st_hash_type type_strend_hash = {
331: strend_cmp,
332: strend_hash,
333: };
334:
335: static st_table*
336: onig_st_init_strend_table_with_size(int size)
337: {
338: return onig_st_init_table_with_size(&type_strend_hash, size);
339: }
340:
341: static int
342: onig_st_lookup_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t *value)
343: {
344: st_strend_key key;
345:
346: key.s = (unsigned char* )str_key;
347: key.end = (unsigned char* )end_key;
348:
349: return onig_st_lookup(table, (st_data_t )(&key), value);
350: }
351:
352: static int
353: onig_st_insert_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t value)
354: {
355: st_strend_key* key;
356: int result;
357:
358: key = (st_strend_key* )xmalloc(sizeof(st_strend_key));
359: key->s = (unsigned char* )str_key;
360: key->end = (unsigned char* )end_key;
361: result = onig_st_insert(table, (st_data_t )key, value);
362: if (result) {
363: xfree(key);
364: }
365: return result;
366: }
367:
368: static int
369: strend_cmp(st_strend_key* x, st_strend_key* y)
370: {
371: unsigned char *p, *q;
372: int c;
373:
374: if ((x->end - x->s) != (y->end - y->s))
375: return 1;
376:
377: p = x->s;
378: q = y->s;
379: while (p < x->end) {
380: c = (int )*p - (int )*q;
381: if (c != 0) return c;
382:
383: p++; q++;
384: }
385:
386: return 0;
387: }
388:
389: static int
390: strend_hash(st_strend_key* x)
391: {
392: int val;
393: unsigned char *p;
394:
395: val = 0;
396: p = x->s;
397: while (p < x->end) {
398: val = val * 997 + (int )*p++;
399: }
400:
401: return val + (val >> 5);
402: }
403:
404: typedef st_table NameTable;
405: typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
406:
407: #define NAMEBUF_SIZE 24
408: #define NAMEBUF_SIZE_1 25
409:
410: #ifdef ONIG_DEBUG
411: static int
412: i_print_name_entry(UChar* key, NameEntry* e, void* arg)
413: {
414: int i;
415: FILE* fp = (FILE* )arg;
416:
417: fprintf(fp, "%s: ", e->name);
418: if (e->back_num == 0)
419: fputs("-", fp);
420: else if (e->back_num == 1)
421: fprintf(fp, "%d", e->back_ref1);
422: else {
423: for (i = 0; i < e->back_num; i++) {
424: if (i > 0) fprintf(fp, ", ");
425: fprintf(fp, "%d", e->back_refs[i]);
426: }
427: }
428: fputs("\n", fp);
429: return ST_CONTINUE;
430: }
431:
432: extern int
433: onig_print_names(FILE* fp, regex_t* reg)
434: {
435: NameTable* t = (NameTable* )reg->name_table;
436:
437: if (IS_NOT_NULL(t)) {
438: fprintf(fp, "name table\n");
439: onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
440: fputs("\n", fp);
441: }
442: return 0;
443: }
444: #endif
445:
446: static int
447: i_free_name_entry(UChar* key, NameEntry* e, void* arg)
448: {
449: xfree(e->name);
450: if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
451: xfree(key);
452: xfree(e);
453: return ST_DELETE;
454: }
455:
456: static int
457: names_clear(regex_t* reg)
458: {
459: NameTable* t = (NameTable* )reg->name_table;
460:
461: if (IS_NOT_NULL(t)) {
462: onig_st_foreach(t, i_free_name_entry, 0);
463: }
464: return 0;
465: }
466:
467: extern int
468: onig_names_free(regex_t* reg)
469: {
470: int r;
471: NameTable* t;
472:
473: r = names_clear(reg);
474: if (r) return r;
475:
476: t = (NameTable* )reg->name_table;
477: if (IS_NOT_NULL(t)) onig_st_free_table(t);
478: reg->name_table = (void* )NULL;
479: return 0;
480: }
481:
482: static NameEntry*
483: name_find(regex_t* reg, const UChar* name, const UChar* name_end)
484: {
485: NameEntry* e;
486: NameTable* t = (NameTable* )reg->name_table;
487:
488: e = (NameEntry* )NULL;
489: if (IS_NOT_NULL(t)) {
490: onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
491: }
492: return e;
493: }
494:
495: typedef struct {
496: int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
497: regex_t* reg;
498: void* arg;
499: int ret;
500: OnigEncoding enc;
501: } INamesArg;
502:
503: static int
504: i_names(UChar* key, NameEntry* e, INamesArg* arg)
505: {
506: int r = (*(arg->func))(e->name,
507: /*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */
508: e->name + e->name_len,
509: e->back_num,
510: (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
511: arg->reg, arg->arg);
512: if (r != 0) {
513: arg->ret = r;
514: return ST_STOP;
515: }
516: return ST_CONTINUE;
517: }
518:
519: extern int
520: onig_foreach_name(regex_t* reg,
521: int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
522: void* arg)
523: {
524: INamesArg narg;
525: NameTable* t = (NameTable* )reg->name_table;
526:
527: narg.ret = 0;
528: if (IS_NOT_NULL(t)) {
529: narg.func = func;
530: narg.reg = reg;
531: narg.arg = arg;
532: narg.enc = reg->enc; /* should be pattern encoding. */
533: onig_st_foreach(t, i_names, (HashDataType )&narg);
534: }
535: return narg.ret;
536: }
537:
538: static int
539: i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map)
540: {
541: int i;
542:
543: if (e->back_num > 1) {
544: for (i = 0; i < e->back_num; i++) {
545: e->back_refs[i] = map[e->back_refs[i]].new_val;
546: }
547: }
548: else if (e->back_num == 1) {
549: e->back_ref1 = map[e->back_ref1].new_val;
550: }
551:
552: return ST_CONTINUE;
553: }
554:
555: extern int
556: onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
557: {
558: NameTable* t = (NameTable* )reg->name_table;
559:
560: if (IS_NOT_NULL(t)) {
561: onig_st_foreach(t, i_renumber_name, (HashDataType )map);
562: }
563: return 0;
564: }
565:
566:
567: extern int
568: onig_number_of_names(regex_t* reg)
569: {
570: NameTable* t = (NameTable* )reg->name_table;
571:
572: if (IS_NOT_NULL(t))
573: return t->num_entries;
574: else
575: return 0;
576: }
577:
578: #else /* USE_ST_HASH_TABLE */
579:
580: #define INIT_NAMES_ALLOC_NUM 8
581:
582: typedef struct {
583: NameEntry* e;
584: int num;
585: int alloc;
586: } NameTable;
587:
588:
589: #ifdef ONIG_DEBUG
590: extern int
591: onig_print_names(FILE* fp, regex_t* reg)
592: {
593: int i, j;
594: NameEntry* e;
595: NameTable* t = (NameTable* )reg->name_table;
596:
597: if (IS_NOT_NULL(t) && t->num > 0) {
598: fprintf(fp, "name table\n");
599: for (i = 0; i < t->num; i++) {
600: e = &(t->e[i]);
601: fprintf(fp, "%s: ", e->name);
602: if (e->back_num == 0) {
603: fputs("-", fp);
604: }
605: else if (e->back_num == 1) {
606: fprintf(fp, "%d", e->back_ref1);
607: }
608: else {
609: for (j = 0; j < e->back_num; j++) {
610: if (j > 0) fprintf(fp, ", ");
611: fprintf(fp, "%d", e->back_refs[j]);
612: }
613: }
614: fputs("\n", fp);
615: }
616: fputs("\n", fp);
617: }
618: return 0;
619: }
620: #endif
621:
622: static int
623: names_clear(regex_t* reg)
624: {
625: int i;
626: NameEntry* e;
627: NameTable* t = (NameTable* )reg->name_table;
628:
629: if (IS_NOT_NULL(t)) {
630: for (i = 0; i < t->num; i++) {
631: e = &(t->e[i]);
632: if (IS_NOT_NULL(e->name)) {
633: xfree(e->name);
634: e->name = NULL;
635: e->name_len = 0;
636: e->back_num = 0;
637: e->back_alloc = 0;
638: if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
639: e->back_refs = (int* )NULL;
640: }
641: }
642: if (IS_NOT_NULL(t->e)) {
643: xfree(t->e);
644: t->e = NULL;
645: }
646: t->num = 0;
647: }
648: return 0;
649: }
650:
651: extern int
652: onig_names_free(regex_t* reg)
653: {
654: int r;
655: NameTable* t;
656:
657: r = names_clear(reg);
658: if (r) return r;
659:
660: t = (NameTable* )reg->name_table;
661: if (IS_NOT_NULL(t)) xfree(t);
662: reg->name_table = NULL;
663: return 0;
664: }
665:
666: static NameEntry*
667: name_find(regex_t* reg, UChar* name, UChar* name_end)
668: {
669: int i, len;
670: NameEntry* e;
671: NameTable* t = (NameTable* )reg->name_table;
672:
673: if (IS_NOT_NULL(t)) {
674: len = name_end - name;
675: for (i = 0; i < t->num; i++) {
676: e = &(t->e[i]);
677: if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
678: return e;
679: }
680: }
681: return (NameEntry* )NULL;
682: }
683:
684: extern int
685: onig_foreach_name(regex_t* reg,
686: int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
687: void* arg)
688: {
689: int i, r;
690: NameEntry* e;
691: NameTable* t = (NameTable* )reg->name_table;
692:
693: if (IS_NOT_NULL(t)) {
694: for (i = 0; i < t->num; i++) {
695: e = &(t->e[i]);
696: r = (*func)(e->name, e->name + e->name_len, e->back_num,
697: (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
698: reg, arg);
699: if (r != 0) return r;
700: }
701: }
702: return 0;
703: }
704:
705: extern int
706: onig_number_of_names(regex_t* reg)
707: {
708: NameTable* t = (NameTable* )reg->name_table;
709:
710: if (IS_NOT_NULL(t))
711: return t->num;
712: else
713: return 0;
714: }
715:
716: #endif /* else USE_ST_HASH_TABLE */
717:
718: static int
719: name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
720: {
721: int alloc;
722: NameEntry* e;
723: NameTable* t = (NameTable* )reg->name_table;
724:
725: if (name_end - name <= 0)
726: return ONIGERR_EMPTY_GROUP_NAME;
727:
728: e = name_find(reg, name, name_end);
729: if (IS_NULL(e)) {
730: #ifdef USE_ST_HASH_TABLE
731: if (IS_NULL(t)) {
732: t = onig_st_init_strend_table_with_size(5);
733: reg->name_table = (void* )t;
734: }
735: e = (NameEntry* )xmalloc(sizeof(NameEntry));
736: CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY);
737:
738: e->name = strdup_with_null(reg->enc, name, name_end);
739: if (IS_NULL(e->name)) return ONIGERR_MEMORY;
740: onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
741: (HashDataType )e);
742:
743: e->name_len = name_end - name;
744: e->back_num = 0;
745: e->back_alloc = 0;
746: e->back_refs = (int* )NULL;
747:
748: #else
749:
750: if (IS_NULL(t)) {
751: alloc = INIT_NAMES_ALLOC_NUM;
752: t = (NameTable* )xmalloc(sizeof(NameTable));
753: CHECK_NULL_RETURN_VAL(t, ONIGERR_MEMORY);
754: t->e = NULL;
755: t->alloc = 0;
756: t->num = 0;
757:
758: t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
759: if (IS_NULL(t->e)) {
760: xfree(t);
761: return ONIGERR_MEMORY;
762: }
763: t->alloc = alloc;
764: reg->name_table = t;
765: goto clear;
766: }
767: else if (t->num == t->alloc) {
768: int i;
769:
770: alloc = t->alloc * 2;
771: t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
772: CHECK_NULL_RETURN_VAL(t->e, ONIGERR_MEMORY);
773: t->alloc = alloc;
774:
775: clear:
776: for (i = t->num; i < t->alloc; i++) {
777: t->e[i].name = NULL;
778: t->e[i].name_len = 0;
779: t->e[i].back_num = 0;
780: t->e[i].back_alloc = 0;
781: t->e[i].back_refs = (int* )NULL;
782: }
783: }
784: e = &(t->e[t->num]);
785: t->num++;
786: e->name = strdup_with_null(reg->enc, name, name_end);
787: e->name_len = name_end - name;
788: #endif
789: }
790:
791: if (e->back_num >= 1 &&
792: ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
793: onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
794: name, name_end);
795: return ONIGERR_MULTIPLEX_DEFINED_NAME;
796: }
797:
798: e->back_num++;
799: if (e->back_num == 1) {
800: e->back_ref1 = backref;
801: }
802: else {
803: if (e->back_num == 2) {
804: alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
805: e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
806: CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
807: e->back_alloc = alloc;
808: e->back_refs[0] = e->back_ref1;
809: e->back_refs[1] = backref;
810: }
811: else {
812: if (e->back_num > e->back_alloc) {
813: alloc = e->back_alloc * 2;
814: e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
815: CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
816: e->back_alloc = alloc;
817: }
818: e->back_refs[e->back_num - 1] = backref;
819: }
820: }
821:
822: return 0;
823: }
824:
825: extern int
826: onig_name_to_group_numbers(regex_t* reg, const UChar* name,
827: const UChar* name_end, int** nums)
828: {
829: NameEntry* e;
830:
831: e = name_find(reg, name, name_end);
832: if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
833:
834: switch (e->back_num) {
835: case 0:
836: break;
837: case 1:
838: *nums = &(e->back_ref1);
839: break;
840: default:
841: *nums = e->back_refs;
842: break;
843: }
844: return e->back_num;
845: }
846:
847: extern int
848: onig_name_to_backref_number(regex_t* reg, const UChar* name,
849: const UChar* name_end, OnigRegion *region)
850: {
851: int i, n, *nums;
852:
853: n = onig_name_to_group_numbers(reg, name, name_end, &nums);
854: if (n < 0)
855: return n;
856: else if (n == 0)
857: return ONIGERR_PARSER_BUG;
858: else if (n == 1)
859: return nums[0];
860: else {
861: if (IS_NOT_NULL(region)) {
862: for (i = n - 1; i >= 0; i--) {
863: if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
864: return nums[i];
865: }
866: }
867: return nums[n - 1];
868: }
869: }
870:
871: #else /* USE_NAMED_GROUP */
872:
873: extern int
874: onig_name_to_group_numbers(regex_t* reg, const UChar* name,
875: const UChar* name_end, int** nums)
876: {
877: return ONIG_NO_SUPPORT_CONFIG;
878: }
879:
880: extern int
881: onig_name_to_backref_number(regex_t* reg, const UChar* name,
882: const UChar* name_end, OnigRegion* region)
883: {
884: return ONIG_NO_SUPPORT_CONFIG;
885: }
886:
887: extern int
888: onig_foreach_name(regex_t* reg,
889: int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
890: void* arg)
891: {
892: return ONIG_NO_SUPPORT_CONFIG;
893: }
894:
895: extern int
896: onig_number_of_names(regex_t* reg)
897: {
898: return 0;
899: }
900: #endif /* else USE_NAMED_GROUP */
901:
902: extern int
903: onig_noname_group_capture_is_active(regex_t* reg)
904: {
905: if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
906: return 0;
907:
908: #ifdef USE_NAMED_GROUP
909: if (onig_number_of_names(reg) > 0 &&
910: IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
911: !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
912: return 0;
913: }
914: #endif
915:
916: return 1;
917: }
918:
919:
920: #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
921:
922: static void
923: scan_env_clear(ScanEnv* env)
924: {
925: int i;
926:
927: BIT_STATUS_CLEAR(env->capture_history);
928: BIT_STATUS_CLEAR(env->bt_mem_start);
929: BIT_STATUS_CLEAR(env->bt_mem_end);
930: BIT_STATUS_CLEAR(env->backrefed_mem);
931: env->error = (UChar* )NULL;
932: env->error_end = (UChar* )NULL;
933: env->num_call = 0;
934: env->num_mem = 0;
935: #ifdef USE_NAMED_GROUP
936: env->num_named = 0;
937: #endif
938: env->mem_alloc = 0;
939: env->mem_nodes_dynamic = (Node** )NULL;
940:
941: for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
942: env->mem_nodes_static[i] = NULL_NODE;
943:
944: #ifdef USE_COMBINATION_EXPLOSION_CHECK
945: env->num_comb_exp_check = 0;
946: env->comb_exp_max_regnum = 0;
947: env->curr_max_regnum = 0;
948: env->has_recursion = 0;
949: #endif
950: }
951:
952: static int
953: scan_env_add_mem_entry(ScanEnv* env)
954: {
955: int i, need, alloc;
956: Node** p;
957:
958: need = env->num_mem + 1;
959: if (need >= SCANENV_MEMNODES_SIZE) {
960: if (env->mem_alloc <= need) {
961: if (IS_NULL(env->mem_nodes_dynamic)) {
962: alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
963: p = (Node** )xmalloc(sizeof(Node*) * alloc);
964: xmemcpy(p, env->mem_nodes_static,
965: sizeof(Node*) * SCANENV_MEMNODES_SIZE);
966: }
967: else {
968: alloc = env->mem_alloc * 2;
969: p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
970: }
971: CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
972:
973: for (i = env->num_mem + 1; i < alloc; i++)
974: p[i] = NULL_NODE;
975:
976: env->mem_nodes_dynamic = p;
977: env->mem_alloc = alloc;
978: }
979: }
980:
981: env->num_mem++;
982: return env->num_mem;
983: }
984:
985: static int
986: scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
987: {
988: if (env->num_mem >= num)
989: SCANENV_MEM_NODES(env)[num] = node;
990: else
991: return ONIGERR_PARSER_BUG;
992: return 0;
993: }
994:
995:
996: #ifdef USE_RECYCLE_NODE
997: typedef struct _FreeNode {
998: struct _FreeNode* next;
999: } FreeNode;
1000:
1001: static FreeNode* FreeNodeList = (FreeNode* )NULL;
1002: #endif
1003:
1004: extern void
1005: onig_node_free(Node* node)
1006: {
1007: start:
1008: if (IS_NULL(node)) return ;
1009:
1010: switch (NTYPE(node)) {
1011: case N_STRING:
1012: if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1013: xfree(NSTRING(node).s);
1014: }
1015: break;
1016:
1017: case N_LIST:
1018: case N_ALT:
1019: onig_node_free(NCONS(node).left);
1020: /* onig_node_free(NCONS(node).right); */
1021: {
1022: Node* next_node = NCONS(node).right;
1023:
1024: #ifdef USE_RECYCLE_NODE
1025: {
1026: FreeNode* n = (FreeNode* )node;
1027:
1028: THREAD_ATOMIC_START;
1029: n->next = FreeNodeList;
1030: FreeNodeList = n;
1031: THREAD_ATOMIC_END;
1032: }
1033: #else
1034: xfree(node);
1035: #endif
1036:
1037: node = next_node;
1038: goto start;
1039: }
1040: break;
1041:
1042: case N_CCLASS:
1043: {
1044: CClassNode* cc = &(NCCLASS(node));
1045:
1046: if (IS_CCLASS_SHARE(cc))
1047: return ;
1048:
1049: if (cc->mbuf)
1050: bbuf_free(cc->mbuf);
1051: }
1052: break;
1053:
1054: case N_QUANTIFIER:
1055: if (NQUANTIFIER(node).target)
1056: onig_node_free(NQUANTIFIER(node).target);
1057: break;
1058:
1059: case N_EFFECT:
1060: if (NEFFECT(node).target)
1061: onig_node_free(NEFFECT(node).target);
1062: break;
1063:
1064: case N_BACKREF:
1065: if (IS_NOT_NULL(NBACKREF(node).back_dynamic))
1066: xfree(NBACKREF(node).back_dynamic);
1067: break;
1068:
1069: case N_ANCHOR:
1070: if (NANCHOR(node).target)
1071: onig_node_free(NANCHOR(node).target);
1072: break;
1073: }
1074:
1075: #ifdef USE_RECYCLE_NODE
1076: {
1077: FreeNode* n = (FreeNode* )node;
1078:
1079: THREAD_ATOMIC_START;
1080: n->next = FreeNodeList;
1081: FreeNodeList = n;
1082: THREAD_ATOMIC_END;
1083: }
1084: #else
1085: xfree(node);
1086: #endif
1087: }
1088:
1089: #ifdef USE_RECYCLE_NODE
1090: extern int
1091: onig_free_node_list(void)
1092: {
1093: FreeNode* n;
1094:
1095: /* THREAD_ATOMIC_START; */
1096: while (IS_NOT_NULL(FreeNodeList)) {
1097: n = FreeNodeList;
1098: FreeNodeList = FreeNodeList->next;
1099: xfree(n);
1100: }
1101: /* THREAD_ATOMIC_END; */
1102: return 0;
1103: }
1104: #endif
1105:
1106: static Node*
1107: node_new(void)
1108: {
1109: Node* node;
1110:
1111: #ifdef USE_RECYCLE_NODE
1112: THREAD_ATOMIC_START;
1113: if (IS_NOT_NULL(FreeNodeList)) {
1114: node = (Node* )FreeNodeList;
1115: FreeNodeList = FreeNodeList->next;
1116: THREAD_ATOMIC_END;
1117: return node;
1118: }
1119: THREAD_ATOMIC_END;
1120: #endif
1121:
1122: node = (Node* )xmalloc(sizeof(Node));
1123: return node;
1124: }
1125:
1126:
1127: static void
1128: initialize_cclass(CClassNode* cc)
1129: {
1130: BITSET_CLEAR(cc->bs);
1131: cc->flags = 0;
1132: cc->mbuf = NULL;
1133: }
1134:
1135: static Node*
1136: node_new_cclass(void)
1137: {
1138: Node* node = node_new();
1139: CHECK_NULL_RETURN(node);
1140: node->type = N_CCLASS;
1141:
1142: initialize_cclass(&(NCCLASS(node)));
1143: return node;
1144: }
1145:
1146: static Node*
1147: node_new_cclass_by_codepoint_range(int not,
1148: const OnigCodePoint sbr[], const OnigCodePoint mbr[])
1149: {
1150: CClassNode* cc;
1151: int n, i, j;
1152:
1153: Node* node = node_new();
1154: CHECK_NULL_RETURN(node);
1155: node->type = N_CCLASS;
1156:
1157: cc = &(NCCLASS(node));
1158: cc->flags = 0;
1159: if (not != 0) CCLASS_SET_NOT(cc);
1160:
1161: BITSET_CLEAR(cc->bs);
1162: if (IS_NOT_NULL(sbr)) {
1163: n = ONIGENC_CODE_RANGE_NUM(sbr);
1164: for (i = 0; i < n; i++) {
1165: for (j = ONIGENC_CODE_RANGE_FROM(sbr, i);
1166: j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
1167: BITSET_SET_BIT(cc->bs, j);
1168: }
1169: }
1170: }
1171:
1172: if (IS_NULL(mbr)) {
1173: is_null:
1174: cc->mbuf = NULL;
1175: }
1176: else {
1177: BBuf* bbuf;
1178:
1179: n = ONIGENC_CODE_RANGE_NUM(mbr);
1180: if (n == 0) goto is_null;
1181:
1182: bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1183: CHECK_NULL_RETURN_VAL(bbuf, NULL);
1184: bbuf->alloc = n + 1;
1185: bbuf->used = n + 1;
1186: bbuf->p = (UChar* )((void* )mbr);
1187:
1188: cc->mbuf = bbuf;
1189: }
1190:
1191: return node;
1192: }
1193:
1194: static Node*
1195: node_new_ctype(int type)
1196: {
1197: Node* node = node_new();
1198: CHECK_NULL_RETURN(node);
1199: node->type = N_CTYPE;
1200: NCTYPE(node).type = type;
1201: return node;
1202: }
1203:
1204: static Node*
1205: node_new_anychar(void)
1206: {
1207: Node* node = node_new();
1208: CHECK_NULL_RETURN(node);
1209: node->type = N_ANYCHAR;
1210: return node;
1211: }
1212:
1213: static Node*
1214: node_new_list(Node* left, Node* right)
1215: {
1216: Node* node = node_new();
1217: CHECK_NULL_RETURN(node);
1218: node->type = N_LIST;
1219: NCONS(node).left = left;
1220: NCONS(node).right = right;
1221: return node;
1222: }
1223:
1224: extern Node*
1225: onig_node_new_list(Node* left, Node* right)
1226: {
1227: return node_new_list(left, right);
1228: }
1229:
1230: static Node*
1231: node_new_alt(Node* left, Node* right)
1232: {
1233: Node* node = node_new();
1234: CHECK_NULL_RETURN(node);
1235: node->type = N_ALT;
1236: NCONS(node).left = left;
1237: NCONS(node).right = right;
1238: return node;
1239: }
1240:
1241: extern Node*
1242: onig_node_new_anchor(int type)
1243: {
1244: Node* node = node_new();
1245: CHECK_NULL_RETURN(node);
1246: node->type = N_ANCHOR;
1247: NANCHOR(node).type = type;
1248: NANCHOR(node).target = NULL;
1249: NANCHOR(node).char_len = -1;
1250: return node;
1251: }
1252:
1253: static Node*
1254: node_new_backref(int back_num, int* backrefs, int by_name,
1255: #ifdef USE_BACKREF_AT_LEVEL
1256: int exist_level, int nest_level,
1257: #endif
1258: ScanEnv* env)
1259: {
1260: int i;
1261: Node* node = node_new();
1262:
1263: CHECK_NULL_RETURN(node);
1264: node->type = N_BACKREF;
1265: NBACKREF(node).state = 0;
1266: NBACKREF(node).back_num = back_num;
1267: NBACKREF(node).back_dynamic = (int* )NULL;
1268: if (by_name != 0)
1269: NBACKREF(node).state |= NST_NAME_REF;
1270:
1271: #ifdef USE_BACKREF_AT_LEVEL
1272: if (exist_level != 0) {
1273: NBACKREF(node).state |= NST_NEST_LEVEL;
1274: NBACKREF(node).nest_level = nest_level;
1275: }
1276: #endif
1277:
1278: for (i = 0; i < back_num; i++) {
1279: if (backrefs[i] <= env->num_mem &&
1280: IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1281: NBACKREF(node).state |= NST_RECURSION; /* /...(\1).../ */
1282: break;
1283: }
1284: }
1285:
1286: if (back_num <= NODE_BACKREFS_SIZE) {
1287: for (i = 0; i < back_num; i++)
1288: NBACKREF(node).back_static[i] = backrefs[i];
1289: }
1290: else {
1291: int* p = (int* )xmalloc(sizeof(int) * back_num);
1292: if (IS_NULL(p)) {
1293: onig_node_free(node);
1294: return NULL;
1295: }
1296: NBACKREF(node).back_dynamic = p;
1297: for (i = 0; i < back_num; i++)
1298: p[i] = backrefs[i];
1299: }
1300: return node;
1301: }
1302:
1303: #ifdef USE_SUBEXP_CALL
1304: static Node*
1305: node_new_call(UChar* name, UChar* name_end)
1306: {
1307: Node* node = node_new();
1308: CHECK_NULL_RETURN(node);
1309:
1310: node->type = N_CALL;
1311: NCALL(node).state = 0;
1312: NCALL(node).ref_num = CALLNODE_REFNUM_UNDEF;
1313: NCALL(node).target = NULL_NODE;
1314: NCALL(node).name = name;
1315: NCALL(node).name_end = name_end;
1316: return node;
1317: }
1318: #endif
1319:
1320: static Node*
1321: node_new_quantifier(int lower, int upper, int by_number)
1322: {
1323: Node* node = node_new();
1324: CHECK_NULL_RETURN(node);
1325: node->type = N_QUANTIFIER;
1326: NQUANTIFIER(node).state = 0;
1327: NQUANTIFIER(node).target = NULL;
1328: NQUANTIFIER(node).lower = lower;
1329: NQUANTIFIER(node).upper = upper;
1330: NQUANTIFIER(node).greedy = 1;
1331: NQUANTIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1332: NQUANTIFIER(node).head_exact = NULL_NODE;
1333: NQUANTIFIER(node).next_head_exact = NULL_NODE;
1334: NQUANTIFIER(node).is_refered = 0;
1335: if (by_number != 0)
1336: NQUANTIFIER(node).state |= NST_BY_NUMBER;
1337:
1338: #ifdef USE_COMBINATION_EXPLOSION_CHECK
1339: NQUANTIFIER(node).comb_exp_check_num = 0;
1340: #endif
1341:
1342: return node;
1343: }
1344:
1345: static Node*
1346: node_new_effect(int type)
1347: {
1348: Node* node = node_new();
1349: CHECK_NULL_RETURN(node);
1350: node->type = N_EFFECT;
1351: NEFFECT(node).type = type;
1352: NEFFECT(node).state = 0;
1353: NEFFECT(node).regnum = 0;
1354: NEFFECT(node).option = 0;
1355: NEFFECT(node).target = NULL;
1356: NEFFECT(node).call_addr = -1;
1357: NEFFECT(node).opt_count = 0;
1358: return node;
1359: }
1360:
1361: extern Node*
1362: onig_node_new_effect(int type)
1363: {
1364: return node_new_effect(type);
1365: }
1366:
1367: static Node*
1368: node_new_effect_memory(OnigOptionType option, int is_named)
1369: {
1370: Node* node = node_new_effect(EFFECT_MEMORY);
1371: CHECK_NULL_RETURN(node);
1372: if (is_named != 0)
1373: SET_EFFECT_STATUS(node, NST_NAMED_GROUP);
1374:
1375: #ifdef USE_SUBEXP_CALL
1376: NEFFECT(node).option = option;
1377: #endif
1378: return node;
1379: }
1380:
1381: static Node*
1382: node_new_option(OnigOptionType option)
1383: {
1384: Node* node = node_new_effect(EFFECT_OPTION);
1385: CHECK_NULL_RETURN(node);
1386: NEFFECT(node).option = option;
1387: return node;
1388: }
1389:
1390: extern int
1391: onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1392: {
1393: int addlen = end - s;
1394:
1395: if (addlen > 0) {
1396: int len = NSTRING(node).end - NSTRING(node).s;
1397:
1398: if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1399: UChar* p;
1400: int capa = len + addlen + NODE_STR_MARGIN;
1401:
1402: if (capa <= NSTRING(node).capa) {
1403: k_strcpy(NSTRING(node).s + len, s, end);
1404: }
1405: else {
1406: if (NSTRING(node).s == NSTRING(node).buf)
1407: p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end,
1408: s, end, capa);
1409: else
1410: p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa);
1411:
1412: CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
1413: NSTRING(node).s = p;
1414: NSTRING(node).capa = capa;
1415: }
1416: }
1417: else {
1418: k_strcpy(NSTRING(node).s + len, s, end);
1419: }
1420: NSTRING(node).end = NSTRING(node).s + len + addlen;
1421: }
1422:
1423: return 0;
1424: }
1425:
1426: static int
1427: node_str_cat_char(Node* node, UChar c)
1428: {
1429: UChar s[1];
1430:
1431: s[0] = c;
1432: return onig_node_str_cat(node, s, s + 1);
1433: }
1434:
1435: extern void
1436: onig_node_conv_to_str_node(Node* node, int flag)
1437: {
1438: node->type = N_STRING;
1439:
1440: NSTRING(node).flag = flag;
1441: NSTRING(node).capa = 0;
1442: NSTRING(node).s = NSTRING(node).buf;
1443: NSTRING(node).end = NSTRING(node).buf;
1444: }
1445:
1446: extern void
1447: onig_node_str_clear(Node* node)
1448: {
1449: if (NSTRING(node).capa != 0 &&
1450: IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1451: xfree(NSTRING(node).s);
1452: }
1453:
1454: NSTRING(node).capa = 0;
1455: NSTRING(node).flag = 0;
1456: NSTRING(node).s = NSTRING(node).buf;
1457: NSTRING(node).end = NSTRING(node).buf;
1458: }
1459:
1460: static Node*
1461: node_new_str(const UChar* s, const UChar* end)
1462: {
1463: Node* node = node_new();
1464: CHECK_NULL_RETURN(node);
1465:
1466: node->type = N_STRING;
1467: NSTRING(node).capa = 0;
1468: NSTRING(node).flag = 0;
1469: NSTRING(node).s = NSTRING(node).buf;
1470: NSTRING(node).end = NSTRING(node).buf;
1471: if (onig_node_str_cat(node, s, end)) {
1472: onig_node_free(node);
1473: return NULL;
1474: }
1475: return node;
1476: }
1477:
1478: extern Node*
1479: onig_node_new_str(const UChar* s, const UChar* end)
1480: {
1481: return node_new_str(s, end);
1482: }
1483:
1484: #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
1485: static Node*
1486: node_new_str_raw(UChar* s, UChar* end)
1487: {
1488: Node* node = node_new_str(s, end);
1489: NSTRING_SET_RAW(node);
1490: return node;
1491: }
1492: #endif
1493:
1494: static Node*
1495: node_new_empty(void)
1496: {
1497: return node_new_str(NULL, NULL);
1498: }
1499:
1500: static Node*
1501: node_new_str_char(UChar c)
1502: {
1503: UChar p[1];
1504:
1505: p[0] = c;
1506: return node_new_str(p, p + 1);
1507: }
1508:
1509: static Node*
1510: str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1511: {
1512: const UChar *p;
1513: Node* n = NULL_NODE;
1514:
1515: if (sn->end > sn->s) {
1516: p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1517: if (p && p > sn->s) { /* can be splitted. */
1518: n = node_new_str(p, sn->end);
1519: if ((sn->flag & NSTR_RAW) != 0)
1520: NSTRING_SET_RAW(n);
1521: sn->end = (UChar* )p;
1522: }
1523: }
1524: return n;
1525: }
1526:
1527: static int
1528: str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1529: {
1530: if (sn->end > sn->s) {
1531: return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1532: }
1533: return 0;
1534: }
1535:
1536: #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1537: static int
1538: node_str_head_pad(StrNode* sn, int num, UChar val)
1539: {
1540: UChar buf[NODE_STR_BUF_SIZE];
1541: int i, len;
1542:
1543: len = sn->end - sn->s;
1544: onig_strcpy(buf, sn->s, sn->end);
1545: onig_strcpy(&(sn->s[num]), buf, buf + len);
1546: sn->end += num;
1547:
1548: for (i = 0; i < num; i++) {
1549: sn->s[i] = val;
1550: }
1551: }
1552: #endif
1553:
1554: extern int
1555: onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1556: {
1557: unsigned int num, val;
1558: OnigCodePoint c;
1559: UChar* p = *src;
1560: PFETCH_READY;
1561:
1562: num = 0;
1563: while (!PEND) {
1564: PFETCH(c);
1565: if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1566: val = (unsigned int )DIGITVAL(c);
1567: if ((INT_MAX_LIMIT - val) / 10UL < num)
1568: return -1; /* overflow */
1569:
1570: num = num * 10 + val;
1571: }
1572: else {
1573: PUNFETCH;
1574: break;
1575: }
1576: }
1577: *src = p;
1578: return num;
1579: }
1580:
1581: static int
1582: scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1583: OnigEncoding enc)
1584: {
1585: OnigCodePoint c;
1586: unsigned int num, val;
1587: UChar* p = *src;
1588: PFETCH_READY;
1589:
1590: num = 0;
1591: while (!PEND && maxlen-- != 0) {
1592: PFETCH(c);
1593: if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1594: val = (unsigned int )XDIGITVAL(enc,c);
1595: if ((INT_MAX_LIMIT - val) / 16UL < num)
1596: return -1; /* overflow */
1597:
1598: num = (num << 4) + XDIGITVAL(enc,c);
1599: }
1600: else {
1601: PUNFETCH;
1602: break;
1603: }
1604: }
1605: *src = p;
1606: return num;
1607: }
1608:
1609: static int
1610: scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1611: OnigEncoding enc)
1612: {
1613: OnigCodePoint c;
1614: unsigned int num, val;
1615: UChar* p = *src;
1616: PFETCH_READY;
1617:
1618: num = 0;
1619: while (!PEND && maxlen-- != 0) {
1620: PFETCH(c);
1621: if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1622: val = ODIGITVAL(c);
1623: if ((INT_MAX_LIMIT - val) / 8UL < num)
1624: return -1; /* overflow */
1625:
1626: num = (num << 3) + val;
1627: }
1628: else {
1629: PUNFETCH;
1630: break;
1631: }
1632: }
1633: *src = p;
1634: return num;
1635: }
1636:
1637:
1638: #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1639: BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1640:
1641: /* data format:
1642: [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1643: (all data size is OnigCodePoint)
1644: */
1645: static int
1646: new_code_range(BBuf** pbuf)
1647: {
1648: #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1649: int r;
1650: OnigCodePoint n;
1651: BBuf* bbuf;
1652:
1653: bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1654: CHECK_NULL_RETURN_VAL(*pbuf, ONIGERR_MEMORY);
1655: r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1656: if (r) return r;
1657:
1658: n = 0;
1659: BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1660: return 0;
1661: }
1662:
1663: static int
1664: add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1665: {
1666: int r, inc_n, pos;
1667: int low, high, bound, x;
1668: OnigCodePoint n, *data;
1669: BBuf* bbuf;
1670:
1671: if (from > to) {
1672: n = from; from = to; to = n;
1673: }
1674:
1675: if (IS_NULL(*pbuf)) {
1676: r = new_code_range(pbuf);
1677: if (r) return r;
1678: bbuf = *pbuf;
1679: n = 0;
1680: }
1681: else {
1682: bbuf = *pbuf;
1683: GET_CODE_POINT(n, bbuf->p);
1684: }
1685: data = (OnigCodePoint* )(bbuf->p);
1686: data++;
1687:
1688: for (low = 0, bound = n; low < bound; ) {
1689: x = (low + bound) >> 1;
1690: if (from > data[x*2 + 1])
1691: low = x + 1;
1692: else
1693: bound = x;
1694: }
1695:
1696: for (high = low, bound = n; high < bound; ) {
1697: x = (high + bound) >> 1;
1698: if (to >= data[x*2] - 1)
1699: high = x + 1;
1700: else
1701: bound = x;
1702: }
1703:
1704: inc_n = low + 1 - high;
1705: if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1706: return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1707:
1708: if (inc_n != 1) {
1709: if (from > data[low*2])
1710: from = data[low*2];
1711: if (to < data[(high - 1)*2 + 1])
1712: to = data[(high - 1)*2 + 1];
1713: }
1714:
1715: if (inc_n != 0 && (OnigCodePoint )high < n) {
1716: int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1717: int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1718: int size = (n - high) * 2 * SIZE_CODE_POINT;
1719:
1720: if (inc_n > 0) {
1721: BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1722: }
1723: else {
1724: BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1725: }
1726: }
1727:
1728: pos = SIZE_CODE_POINT * (1 + low * 2);
1729: BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1730: BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1731: BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1732: n += inc_n;
1733: BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1734:
1735: return 0;
1736: }
1737:
1738: static int
1739: add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1740: {
1741: if (from > to) {
1742: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1743: return 0;
1744: else
1745: return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1746: }
1747:
1748: return add_code_range_to_buf(pbuf, from, to);
1749: }
1750:
1751: static int
1752: not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1753: {
1754: int r, i, n;
1755: OnigCodePoint pre, from, *data, to = 0;
1756:
1757: *pbuf = (BBuf* )NULL;
1758: if (IS_NULL(bbuf)) {
1759: set_all:
1760: return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1761: }
1762:
1763: data = (OnigCodePoint* )(bbuf->p);
1764: GET_CODE_POINT(n, data);
1765: data++;
1766: if (n <= 0) goto set_all;
1767:
1768: r = 0;
1769: pre = MBCODE_START_POS(enc);
1770: for (i = 0; i < n; i++) {
1771: from = data[i*2];
1772: to = data[i*2+1];
1773: if (pre <= from - 1) {
1774: r = add_code_range_to_buf(pbuf, pre, from - 1);
1775: if (r != 0) return r;
1776: }
1777: if (to == ~((OnigCodePoint )0)) break;
1778: pre = to + 1;
1779: }
1780: if (to < ~((OnigCodePoint )0)) {
1781: r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1782: }
1783: return r;
1784: }
1785:
1786: #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1787: BBuf *tbuf; \
1788: int tnot; \
1789: tnot = not1; not1 = not2; not2 = tnot; \
1790: tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1791: } while (0)
1792:
1793: static int
1794: or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1795: BBuf* bbuf2, int not2, BBuf** pbuf)
1796: {
1797: int r;
1798: OnigCodePoint i, n1, *data1;
1799: OnigCodePoint from, to;
1800:
1801: *pbuf = (BBuf* )NULL;
1802: if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1803: if (not1 != 0 || not2 != 0)
1804: return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1805: return 0;
1806: }
1807:
1808: r = 0;
1809: if (IS_NULL(bbuf2))
1810: SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1811:
1812: if (IS_NULL(bbuf1)) {
1813: if (not1 != 0) {
1814: return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1815: }
1816: else {
1817: if (not2 == 0) {
1818: return bbuf_clone(pbuf, bbuf2);
1819: }
1820: else {
1821: return not_code_range_buf(enc, bbuf2, pbuf);
1822: }
1823: }
1824: }
1825:
1826: if (not1 != 0)
1827: SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1828:
1829: data1 = (OnigCodePoint* )(bbuf1->p);
1830: GET_CODE_POINT(n1, data1);
1831: data1++;
1832:
1833: if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1834: r = bbuf_clone(pbuf, bbuf2);
1835: }
1836: else if (not1 == 0) { /* 1 OR (not 2) */
1837: r = not_code_range_buf(enc, bbuf2, pbuf);
1838: }
1839: if (r != 0) return r;
1840:
1841: for (i = 0; i < n1; i++) {
1842: from = data1[i*2];
1843: to = data1[i*2+1];
1844: r = add_code_range_to_buf(pbuf, from, to);
1845: if (r != 0) return r;
1846: }
1847: return 0;
1848: }
1849:
1850: static int
1851: and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1852: OnigCodePoint* data, int n)
1853: {
1854: int i, r;
1855: OnigCodePoint from2, to2;
1856:
1857: for (i = 0; i < n; i++) {
1858: from2 = data[i*2];
1859: to2 = data[i*2+1];
1860: if (from2 < from1) {
1861: if (to2 < from1) continue;
1862: else {
1863: from1 = to2 + 1;
1864: }
1865: }
1866: else if (from2 <= to1) {
1867: if (to2 < to1) {
1868: if (from1 <= from2 - 1) {
1869: r = add_code_range_to_buf(pbuf, from1, from2-1);
1870: if (r != 0) return r;
1871: }
1872: from1 = to2 + 1;
1873: }
1874: else {
1875: to1 = from2 - 1;
1876: }
1877: }
1878: else {
1879: from1 = from2;
1880: }
1881: if (from1 > to1) break;
1882: }
1883: if (from1 <= to1) {
1884: r = add_code_range_to_buf(pbuf, from1, to1);
1885: if (r != 0) return r;
1886: }
1887: return 0;
1888: }
1889:
1890: static int
1891: and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1892: {
1893: int r;
1894: OnigCodePoint i, j, n1, n2, *data1, *data2;
1895: OnigCodePoint from, to, from1, to1, from2, to2;
1896:
1897: *pbuf = (BBuf* )NULL;
1898: if (IS_NULL(bbuf1)) {
1899: if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1900: return bbuf_clone(pbuf, bbuf2);
1901: return 0;
1902: }
1903: else if (IS_NULL(bbuf2)) {
1904: if (not2 != 0)
1905: return bbuf_clone(pbuf, bbuf1);
1906: return 0;
1907: }
1908:
1909: if (not1 != 0)
1910: SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1911:
1912: data1 = (OnigCodePoint* )(bbuf1->p);
1913: data2 = (OnigCodePoint* )(bbuf2->p);
1914: GET_CODE_POINT(n1, data1);
1915: GET_CODE_POINT(n2, data2);
1916: data1++;
1917: data2++;
1918:
1919: if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1920: for (i = 0; i < n1; i++) {
1921: from1 = data1[i*2];
1922: to1 = data1[i*2+1];
1923: for (j = 0; j < n2; j++) {
1924: from2 = data2[j*2];
1925: to2 = data2[j*2+1];
1926: if (from2 > to1) break;
1927: if (to2 < from1) continue;
1928: from = MAX(from1, from2);
1929: to = MIN(to1, to2);
1930: r = add_code_range_to_buf(pbuf, from, to);
1931: if (r != 0) return r;
1932: }
1933: }
1934: }
1935: else if (not1 == 0) { /* 1 AND (not 2) */
1936: for (i = 0; i < n1; i++) {
1937: from1 = data1[i*2];
1938: to1 = data1[i*2+1];
1939: r = and_code_range1(pbuf, from1, to1, data2, n2);
1940: if (r != 0) return r;
1941: }
1942: }
1943:
1944: return 0;
1945: }
1946:
1947: static int
1948: and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1949: {
1950: int r, not1, not2;
1951: BBuf *buf1, *buf2, *pbuf;
1952: BitSetRef bsr1, bsr2;
1953: BitSet bs1, bs2;
1954:
1955: not1 = IS_CCLASS_NOT(dest);
1956: bsr1 = dest->bs;
1957: buf1 = dest->mbuf;
1958: not2 = IS_CCLASS_NOT(cc);
1959: bsr2 = cc->bs;
1960: buf2 = cc->mbuf;
1961:
1962: if (not1 != 0) {
1963: bitset_invert_to(bsr1, bs1);
1964: bsr1 = bs1;
1965: }
1966: if (not2 != 0) {
1967: bitset_invert_to(bsr2, bs2);
1968: bsr2 = bs2;
1969: }
1970: bitset_and(bsr1, bsr2);
1971: if (bsr1 != dest->bs) {
1972: bitset_copy(dest->bs, bsr1);
1973: bsr1 = dest->bs;
1974: }
1975: if (not1 != 0) {
1976: bitset_invert(dest->bs);
1977: }
1978:
1979: if (! ONIGENC_IS_SINGLEBYTE(enc)) {
1980: if (not1 != 0 && not2 != 0) {
1981: r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
1982: }
1983: else {
1984: r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
1985: if (r == 0 && not1 != 0) {
1986: BBuf *tbuf;
1987: r = not_code_range_buf(enc, pbuf, &tbuf);
1988: if (r != 0) {
1989: bbuf_free(pbuf);
1990: return r;
1991: }
1992: bbuf_free(pbuf);
1993: pbuf = tbuf;
1994: }
1995: }
1996: if (r != 0) return r;
1997:
1998: dest->mbuf = pbuf;
1999: bbuf_free(buf1);
2000: return r;
2001: }
2002: return 0;
2003: }
2004:
2005: static int
2006: or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2007: {
2008: int r, not1, not2;
2009: BBuf *buf1, *buf2, *pbuf;
2010: BitSetRef bsr1, bsr2;
2011: BitSet bs1, bs2;
2012:
2013: not1 = IS_CCLASS_NOT(dest);
2014: bsr1 = dest->bs;
2015: buf1 = dest->mbuf;
2016: not2 = IS_CCLASS_NOT(cc);
2017: bsr2 = cc->bs;
2018: buf2 = cc->mbuf;
2019:
2020: if (not1 != 0) {
2021: bitset_invert_to(bsr1, bs1);
2022: bsr1 = bs1;
2023: }
2024: if (not2 != 0) {
2025: bitset_invert_to(bsr2, bs2);
2026: bsr2 = bs2;
2027: }
2028: bitset_or(bsr1, bsr2);
2029: if (bsr1 != dest->bs) {
2030: bitset_copy(dest->bs, bsr1);
2031: bsr1 = dest->bs;
2032: }
2033: if (not1 != 0) {
2034: bitset_invert(dest->bs);
2035: }
2036:
2037: if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2038: if (not1 != 0 && not2 != 0) {
2039: r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2040: }
2041: else {
2042: r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2043: if (r == 0 && not1 != 0) {
2044: BBuf *tbuf;
2045: r = not_code_range_buf(enc, pbuf, &tbuf);
2046: if (r != 0) {
2047: bbuf_free(pbuf);
2048: return r;
2049: }
2050: bbuf_free(pbuf);
2051: pbuf = tbuf;
2052: }
2053: }
2054: if (r != 0) return r;
2055:
2056: dest->mbuf = pbuf;
2057: bbuf_free(buf1);
2058: return r;
2059: }
2060: else
2061: return 0;
2062: }
2063:
2064: static int
2065: conv_backslash_value(int c, ScanEnv* env)
2066: {
2067: if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2068: switch (c) {
2069: case 'n': return '\n';
2070: case 't': return '\t';
2071: case 'r': return '\r';
2072: case 'f': return '\f';
2073: case 'a': return '\007';
2074: case 'b': return '\010';
2075: case 'e': return '\033';
2076: case 'v':
2077: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2078: return '\v';
2079: break;
2080:
2081: default:
2082: break;
2083: }
2084: }
2085: return c;
2086: }
2087:
2088: static int
2089: is_invalid_quantifier_target(Node* node)
2090: {
2091: switch (NTYPE(node)) {
2092: case N_ANCHOR:
2093: return 1;
2094: break;
2095:
2096: case N_EFFECT:
2097: if (NEFFECT(node).type == EFFECT_OPTION)
2098: return is_invalid_quantifier_target(NEFFECT(node).target);
2099: break;
2100:
2101: case N_LIST: /* ex. (?:\G\A)* */
2102: do {
2103: if (! is_invalid_quantifier_target(NCONS(node).left)) return 0;
2104: } while (IS_NOT_NULL(node = NCONS(node).right));
2105: return 0;
2106: break;
2107:
2108: case N_ALT: /* ex. (?:abc|\A)* */
2109: do {
2110: if (is_invalid_quantifier_target(NCONS(node).left)) return 1;
2111: } while (IS_NOT_NULL(node = NCONS(node).right));
2112: break;
2113:
2114: default:
2115: break;
2116: }
2117: return 0;
2118: }
2119:
2120: /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2121: static int
2122: popular_quantifier_num(QuantifierNode* qf)
2123: {
2124: if (qf->greedy) {
2125: if (qf->lower == 0) {
2126: if (qf->upper == 1) return 0;
2127: else if (IS_REPEAT_INFINITE(qf->upper)) return 1;
2128: }
2129: else if (qf->lower == 1) {
2130: if (IS_REPEAT_INFINITE(qf->upper)) return 2;
2131: }
2132: }
2133: else {
2134: if (qf->lower == 0) {
2135: if (qf->upper == 1) return 3;
2136: else if (IS_REPEAT_INFINITE(qf->upper)) return 4;
2137: }
2138: else if (qf->lower == 1) {
2139: if (IS_REPEAT_INFINITE(qf->upper)) return 5;
2140: }
2141: }
2142: return -1;
2143: }
2144:
2145:
2146: enum ReduceType {
2147: RQ_ASIS = 0, /* as is */
2148: RQ_DEL = 1, /* delete parent */
2149: RQ_A, /* to '*' */
2150: RQ_AQ, /* to '*?' */
2151: RQ_QQ, /* to '??' */
2152: RQ_P_QQ, /* to '+)??' */
2153: RQ_PQ_Q /* to '+?)?' */
2154: };
2155:
2156: static enum ReduceType ReduceTypeTable[6][6] = {
2157: {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2158: {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2159: {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2160: {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2161: {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2162: {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2163: };
2164:
2165: extern void
2166: onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2167: {
2168: int pnum, cnum;
2169: QuantifierNode *p, *c;
2170:
2171: p = &(NQUANTIFIER(pnode));
2172: c = &(NQUANTIFIER(cnode));
2173: pnum = popular_quantifier_num(p);
2174: cnum = popular_quantifier_num(c);
2175:
2176: switch(ReduceTypeTable[cnum][pnum]) {
2177: case RQ_DEL:
2178: *p = *c;
2179: break;
2180: case RQ_A:
2181: p->target = c->target;
2182: p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2183: break;
2184: case RQ_AQ:
2185: p->target = c->target;
2186: p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2187: break;
2188: case RQ_QQ:
2189: p->target = c->target;
2190: p->lower = 0; p->upper = 1; p->greedy = 0;
2191: break;
2192: case RQ_P_QQ:
2193: p->target = cnode;
2194: p->lower = 0; p->upper = 1; p->greedy = 0;
2195: c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2196: return ;
2197: break;
2198: case RQ_PQ_Q:
2199: p->target = cnode;
2200: p->lower = 0; p->upper = 1; p->greedy = 1;
2201: c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2202: return ;
2203: break;
2204: case RQ_ASIS:
2205: p->target = cnode;
2206: return ;
2207: break;
2208: }
2209:
2210: c->target = NULL_NODE;
2211: onig_node_free(cnode);
2212: }
2213:
2214:
2215: enum TokenSyms {
2216: TK_EOT = 0, /* end of token */
2217: TK_RAW_BYTE = 1,
2218: TK_CHAR,
2219: TK_STRING,
2220: TK_CODE_POINT,
2221: TK_ANYCHAR,
2222: TK_CHAR_TYPE,
2223: TK_BACKREF,
2224: TK_CALL,
2225: TK_ANCHOR,
2226: TK_OP_REPEAT,
2227: TK_INTERVAL,
2228: TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2229: TK_ALT,
2230: TK_SUBEXP_OPEN,
2231: TK_SUBEXP_CLOSE,
2232: TK_CC_OPEN,
2233: TK_QUOTE_OPEN,
2234: TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2235: /* in cc */
2236: TK_CC_CLOSE,
2237: TK_CC_RANGE,
2238: TK_POSIX_BRACKET_OPEN,
2239: TK_CC_AND, /* && */
2240: TK_CC_CC_OPEN /* [ */
2241: };
2242:
2243: typedef struct {
2244: enum TokenSyms type;
2245: int escaped;
2246: int base; /* is number: 8, 16 (used in [....]) */
2247: UChar* backp;
2248: union {
2249: UChar* s;
2250: int c;
2251: OnigCodePoint code;
2252: int anchor;
2253: int subtype;
2254: struct {
2255: int lower;
2256: int upper;
2257: int greedy;
2258: int possessive;
2259: } repeat;
2260: struct {
2261: int num;
2262: int ref1;
2263: int* refs;
2264: int by_name;
2265: #ifdef USE_BACKREF_AT_LEVEL
2266: int exist_level;
2267: int level; /* \k<name+n> */
2268: #endif
2269: } backref;
2270: struct {
2271: UChar* name;
2272: UChar* name_end;
2273: } call;
2274: struct {
2275: int not;
2276: } prop;
2277: } u;
2278: } OnigToken;
2279:
2280:
2281: static int
2282: fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2283: {
2284: int low, up, syn_allow, non_low = 0;
2285: int r = 0;
2286: OnigCodePoint c;
2287: OnigEncoding enc = env->enc;
2288: UChar* p = *src;
2289: PFETCH_READY;
2290:
2291: syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2292:
2293: if (PEND) {
2294: if (syn_allow)
2295: return 1; /* "....{" : OK! */
2296: else
2297: return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2298: }
2299:
2300: if (! syn_allow) {
2301: c = PPEEK;
2302: if (c == ')' || c == '(' || c == '|') {
2303: return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2304: }
2305: }
2306:
2307: low = onig_scan_unsigned_number(&p, end, env->enc);
2308: if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2309: if (low > ONIG_MAX_REPEAT_NUM)
2310: return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2311:
2312: if (p == *src) { /* can't read low */
2313: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2314: /* allow {,n} as {0,n} */
2315: low = 0;
2316: non_low = 1;
2317: }
2318: else
2319: goto invalid;
2320: }
2321:
2322: if (PEND) goto invalid;
2323: PFETCH(c);
2324: if (c == ',') {
2325: UChar* prev = p;
2326: up = onig_scan_unsigned_number(&p, end, env->enc);
2327: if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2328: if (up > ONIG_MAX_REPEAT_NUM)
2329: return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2330:
2331: if (p == prev) {
2332: if (non_low != 0)
2333: goto invalid;
2334: up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2335: }
2336: }
2337: else {
2338: if (non_low != 0)
2339: goto invalid;
2340:
2341: PUNFETCH;
2342: up = low; /* {n} : exact n times */
2343: r = 2; /* fixed */
2344: }
2345:
2346: if (PEND) goto invalid;
2347: PFETCH(c);
2348: if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2349: if (c != MC_ESC(enc)) goto invalid;
2350: PFETCH(c);
2351: }
2352: if (c != '}') goto invalid;
2353:
2354: if (!IS_REPEAT_INFINITE(up) && low > up) {
2355: return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2356: }
2357:
2358: tok->type = TK_INTERVAL;
2359: tok->u.repeat.lower = low;
2360: tok->u.repeat.upper = up;
2361: *src = p;
2362: return r; /* 0: normal {n,m}, 2: fixed {n} */
2363:
2364: invalid:
2365: if (syn_allow)
2366: return 1; /* OK */
2367: else
2368: return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2369: }
2370:
2371: /* \M-, \C-, \c, or \... */
2372: static int
2373: fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2374: {
2375: int v;
2376: OnigCodePoint c;
2377: OnigEncoding enc = env->enc;
2378: UChar* p = *src;
2379: PFETCH_READY;
2380:
2381: if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2382:
2383: PFETCH(c);
2384: switch (c) {
2385: case 'M':
2386: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2387: if (PEND) return ONIGERR_END_PATTERN_AT_META;
2388: PFETCH(c);
2389: if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2390: if (PEND) return ONIGERR_END_PATTERN_AT_META;
2391: PFETCH(c);
2392: if (c == MC_ESC(enc)) {
2393: v = fetch_escaped_value(&p, end, env);
2394: if (v < 0) return v;
2395: c = (OnigCodePoint )v;
2396: }
2397: c = ((c & 0xff) | 0x80);
2398: }
2399: else
2400: goto backslash;
2401: break;
2402:
2403: case 'C':
2404: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2405: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2406: PFETCH(c);
2407: if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2408: goto control;
2409: }
2410: else
2411: goto backslash;
2412:
2413: case 'c':
2414: if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2415: control:
2416: if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2417: PFETCH(c);
2418: if (c == '?') {
2419: c = 0177;
2420: }
2421: else {
2422: if (c == MC_ESC(enc)) {
2423: v = fetch_escaped_value(&p, end, env);
2424: if (v < 0) return v;
2425: c = (OnigCodePoint )v;
2426: }
2427: c &= 0x9f;
2428: }
2429: break;
2430: }
2431: /* fall through */
2432:
2433: default:
2434: {
2435: backslash:
2436: c = conv_backslash_value(c, env);
2437: }
2438: break;
2439: }
2440:
2441: *src = p;
2442: return c;
2443: }
2444:
2445: static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2446:
2447: #ifdef USE_NAMED_GROUP
2448: #ifdef USE_BACKREF_AT_LEVEL
2449: /*
2450: \k<name+n>, \k<name-n>
2451: */
2452: static int
2453: fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end
2454: , ScanEnv* env, int* level)
2455: {
2456: int r, exist_level = 0;
2457: OnigCodePoint c = 0;
2458: OnigCodePoint first_code;
2459: OnigEncoding enc = env->enc;
2460: UChar *name_end;
2461: UChar *p = *src;
2462: PFETCH_READY;
2463:
2464: name_end = end;
2465: r = 0;
2466: if (PEND) {
2467: return ONIGERR_EMPTY_GROUP_NAME;
2468: }
2469: else {
2470: PFETCH(c);
2471: first_code = c;
2472: if (c == '>')
2473: return ONIGERR_EMPTY_GROUP_NAME;
2474:
2475: if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2476: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2477: }
2478: }
2479:
2480: while (!PEND) {
2481: name_end = p;
2482: PFETCH(c);
2483: if (c == '>' || c == ')' || c == '+' || c == '-') break;
2484:
2485: if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2486: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2487: }
2488: }
2489:
2490: if (c != '>') {
2491: if (c == '+' || c == '-') {
2492: int num;
2493: int flag = (c == '-' ? -1 : 1);
2494:
2495: PFETCH(c);
2496: if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2497: PUNFETCH;
2498: num = onig_scan_unsigned_number(&p, end, enc);
2499: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2500: *level = (num * flag);
2501: exist_level = 1;
2502:
2503: PFETCH(c);
2504: if (c == '>')
2505: goto first_check;
2506: }
2507:
2508: err:
2509: r = ONIGERR_INVALID_GROUP_NAME;
2510: name_end = end;
2511: }
2512: else {
2513: first_check:
2514: if (ONIGENC_IS_CODE_ASCII(first_code) &&
2515: ONIGENC_IS_CODE_UPPER(enc, first_code))
2516: r = ONIGERR_INVALID_GROUP_NAME;
2517: }
2518:
2519: if (r == 0) {
2520: *rname_end = name_end;
2521: *src = p;
2522: return (exist_level ? 1 : 0);
2523: }
2524: else {
2525: onig_scan_env_set_error_string(env, r, *src, name_end);
2526: return r;
2527: }
2528: }
2529: #endif /* USE_BACKREF_AT_LEVEL */
2530:
2531: /*
2532: def: 0 -> define name (don't allow number name)
2533: 1 -> reference name (allow number name)
2534: */
2535: static int
2536: fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2537: {
2538: int r, is_num;
2539: OnigCodePoint c = 0;
2540: OnigCodePoint first_code;
2541: OnigEncoding enc = env->enc;
2542: UChar *name_end;
2543: UChar *p = *src;
2544: PFETCH_READY;
2545:
2546: name_end = end;
2547: r = 0;
2548: is_num = 0;
2549: if (PEND) {
2550: return ONIGERR_EMPTY_GROUP_NAME;
2551: }
2552: else {
2553: PFETCH(c);
2554: first_code = c;
2555: if (c == '>')
2556: return ONIGERR_EMPTY_GROUP_NAME;
2557:
2558: if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2559: if (ref == 1)
2560: is_num = 1;
2561: else {
2562: r = ONIGERR_INVALID_GROUP_NAME;
2563: }
2564: }
2565: else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2566: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2567: }
2568: }
2569:
2570: while (!PEND) {
2571: name_end = p;
2572: PFETCH(c);
2573: if (c == '>' || c == ')') break;
2574:
2575: if (is_num == 1) {
2576: if (! ONIGENC_IS_CODE_DIGIT(enc, c)) {
2577: if (!ONIGENC_IS_CODE_WORD(enc, c))
2578: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2579: else
2580: r = ONIGERR_INVALID_GROUP_NAME;
2581: }
2582: }
2583: else {
2584: if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2585: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2586: }
2587: }
2588: }
2589:
2590: if (c != '>') {
2591: r = ONIGERR_INVALID_GROUP_NAME;
2592: name_end = end;
2593: }
2594: else {
2595: if (ONIGENC_IS_CODE_ASCII(first_code) &&
2596: ONIGENC_IS_CODE_UPPER(enc, first_code))
2597: r = ONIGERR_INVALID_GROUP_NAME;
2598: }
2599:
2600: if (r == 0) {
2601: *rname_end = name_end;
2602: *src = p;
2603: return 0;
2604: }
2605: else {
2606: onig_scan_env_set_error_string(env, r, *src, name_end);
2607: return r;
2608: }
2609: }
2610: #else
2611: static int
2612: fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2613: {
2614: int r, len;
2615: OnigCodePoint c = 0;
2616: UChar *name_end;
2617: OnigEncoding enc = env->enc;
2618: UChar *p = *src;
2619: PFETCH_READY;
2620:
2621: r = 0;
2622: while (!PEND) {
2623: name_end = p;
2624: if (enc_len(enc, p) > 1)
2625: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2626:
2627: PFETCH(c);
2628: if (c == '>' || c == ')') break;
2629: if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2630: r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2631: }
2632: if (c != '>') {
2633: r = ONIGERR_INVALID_GROUP_NAME;
2634: name_end = end;
2635: }
2636:
2637: if (r == 0) {
2638: *rname_end = name_end;
2639: *src = p;
2640: return 0;
2641: }
2642: else {
2643: err:
2644: onig_scan_env_set_error_string(env, r, *src, name_end);
2645: return r;
2646: }
2647: }
2648: #endif
2649:
2650: static void
2651: CC_ESC_WARN(ScanEnv* env, UChar *c)
2652: {
2653: if (onig_warn == onig_null_warn) return ;
2654:
2655: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2656: IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2657: UChar buf[WARN_BUFSIZE];
2658: onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2659: env->pattern, env->pattern_end,
2660: (UChar* )"character class has '%s' without escape", c);
2661: (*onig_warn)((char* )buf);
2662: }
2663: }
2664:
2665: static void
2666: CCEND_ESC_WARN(ScanEnv* env, UChar* c)
2667: {
2668: if (onig_warn == onig_null_warn) return ;
2669:
2670: if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2671: UChar buf[WARN_BUFSIZE];
2672: onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2673: (env)->pattern, (env)->pattern_end,
2674: (UChar* )"regular expression has '%s' without escape", c);
2675: (*onig_warn)((char* )buf);
2676: }
2677: }
2678:
2679: static UChar*
2680: find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2681: UChar **next, OnigEncoding enc)
2682: {
2683: int i;
2684: OnigCodePoint x;
2685: UChar *q;
2686: UChar *p = from;
2687:
2688: while (p < to) {
2689: x = ONIGENC_MBC_TO_CODE(enc, p, to);
2690: q = p + enc_len(enc, p);
2691: if (x == s[0]) {
2692: for (i = 1; i < n && q < to; i++) {
2693: x = ONIGENC_MBC_TO_CODE(enc, q, to);
2694: if (x != s[i]) break;
2695: q += enc_len(enc, q);
2696: }
2697: if (i >= n) {
2698: if (IS_NOT_NULL(next))
2699: *next = q;
2700: return p;
2701: }
2702: }
2703: p = q;
2704: }
2705: return NULL_UCHARP;
2706: }
2707:
2708: static int
2709: str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2710: OnigCodePoint bad, OnigEncoding enc)
2711: {
2712: int i, in_esc;
2713: OnigCodePoint x;
2714: UChar *q;
2715: UChar *p = from;
2716:
2717: in_esc = 0;
2718: while (p < to) {
2719: if (in_esc) {
2720: in_esc = 0;
2721: p += enc_len(enc, p);
2722: }
2723: else {
2724: x = ONIGENC_MBC_TO_CODE(enc, p, to);
2725: q = p + enc_len(enc, p);
2726: if (x == s[0]) {
2727: for (i = 1; i < n && q < to; i++) {
2728: x = ONIGENC_MBC_TO_CODE(enc, q, to);
2729: if (x != s[i]) break;
2730: q += enc_len(enc, q);
2731: }
2732: if (i >= n) return 1;
2733: p += enc_len(enc, p);
2734: }
2735: else {
2736: x = ONIGENC_MBC_TO_CODE(enc, p, to);
2737: if (x == bad) return 0;
2738: else if (x == MC_ESC(enc)) in_esc = 1;
2739: p = q;
2740: }
2741: }
2742: }
2743: return 0;
2744: }
2745:
2746: static int
2747: fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2748: {
2749: int num;
2750: OnigCodePoint c, c2;
2751: OnigSyntaxType* syn = env->syntax;
2752: OnigEncoding enc = env->enc;
2753: UChar* prev;
2754: UChar* p = *src;
2755: PFETCH_READY;
2756:
2757: if (PEND) {
2758: tok->type = TK_EOT;
2759: return tok->type;
2760: }
2761:
2762: PFETCH(c);
2763: tok->type = TK_CHAR;
2764: tok->base = 0;
2765: tok->u.c = c;
2766: tok->escaped = 0;
2767:
2768: if (c == ']') {
2769: tok->type = TK_CC_CLOSE;
2770: }
2771: else if (c == '-') {
2772: tok->type = TK_CC_RANGE;
2773: }
2774: else if (c == MC_ESC(enc)) {
2775: if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2776: goto end;
2777:
2778: if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2779:
2780: PFETCH(c);
2781: tok->escaped = 1;
2782: tok->u.c = c;
2783: switch (c) {
2784: case 'w':
2785: tok->type = TK_CHAR_TYPE;
2786: tok->u.subtype = CTYPE_WORD;
2787: break;
2788: case 'W':
2789: tok->type = TK_CHAR_TYPE;
2790: tok->u.subtype = CTYPE_NOT_WORD;
2791: break;
2792: case 'd':
2793: tok->type = TK_CHAR_TYPE;
2794: tok->u.subtype = CTYPE_DIGIT;
2795: break;
2796: case 'D':
2797: tok->type = TK_CHAR_TYPE;
2798: tok->u.subtype = CTYPE_NOT_DIGIT;
2799: break;
2800: case 's':
2801: tok->type = TK_CHAR_TYPE;
2802: tok->u.subtype = CTYPE_WHITE_SPACE;
2803: break;
2804: case 'S':
2805: tok->type = TK_CHAR_TYPE;
2806: tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
2807: break;
2808: case 'h':
2809: if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2810: tok->type = TK_CHAR_TYPE;
2811: tok->u.subtype = CTYPE_XDIGIT;
2812: break;
2813: case 'H':
2814: if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2815: tok->type = TK_CHAR_TYPE;
2816: tok->u.subtype = CTYPE_NOT_XDIGIT;
2817: break;
2818:
2819: case 'p':
2820: case 'P':
2821: c2 = PPEEK;
2822: if (c2 == '{' &&
2823: IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2824: PINC;
2825: tok->type = TK_CHAR_PROPERTY;
2826: tok->u.prop.not = (c == 'P' ? 1 : 0);
2827:
2828: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2829: PFETCH(c2);
2830: if (c2 == '^') {
2831: tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2832: }
2833: else
2834: PUNFETCH;
2835: }
2836: }
2837: break;
2838:
2839: case 'x':
2840: if (PEND) break;
2841:
2842: prev = p;
2843: if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
2844: PINC;
2845: num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
2846: if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
2847: if (!PEND) {
2848: c2 = PPEEK;
2849: if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
2850: return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
2851: }
2852:
2853: if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) {
2854: PINC;
2855: tok->type = TK_CODE_POINT;
2856: tok->base = 16;
2857: tok->u.code = (OnigCodePoint )num;
2858: }
2859: else {
2860: /* can't read nothing or invalid format */
2861: p = prev;
2862: }
2863: }
2864: else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
2865: num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
2866: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2867: if (p == prev) { /* can't read nothing. */
2868: num = 0; /* but, it's not error */
2869: }
2870: tok->type = TK_RAW_BYTE;
2871: tok->base = 16;
2872: tok->u.c = num;
2873: }
2874: break;
2875:
2876: case 'u':
2877: if (PEND) break;
2878:
2879: prev = p;
2880: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
2881: num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
2882: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2883: if (p == prev) { /* can't read nothing. */
2884: num = 0; /* but, it's not error */
2885: }
2886: tok->type = TK_CODE_POINT;
2887: tok->base = 16;
2888: tok->u.code = (OnigCodePoint )num;
2889: }
2890: break;
2891:
2892: case '0':
2893: case '1': case '2': case '3': case '4': case '5': case '6': case '7':
2894: if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
2895: PUNFETCH;
2896: prev = p;
2897: num = scan_unsigned_octal_number(&p, end, 3, enc);
2898: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2899: if (p == prev) { /* can't read nothing. */
2900: num = 0; /* but, it's not error */
2901: }
2902: tok->type = TK_RAW_BYTE;
2903: tok->base = 8;
2904: tok->u.c = num;
2905: }
2906: break;
2907:
2908: default:
2909: PUNFETCH;
2910: num = fetch_escaped_value(&p, end, env);
2911: if (num < 0) return num;
2912: if (tok->u.c != num) {
2913: tok->u.code = (OnigCodePoint )num;
2914: tok->type = TK_CODE_POINT;
2915: }
2916: break;
2917: }
2918: }
2919: else if (c == '[') {
2920: if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
2921: OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
2922: tok->backp = p; /* point at '[' is readed */
2923: PINC;
2924: if (str_exist_check_with_esc(send, 2, p, end,
2925: (OnigCodePoint )']', enc)) {
2926: tok->type = TK_POSIX_BRACKET_OPEN;
2927: }
2928: else {
2929: PUNFETCH;
2930: goto cc_in_cc;
2931: }
2932: }
2933: else {
2934: cc_in_cc:
2935: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
2936: tok->type = TK_CC_CC_OPEN;
2937: }
2938: else {
2939: CC_ESC_WARN(env, (UChar* )"[");
2940: }
2941: }
2942: }
2943: else if (c == '&') {
2944: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
2945: !PEND && (PPEEK_IS('&'))) {
2946: PINC;
2947: tok->type = TK_CC_AND;
2948: }
2949: }
2950:
2951: end:
2952: *src = p;
2953: return tok->type;
2954: }
2955:
2956: static int
2957: fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2958: {
2959: int r, num;
2960: OnigCodePoint c;
2961: OnigEncoding enc = env->enc;
2962: OnigSyntaxType* syn = env->syntax;
2963: UChar* prev;
2964: UChar* p = *src;
2965: PFETCH_READY;
2966:
2967: start:
2968: if (PEND) {
2969: tok->type = TK_EOT;
2970: return tok->type;
2971: }
2972:
2973: tok->type = TK_STRING;
2974: tok->base = 0;
2975: tok->backp = p;
2976:
2977: PFETCH(c);
2978: if (IS_MC_ESC_CODE(c, enc, syn)) {
2979: if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2980:
2981: tok->backp = p;
2982: PFETCH(c);
2983:
2984: tok->u.c = c;
2985: tok->escaped = 1;
2986: switch (c) {
2987: case '*':
2988: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
2989: tok->type = TK_OP_REPEAT;
2990: tok->u.repeat.lower = 0;
2991: tok->u.repeat.upper = REPEAT_INFINITE;
2992: goto greedy_check;
2993: break;
2994:
2995: case '+':
2996: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
2997: tok->type = TK_OP_REPEAT;
2998: tok->u.repeat.lower = 1;
2999: tok->u.repeat.upper = REPEAT_INFINITE;
3000: goto greedy_check;
3001: break;
3002:
3003: case '?':
3004: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3005: tok->type = TK_OP_REPEAT;
3006: tok->u.repeat.lower = 0;
3007: tok->u.repeat.upper = 1;
3008: greedy_check:
3009: if (!PEND && PPEEK_IS('?') &&
3010: IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3011: PFETCH(c);
3012: tok->u.repeat.greedy = 0;
3013: tok->u.repeat.possessive = 0;
3014: }
3015: else {
3016: possessive_check:
3017: if (!PEND && PPEEK_IS('+') &&
3018: ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3019: tok->type != TK_INTERVAL) ||
3020: (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3021: tok->type == TK_INTERVAL))) {
3022: PFETCH(c);
3023: tok->u.repeat.greedy = 1;
3024: tok->u.repeat.possessive = 1;
3025: }
3026: else {
3027: tok->u.repeat.greedy = 1;
3028: tok->u.repeat.possessive = 0;
3029: }
3030: }
3031: break;
3032:
3033: case '{':
3034: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3035: r = fetch_range_quantifier(&p, end, tok, env);
3036: if (r < 0) return r; /* error */
3037: if (r == 0) goto greedy_check;
3038: else if (r == 2) { /* {n} */
3039: if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3040: goto possessive_check;
3041:
3042: goto greedy_check;
3043: }
3044: /* r == 1 : normal char */
3045: break;
3046:
3047: case '|':
3048: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3049: tok->type = TK_ALT;
3050: break;
3051:
3052: case '(':
3053: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3054: tok->type = TK_SUBEXP_OPEN;
3055: break;
3056:
3057: case ')':
3058: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3059: tok->type = TK_SUBEXP_CLOSE;
3060: break;
3061:
3062: case 'w':
3063: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3064: tok->type = TK_CHAR_TYPE;
3065: tok->u.subtype = CTYPE_WORD;
3066: break;
3067:
3068: case 'W':
3069: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3070: tok->type = TK_CHAR_TYPE;
3071: tok->u.subtype = CTYPE_NOT_WORD;
3072: break;
3073:
3074: case 'b':
3075: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3076: tok->type = TK_ANCHOR;
3077: tok->u.anchor = ANCHOR_WORD_BOUND;
3078: break;
3079:
3080: case 'B':
3081: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3082: tok->type = TK_ANCHOR;
3083: tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3084: break;
3085:
3086: #ifdef USE_WORD_BEGIN_END
3087: case '<':
3088: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3089: tok->type = TK_ANCHOR;
3090: tok->u.anchor = ANCHOR_WORD_BEGIN;
3091: break;
3092:
3093: case '>':
3094: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3095: tok->type = TK_ANCHOR;
3096: tok->u.anchor = ANCHOR_WORD_END;
3097: break;
3098: #endif
3099:
3100: case 's':
3101: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3102: tok->type = TK_CHAR_TYPE;
3103: tok->u.subtype = CTYPE_WHITE_SPACE;
3104: break;
3105:
3106: case 'S':
3107: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3108: tok->type = TK_CHAR_TYPE;
3109: tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
3110: break;
3111:
3112: case 'd':
3113: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3114: tok->type = TK_CHAR_TYPE;
3115: tok->u.subtype = CTYPE_DIGIT;
3116: break;
3117:
3118: case 'D':
3119: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3120: tok->type = TK_CHAR_TYPE;
3121: tok->u.subtype = CTYPE_NOT_DIGIT;
3122: break;
3123:
3124: case 'h':
3125: if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3126: tok->type = TK_CHAR_TYPE;
3127: tok->u.subtype = CTYPE_XDIGIT;
3128: break;
3129:
3130: case 'H':
3131: if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3132: tok->type = TK_CHAR_TYPE;
3133: tok->u.subtype = CTYPE_NOT_XDIGIT;
3134: break;
3135:
3136: case 'A':
3137: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3138: begin_buf:
3139: tok->type = TK_ANCHOR;
3140: tok->u.subtype = ANCHOR_BEGIN_BUF;
3141: break;
3142:
3143: case 'Z':
3144: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3145: tok->type = TK_ANCHOR;
3146: tok->u.subtype = ANCHOR_SEMI_END_BUF;
3147: break;
3148:
3149: case 'z':
3150: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3151: end_buf:
3152: tok->type = TK_ANCHOR;
3153: tok->u.subtype = ANCHOR_END_BUF;
3154: break;
3155:
3156: case 'G':
3157: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3158: tok->type = TK_ANCHOR;
3159: tok->u.subtype = ANCHOR_BEGIN_POSITION;
3160: break;
3161:
3162: case '`':
3163: if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3164: goto begin_buf;
3165: break;
3166:
3167: case '\'':
3168: if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3169: goto end_buf;
3170: break;
3171:
3172: case 'x':
3173: if (PEND) break;
3174:
3175: prev = p;
3176: if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3177: PINC;
3178: num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3179: if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3180: if (!PEND) {
3181: if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3182: return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3183: }
3184:
3185: if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) {
3186: PINC;
3187: tok->type = TK_CODE_POINT;
3188: tok->u.code = (OnigCodePoint )num;
3189: }
3190: else {
3191: /* can't read nothing or invalid format */
3192: p = prev;
3193: }
3194: }
3195: else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3196: num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3197: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3198: if (p == prev) { /* can't read nothing. */
3199: num = 0; /* but, it's not error */
3200: }
3201: tok->type = TK_RAW_BYTE;
3202: tok->base = 16;
3203: tok->u.c = num;
3204: }
3205: break;
3206:
3207: case 'u':
3208: if (PEND) break;
3209:
3210: prev = p;
3211: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3212: num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3213: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3214: if (p == prev) { /* can't read nothing. */
3215: num = 0; /* but, it's not error */
3216: }
3217: tok->type = TK_CODE_POINT;
3218: tok->base = 16;
3219: tok->u.code = (OnigCodePoint )num;
3220: }
3221: break;
3222:
3223: case '1': case '2': case '3': case '4':
3224: case '5': case '6': case '7': case '8': case '9':
3225: PUNFETCH;
3226: prev = p;
3227: num = onig_scan_unsigned_number(&p, end, enc);
3228: if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3229: goto skip_backref;
3230: }
3231:
3232: if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3233: (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3234: if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3235: if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3236: return ONIGERR_INVALID_BACKREF;
3237: }
3238:
3239: tok->type = TK_BACKREF;
3240: tok->u.backref.num = 1;
3241: tok->u.backref.ref1 = num;
3242: tok->u.backref.by_name = 0;
3243: #ifdef USE_BACKREF_AT_LEVEL
3244: tok->u.backref.exist_level = 0;
3245: #endif
3246: break;
3247: }
3248:
3249: skip_backref:
3250: if (c == '8' || c == '9') {
3251: /* normal char */
3252: p = prev; PINC;
3253: break;
3254: }
3255:
3256: p = prev;
3257: /* fall through */
3258: case '0':
3259: if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3260: prev = p;
3261: num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3262: if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3263: if (p == prev) { /* can't read nothing. */
3264: num = 0; /* but, it's not error */
3265: }
3266: tok->type = TK_RAW_BYTE;
3267: tok->base = 8;
3268: tok->u.c = num;
3269: }
3270: else if (c != '0') {
3271: PINC;
3272: }
3273: break;
3274:
3275: #ifdef USE_NAMED_GROUP
3276: case 'k':
3277: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3278: PFETCH(c);
3279: if (c == '<') {
3280: UChar* name_end;
3281: int* backs;
3282:
3283: prev = p;
3284:
3285: #ifdef USE_BACKREF_AT_LEVEL
3286: name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3287: r = fetch_name_with_level(&p, end, &name_end, env, &tok->u.backref.level);
3288: if (r == 1) tok->u.backref.exist_level = 1;
3289: else tok->u.backref.exist_level = 0;
3290: #else
3291: r = fetch_name(&p, end, &name_end, env, 1);
3292: #endif
3293: if (r < 0) return r;
3294:
3295: num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3296: if (num <= 0) {
3297: onig_scan_env_set_error_string(env,
3298: ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3299: return ONIGERR_UNDEFINED_NAME_REFERENCE;
3300: }
3301: if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3302: int i;
3303: for (i = 0; i < num; i++) {
3304: if (backs[i] > env->num_mem ||
3305: IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3306: return ONIGERR_INVALID_BACKREF;
3307: }
3308: }
3309:
3310: tok->type = TK_BACKREF;
3311: tok->u.backref.by_name = 1;
3312: if (num == 1) {
3313: tok->u.backref.num = 1;
3314: tok->u.backref.ref1 = backs[0];
3315: }
3316: else {
3317: tok->u.backref.num = num;
3318: tok->u.backref.refs = backs;
3319: }
3320: }
3321: else
3322: PUNFETCH;
3323: }
3324: break;
3325: #endif
3326:
3327: #ifdef USE_SUBEXP_CALL
3328: case 'g':
3329: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3330: PFETCH(c);
3331: if (c == '<') {
3332: UChar* name_end;
3333:
3334: prev = p;
3335: r = fetch_name(&p, end, &name_end, env, 1);
3336: if (r < 0) return r;
3337:
3338: tok->type = TK_CALL;
3339: tok->u.call.name = prev;
3340: tok->u.call.name_end = name_end;
3341: }
3342: else
3343: PUNFETCH;
3344: }
3345: break;
3346: #endif
3347:
3348: case 'Q':
3349: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3350: tok->type = TK_QUOTE_OPEN;
3351: }
3352: break;
3353:
3354: case 'p':
3355: case 'P':
3356: if (PPEEK_IS('{') &&
3357: IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3358: PINC;
3359: tok->type = TK_CHAR_PROPERTY;
3360: tok->u.prop.not = (c == 'P' ? 1 : 0);
3361:
3362: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3363: PFETCH(c);
3364: if (c == '^') {
3365: tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3366: }
3367: else
3368: PUNFETCH;
3369: }
3370: }
3371: break;
3372:
3373: default:
3374: PUNFETCH;
3375: num = fetch_escaped_value(&p, end, env);
3376: if (num < 0) return num;
3377: /* set_raw: */
3378: if (tok->u.c != num) {
3379: tok->type = TK_CODE_POINT;
3380: tok->u.code = (OnigCodePoint )num;
3381: }
3382: else { /* string */
3383: p = tok->backp + enc_len(enc, tok->backp);
3384: }
3385: break;
3386: }
3387: }
3388: else {
3389: tok->u.c = c;
3390: tok->escaped = 0;
3391:
3392: #ifdef USE_VARIABLE_META_CHARS
3393: if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3394: IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3395: if (c == MC_ANYCHAR(enc))
3396: goto any_char;
3397: else if (c == MC_ANYTIME(enc))
3398: goto anytime;
3399: else if (c == MC_ZERO_OR_ONE_TIME(enc))
3400: goto zero_or_one_time;
3401: else if (c == MC_ONE_OR_MORE_TIME(enc))
3402: goto one_or_more_time;
3403: else if (c == MC_ANYCHAR_ANYTIME(enc)) {
3404: tok->type = TK_ANYCHAR_ANYTIME;
3405: goto out;
3406: }
3407: }
3408: #endif
3409:
3410: switch (c) {
3411: case '.':
3412: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3413: #ifdef USE_VARIABLE_META_CHARS
3414: any_char:
3415: #endif
3416: tok->type = TK_ANYCHAR;
3417: break;
3418:
3419: case '*':
3420: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3421: #ifdef USE_VARIABLE_META_CHARS
3422: anytime:
3423: #endif
3424: tok->type = TK_OP_REPEAT;
3425: tok->u.repeat.lower = 0;
3426: tok->u.repeat.upper = REPEAT_INFINITE;
3427: goto greedy_check;
3428: break;
3429:
3430: case '+':
3431: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3432: #ifdef USE_VARIABLE_META_CHARS
3433: one_or_more_time:
3434: #endif
3435: tok->type = TK_OP_REPEAT;
3436: tok->u.repeat.lower = 1;
3437: tok->u.repeat.upper = REPEAT_INFINITE;
3438: goto greedy_check;
3439: break;
3440:
3441: case '?':
3442: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3443: #ifdef USE_VARIABLE_META_CHARS
3444: zero_or_one_time:
3445: #endif
3446: tok->type = TK_OP_REPEAT;
3447: tok->u.repeat.lower = 0;
3448: tok->u.repeat.upper = 1;
3449: goto greedy_check;
3450: break;
3451:
3452: case '{':
3453: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3454: r = fetch_range_quantifier(&p, end, tok, env);
3455: if (r < 0) return r; /* error */
3456: if (r == 0) goto greedy_check;
3457: else if (r == 2) { /* {n} */
3458: if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3459: goto possessive_check;
3460:
3461: goto greedy_check;
3462: }
3463: /* r == 1 : normal char */
3464: break;
3465:
3466: case '|':
3467: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3468: tok->type = TK_ALT;
3469: break;
3470:
3471: case '(':
3472: if (PPEEK_IS('?') &&
3473: IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3474: PINC;
3475: if (PPEEK_IS('#')) {
3476: PFETCH(c);
3477: while (1) {
3478: if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3479: PFETCH(c);
3480: if (c == MC_ESC(enc)) {
3481: if (!PEND) PFETCH(c);
3482: }
3483: else {
3484: if (c == ')') break;
3485: }
3486: }
3487: goto start;
3488: }
3489: PUNFETCH;
3490: }
3491:
3492: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3493: tok->type = TK_SUBEXP_OPEN;
3494: break;
3495:
3496: case ')':
3497: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3498: tok->type = TK_SUBEXP_CLOSE;
3499: break;
3500:
3501: case '^':
3502: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3503: tok->type = TK_ANCHOR;
3504: tok->u.subtype = (IS_SINGLELINE(env->option)
3505: ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3506: break;
3507:
3508: case '$':
3509: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3510: tok->type = TK_ANCHOR;
3511: tok->u.subtype = (IS_SINGLELINE(env->option)
3512: ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3513: break;
3514:
3515: case '[':
3516: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3517: tok->type = TK_CC_OPEN;
3518: break;
3519:
3520: case ']':
3521: if (*src > env->pattern) /* /].../ is allowed. */
3522: CCEND_ESC_WARN(env, (UChar* )"]");
3523: break;
3524:
3525: case '#':
3526: if (IS_EXTEND(env->option)) {
3527: while (!PEND) {
3528: PFETCH(c);
3529: if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3530: break;
3531: }
3532: goto start;
3533: break;
3534: }
3535: break;
3536:
3537: case ' ': case '\t': case '\n': case '\r': case '\f':
3538: if (IS_EXTEND(env->option))
3539: goto start;
3540: break;
3541:
3542: default:
3543: /* string */
3544: break;
3545: }
3546: }
3547:
3548: #ifdef USE_VARIABLE_META_CHARS
3549: out:
3550: #endif
3551: *src = p;
3552: return tok->type;
3553: }
3554:
3555: static int
3556: add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc,
3557: const OnigCodePoint sbr[], const OnigCodePoint mbr[])
3558: {
3559: int i, r;
3560: OnigCodePoint j;
3561:
3562: int nsb = ONIGENC_CODE_RANGE_NUM(sbr);
3563: int nmb = ONIGENC_CODE_RANGE_NUM(mbr);
3564:
3565: if (not == 0) {
3566: for (i = 0; i < nsb; i++) {
3567: for (j = ONIGENC_CODE_RANGE_FROM(sbr, i);
3568: j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
3569: BITSET_SET_BIT(cc->bs, j);
3570: }
3571: }
3572:
3573: for (i = 0; i < nmb; i++) {
3574: r = add_code_range_to_buf(&(cc->mbuf),
3575: ONIGENC_CODE_RANGE_FROM(mbr, i),
3576: ONIGENC_CODE_RANGE_TO(mbr, i));
3577: if (r != 0) return r;
3578: }
3579: }
3580: else {
3581: OnigCodePoint prev = 0;
3582:
3583: if (ONIGENC_MBC_MINLEN(enc) == 1) {
3584: for (i = 0; i < nsb; i++) {
3585: for (j = prev;
3586: j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) {
3587: BITSET_SET_BIT(cc->bs, j);
3588: }
3589: prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1;
3590: }
3591: if (prev < 0x7f) {
3592: for (j = prev; j < 0x7f; j++) {
3593: BITSET_SET_BIT(cc->bs, j);
3594: }
3595: }
3596:
3597: prev = 0x80;
3598: }
3599:
3600: for (i = 0; i < nmb; i++) {
3601: if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3602: r = add_code_range_to_buf(&(cc->mbuf), prev,
3603: ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3604: if (r != 0) return r;
3605: }
3606: prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3607: }
3608: if (prev < 0x7fffffff) {
3609: r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3610: if (r != 0) return r;
3611: }
3612: }
3613:
3614: return 0;
3615: }
3616:
3617: static int
3618: add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3619: {
3620: int c, r;
3621: const OnigCodePoint *sbr, *mbr;
3622: OnigEncoding enc = env->enc;
3623:
3624: r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr);
3625: if (r == 0) {
3626: return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr);
3627: }
3628: else if (r != ONIG_NO_SUPPORT_CONFIG) {
3629: return r;
3630: }
3631:
3632: r = 0;
3633: switch (ctype) {
3634: case ONIGENC_CTYPE_ALPHA:
3635: case ONIGENC_CTYPE_BLANK:
3636: case ONIGENC_CTYPE_CNTRL:
3637: case ONIGENC_CTYPE_DIGIT:
3638: case ONIGENC_CTYPE_LOWER:
3639: case ONIGENC_CTYPE_PUNCT:
3640: case ONIGENC_CTYPE_SPACE:
3641: case ONIGENC_CTYPE_UPPER:
3642: case ONIGENC_CTYPE_XDIGIT:
3643: case ONIGENC_CTYPE_ASCII:
3644: case ONIGENC_CTYPE_ALNUM:
3645: if (not != 0) {
3646: for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3647: if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3648: BITSET_SET_BIT(cc->bs, c);
3649: }
3650: ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3651: }
3652: else {
3653: for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3654: if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3655: BITSET_SET_BIT(cc->bs, c);
3656: }
3657: }
3658: break;
3659:
3660: case ONIGENC_CTYPE_GRAPH:
3661: case ONIGENC_CTYPE_PRINT:
3662: if (not != 0) {
3663: for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3664: if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3665: BITSET_SET_BIT(cc->bs, c);
3666: }
3667: }
3668: else {
3669: for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3670: if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3671: BITSET_SET_BIT(cc->bs, c);
3672: }
3673: ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3674: }
3675: break;
3676:
3677: case ONIGENC_CTYPE_WORD:
3678: if (not == 0) {
3679: for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3680: if (ONIGENC_IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3681: }
3682: ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3683: }
3684: else {
3685: for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3686: if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* 0: invalid code point */
3687: && ! ONIGENC_IS_CODE_WORD(enc, c))
3688: BITSET_SET_BIT(cc->bs, c);
3689: }
3690: }
3691: break;
3692:
3693: default:
3694: return ONIGERR_PARSER_BUG;
3695: break;
3696: }
3697:
3698: return r;
3699: }
3700:
3701: static int
3702: parse_ctype_to_enc_ctype(int pctype, int* not)
3703: {
3704: int ctype;
3705:
3706: switch (pctype) {
3707: case CTYPE_WORD:
3708: ctype = ONIGENC_CTYPE_WORD;
3709: *not = 0;
3710: break;
3711: case CTYPE_NOT_WORD:
3712: ctype = ONIGENC_CTYPE_WORD;
3713: *not = 1;
3714: break;
3715: case CTYPE_WHITE_SPACE:
3716: ctype = ONIGENC_CTYPE_SPACE;
3717: *not = 0;
3718: break;
3719: case CTYPE_NOT_WHITE_SPACE:
3720: ctype = ONIGENC_CTYPE_SPACE;
3721: *not = 1;
3722: break;
3723: case CTYPE_DIGIT:
3724: ctype = ONIGENC_CTYPE_DIGIT;
3725: *not = 0;
3726: break;
3727: case CTYPE_NOT_DIGIT:
3728: ctype = ONIGENC_CTYPE_DIGIT;
3729: *not = 1;
3730: break;
3731: case CTYPE_XDIGIT:
3732: ctype = ONIGENC_CTYPE_XDIGIT;
3733: *not = 0;
3734: break;
3735: case CTYPE_NOT_XDIGIT:
3736: ctype = ONIGENC_CTYPE_XDIGIT;
3737: *not = 1;
3738: break;
3739: default:
3740: return ONIGERR_PARSER_BUG;
3741: break;
3742: }
3743: return ctype;
3744: }
3745:
3746: typedef struct {
3747: UChar *name;
3748: int ctype;
3749: short int len;
3750: } PosixBracketEntryType;
3751:
3752: static int
3753: parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3754: {
3755: #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3756: #define POSIX_BRACKET_NAME_MAX_LEN 6
3757:
3758: static PosixBracketEntryType PBS[] = {
3759: { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3760: { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3761: { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3762: { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3763: { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3764: { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3765: { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3766: { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3767: { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3768: { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3769: { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3770: { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3771: { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3772: { (UChar* )NULL, -1, 0 }
3773: };
3774:
3775: PosixBracketEntryType *pb;
3776: int not, i, r;
3777: OnigCodePoint c;
3778: OnigEncoding enc = env->enc;
3779: UChar *p = *src;
3780: PFETCH_READY;
3781:
3782: if (PPEEK_IS('^')) {
3783: PINC;
3784: not = 1;
3785: }
3786: else
3787: not = 0;
3788:
3789: if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2)
3790: goto not_posix_bracket;
3791:
3792: for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3793: if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3794: p = (UChar* )onigenc_step(enc, p, end, pb->len);
3795: if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3796: return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3797:
3798: r = add_ctype_to_cc(cc, pb->ctype, not, env);
3799: if (r != 0) return r;
3800:
3801: PINC; PINC;
3802: *src = p;
3803: return 0;
3804: }
3805: }
3806:
3807: not_posix_bracket:
3808: c = 0;
3809: i = 0;
3810: while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3811: PINC;
3812: if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3813: }
3814: if (c == ':' && ! PEND) {
3815: PINC;
3816: if (! PEND) {
3817: PFETCH(c);
3818: if (c == ']')
3819: return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3820: }
3821: }
3822:
3823: return 1; /* 1: is not POSIX bracket, but no error. */
3824: }
3825:
3826: static int
3827: property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc)
3828: {
3829: static PosixBracketEntryType PBS[] = {
3830: { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
3831: { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
3832: { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
3833: { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3834: { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
3835: { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
3836: { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
3837: { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
3838: { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
3839: { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
3840: { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
3841: { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
3842: { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
3843: { (UChar* )NULL, -1, 0 }
3844: };
3845:
3846: PosixBracketEntryType *pb;
3847: int len;
3848:
3849: len = onigenc_strlen(enc, p, end);
3850: for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3851: if (len == pb->len &&
3852: onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
3853: return pb->ctype;
3854: }
3855:
3856: return -1;
3857: }
3858:
3859: static int
3860: fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3861: {
3862: int ctype;
3863: OnigCodePoint c;
3864: OnigEncoding enc = env->enc;
3865: UChar *prev, *start, *p = *src;
3866: PFETCH_READY;
3867:
3868: /* 'IsXXXX' => 'XXXX' */
3869: if (!PEND &&
3870: IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) {
3871: c = PPEEK;
3872: if (c == 'I') {
3873: PINC;
3874: if (! PEND) {
3875: c = PPEEK;
3876: if (c == 's')
3877: PINC;
3878: else
3879: PUNFETCH;
3880: }
3881: }
3882: }
3883:
3884: start = prev = p;
3885:
3886: while (!PEND) {
3887: prev = p;
3888: PFETCH(c);
3889: if (c == '}') {
3890: ctype = property_name_to_ctype(start, prev, enc);
3891: if (ctype < 0) break;
3892:
3893: *src = p;
3894: return ctype;
3895: }
3896: else if (c == '(' || c == ')' || c == '{' || c == '|')
3897: break;
3898: }
3899:
3900: onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME,
3901: *src, prev);
3902: return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
3903: }
3904:
3905: static int
3906: parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
3907: ScanEnv* env)
3908: {
3909: int r, ctype;
3910: CClassNode* cc;
3911:
3912: ctype = fetch_char_property_to_ctype(src, end, env);
3913: if (ctype < 0) return ctype;
3914:
3915: *np = node_new_cclass();
3916: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
3917: cc = &(NCCLASS(*np));
3918: r = add_ctype_to_cc(cc, ctype, 0, env);
3919: if (r != 0) return r;
3920: if (tok->u.prop.not != 0) CCLASS_SET_NOT(cc);
3921:
3922: return 0;
3923: }
3924:
3925:
3926: enum CCSTATE {
3927: CCS_VALUE,
3928: CCS_RANGE,
3929: CCS_COMPLETE,
3930: CCS_START
3931: };
3932:
3933: enum CCVALTYPE {
3934: CCV_SB,
3935: CCV_CODE_POINT,
3936: CCV_CLASS
3937: };
3938:
3939: static int
3940: next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
3941: enum CCSTATE* state, ScanEnv* env)
3942: {
3943: int r;
3944:
3945: if (*state == CCS_RANGE)
3946: return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
3947:
3948: if (*state == CCS_VALUE && *type != CCV_CLASS) {
3949: if (*type == CCV_SB)
3950: BITSET_SET_BIT(cc->bs, (int )(*vs));
3951: else if (*type == CCV_CODE_POINT) {
3952: r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3953: if (r < 0) return r;
3954: }
3955: }
3956:
3957: *state = CCS_VALUE;
3958: *type = CCV_CLASS;
3959: return 0;
3960: }
3961:
3962: static int
3963: next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
3964: int* vs_israw, int v_israw,
3965: enum CCVALTYPE intype, enum CCVALTYPE* type,
3966: enum CCSTATE* state, ScanEnv* env)
3967: {
3968: int r;
3969:
3970: switch (*state) {
3971: case CCS_VALUE:
3972: if (*type == CCV_SB)
3973: BITSET_SET_BIT(cc->bs, (int )(*vs));
3974: else if (*type == CCV_CODE_POINT) {
3975: r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3976: if (r < 0) return r;
3977: }
3978: break;
3979:
3980: case CCS_RANGE:
3981: if (intype == *type) {
3982: if (intype == CCV_SB) {
3983: if (*vs > 0xff || v > 0xff)
3984: return ONIGERR_INVALID_WIDE_CHAR_VALUE;
3985:
3986: if (*vs > v) {
3987: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3988: goto ccs_range_end;
3989: else
3990: return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3991: }
3992: bitset_set_range(cc->bs, (int )*vs, (int )v);
3993: }
3994: else {
3995: r = add_code_range(&(cc->mbuf), env, *vs, v);
3996: if (r < 0) return r;
3997: }
3998: }
3999: else {
4000: #if 0
4001: if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4002: #endif
4003: if (*vs > v) {
4004: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4005: goto ccs_range_end;
4006: else
4007: return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4008: }
4009: bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4010: r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4011: if (r < 0) return r;
4012: #if 0
4013: }
4014: else
4015: return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4016: #endif
4017: }
4018: ccs_range_end:
4019: *state = CCS_COMPLETE;
4020: break;
4021:
4022: case CCS_COMPLETE:
4023: case CCS_START:
4024: *state = CCS_VALUE;
4025: break;
4026:
4027: default:
4028: break;
4029: }
4030:
4031: *vs_israw = v_israw;
4032: *vs = v;
4033: *type = intype;
4034: return 0;
4035: }
4036:
4037: static int
4038: code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4039: OnigEncoding enc)
4040: {
4041: int in_esc;
4042: OnigCodePoint code;
4043: UChar* p = from;
4044: PFETCH_READY;
4045:
4046: in_esc = 0;
4047: while (! PEND) {
4048: if (ignore_escaped && in_esc) {
4049: in_esc = 0;
4050: }
4051: else {
4052: PFETCH(code);
4053: if (code == c) return 1;
4054: if (code == MC_ESC(enc)) in_esc = 1;
4055: }
4056: }
4057: return 0;
4058: }
4059:
4060: static int
4061: parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4062: ScanEnv* env)
4063: {
4064: int r, neg, len, fetched, and_start;
4065: OnigCodePoint v, vs;
4066: UChar *p;
4067: Node* node;
4068: CClassNode *cc, *prev_cc;
4069: CClassNode work_cc;
4070:
4071: enum CCSTATE state;
4072: enum CCVALTYPE val_type, in_type;
4073: int val_israw, in_israw;
4074:
4075: prev_cc = (CClassNode* )NULL;
4076: *np = NULL_NODE;
4077: r = fetch_token_in_cc(tok, src, end, env);
4078: if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4079: neg = 1;
4080: r = fetch_token_in_cc(tok, src, end, env);
4081: }
4082: else {
4083: neg = 0;
4084: }
4085:
4086: if (r < 0) return r;
4087: if (r == TK_CC_CLOSE) {
4088: if (! code_exist_check((OnigCodePoint )']',
4089: *src, env->pattern_end, 1, env->enc))
4090: return ONIGERR_EMPTY_CHAR_CLASS;
4091:
4092: CC_ESC_WARN(env, (UChar* )"]");
4093: r = tok->type = TK_CHAR; /* allow []...] */
4094: }
4095:
4096: *np = node = node_new_cclass();
4097: CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY);
4098: cc = &(NCCLASS(node));
4099:
4100: and_start = 0;
4101: state = CCS_START;
4102: p = *src;
4103: while (r != TK_CC_CLOSE) {
4104: fetched = 0;
4105: switch (r) {
4106: case TK_CHAR:
4107: len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4108: if (len > 1) {
4109: in_type = CCV_CODE_POINT;
4110: }
4111: else {
4112: sb_char:
4113: in_type = CCV_SB;
4114: }
4115: v = (OnigCodePoint )tok->u.c;
4116: in_israw = 0;
4117: goto val_entry2;
4118: break;
4119:
4120: case TK_RAW_BYTE:
4121: /* tok->base != 0 : octal or hexadec. */
4122: if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4123: UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4124: UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4125: UChar* psave = p;
4126: int i, base = tok->base;
4127:
4128: buf[0] = tok->u.c;
4129: for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4130: r = fetch_token_in_cc(tok, &p, end, env);
4131: if (r < 0) goto err;
4132: if (r != TK_RAW_BYTE || tok->base != base) {
4133: fetched = 1;
4134: break;
4135: }
4136: buf[i] = tok->u.c;
4137: }
4138:
4139: if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4140: r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4141: goto err;
4142: }
4143:
4144: len = enc_len(env->enc, buf);
4145: if (i < len) {
4146: r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4147: goto err;
4148: }
4149: else if (i > len) { /* fetch back */
4150: p = psave;
4151: for (i = 1; i < len; i++) {
4152: r = fetch_token_in_cc(tok, &p, end, env);
4153: }
4154: fetched = 0;
4155: }
4156:
4157: if (i == 1) {
4158: v = (OnigCodePoint )buf[0];
4159: goto raw_single;
4160: }
4161: else {
4162: v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4163: in_type = CCV_CODE_POINT;
4164: }
4165: }
4166: else {
4167: v = (OnigCodePoint )tok->u.c;
4168: raw_single:
4169: in_type = CCV_SB;
4170: }
4171: in_israw = 1;
4172: goto val_entry2;
4173: break;
4174:
4175: case TK_CODE_POINT:
4176: v = tok->u.code;
4177: in_israw = 1;
4178: val_entry:
4179: len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4180: if (len < 0) {
4181: r = len;
4182: goto err;
4183: }
4184: in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4185: val_entry2:
4186: r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4187: &state, env);
4188: if (r != 0) goto err;
4189: break;
4190:
4191: case TK_POSIX_BRACKET_OPEN:
4192: r = parse_posix_bracket(cc, &p, end, env);
4193: if (r < 0) goto err;
4194: if (r == 1) { /* is not POSIX bracket */
4195: CC_ESC_WARN(env, (UChar* )"[");
4196: p = tok->backp;
4197: v = (OnigCodePoint )tok->u.c;
4198: in_israw = 0;
4199: goto val_entry;
4200: }
4201: goto next_class;
4202: break;
4203:
4204: case TK_CHAR_TYPE:
4205: {
4206: int ctype, not;
4207: ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4208: r = add_ctype_to_cc(cc, ctype, not, env);
4209: if (r != 0) return r;
4210: }
4211:
4212: next_class:
4213: r = next_state_class(cc, &vs, &val_type, &state, env);
4214: if (r != 0) goto err;
4215: break;
4216:
4217: case TK_CHAR_PROPERTY:
4218: {
4219: int ctype;
4220:
4221: ctype = fetch_char_property_to_ctype(&p, end, env);
4222: if (ctype < 0) return ctype;
4223: r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4224: if (r != 0) return r;
4225: goto next_class;
4226: }
4227: break;
4228:
4229: case TK_CC_RANGE:
4230: if (state == CCS_VALUE) {
4231: r = fetch_token_in_cc(tok, &p, end, env);
4232: if (r < 0) goto err;
4233: fetched = 1;
4234: if (r == TK_CC_CLOSE) { /* allow [x-] */
4235: range_end_val:
4236: v = (OnigCodePoint )'-';
4237: in_israw = 0;
4238: goto val_entry;
4239: }
4240: else if (r == TK_CC_AND) {
4241: CC_ESC_WARN(env, (UChar* )"-");
4242: goto range_end_val;
4243: }
4244: state = CCS_RANGE;
4245: }
4246: else if (state == CCS_START) {
4247: /* [-xa] is allowed */
4248: v = (OnigCodePoint )tok->u.c;
4249: in_israw = 0;
4250:
4251: r = fetch_token_in_cc(tok, &p, end, env);
4252: if (r < 0) goto err;
4253: fetched = 1;
4254: /* [--x] or [a&&-x] is warned. */
4255: if (r == TK_CC_RANGE || and_start != 0)
4256: CC_ESC_WARN(env, (UChar* )"-");
4257:
4258: goto val_entry;
4259: }
4260: else if (state == CCS_RANGE) {
4261: CC_ESC_WARN(env, (UChar* )"-");
4262: goto sb_char; /* [!--x] is allowed */
4263: }
4264: else { /* CCS_COMPLETE */
4265: r = fetch_token_in_cc(tok, &p, end, env);
4266: if (r < 0) goto err;
4267: fetched = 1;
4268: if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4269: else if (r == TK_CC_AND) {
4270: CC_ESC_WARN(env, (UChar* )"-");
4271: goto range_end_val;
4272: }
4273:
4274: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4275: CC_ESC_WARN(env, (UChar* )"-");
4276: goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4277: }
4278: r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4279: goto err;
4280: }
4281: break;
4282:
4283: case TK_CC_CC_OPEN: /* [ */
4284: {
4285: Node *anode;
4286: CClassNode* acc;
4287:
4288: r = parse_char_class(&anode, tok, &p, end, env);
4289: if (r != 0) goto cc_open_err;
4290: acc = &(NCCLASS(anode));
4291: r = or_cclass(cc, acc, env->enc);
4292:
4293: onig_node_free(anode);
4294: cc_open_err:
4295: if (r != 0) goto err;
4296: }
4297: break;
4298:
4299: case TK_CC_AND: /* && */
4300: {
4301: if (state == CCS_VALUE) {
4302: r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4303: &val_type, &state, env);
4304: if (r != 0) goto err;
4305: }
4306: /* initialize local variables */
4307: and_start = 1;
4308: state = CCS_START;
4309:
4310: if (IS_NOT_NULL(prev_cc)) {
4311: r = and_cclass(prev_cc, cc, env->enc);
4312: if (r != 0) goto err;
4313: bbuf_free(cc->mbuf);
4314: }
4315: else {
4316: prev_cc = cc;
4317: cc = &work_cc;
4318: }
4319: initialize_cclass(cc);
4320: }
4321: break;
4322:
4323: case TK_EOT:
4324: r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4325: goto err;
4326: break;
4327: default:
4328: r = ONIGERR_PARSER_BUG;
4329: goto err;
4330: break;
4331: }
4332:
4333: if (fetched)
4334: r = tok->type;
4335: else {
4336: r = fetch_token_in_cc(tok, &p, end, env);
4337: if (r < 0) goto err;
4338: }
4339: }
4340:
4341: if (state == CCS_VALUE) {
4342: r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4343: &val_type, &state, env);
4344: if (r != 0) goto err;
4345: }
4346:
4347: if (IS_NOT_NULL(prev_cc)) {
4348: r = and_cclass(prev_cc, cc, env->enc);
4349: if (r != 0) goto err;
4350: bbuf_free(cc->mbuf);
4351: cc = prev_cc;
4352: }
4353:
4354: if (neg != 0)
4355: CCLASS_SET_NOT(cc);
4356: else
4357: CCLASS_CLEAR_NOT(cc);
4358: if (IS_CCLASS_NOT(cc) &&
4359: IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4360: int is_empty;
4361:
4362: is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4363: if (is_empty != 0)
4364: BITSET_IS_EMPTY(cc->bs, is_empty);
4365:
4366: if (is_empty == 0) {
4367: #define NEWLINE_CODE 0x0a
4368:
4369: if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4370: if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4371: BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4372: else
4373: add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4374: }
4375: }
4376: }
4377: *src = p;
4378: return 0;
4379:
4380: err:
4381: if (cc != &(NCCLASS(*np)))
4382: bbuf_free(cc->mbuf);
4383: onig_node_free(*np);
4384: return r;
4385: }
4386:
4387: static int parse_subexp(Node** top, OnigToken* tok, int term,
4388: UChar** src, UChar* end, ScanEnv* env);
4389:
4390: static int
4391: parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4392: ScanEnv* env)
4393: {
4394: int r, num;
4395: int list_capture;
4396: Node *target;
4397: OnigOptionType option;
4398: OnigEncoding enc = env->enc;
4399: OnigCodePoint c;
4400: UChar* p = *src;
4401: PFETCH_READY;
4402:
4403: *np = NULL;
4404: if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4405:
4406: option = env->option;
4407: if (PPEEK_IS('?') &&
4408: IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4409: PINC;
4410: if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4411:
4412: PFETCH(c);
4413: switch (c) {
4414: case ':': /* (?:...) grouping only */
4415: group:
4416: r = fetch_token(tok, &p, end, env);
4417: if (r < 0) return r;
4418: r = parse_subexp(np, tok, term, &p, end, env);
4419: if (r < 0) return r;
4420: *src = p;
4421: return 1; /* group */
4422: break;
4423:
4424: case '=':
4425: *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4426: break;
4427: case '!': /* preceding read */
4428: *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4429: break;
4430: case '>': /* (?>...) stop backtrack */
4431: *np = node_new_effect(EFFECT_STOP_BACKTRACK);
4432: break;
4433:
4434: case '<': /* look behind (?<=...), (?<!...) */
4435: PFETCH(c);
4436: if (c == '=')
4437: *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4438: else if (c == '!')
4439: *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4440: #ifdef USE_NAMED_GROUP
4441: else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4442: UChar *name;
4443: UChar *name_end;
4444:
4445: PUNFETCH;
4446: list_capture = 0;
4447:
4448: named_group:
4449: name = p;
4450: r = fetch_name(&p, end, &name_end, env, 0);
4451: if (r < 0) return r;
4452:
4453: num = scan_env_add_mem_entry(env);
4454: if (num < 0) return num;
4455: if (list_capture != 0 && num >= BIT_STATUS_BITS_NUM)
4456: return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4457:
4458: r = name_add(env->reg, name, name_end, num, env);
4459: if (r != 0) return r;
4460: *np = node_new_effect_memory(env->option, 1);
4461: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4462: NEFFECT(*np).regnum = num;
4463: if (list_capture != 0)
4464: BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4465: env->num_named++;
4466: }
4467: #endif
4468: else
4469: return ONIGERR_UNDEFINED_GROUP_OPTION;
4470: break;
4471:
4472: case '@':
4473: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4474: #ifdef USE_NAMED_GROUP
4475: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4476: PFETCH(c);
4477: if (c == '<') {
4478: list_capture = 1;
4479: goto named_group; /* (?@<name>...) */
4480: }
4481: PUNFETCH;
4482: }
4483: #endif
4484: *np = node_new_effect_memory(env->option, 0);
4485: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4486: num = scan_env_add_mem_entry(env);
4487: if (num < 0) {
4488: onig_node_free(*np);
4489: return num;
4490: }
4491: else if (num >= BIT_STATUS_BITS_NUM) {
4492: onig_node_free(*np);
4493: return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4494: }
4495: NEFFECT(*np).regnum = num;
4496: BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4497: }
4498: else {
4499: return ONIGERR_UNDEFINED_GROUP_OPTION;
4500: }
4501: break;
4502:
4503: #ifdef USE_POSIXLINE_OPTION
4504: case 'p':
4505: #endif
4506: case '-': case 'i': case 'm': case 's': case 'x':
4507: {
4508: int neg = 0;
4509:
4510: while (1) {
4511: switch (c) {
4512: case ':':
4513: case ')':
4514: break;
4515:
4516: case '-': neg = 1; break;
4517: case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4518: case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4519: case 's':
4520: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4521: ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4522: }
4523: else
4524: return ONIGERR_UNDEFINED_GROUP_OPTION;
4525: break;
4526:
4527: case 'm':
4528: if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4529: ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4530: }
4531: else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4532: ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4533: }
4534: else
4535: return ONIGERR_UNDEFINED_GROUP_OPTION;
4536: break;
4537: #ifdef USE_POSIXLINE_OPTION
4538: case 'p':
4539: ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4540: break;
4541: #endif
4542: default:
4543: return ONIGERR_UNDEFINED_GROUP_OPTION;
4544: }
4545:
4546: if (c == ')') {
4547: *np = node_new_option(option);
4548: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4549: *src = p;
4550: return 2; /* option only */
4551: }
4552: else if (c == ':') {
4553: OnigOptionType prev = env->option;
4554:
4555: env->option = option;
4556: r = fetch_token(tok, &p, end, env);
4557: if (r < 0) return r;
4558: r = parse_subexp(&target, tok, term, &p, end, env);
4559: env->option = prev;
4560: if (r < 0) return r;
4561: *np = node_new_option(option);
4562: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4563: NEFFECT(*np).target = target;
4564: *src = p;
4565: return 0;
4566: }
4567:
4568: if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4569: PFETCH(c);
4570: }
4571: }
4572: break;
4573:
4574: default:
4575: return ONIGERR_UNDEFINED_GROUP_OPTION;
4576: }
4577: }
4578: else {
4579: if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4580: goto group;
4581:
4582: *np = node_new_effect_memory(env->option, 0);
4583: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4584: num = scan_env_add_mem_entry(env);
4585: if (num < 0) return num;
4586: NEFFECT(*np).regnum = num;
4587: }
4588:
4589: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4590: r = fetch_token(tok, &p, end, env);
4591: if (r < 0) return r;
4592: r = parse_subexp(&target, tok, term, &p, end, env);
4593: if (r < 0) return r;
4594:
4595: if (NTYPE(*np) == N_ANCHOR)
4596: NANCHOR(*np).target = target;
4597: else {
4598: NEFFECT(*np).target = target;
4599: if (NEFFECT(*np).type == EFFECT_MEMORY) {
4600: /* Don't move this to previous of parse_subexp() */
4601: r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np);
4602: if (r != 0) return r;
4603: }
4604: }
4605:
4606: *src = p;
4607: return 0;
4608: }
4609:
4610: static const char* PopularQStr[] = {
4611: "?", "*", "+", "??", "*?", "+?"
4612: };
4613:
4614: static const char* ReduceQStr[] = {
4615: "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4616: };
4617:
4618: static int
4619: set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4620: {
4621: QuantifierNode* qn;
4622:
4623: qn = &(NQUANTIFIER(qnode));
4624: if (qn->lower == 1 && qn->upper == 1) {
4625: return 1;
4626: }
4627:
4628: switch (NTYPE(target)) {
4629: case N_STRING:
4630: if (! group) {
4631: StrNode* sn = &(NSTRING(target));
4632: if (str_node_can_be_split(sn, env->enc)) {
4633: Node* n = str_node_split_last_char(sn, env->enc);
4634: if (IS_NOT_NULL(n)) {
4635: qn->target = n;
4636: return 2;
4637: }
4638: }
4639: }
4640: break;
4641:
4642: case N_QUANTIFIER:
4643: { /* check redundant double repeat. */
4644: /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4645: QuantifierNode* qnt = &(NQUANTIFIER(target));
4646: int nestq_num = popular_quantifier_num(qn);
4647: int targetq_num = popular_quantifier_num(qnt);
4648:
4649: #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4650: if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4651: IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4652: UChar buf[WARN_BUFSIZE];
4653:
4654: switch(ReduceTypeTable[targetq_num][nestq_num]) {
4655: case RQ_ASIS:
4656: break;
4657:
4658: case RQ_DEL:
4659: if (onig_verb_warn != onig_null_warn) {
4660: onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4661: env->pattern, env->pattern_end,
4662: (UChar* )"redundant nested repeat operator");
4663: (*onig_verb_warn)((char* )buf);
4664: }
4665: goto warn_exit;
4666: break;
4667:
4668: default:
4669: if (onig_verb_warn != onig_null_warn) {
4670: onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4671: env->pattern, env->pattern_end,
4672: (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4673: PopularQStr[targetq_num], PopularQStr[nestq_num],
4674: ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4675: (*onig_verb_warn)((char* )buf);
4676: }
4677: goto warn_exit;
4678: break;
4679: }
4680: }
4681:
4682: warn_exit:
4683: #endif
4684: if (targetq_num >= 0) {
4685: if (nestq_num >= 0) {
4686: onig_reduce_nested_quantifier(qnode, target);
4687: goto q_exit;
4688: }
4689: else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4690: /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4691: if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4692: qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4693: }
4694: }
4695: }
4696: }
4697: break;
4698:
4699: default:
4700: break;
4701: }
4702:
4703: qn->target = target;
4704: q_exit:
4705: return 0;
4706: }
4707:
4708: #ifdef USE_SHARED_CCLASS_TABLE
4709:
4710: #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4711:
4712: /* for ctype node hash table */
4713:
4714: typedef struct {
4715: OnigEncoding enc;
4716: int not;
4717: int type;
4718: } type_cclass_key;
4719:
4720: static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4721: {
4722: if (x->type != y->type) return 1;
4723: if (x->enc != y->enc) return 1;
4724: if (x->not != y->not) return 1;
4725: return 0;
4726: }
4727:
4728: static int type_cclass_hash(type_cclass_key* key)
4729: {
4730: int i, val;
4731: unsigned char *p;
4732:
4733: val = 0;
4734:
4735: p = (unsigned char* )&(key->enc);
4736: for (i = 0; i < sizeof(key->enc); i++) {
4737: val = val * 997 + (int )*p++;
4738: }
4739:
4740: p = (unsigned char* )(&key->type);
4741: for (i = 0; i < sizeof(key->type); i++) {
4742: val = val * 997 + (int )*p++;
4743: }
4744:
4745: val += key->not;
4746: return val + (val >> 5);
4747: }
4748:
4749: static struct st_hash_type type_type_cclass_hash = {
4750: type_cclass_cmp,
4751: type_cclass_hash,
4752: };
4753:
4754: static st_table* OnigTypeCClassTable;
4755:
4756:
4757: static int
4758: i_free_shared_class(type_cclass_key* key, Node* node, void* arg)
4759: {
4760: if (IS_NOT_NULL(node)) {
4761: CClassNode* cc = &(NCCLASS(node));
4762: if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4763: xfree(node);
4764: }
4765:
4766: if (IS_NOT_NULL(key)) xfree(key);
4767: return ST_DELETE;
4768: }
4769:
4770: extern int
4771: onig_free_shared_cclass_table(void)
4772: {
4773: if (IS_NOT_NULL(OnigTypeCClassTable)) {
4774: onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4775: onig_st_free_table(OnigTypeCClassTable);
4776: OnigTypeCClassTable = NULL;
4777: }
4778:
4779: return 0;
4780: }
4781:
4782: #endif /* USE_SHARED_CCLASS_TABLE */
4783:
4784:
4785: static int
4786: parse_exp(Node** np, OnigToken* tok, int term,
4787: UChar** src, UChar* end, ScanEnv* env)
4788: {
4789: int r, len, group = 0;
4790: Node* qn;
4791: Node** targetp;
4792:
4793: *np = NULL;
4794: if (tok->type == term)
4795: goto end_of_token;
4796:
4797: switch (tok->type) {
4798: case TK_ALT:
4799: case TK_EOT:
4800: end_of_token:
4801: *np = node_new_empty();
4802: return tok->type;
4803: break;
4804:
4805: case TK_SUBEXP_OPEN:
4806: r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env);
4807: if (r < 0) return r;
4808: if (r == 1) group = 1;
4809: else if (r == 2) { /* option only */
4810: Node* target;
4811: OnigOptionType prev = env->option;
4812:
4813: env->option = NEFFECT(*np).option;
4814: r = fetch_token(tok, src, end, env);
4815: if (r < 0) return r;
4816: r = parse_subexp(&target, tok, term, src, end, env);
4817: env->option = prev;
4818: if (r < 0) return r;
4819: NEFFECT(*np).target = target;
4820: return tok->type;
4821: }
4822: break;
4823:
4824: case TK_SUBEXP_CLOSE:
4825: if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
4826: return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
4827:
4828: if (tok->escaped) goto tk_raw_byte;
4829: else goto tk_byte;
4830: break;
4831:
4832: case TK_STRING:
4833: tk_byte:
4834: {
4835: *np = node_new_str(tok->backp, *src);
4836: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4837:
4838: while (1) {
4839: r = fetch_token(tok, src, end, env);
4840: if (r < 0) return r;
4841: if (r != TK_STRING) break;
4842:
4843: r = onig_node_str_cat(*np, tok->backp, *src);
4844: if (r < 0) return r;
4845: }
4846:
4847: string_end:
4848: targetp = np;
4849: goto repeat;
4850: }
4851: break;
4852:
4853: case TK_RAW_BYTE:
4854: tk_raw_byte:
4855: {
4856: *np = node_new_str_char((UChar )tok->u.c);
4857: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4858: len = 1;
4859: while (1) {
4860: if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
4861: if (len == enc_len(env->enc, NSTRING(*np).s)) {
4862: r = fetch_token(tok, src, end, env);
4863: goto string_end;
4864: }
4865: }
4866:
4867: r = fetch_token(tok, src, end, env);
4868: if (r < 0) return r;
4869: if (r != TK_RAW_BYTE) {
4870: #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
4871: int rem;
4872: if (len < ONIGENC_MBC_MINLEN(env->enc)) {
4873: rem = ONIGENC_MBC_MINLEN(env->enc) - len;
4874: (void )node_str_head_pad(&NSTRING(*np), rem, (UChar )0);
4875: if (len + rem == enc_len(env->enc, NSTRING(*np).s)) {
4876: goto string_end;
4877: }
4878: }
4879: #endif
4880: return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4881: }
4882:
4883: r = node_str_cat_char(*np, (UChar )tok->u.c);
4884: if (r < 0) return r;
4885:
4886: len++;
4887: }
4888: }
4889: break;
4890:
4891: case TK_CODE_POINT:
4892: {
4893: UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4894: int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
4895: if (num < 0) return num;
4896: #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
4897: *np = node_new_str_raw(buf, buf + num);
4898: #else
4899: *np = node_new_str(buf, buf + num);
4900: #endif
4901: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4902: }
4903: break;
4904:
4905: case TK_QUOTE_OPEN:
4906: {
4907: OnigCodePoint end_op[2];
4908: UChar *qstart, *qend, *nextp;
4909:
4910: end_op[0] = (OnigCodePoint )MC_ESC(env->enc);
4911: end_op[1] = (OnigCodePoint )'E';
4912: qstart = *src;
4913: qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
4914: if (IS_NULL(qend)) {
4915: nextp = qend = end;
4916: }
4917: *np = node_new_str(qstart, qend);
4918: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4919: *src = nextp;
4920: }
4921: break;
4922:
4923: case TK_CHAR_TYPE:
4924: {
4925: switch (tok->u.subtype) {
4926: case CTYPE_WORD:
4927: case CTYPE_NOT_WORD:
4928: *np = node_new_ctype(tok->u.subtype);
4929: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4930: break;
4931:
4932: case CTYPE_WHITE_SPACE:
4933: case CTYPE_NOT_WHITE_SPACE:
4934: case CTYPE_DIGIT:
4935: case CTYPE_NOT_DIGIT:
4936: case CTYPE_XDIGIT:
4937: case CTYPE_NOT_XDIGIT:
4938: {
4939: CClassNode* cc;
4940: int ctype, not;
4941:
4942: #ifdef USE_SHARED_CCLASS_TABLE
4943: const OnigCodePoint *sbr, *mbr;
4944:
4945: ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4946: r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr);
4947: if (r == 0 &&
4948: ONIGENC_CODE_RANGE_NUM(mbr)
4949: >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
4950: type_cclass_key key;
4951: type_cclass_key* new_key;
4952:
4953: key.enc = env->enc;
4954: key.not = not;
4955: key.type = ctype;
4956:
4957: THREAD_ATOMIC_START;
4958:
4959: if (IS_NULL(OnigTypeCClassTable)) {
4960: OnigTypeCClassTable
4961: = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
4962: if (IS_NULL(OnigTypeCClassTable)) {
4963: THREAD_ATOMIC_END;
4964: return ONIGERR_MEMORY;
4965: }
4966: }
4967: else {
4968: if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
4969: (st_data_t* )np)) {
4970: THREAD_ATOMIC_END;
4971: break;
4972: }
4973: }
4974:
4975: *np = node_new_cclass_by_codepoint_range(not, sbr, mbr);
4976: if (IS_NULL(*np)) {
4977: THREAD_ATOMIC_END;
4978: return ONIGERR_MEMORY;
4979: }
4980:
4981: CCLASS_SET_SHARE(&(NCCLASS(*np)));
4982: new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
4983: xmemcpy(new_key, &key, sizeof(type_cclass_key));
4984: onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
4985: (st_data_t )*np);
4986:
4987: THREAD_ATOMIC_END;
4988: }
4989: else {
4990: #endif
4991: ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬);
4992: *np = node_new_cclass();
4993: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4994: cc = &(NCCLASS(*np));
4995: add_ctype_to_cc(cc, ctype, 0, env);
4996: if (not != 0) CCLASS_SET_NOT(cc);
4997: #ifdef USE_SHARED_CCLASS_TABLE
4998: }
4999: #endif
5000: }
5001: break;
5002:
5003: default:
5004: return ONIGERR_PARSER_BUG;
5005: break;
5006: }
5007: }
5008: break;
5009:
5010: case TK_CHAR_PROPERTY:
5011: r = parse_char_property(np, tok, src, end, env);
5012: if (r != 0) return r;
5013: break;
5014:
5015: case TK_CC_OPEN:
5016: {
5017: CClassNode* cc;
5018:
5019: r = parse_char_class(np, tok, src, end, env);
5020: if (r != 0) return r;
5021:
5022: cc = &(NCCLASS(*np));
5023:
5024: if (IS_IGNORECASE(env->option)) {
5025: int i, n, in_cc;
5026: const OnigPairAmbigCodes* ccs;
5027: BitSetRef bs = cc->bs;
5028: OnigAmbigType amb;
5029:
5030: for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
5031: if ((amb & env->ambig_flag) == 0) continue;
5032:
5033: n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs);
5034: for (i = 0; i < n; i++) {
5035: in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc);
5036:
5037: if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) ||
5038: (in_cc == 0 && IS_CCLASS_NOT(cc))) {
5039: if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
5040: ccs[i].from >= SINGLE_BYTE_SIZE) {
5041: /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */
5042: add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to);
5043: }
5044: else {
5045: if (BITSET_AT(bs, ccs[i].from)) {
5046: /* /(?i:[^A-C])/.match("a") ==> fail. */
5047: BITSET_SET_BIT(bs, ccs[i].to);
5048: }
5049: if (BITSET_AT(bs, ccs[i].to)) {
5050: BITSET_SET_BIT(bs, ccs[i].from);
5051: }
5052: }
5053: }
5054: }
5055: }
5056: }
5057: }
5058: break;
5059:
5060: case TK_ANYCHAR:
5061: *np = node_new_anychar();
5062: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5063: break;
5064:
5065: case TK_ANYCHAR_ANYTIME:
5066: *np = node_new_anychar();
5067: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5068: qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5069: CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5070: NQUANTIFIER(qn).target = *np;
5071: *np = qn;
5072: break;
5073:
5074: case TK_BACKREF:
5075: len = tok->u.backref.num;
5076: *np = node_new_backref(len,
5077: (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5078: tok->u.backref.by_name,
5079: #ifdef USE_BACKREF_AT_LEVEL
5080: tok->u.backref.exist_level,
5081: tok->u.backref.level,
5082: #endif
5083: env);
5084: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5085: break;
5086:
5087: #ifdef USE_SUBEXP_CALL
5088: case TK_CALL:
5089: *np = node_new_call(tok->u.call.name, tok->u.call.name_end);
5090: CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5091: env->num_call++;
5092: break;
5093: #endif
5094:
5095: case TK_ANCHOR:
5096: *np = onig_node_new_anchor(tok->u.anchor);
5097: break;
5098:
5099: case TK_OP_REPEAT:
5100: case TK_INTERVAL:
5101: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5102: if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5103: return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5104: else
5105: *np = node_new_empty();
5106: }
5107: else {
5108: goto tk_byte;
5109: }
5110: break;
5111:
5112: default:
5113: return ONIGERR_PARSER_BUG;
5114: break;
5115: }
5116:
5117: {
5118: targetp = np;
5119:
5120: re_entry:
5121: r = fetch_token(tok, src, end, env);
5122: if (r < 0) return r;
5123:
5124: repeat:
5125: if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5126: if (is_invalid_quantifier_target(*targetp))
5127: return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5128:
5129: qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5130: (r == TK_INTERVAL ? 1 : 0));
5131: CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5132: NQUANTIFIER(qn).greedy = tok->u.repeat.greedy;
5133: r = set_quantifier(qn, *targetp, group, env);
5134: if (r < 0) return r;
5135:
5136: if (tok->u.repeat.possessive != 0) {
5137: Node* en;
5138: en = node_new_effect(EFFECT_STOP_BACKTRACK);
5139: CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
5140: NEFFECT(en).target = qn;
5141: qn = en;
5142: }
5143:
5144: if (r == 0) {
5145: *targetp = qn;
5146: }
5147: else if (r == 2) { /* split case: /abc+/ */
5148: Node *tmp;
5149:
5150: *targetp = node_new_list(*targetp, NULL);
5151: CHECK_NULL_RETURN_VAL(*targetp, ONIGERR_MEMORY);
5152: tmp = NCONS(*targetp).right = node_new_list(qn, NULL);
5153: CHECK_NULL_RETURN_VAL(tmp, ONIGERR_MEMORY);
5154: targetp = &(NCONS(tmp).left);
5155: }
5156: goto re_entry;
5157: }
5158: }
5159:
5160: return r;
5161: }
5162:
5163: static int
5164: parse_branch(Node** top, OnigToken* tok, int term,
5165: UChar** src, UChar* end, ScanEnv* env)
5166: {
5167: int r;
5168: Node *node, **headp;
5169:
5170: *top = NULL;
5171: r = parse_exp(&node, tok, term, src, end, env);
5172: if (r < 0) return r;
5173:
5174: if (r == TK_EOT || r == term || r == TK_ALT) {
5175: *top = node;
5176: }
5177: else {
5178: *top = node_new_list(node, NULL);
5179: headp = &(NCONS(*top).right);
5180: while (r != TK_EOT && r != term && r != TK_ALT) {
5181: r = parse_exp(&node, tok, term, src, end, env);
5182: if (r < 0) return r;
5183:
5184: if (NTYPE(node) == N_LIST) {
5185: *headp = node;
5186: while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right;
5187: headp = &(NCONS(node).right);
5188: }
5189: else {
5190: *headp = node_new_list(node, NULL);
5191: headp = &(NCONS(*headp).right);
5192: }
5193: }
5194: }
5195:
5196: return r;
5197: }
5198:
5199: /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5200: static int
5201: parse_subexp(Node** top, OnigToken* tok, int term,
5202: UChar** src, UChar* end, ScanEnv* env)
5203: {
5204: int r;
5205: Node *node, **headp;
5206:
5207: *top = NULL;
5208: r = parse_branch(&node, tok, term, src, end, env);
5209: if (r < 0) {
5210: onig_node_free(node);
5211: return r;
5212: }
5213:
5214: if (r == term) {
5215: *top = node;
5216: }
5217: else if (r == TK_ALT) {
5218: *top = node_new_alt(node, NULL);
5219: headp = &(NCONS(*top).right);
5220: while (r == TK_ALT) {
5221: r = fetch_token(tok, src, end, env);
5222: if (r < 0) return r;
5223: r = parse_branch(&node, tok, term, src, end, env);
5224: if (r < 0) return r;
5225:
5226: *headp = node_new_alt(node, NULL);
5227: headp = &(NCONS(*headp).right);
5228: }
5229:
5230: if (tok->type != term)
5231: goto err;
5232: }
5233: else {
5234: err:
5235: if (term == TK_SUBEXP_CLOSE)
5236: return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5237: else
5238: return ONIGERR_PARSER_BUG;
5239: }
5240:
5241: return r;
5242: }
5243:
5244: static int
5245: parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5246: {
5247: int r;
5248: OnigToken tok;
5249:
5250: r = fetch_token(&tok, src, end, env);
5251: if (r < 0) return r;
5252: r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5253: if (r < 0) return r;
5254: return 0;
5255: }
5256:
5257: extern int
5258: onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg,
5259: ScanEnv* env)
5260: {
5261: int r;
5262: UChar* p;
5263:
5264: #ifdef USE_NAMED_GROUP
5265: names_clear(reg);
5266: #endif
5267:
5268: scan_env_clear(env);
5269: env->option = reg->options;
5270: env->ambig_flag = reg->ambig_flag;
5271: env->enc = reg->enc;
5272: env->syntax = reg->syntax;
5273: env->pattern = (UChar* )pattern;
5274: env->pattern_end = (UChar* )end;
5275: env->reg = reg;
5276:
5277: *root = NULL;
5278: p = (UChar* )pattern;
5279: r = parse_regexp(root, &p, (UChar* )end, env);
5280: reg->num_mem = env->num_mem;
5281: return r;
5282: }
5283:
5284: extern void
5285: onig_scan_env_set_error_string(ScanEnv* env, int ecode,
5286: UChar* arg, UChar* arg_end)
5287: {
5288: env->error = arg;
5289: env->error_end = arg_end;
5290: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>