File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / pcre / php_pcre.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 20:03:52 2014 UTC (10 years, 1 month ago) by misho
Branches: php, MAIN
CVS tags: v5_4_29, HEAD
php 5.4.29

    1: /*
    2:    +----------------------------------------------------------------------+
    3:    | PHP Version 5                                                        |
    4:    +----------------------------------------------------------------------+
    5:    | Copyright (c) 1997-2014 The PHP Group                                |
    6:    +----------------------------------------------------------------------+
    7:    | This source file is subject to version 3.01 of the PHP license,      |
    8:    | that is bundled with this package in the file LICENSE, and is        |
    9:    | available through the world-wide-web at the following url:           |
   10:    | http://www.php.net/license/3_01.txt                                  |
   11:    | If you did not receive a copy of the PHP license and are unable to   |
   12:    | obtain it through the world-wide-web, please send a note to          |
   13:    | license@php.net so we can mail you a copy immediately.               |
   14:    +----------------------------------------------------------------------+
   15:    | Author: Andrei Zmievski <andrei@php.net>                             |
   16:    +----------------------------------------------------------------------+
   17:  */
   18: 
   19: /* $Id: php_pcre.c,v 1.1.1.4 2014/06/15 20:03:52 misho Exp $ */
   20: 
   21: #include "php.h"
   22: #include "php_ini.h"
   23: #include "php_globals.h"
   24: #include "php_pcre.h"
   25: #include "ext/standard/info.h"
   26: #include "ext/standard/php_smart_str.h"
   27: 
   28: #if HAVE_PCRE || HAVE_BUNDLED_PCRE
   29: 
   30: #include "ext/standard/php_string.h"
   31: 
   32: #define PREG_PATTERN_ORDER			1
   33: #define PREG_SET_ORDER				2
   34: #define PREG_OFFSET_CAPTURE			(1<<8)
   35: 
   36: #define	PREG_SPLIT_NO_EMPTY			(1<<0)
   37: #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
   38: #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
   39: 
   40: #define PREG_REPLACE_EVAL			(1<<0)
   41: 
   42: #define PREG_GREP_INVERT			(1<<0)
   43: 
   44: #define PCRE_CACHE_SIZE 4096
   45: 
   46: enum {
   47: 	PHP_PCRE_NO_ERROR = 0,
   48: 	PHP_PCRE_INTERNAL_ERROR,
   49: 	PHP_PCRE_BACKTRACK_LIMIT_ERROR,
   50: 	PHP_PCRE_RECURSION_LIMIT_ERROR,
   51: 	PHP_PCRE_BAD_UTF8_ERROR,
   52: 	PHP_PCRE_BAD_UTF8_OFFSET_ERROR
   53: };
   54: 
   55: 
   56: ZEND_DECLARE_MODULE_GLOBALS(pcre)
   57: 
   58: 
   59: static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
   60: {
   61: 	int preg_code = 0;
   62: 
   63: 	switch (pcre_code) {
   64: 		case PCRE_ERROR_MATCHLIMIT:
   65: 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
   66: 			break;
   67: 
   68: 		case PCRE_ERROR_RECURSIONLIMIT:
   69: 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
   70: 			break;
   71: 
   72: 		case PCRE_ERROR_BADUTF8:
   73: 			preg_code = PHP_PCRE_BAD_UTF8_ERROR;
   74: 			break;
   75: 
   76: 		case PCRE_ERROR_BADUTF8_OFFSET:
   77: 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
   78: 			break;
   79: 
   80: 		default:
   81: 			preg_code = PHP_PCRE_INTERNAL_ERROR;
   82: 			break;
   83: 	}
   84: 
   85: 	PCRE_G(error_code) = preg_code;
   86: }
   87: /* }}} */
   88: 
   89: static void php_free_pcre_cache(void *data) /* {{{ */
   90: {
   91: 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
   92: 	if (!pce) return;
   93: 	pefree(pce->re, 1);
   94: 	if (pce->extra) pefree(pce->extra, 1);
   95: #if HAVE_SETLOCALE
   96: 	if ((void*)pce->tables) pefree((void*)pce->tables, 1);
   97: 	pefree(pce->locale, 1);
   98: #endif
   99: }
  100: /* }}} */
  101: 
  102: static PHP_GINIT_FUNCTION(pcre) /* {{{ */
  103: {
  104: 	zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
  105: 	pcre_globals->backtrack_limit = 0;
  106: 	pcre_globals->recursion_limit = 0;
  107: 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
  108: }
  109: /* }}} */
  110: 
  111: static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
  112: {
  113: 	zend_hash_destroy(&pcre_globals->pcre_cache);
  114: }
  115: /* }}} */
  116: 
  117: PHP_INI_BEGIN()
  118: 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
  119: 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
  120: PHP_INI_END()
  121: 
  122: 
  123: /* {{{ PHP_MINFO_FUNCTION(pcre) */
  124: static PHP_MINFO_FUNCTION(pcre)
  125: {
  126: 	php_info_print_table_start();
  127: 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
  128: 	php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
  129: 	php_info_print_table_end();
  130: 
  131: 	DISPLAY_INI_ENTRIES();
  132: }
  133: /* }}} */
  134: 
  135: /* {{{ PHP_MINIT_FUNCTION(pcre) */
  136: static PHP_MINIT_FUNCTION(pcre)
  137: {
  138: 	REGISTER_INI_ENTRIES();
  139: 	
  140: 	REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
  141: 	REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
  142: 	REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
  143: 	REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
  144: 	REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
  145: 	REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
  146: 	REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
  147: 
  148: 	REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
  149: 	REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
  150: 	REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
  151: 	REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
  152: 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
  153: 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
  154: 	REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
  155: 
  156: 	return SUCCESS;
  157: }
  158: /* }}} */
  159: 
  160: /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
  161: static PHP_MSHUTDOWN_FUNCTION(pcre)
  162: {
  163: 	UNREGISTER_INI_ENTRIES();
  164: 
  165: 	return SUCCESS;
  166: }
  167: /* }}} */
  168: 
  169: /* {{{ static pcre_clean_cache */
  170: static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
  171: {
  172: 	int *num_clean = (int *)arg;
  173: 
  174: 	if (*num_clean > 0) {
  175: 		(*num_clean)--;
  176: 		return 1;
  177: 	} else {
  178: 		return 0;
  179: 	}
  180: }
  181: /* }}} */
  182: 
  183: /* {{{ static make_subpats_table */
  184: static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
  185: {
  186: 	pcre_extra *extra = pce->extra;
  187: 	int name_cnt = 0, name_size, ni = 0;
  188: 	int rc;
  189: 	char *name_table;
  190: 	unsigned short name_idx;
  191: 	char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
  192: 
  193: 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
  194: 	if (rc < 0) {
  195: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
  196: 		efree(subpat_names);
  197: 		return NULL;
  198: 	}
  199: 	if (name_cnt > 0) {
  200: 		int rc1, rc2;
  201: 
  202: 		rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
  203: 		rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
  204: 		rc = rc2 ? rc2 : rc1;
  205: 		if (rc < 0) {
  206: 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
  207: 			efree(subpat_names);
  208: 			return NULL;
  209: 		}
  210: 
  211: 		while (ni++ < name_cnt) {
  212: 			name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
  213: 			subpat_names[name_idx] = name_table + 2;
  214: 			if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
  215: 				php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
  216: 				efree(subpat_names);
  217: 				return NULL;
  218: 			}
  219: 			name_table += name_size;
  220: 		}
  221: 	}
  222: 
  223: 	return subpat_names;
  224: }
  225: /* }}} */
  226: 
  227: /* {{{ pcre_get_compiled_regex_cache
  228:  */
  229: PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
  230: {
  231: 	pcre				*re = NULL;
  232: 	pcre_extra			*extra;
  233: 	int					 coptions = 0;
  234: 	int					 soptions = 0;
  235: 	const char			*error;
  236: 	int					 erroffset;
  237: 	char				 delimiter;
  238: 	char				 start_delimiter;
  239: 	char				 end_delimiter;
  240: 	char				*p, *pp;
  241: 	char				*pattern;
  242: 	int					 do_study = 0;
  243: 	int					 poptions = 0;
  244: 	int				count = 0;
  245: 	unsigned const char *tables = NULL;
  246: #if HAVE_SETLOCALE
  247: 	char				*locale;
  248: #endif
  249: 	pcre_cache_entry	*pce;
  250: 	pcre_cache_entry	 new_entry;
  251: 	char                *tmp = NULL;
  252: 
  253: #if HAVE_SETLOCALE
  254: # if defined(PHP_WIN32) && defined(ZTS)
  255: 	_configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
  256: # endif
  257: 	locale = setlocale(LC_CTYPE, NULL);
  258: #endif
  259: 
  260: 	/* Try to lookup the cached regex entry, and if successful, just pass
  261: 	   back the compiled pattern, otherwise go on and compile it. */
  262: 	if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
  263: 		/*
  264: 		 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
  265: 		 * is, we flush it and compile the pattern from scratch.
  266: 		 */
  267: 		if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
  268: 			zend_hash_clean(&PCRE_G(pcre_cache));
  269: 		} else {
  270: #if HAVE_SETLOCALE
  271: 			if (!strcmp(pce->locale, locale)) {
  272: #endif
  273: 				return pce;
  274: #if HAVE_SETLOCALE
  275: 			}
  276: #endif
  277: 		}
  278: 	}
  279: 	
  280: 	p = regex;
  281: 	
  282: 	/* Parse through the leading whitespace, and display a warning if we
  283: 	   get to the end without encountering a delimiter. */
  284: 	while (isspace((int)*(unsigned char *)p)) p++;
  285: 	if (*p == 0) {
  286: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, 
  287: 						 p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
  288: 		return NULL;
  289: 	}
  290: 	
  291: 	/* Get the delimiter and display a warning if it is alphanumeric
  292: 	   or a backslash. */
  293: 	delimiter = *p++;
  294: 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
  295: 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
  296: 		return NULL;
  297: 	}
  298: 
  299: 	start_delimiter = delimiter;
  300: 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
  301: 		delimiter = pp[5];
  302: 	end_delimiter = delimiter;
  303: 
  304: 	pp = p;
  305: 
  306: 	if (start_delimiter == end_delimiter) {
  307: 		/* We need to iterate through the pattern, searching for the ending delimiter,
  308: 		   but skipping the backslashed delimiters.  If the ending delimiter is not
  309: 		   found, display a warning. */
  310: 		while (*pp != 0) {
  311: 			if (*pp == '\\' && pp[1] != 0) pp++;
  312: 			else if (*pp == delimiter)
  313: 				break;
  314: 			pp++;
  315: 		}
  316: 	} else {
  317: 		/* We iterate through the pattern, searching for the matching ending
  318: 		 * delimiter. For each matching starting delimiter, we increment nesting
  319: 		 * level, and decrement it for each matching ending delimiter. If we
  320: 		 * reach the end of the pattern without matching, display a warning.
  321: 		 */
  322: 		int brackets = 1; 	/* brackets nesting level */
  323: 		while (*pp != 0) {
  324: 			if (*pp == '\\' && pp[1] != 0) pp++;
  325: 			else if (*pp == end_delimiter && --brackets <= 0)
  326: 				break;
  327: 			else if (*pp == start_delimiter)
  328: 				brackets++;
  329: 			pp++;
  330: 		}
  331: 	}
  332: 
  333: 	if (*pp == 0) {
  334: 		if (pp < regex + regex_len) {
  335: 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
  336: 		} else if (start_delimiter == end_delimiter) {
  337: 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
  338: 		} else {
  339: 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
  340: 		}
  341: 		return NULL;
  342: 	}
  343: 	
  344: 	/* Make a copy of the actual pattern. */
  345: 	pattern = estrndup(p, pp-p);
  346: 
  347: 	/* Move on to the options */
  348: 	pp++;
  349: 
  350: 	/* Parse through the options, setting appropriate flags.  Display
  351: 	   a warning if we encounter an unknown modifier. */	
  352: 	while (pp < regex + regex_len) {
  353: 		switch (*pp++) {
  354: 			/* Perl compatible options */
  355: 			case 'i':	coptions |= PCRE_CASELESS;		break;
  356: 			case 'm':	coptions |= PCRE_MULTILINE;		break;
  357: 			case 's':	coptions |= PCRE_DOTALL;		break;
  358: 			case 'x':	coptions |= PCRE_EXTENDED;		break;
  359: 			
  360: 			/* PCRE specific options */
  361: 			case 'A':	coptions |= PCRE_ANCHORED;		break;
  362: 			case 'D':	coptions |= PCRE_DOLLAR_ENDONLY;break;
  363: 			case 'S':	do_study  = 1;					break;
  364: 			case 'U':	coptions |= PCRE_UNGREEDY;		break;
  365: 			case 'X':	coptions |= PCRE_EXTRA;			break;
  366: 			case 'u':	coptions |= PCRE_UTF8;
  367: 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
  368:        characters, even in UTF-8 mode. However, this can be changed by setting
  369:        the PCRE_UCP option. */
  370: #ifdef PCRE_UCP
  371: 						coptions |= PCRE_UCP;
  372: #endif			
  373: 				break;
  374: 
  375: 			/* Custom preg options */
  376: 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
  377: 			
  378: 			case ' ':
  379: 			case '\n':
  380: 				break;
  381: 
  382: 			default:
  383: 				if (pp[-1]) {
  384: 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
  385: 				} else {
  386: 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
  387: 				}
  388: 				efree(pattern);
  389: 				return NULL;
  390: 		}
  391: 	}
  392: 
  393: #if HAVE_SETLOCALE
  394: 	if (strcmp(locale, "C"))
  395: 		tables = pcre_maketables();
  396: #endif
  397: 
  398: 	/* Compile pattern and display a warning if compilation failed. */
  399: 	re = pcre_compile(pattern,
  400: 					  coptions,
  401: 					  &error,
  402: 					  &erroffset,
  403: 					  tables);
  404: 
  405: 	if (re == NULL) {
  406: 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
  407: 		efree(pattern);
  408: 		if (tables) {
  409: 			pefree((void*)tables, 1);
  410: 		}
  411: 		return NULL;
  412: 	}
  413: 
  414: 	/* If study option was specified, study the pattern and
  415: 	   store the result in extra for passing to pcre_exec. */
  416: 	if (do_study) {
  417: 		extra = pcre_study(re, soptions, &error);
  418: 		if (extra) {
  419: 			extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  420: 		}
  421: 		if (error != NULL) {
  422: 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
  423: 		}
  424: 	} else {
  425: 		extra = NULL;
  426: 	}
  427: 
  428: 	efree(pattern);
  429: 
  430: 	/*
  431: 	 * If we reached cache limit, clean out the items from the head of the list;
  432: 	 * these are supposedly the oldest ones (but not necessarily the least used
  433: 	 * ones).
  434: 	 */
  435: 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
  436: 		int num_clean = PCRE_CACHE_SIZE / 8;
  437: 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
  438: 	}
  439: 
  440: 	/* Store the compiled pattern and extra info in the cache. */
  441: 	new_entry.re = re;
  442: 	new_entry.extra = extra;
  443: 	new_entry.preg_options = poptions;
  444: 	new_entry.compile_options = coptions;
  445: #if HAVE_SETLOCALE
  446: 	new_entry.locale = pestrdup(locale, 1);
  447: 	new_entry.tables = tables;
  448: #endif
  449: 
  450: 	/*
  451: 	 * Interned strings are not duplicated when stored in HashTable,
  452: 	 * but all the interned strings created during HTTP request are removed
  453: 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
  454: 	 * on the next request as well. So we disable usage of interned strings
  455: 	 * as hash keys especually for this table.
  456: 	 * See bug #63180 
  457: 	 */
  458: 	if (IS_INTERNED(regex)) {
  459: 		regex = tmp = estrndup(regex, regex_len);
  460: 	}
  461: 
  462: 	zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
  463: 						sizeof(pcre_cache_entry), (void**)&pce);
  464: 
  465: 	if (tmp) {
  466: 		efree(tmp);
  467: 	}
  468: 
  469: 	return pce;
  470: }
  471: /* }}} */
  472: 
  473: /* {{{ pcre_get_compiled_regex
  474:  */
  475: PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
  476: {
  477: 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
  478: 
  479: 	if (extra) {
  480: 		*extra = pce ? pce->extra : NULL;
  481: 	}
  482: 	if (preg_options) {
  483: 		*preg_options = pce ? pce->preg_options : 0;
  484: 	}
  485: 	
  486: 	return pce ? pce->re : NULL;
  487: }
  488: /* }}} */
  489: 
  490: /* {{{ pcre_get_compiled_regex_ex
  491:  */
  492: PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
  493: {
  494: 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
  495: 	
  496: 	if (extra) {
  497: 		*extra = pce ? pce->extra : NULL;
  498: 	}
  499: 	if (preg_options) {
  500: 		*preg_options = pce ? pce->preg_options : 0;
  501: 	}
  502: 	if (compile_options) {
  503: 		*compile_options = pce ? pce->compile_options : 0;
  504: 	}
  505: 	
  506: 	return pce ? pce->re : NULL;
  507: }
  508: /* }}} */
  509: 
  510: /* {{{ add_offset_pair */
  511: static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
  512: {
  513: 	zval *match_pair;
  514: 
  515: 	ALLOC_ZVAL(match_pair);
  516: 	array_init(match_pair);
  517: 	INIT_PZVAL(match_pair);
  518: 
  519: 	/* Add (match, offset) to the return value */
  520: 	add_next_index_stringl(match_pair, str, len, 1);
  521: 	add_next_index_long(match_pair, offset);
  522: 	
  523: 	if (name) {
  524: 		zval_add_ref(&match_pair);
  525: 		zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
  526: 	}
  527: 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
  528: }
  529: /* }}} */
  530: 
  531: static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
  532: {
  533: 	/* parameters */
  534: 	char			 *regex;			/* Regular expression */
  535: 	char			 *subject;			/* String to match against */
  536: 	int				  regex_len;
  537: 	int				  subject_len;
  538: 	pcre_cache_entry *pce;				/* Compiled regular expression */
  539: 	zval			 *subpats = NULL;	/* Array for subpatterns */
  540: 	long			  flags = 0;		/* Match control flags */
  541: 	long			  start_offset = 0;	/* Where the new search starts */
  542: 
  543: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", &regex, &regex_len,
  544: 							  &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
  545: 		RETURN_FALSE;
  546: 	}
  547: 	
  548: 	/* Compile regex or get it from cache. */
  549: 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
  550: 		RETURN_FALSE;
  551: 	}
  552: 
  553: 	php_pcre_match_impl(pce, subject, subject_len, return_value, subpats, 
  554: 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
  555: }
  556: /* }}} */
  557: 
  558: /* {{{ php_pcre_match_impl() */
  559: PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
  560: 	zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
  561: {
  562: 	zval			*result_set,		/* Holds a set of subpatterns after
  563: 										   a global match */
  564: 				   **match_sets = NULL;	/* An array of sets of matches for each
  565: 										   subpattern after a global match */
  566: 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
  567: 	pcre_extra		 extra_data;		/* Used locally for exec options */
  568: 	int				 exoptions = 0;		/* Execution options */
  569: 	int				 count = 0;			/* Count of matched subpatterns */
  570: 	int				*offsets;			/* Array of subpattern offsets */
  571: 	int				 num_subpats;		/* Number of captured subpatterns */
  572: 	int				 size_offsets;		/* Size of the offsets array */
  573: 	int				 matched;			/* Has anything matched */
  574: 	int				 g_notempty = 0;	/* If the match should not be empty */
  575: 	const char	   **stringlist;		/* Holds list of subpatterns */
  576: 	char 		   **subpat_names;		/* Array for named subpatterns */
  577: 	int				 i, rc;
  578: 	int				 subpats_order;		/* Order of subpattern matches */
  579: 	int				 offset_capture;    /* Capture match offsets: yes/no */
  580: 
  581: 	/* Overwrite the passed-in value for subpatterns with an empty array. */
  582: 	if (subpats != NULL) {
  583: 		zval_dtor(subpats);
  584: 		array_init(subpats);
  585: 	}
  586: 
  587: 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
  588: 
  589: 	if (use_flags) {
  590: 		offset_capture = flags & PREG_OFFSET_CAPTURE;
  591: 
  592: 		/*
  593: 		 * subpats_order is pre-set to pattern mode so we change it only if
  594: 		 * necessary.
  595: 		 */
  596: 		if (flags & 0xff) {
  597: 			subpats_order = flags & 0xff;
  598: 		}
  599: 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
  600: 			(!global && subpats_order != 0)) {
  601: 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
  602: 			return;
  603: 		}
  604: 	} else {
  605: 		offset_capture = 0;
  606: 	}
  607: 
  608: 	/* Negative offset counts from the end of the string. */
  609: 	if (start_offset < 0) {
  610: 		start_offset = subject_len + start_offset;
  611: 		if (start_offset < 0) {
  612: 			start_offset = 0;
  613: 		}
  614: 	}
  615: 
  616: 	if (extra == NULL) {
  617: 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  618: 		extra = &extra_data;
  619: 	}
  620: 	extra->match_limit = PCRE_G(backtrack_limit);
  621: 	extra->match_limit_recursion = PCRE_G(recursion_limit);
  622: 
  623: 	/* Calculate the size of the offsets array, and allocate memory for it. */
  624: 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
  625: 	if (rc < 0) {
  626: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
  627: 		RETURN_FALSE;
  628: 	}
  629: 	num_subpats++;
  630: 	size_offsets = num_subpats * 3;
  631: 
  632: 	/*
  633: 	 * Build a mapping from subpattern numbers to their names. We will always
  634: 	 * allocate the table, even though there may be no named subpatterns. This
  635: 	 * avoids somewhat more complicated logic in the inner loops.
  636: 	 */
  637: 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
  638: 	if (!subpat_names) {
  639: 		RETURN_FALSE;
  640: 	}
  641: 
  642: 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
  643: 
  644: 	/* Allocate match sets array and initialize the values. */
  645: 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
  646: 		match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
  647: 		for (i=0; i<num_subpats; i++) {
  648: 			ALLOC_ZVAL(match_sets[i]);
  649: 			array_init(match_sets[i]);
  650: 			INIT_PZVAL(match_sets[i]);
  651: 		}
  652: 	}
  653: 
  654: 	matched = 0;
  655: 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
  656: 	
  657: 	do {
  658: 		/* Execute the regular expression. */
  659: 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
  660: 						  exoptions|g_notempty, offsets, size_offsets);
  661: 
  662: 		/* the string was already proved to be valid UTF-8 */
  663: 		exoptions |= PCRE_NO_UTF8_CHECK;
  664: 
  665: 		/* Check for too many substrings condition. */
  666: 		if (count == 0) {
  667: 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
  668: 			count = size_offsets/3;
  669: 		}
  670: 
  671: 		/* If something has matched */
  672: 		if (count > 0) {
  673: 			matched++;
  674: 
  675: 			/* If subpatterns array has been passed, fill it in with values. */
  676: 			if (subpats != NULL) {
  677: 				/* Try to get the list of substrings and display a warning if failed. */
  678: 				if (pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
  679: 					efree(subpat_names);
  680: 					efree(offsets);
  681: 					if (match_sets) efree(match_sets);
  682: 					php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
  683: 					RETURN_FALSE;
  684: 				}
  685: 
  686: 				if (global) {	/* global pattern matching */
  687: 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
  688: 						/* For each subpattern, insert it into the appropriate array. */
  689: 						for (i = 0; i < count; i++) {
  690: 							if (offset_capture) {
  691: 								add_offset_pair(match_sets[i], (char *)stringlist[i],
  692: 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
  693: 							} else {
  694: 								add_next_index_stringl(match_sets[i], (char *)stringlist[i],
  695: 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
  696: 							}
  697: 						}
  698: 						/*
  699: 						 * If the number of captured subpatterns on this run is
  700: 						 * less than the total possible number, pad the result
  701: 						 * arrays with empty strings.
  702: 						 */
  703: 						if (count < num_subpats) {
  704: 							for (; i < num_subpats; i++) {
  705: 								add_next_index_string(match_sets[i], "", 1);
  706: 							}
  707: 						}
  708: 					} else {
  709: 						/* Allocate the result set array */
  710: 						ALLOC_ZVAL(result_set);
  711: 						array_init(result_set);
  712: 						INIT_PZVAL(result_set);
  713: 						
  714: 						/* Add all the subpatterns to it */
  715: 						for (i = 0; i < count; i++) {
  716: 							if (offset_capture) {
  717: 								add_offset_pair(result_set, (char *)stringlist[i],
  718: 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
  719: 							} else {
  720: 								if (subpat_names[i]) {
  721: 									add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
  722: 														   offsets[(i<<1)+1] - offsets[i<<1], 1);
  723: 								}
  724: 								add_next_index_stringl(result_set, (char *)stringlist[i],
  725: 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
  726: 							}
  727: 						}
  728: 						/* And add it to the output array */
  729: 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
  730: 					}
  731: 				} else {			/* single pattern matching */
  732: 					/* For each subpattern, insert it into the subpatterns array. */
  733: 					for (i = 0; i < count; i++) {
  734: 						if (offset_capture) {
  735: 							add_offset_pair(subpats, (char *)stringlist[i],
  736: 											offsets[(i<<1)+1] - offsets[i<<1],
  737: 											offsets[i<<1], subpat_names[i]);
  738: 						} else {
  739: 							if (subpat_names[i]) {
  740: 								add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
  741: 												  offsets[(i<<1)+1] - offsets[i<<1], 1);
  742: 							}
  743: 							add_next_index_stringl(subpats, (char *)stringlist[i],
  744: 												   offsets[(i<<1)+1] - offsets[i<<1], 1);
  745: 						}
  746: 					}
  747: 				}
  748: 
  749: 				pcre_free((void *) stringlist);
  750: 			}
  751: 		} else if (count == PCRE_ERROR_NOMATCH) {
  752: 			/* If we previously set PCRE_NOTEMPTY after a null match,
  753: 			   this is not necessarily the end. We need to advance
  754: 			   the start offset, and continue. Fudge the offset values
  755: 			   to achieve this, unless we're already at the end of the string. */
  756: 			if (g_notempty != 0 && start_offset < subject_len) {
  757: 				offsets[0] = start_offset;
  758: 				offsets[1] = start_offset + 1;
  759: 			} else
  760: 				break;
  761: 		} else {
  762: 			pcre_handle_exec_error(count TSRMLS_CC);
  763: 			break;
  764: 		}
  765: 		
  766: 		/* If we have matched an empty string, mimic what Perl's /g options does.
  767: 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
  768: 		   the match again at the same point. If this fails (picked up above) we
  769: 		   advance to the next character. */
  770: 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
  771: 		
  772: 		/* Advance to the position right after the last full match */
  773: 		start_offset = offsets[1];
  774: 	} while (global);
  775: 
  776: 	/* Add the match sets to the output array and clean up */
  777: 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
  778: 		for (i = 0; i < num_subpats; i++) {
  779: 			if (subpat_names[i]) {
  780: 				zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
  781: 								 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
  782: 				Z_ADDREF_P(match_sets[i]);
  783: 			}
  784: 			zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
  785: 		}
  786: 		efree(match_sets);
  787: 	}
  788: 	
  789: 	efree(offsets);
  790: 	efree(subpat_names);
  791: 
  792: 	/* Did we encounter an error? */
  793: 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
  794: 		RETVAL_LONG(matched);
  795: 	} else {
  796: 		RETVAL_FALSE;
  797: 	}
  798: }
  799: /* }}} */
  800: 
  801: /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
  802:    Perform a Perl-style regular expression match */
  803: static PHP_FUNCTION(preg_match)
  804: {
  805: 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
  806: }
  807: /* }}} */
  808: 
  809: /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
  810:    Perform a Perl-style global regular expression match */
  811: static PHP_FUNCTION(preg_match_all)
  812: {
  813: 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  814: }
  815: /* }}} */
  816: 
  817: /* {{{ preg_get_backref
  818:  */
  819: static int preg_get_backref(char **str, int *backref)
  820: {
  821: 	register char in_brace = 0;
  822: 	register char *walk = *str;
  823: 
  824: 	if (walk[1] == 0)
  825: 		return 0;
  826: 
  827: 	if (*walk == '$' && walk[1] == '{') {
  828: 		in_brace = 1;
  829: 		walk++;
  830: 	}
  831: 	walk++;
  832: 
  833: 	if (*walk >= '0' && *walk <= '9') {
  834: 		*backref = *walk - '0';
  835: 		walk++;
  836: 	} else
  837: 		return 0;
  838: 	
  839: 	if (*walk && *walk >= '0' && *walk <= '9') {
  840: 		*backref = *backref * 10 + *walk - '0';
  841: 		walk++;
  842: 	}
  843: 
  844: 	if (in_brace) {
  845: 		if (*walk == 0 || *walk != '}')
  846: 			return 0;
  847: 		else
  848: 			walk++;
  849: 	}
  850: 	
  851: 	*str = walk;
  852: 	return 1;	
  853: }
  854: /* }}} */
  855: 
  856: /* {{{ preg_do_repl_func
  857:  */
  858: static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, char **result TSRMLS_DC)
  859: {
  860: 	zval		*retval_ptr;		/* Function return value */
  861: 	zval	   **args[1];			/* Argument to pass to function */
  862: 	zval		*subpats;			/* Captured subpatterns */ 
  863: 	int			 result_len;		/* Return value length */
  864: 	int			 i;
  865: 
  866: 	MAKE_STD_ZVAL(subpats);
  867: 	array_init(subpats);
  868: 	for (i = 0; i < count; i++) {
  869: 		if (subpat_names[i]) {
  870: 			add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
  871: 		}
  872: 		add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
  873: 	}
  874: 	args[0] = &subpats;
  875: 
  876: 	if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
  877: 		convert_to_string_ex(&retval_ptr);
  878: 		*result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
  879: 		result_len = Z_STRLEN_P(retval_ptr);
  880: 		zval_ptr_dtor(&retval_ptr);
  881: 	} else {
  882: 		if (!EG(exception)) {
  883: 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
  884: 		}
  885: 		result_len = offsets[1] - offsets[0];
  886: 		*result = estrndup(&subject[offsets[0]], result_len);
  887: 	}
  888: 
  889: 	zval_ptr_dtor(&subpats);
  890: 
  891: 	return result_len;
  892: }
  893: /* }}} */
  894: 
  895: /* {{{ preg_do_eval
  896:  */
  897: static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
  898: 						int *offsets, int count, char **result TSRMLS_DC)
  899: {
  900: 	zval		 retval;			/* Return value from evaluation */
  901: 	char		*eval_str_end,		/* End of eval string */
  902: 				*match,				/* Current match for a backref */
  903: 				*esc_match,			/* Quote-escaped match */
  904: 				*walk,				/* Used to walk the code string */
  905: 				*segment,			/* Start of segment to append while walking */
  906: 				 walk_last;			/* Last walked character */
  907: 	int			 match_len;			/* Length of the match */
  908: 	int			 esc_match_len;		/* Length of the quote-escaped match */
  909: 	int			 result_len;		/* Length of the result of the evaluation */
  910: 	int			 backref;			/* Current backref */
  911: 	char        *compiled_string_description;
  912: 	smart_str    code = {0};
  913: 	
  914: 	eval_str_end = eval_str + eval_str_len;
  915: 	walk = segment = eval_str;
  916: 	walk_last = 0;
  917: 	
  918: 	while (walk < eval_str_end) {
  919: 		/* If found a backreference.. */
  920: 		if ('\\' == *walk || '$' == *walk) {
  921: 			smart_str_appendl(&code, segment, walk - segment);
  922: 			if (walk_last == '\\') {
  923: 				code.c[code.len-1] = *walk++;
  924: 				segment = walk;
  925: 				walk_last = 0;
  926: 				continue;
  927: 			}
  928: 			segment = walk;
  929: 			if (preg_get_backref(&walk, &backref)) {
  930: 				if (backref < count) {
  931: 					/* Find the corresponding string match and substitute it
  932: 					   in instead of the backref */
  933: 					match = subject + offsets[backref<<1];
  934: 					match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
  935: 					if (match_len) {
  936: 						esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
  937: 					} else {
  938: 						esc_match = match;
  939: 						esc_match_len = 0;
  940: 					}
  941: 				} else {
  942: 					esc_match = "";
  943: 					esc_match_len = 0;
  944: 				}
  945: 				smart_str_appendl(&code, esc_match, esc_match_len);
  946: 
  947: 				segment = walk;
  948: 
  949: 				/* Clean up and reassign */
  950: 				if (esc_match_len)
  951: 					efree(esc_match);
  952: 				continue;
  953: 			}
  954: 		}
  955: 		walk++;
  956: 		walk_last = walk[-1];
  957: 	}
  958: 	smart_str_appendl(&code, segment, walk - segment);
  959: 	smart_str_0(&code);
  960: 
  961: 	compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
  962: 	/* Run the code */
  963: 	if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
  964: 		efree(compiled_string_description);
  965: 		php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
  966: 		/* zend_error() does not return in this case */
  967: 	}
  968: 	efree(compiled_string_description);
  969: 	convert_to_string(&retval);
  970: 	
  971: 	/* Save the return value and its length */
  972: 	*result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
  973: 	result_len = Z_STRLEN(retval);
  974: 	
  975: 	/* Clean up */
  976: 	zval_dtor(&retval);
  977: 	smart_str_free(&code);
  978: 	
  979: 	return result_len;
  980: }
  981: /* }}} */
  982: 
  983: /* {{{ php_pcre_replace
  984:  */
  985: PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
  986: 							  char *subject, int subject_len,
  987: 							  zval *replace_val, int is_callable_replace,
  988: 							  int *result_len, int limit, int *replace_count TSRMLS_DC)
  989: {
  990: 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
  991: 
  992: 	/* Compile regex or get it from cache. */
  993: 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
  994: 		return NULL;
  995: 	}
  996: 
  997: 	return php_pcre_replace_impl(pce, subject, subject_len, replace_val, 
  998: 		is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
  999: }
 1000: /* }}} */
 1001: 
 1002: /* {{{ php_pcre_replace_impl() */
 1003: PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val, 
 1004: 	int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
 1005: {
 1006: 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
 1007: 	pcre_extra		 extra_data;		/* Used locally for exec options */
 1008: 	int				 exoptions = 0;		/* Execution options */
 1009: 	int				 count = 0;			/* Count of matched subpatterns */
 1010: 	int				*offsets;			/* Array of subpattern offsets */
 1011: 	char 			**subpat_names;		/* Array for named subpatterns */
 1012: 	int				 num_subpats;		/* Number of captured subpatterns */
 1013: 	int				 size_offsets;		/* Size of the offsets array */
 1014: 	int				 new_len;			/* Length of needed storage */
 1015: 	int				 alloc_len;			/* Actual allocated length */
 1016: 	int				 eval_result_len=0;	/* Length of the eval'ed or
 1017: 										   function-returned string */
 1018: 	int				 match_len;			/* Length of the current match */
 1019: 	int				 backref;			/* Backreference number */
 1020: 	int				 eval;				/* If the replacement string should be eval'ed */
 1021: 	int				 start_offset;		/* Where the new search starts */
 1022: 	int				 g_notempty=0;		/* If the match should not be empty */
 1023: 	int				 replace_len=0;		/* Length of replacement string */
 1024: 	char			*result,			/* Result of replacement */
 1025: 					*replace=NULL,		/* Replacement string */
 1026: 					*new_buf,			/* Temporary buffer for re-allocation */
 1027: 					*walkbuf,			/* Location of current replacement in the result */
 1028: 					*walk,				/* Used to walk the replacement string */
 1029: 					*match,				/* The current match */
 1030: 					*piece,				/* The current piece of subject */
 1031: 					*replace_end=NULL,	/* End of replacement string */
 1032: 					*eval_result,		/* Result of eval or custom function */
 1033: 					 walk_last;			/* Last walked character */
 1034: 	int				 rc;
 1035: 
 1036: 	if (extra == NULL) {
 1037: 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 1038: 		extra = &extra_data;
 1039: 	}
 1040: 	extra->match_limit = PCRE_G(backtrack_limit);
 1041: 	extra->match_limit_recursion = PCRE_G(recursion_limit);
 1042: 
 1043: 	eval = pce->preg_options & PREG_REPLACE_EVAL;
 1044: 	if (is_callable_replace) {
 1045: 		if (eval) {
 1046: 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
 1047: 			return NULL;
 1048: 		}
 1049: 	} else {
 1050: 		replace = Z_STRVAL_P(replace_val);
 1051: 		replace_len = Z_STRLEN_P(replace_val);
 1052: 		replace_end = replace + replace_len;
 1053: 	}
 1054: 
 1055: 	/* Calculate the size of the offsets array, and allocate memory for it. */
 1056: 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
 1057: 	if (rc < 0) {
 1058: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
 1059: 		return NULL;
 1060: 	}
 1061: 	num_subpats++;
 1062: 	size_offsets = num_subpats * 3;
 1063: 
 1064: 	/*
 1065: 	 * Build a mapping from subpattern numbers to their names. We will always
 1066: 	 * allocate the table, even though there may be no named subpatterns. This
 1067: 	 * avoids somewhat more complicated logic in the inner loops.
 1068: 	 */
 1069: 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
 1070: 	if (!subpat_names) {
 1071: 		return NULL;
 1072: 	}
 1073: 
 1074: 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
 1075: 	
 1076: 	alloc_len = 2 * subject_len + 1;
 1077: 	result = safe_emalloc(alloc_len, sizeof(char), 0);
 1078: 
 1079: 	/* Initialize */
 1080: 	match = NULL;
 1081: 	*result_len = 0;
 1082: 	start_offset = 0;
 1083: 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 1084: 	
 1085: 	while (1) {
 1086: 		/* Execute the regular expression. */
 1087: 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
 1088: 						  exoptions|g_notempty, offsets, size_offsets);
 1089: 
 1090: 		/* the string was already proved to be valid UTF-8 */
 1091: 		exoptions |= PCRE_NO_UTF8_CHECK;
 1092: 
 1093: 		/* Check for too many substrings condition. */
 1094: 		if (count == 0) {
 1095: 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
 1096: 			count = size_offsets/3;
 1097: 		}
 1098: 
 1099: 		piece = subject + start_offset;
 1100: 
 1101: 		if (count > 0 && (limit == -1 || limit > 0)) {
 1102: 			if (replace_count) {
 1103: 				++*replace_count;
 1104: 			}
 1105: 			/* Set the match location in subject */
 1106: 			match = subject + offsets[0];
 1107: 
 1108: 			new_len = *result_len + offsets[0] - start_offset; /* part before the match */
 1109: 			
 1110: 			/* If evaluating, do it and add the return string's length */
 1111: 			if (eval) {
 1112: 				eval_result_len = preg_do_eval(replace, replace_len, subject,
 1113: 											   offsets, count, &eval_result TSRMLS_CC);
 1114: 				new_len += eval_result_len;
 1115: 			} else if (is_callable_replace) {
 1116: 				/* Use custom function to get replacement string and its length. */
 1117: 				eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, &eval_result TSRMLS_CC);
 1118: 				new_len += eval_result_len;
 1119: 			} else { /* do regular substitution */
 1120: 				walk = replace;
 1121: 				walk_last = 0;
 1122: 				while (walk < replace_end) {
 1123: 					if ('\\' == *walk || '$' == *walk) {
 1124: 						if (walk_last == '\\') {
 1125: 							walk++;
 1126: 							walk_last = 0;
 1127: 							continue;
 1128: 						}
 1129: 						if (preg_get_backref(&walk, &backref)) {
 1130: 							if (backref < count)
 1131: 								new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
 1132: 							continue;
 1133: 						}
 1134: 					}
 1135: 					new_len++;
 1136: 					walk++;
 1137: 					walk_last = walk[-1];
 1138: 				}
 1139: 			}
 1140: 
 1141: 			if (new_len + 1 > alloc_len) {
 1142: 				alloc_len = 1 + alloc_len + 2 * new_len;
 1143: 				new_buf = emalloc(alloc_len);
 1144: 				memcpy(new_buf, result, *result_len);
 1145: 				efree(result);
 1146: 				result = new_buf;
 1147: 			}
 1148: 			/* copy the part of the string before the match */
 1149: 			memcpy(&result[*result_len], piece, match-piece);
 1150: 			*result_len += match-piece;
 1151: 
 1152: 			/* copy replacement and backrefs */
 1153: 			walkbuf = result + *result_len;
 1154: 			
 1155: 			/* If evaluating or using custom function, copy result to the buffer
 1156: 			 * and clean up. */
 1157: 			if (eval || is_callable_replace) {
 1158: 				memcpy(walkbuf, eval_result, eval_result_len);
 1159: 				*result_len += eval_result_len;
 1160: 				STR_FREE(eval_result);
 1161: 			} else { /* do regular backreference copying */
 1162: 				walk = replace;
 1163: 				walk_last = 0;
 1164: 				while (walk < replace_end) {
 1165: 					if ('\\' == *walk || '$' == *walk) {
 1166: 						if (walk_last == '\\') {
 1167: 							*(walkbuf-1) = *walk++;
 1168: 							walk_last = 0;
 1169: 							continue;
 1170: 						}
 1171: 						if (preg_get_backref(&walk, &backref)) {
 1172: 							if (backref < count) {
 1173: 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
 1174: 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
 1175: 								walkbuf += match_len;
 1176: 							}
 1177: 							continue;
 1178: 						}
 1179: 					}
 1180: 					*walkbuf++ = *walk++;
 1181: 					walk_last = walk[-1];
 1182: 				}
 1183: 				*walkbuf = '\0';
 1184: 				/* increment the result length by how much we've added to the string */
 1185: 				*result_len += walkbuf - (result + *result_len);
 1186: 			}
 1187: 
 1188: 			if (limit != -1)
 1189: 				limit--;
 1190: 
 1191: 		} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
 1192: 			/* If we previously set PCRE_NOTEMPTY after a null match,
 1193: 			   this is not necessarily the end. We need to advance
 1194: 			   the start offset, and continue. Fudge the offset values
 1195: 			   to achieve this, unless we're already at the end of the string. */
 1196: 			if (g_notempty != 0 && start_offset < subject_len) {
 1197: 				offsets[0] = start_offset;
 1198: 				offsets[1] = start_offset + 1;
 1199: 				memcpy(&result[*result_len], piece, 1);
 1200: 				(*result_len)++;
 1201: 			} else {
 1202: 				new_len = *result_len + subject_len - start_offset;
 1203: 				if (new_len + 1 > alloc_len) {
 1204: 					alloc_len = new_len + 1; /* now we know exactly how long it is */
 1205: 					new_buf = safe_emalloc(alloc_len, sizeof(char), 0);
 1206: 					memcpy(new_buf, result, *result_len);
 1207: 					efree(result);
 1208: 					result = new_buf;
 1209: 				}
 1210: 				/* stick that last bit of string on our output */
 1211: 				memcpy(&result[*result_len], piece, subject_len - start_offset);
 1212: 				*result_len += subject_len - start_offset;
 1213: 				result[*result_len] = '\0';
 1214: 				break;
 1215: 			}
 1216: 		} else {
 1217: 			pcre_handle_exec_error(count TSRMLS_CC);
 1218: 			efree(result);
 1219: 			result = NULL;
 1220: 			break;
 1221: 		}
 1222: 			
 1223: 		/* If we have matched an empty string, mimic what Perl's /g options does.
 1224: 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
 1225: 		   the match again at the same point. If this fails (picked up above) we
 1226: 		   advance to the next character. */
 1227: 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
 1228: 		
 1229: 		/* Advance to the next piece. */
 1230: 		start_offset = offsets[1];
 1231: 	}
 1232: 
 1233: 	efree(offsets);
 1234: 	efree(subpat_names);
 1235: 
 1236: 	return result;
 1237: }
 1238: /* }}} */
 1239: 
 1240: /* {{{ php_replace_in_subject
 1241:  */
 1242: static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
 1243: {
 1244: 	zval		**regex_entry,
 1245: 				**replace_entry = NULL,
 1246: 				 *replace_value,
 1247: 				  empty_replace;
 1248: 	char		*subject_value,
 1249: 				*result;
 1250: 	int			 subject_len;
 1251: 
 1252: 	/* Make sure we're dealing with strings. */	
 1253: 	convert_to_string_ex(subject);
 1254: 	/* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
 1255: 	ZVAL_STRINGL(&empty_replace, "", 0, 0);
 1256: 	
 1257: 	/* If regex is an array */
 1258: 	if (Z_TYPE_P(regex) == IS_ARRAY) {
 1259: 		/* Duplicate subject string for repeated replacement */
 1260: 		subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
 1261: 		subject_len = Z_STRLEN_PP(subject);
 1262: 		*result_len = subject_len;
 1263: 		
 1264: 		zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
 1265: 
 1266: 		replace_value = replace;
 1267: 		if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
 1268: 			zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
 1269: 
 1270: 		/* For each entry in the regex array, get the entry */
 1271: 		while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
 1272: 			/* Make sure we're dealing with strings. */	
 1273: 			convert_to_string_ex(regex_entry);
 1274: 		
 1275: 			/* If replace is an array and not a callable construct */
 1276: 			if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
 1277: 				/* Get current entry */
 1278: 				if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
 1279: 					if (!is_callable_replace) {
 1280: 						convert_to_string_ex(replace_entry);
 1281: 					}
 1282: 					replace_value = *replace_entry;
 1283: 					zend_hash_move_forward(Z_ARRVAL_P(replace));
 1284: 				} else {
 1285: 					/* We've run out of replacement strings, so use an empty one */
 1286: 					replace_value = &empty_replace;
 1287: 				}
 1288: 			}
 1289: 			
 1290: 			/* Do the actual replacement and put the result back into subject_value
 1291: 			   for further replacements. */
 1292: 			if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
 1293: 										   Z_STRLEN_PP(regex_entry),
 1294: 										   subject_value,
 1295: 										   subject_len,
 1296: 										   replace_value,
 1297: 										   is_callable_replace,
 1298: 										   result_len,
 1299: 										   limit,
 1300: 										   replace_count TSRMLS_CC)) != NULL) {
 1301: 				efree(subject_value);
 1302: 				subject_value = result;
 1303: 				subject_len = *result_len;
 1304: 			} else {
 1305: 				efree(subject_value);
 1306: 				return NULL;
 1307: 			}
 1308: 
 1309: 			zend_hash_move_forward(Z_ARRVAL_P(regex));
 1310: 		}
 1311: 
 1312: 		return subject_value;
 1313: 	} else {
 1314: 		result = php_pcre_replace(Z_STRVAL_P(regex),
 1315: 								  Z_STRLEN_P(regex),
 1316: 								  Z_STRVAL_PP(subject),
 1317: 								  Z_STRLEN_PP(subject),
 1318: 								  replace,
 1319: 								  is_callable_replace,
 1320: 								  result_len,
 1321: 								  limit,
 1322: 								  replace_count TSRMLS_CC);
 1323: 		return result;
 1324: 	}
 1325: }
 1326: /* }}} */
 1327: 
 1328: /* {{{ preg_replace_impl
 1329:  */
 1330: static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
 1331: {
 1332: 	zval		   **regex,
 1333: 				   **replace,
 1334: 				   **subject,
 1335: 				   **subject_entry,
 1336: 				   **zcount = NULL;
 1337: 	char			*result;
 1338: 	int				 result_len;
 1339: 	int				 limit_val = -1;
 1340: 	long			limit = -1;
 1341: 	char			*string_key;
 1342: 	ulong			 num_key;
 1343: 	char			*callback_name;
 1344: 	int				 replace_count=0, old_replace_count;
 1345: 	
 1346: 	/* Get function parameters and do error-checking. */
 1347: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", &regex, &replace, &subject, &limit, &zcount) == FAILURE) {
 1348: 		return;
 1349: 	}
 1350: 	
 1351: 	if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
 1352: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
 1353: 		RETURN_FALSE;
 1354: 	}
 1355: 
 1356: 	SEPARATE_ZVAL(replace);
 1357: 	if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
 1358: 		convert_to_string_ex(replace);
 1359: 	}
 1360: 	if (is_callable_replace) {
 1361: 		if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
 1362: 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
 1363: 			efree(callback_name);
 1364: 			MAKE_COPY_ZVAL(subject, return_value);
 1365: 			return;
 1366: 		}
 1367: 		efree(callback_name);
 1368: 	}
 1369: 
 1370: 	SEPARATE_ZVAL(regex);
 1371: 	SEPARATE_ZVAL(subject);
 1372: 
 1373: 	if (ZEND_NUM_ARGS() > 3) {
 1374: 		limit_val = limit;
 1375: 	}
 1376: 		
 1377: 	if (Z_TYPE_PP(regex) != IS_ARRAY)
 1378: 		convert_to_string_ex(regex);
 1379: 	
 1380: 	/* if subject is an array */
 1381: 	if (Z_TYPE_PP(subject) == IS_ARRAY) {
 1382: 		array_init(return_value);
 1383: 		zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
 1384: 
 1385: 		/* For each subject entry, convert it to string, then perform replacement
 1386: 		   and add the result to the return_value array. */
 1387: 		while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
 1388: 			SEPARATE_ZVAL(subject_entry);
 1389: 			old_replace_count = replace_count;
 1390: 			if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
 1391: 				if (!is_filter || replace_count > old_replace_count) {
 1392: 					/* Add to return array */
 1393: 					switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject), &string_key, &num_key, 0))
 1394: 					{
 1395: 					case HASH_KEY_IS_STRING:
 1396: 						add_assoc_stringl(return_value, string_key, result, result_len, 0);
 1397: 						break;
 1398: 
 1399: 					case HASH_KEY_IS_LONG:
 1400: 						add_index_stringl(return_value, num_key, result, result_len, 0);
 1401: 						break;
 1402: 					}
 1403: 				} else {
 1404: 					efree(result);
 1405: 				}
 1406: 			}
 1407: 		
 1408: 			zend_hash_move_forward(Z_ARRVAL_PP(subject));
 1409: 		}
 1410: 	} else {	/* if subject is not an array */
 1411: 		old_replace_count = replace_count;
 1412: 		if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
 1413: 			if (!is_filter || replace_count > old_replace_count) {
 1414: 				RETVAL_STRINGL(result, result_len, 0);
 1415: 			} else {
 1416: 				efree(result);
 1417: 			}
 1418: 		}
 1419: 	}
 1420: 	if (ZEND_NUM_ARGS() > 4) {
 1421: 		zval_dtor(*zcount);
 1422: 		ZVAL_LONG(*zcount, replace_count);
 1423: 	}
 1424:     	
 1425: }
 1426: /* }}} */
 1427: 
 1428: /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
 1429:    Perform Perl-style regular expression replacement. */
 1430: static PHP_FUNCTION(preg_replace)
 1431: {
 1432: 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
 1433: }
 1434: /* }}} */
 1435: 
 1436: /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
 1437:    Perform Perl-style regular expression replacement using replacement callback. */
 1438: static PHP_FUNCTION(preg_replace_callback)
 1439: {
 1440: 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
 1441: }
 1442: /* }}} */
 1443: 
 1444: /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
 1445:    Perform Perl-style regular expression replacement and only return matches. */
 1446: static PHP_FUNCTION(preg_filter)
 1447: {
 1448: 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
 1449: }
 1450: /* }}} */
 1451: 
 1452: /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]]) 
 1453:    Split string into an array using a perl-style regular expression as a delimiter */
 1454: static PHP_FUNCTION(preg_split)
 1455: {
 1456: 	char				*regex;			/* Regular expression */
 1457: 	char				*subject;		/* String to match against */
 1458: 	int					 regex_len;
 1459: 	int					 subject_len;
 1460: 	long				 limit_val = -1;/* Integer value of limit */
 1461: 	long				 flags = 0;		/* Match control flags */
 1462: 	pcre_cache_entry	*pce;			/* Compiled regular expression */
 1463: 
 1464: 	/* Get function parameters and do error checking */	
 1465: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", &regex, &regex_len,
 1466: 							  &subject, &subject_len, &limit_val, &flags) == FAILURE) {
 1467: 		RETURN_FALSE;
 1468: 	}
 1469: 	
 1470: 	/* Compile regex or get it from cache. */
 1471: 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
 1472: 		RETURN_FALSE;
 1473: 	}
 1474: 
 1475: 	php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
 1476: }
 1477: /* }}} */
 1478: 
 1479: /* {{{ php_pcre_split
 1480:  */
 1481: PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
 1482: 	long limit_val, long flags TSRMLS_DC)
 1483: {
 1484: 	pcre_extra		*extra = NULL;		/* Holds results of studying */
 1485: 	pcre			*re_bump = NULL;	/* Regex instance for empty matches */
 1486: 	pcre_extra		*extra_bump = NULL;	/* Almost dummy */
 1487: 	pcre_extra		 extra_data;		/* Used locally for exec options */
 1488: 	int				*offsets;			/* Array of subpattern offsets */
 1489: 	int				 size_offsets;		/* Size of the offsets array */
 1490: 	int				 exoptions = 0;		/* Execution options */
 1491: 	int				 count = 0;			/* Count of matched subpatterns */
 1492: 	int				 start_offset;		/* Where the new search starts */
 1493: 	int				 next_offset;		/* End of the last delimiter match + 1 */
 1494: 	int				 g_notempty = 0;	/* If the match should not be empty */
 1495: 	char			*last_match;		/* Location of last match */
 1496: 	int				 rc;
 1497: 	int				 no_empty;			/* If NO_EMPTY flag is set */
 1498: 	int				 delim_capture; 	/* If delimiters should be captured */
 1499: 	int				 offset_capture;	/* If offsets should be captured */
 1500: 
 1501: 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
 1502: 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
 1503: 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
 1504: 	
 1505: 	if (limit_val == 0) {
 1506: 		limit_val = -1;
 1507: 	}
 1508: 
 1509: 	if (extra == NULL) {
 1510: 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 1511: 		extra = &extra_data;
 1512: 	}
 1513: 	extra->match_limit = PCRE_G(backtrack_limit);
 1514: 	extra->match_limit_recursion = PCRE_G(recursion_limit);
 1515: 	
 1516: 	/* Initialize return value */
 1517: 	array_init(return_value);
 1518: 
 1519: 	/* Calculate the size of the offsets array, and allocate memory for it. */
 1520: 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
 1521: 	if (rc < 0) {
 1522: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
 1523: 		RETURN_FALSE;
 1524: 	}
 1525: 	size_offsets = (size_offsets + 1) * 3;
 1526: 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
 1527: 	
 1528: 	/* Start at the beginning of the string */
 1529: 	start_offset = 0;
 1530: 	next_offset = 0;
 1531: 	last_match = subject;
 1532: 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 1533: 	
 1534: 	/* Get next piece if no limit or limit not yet reached and something matched*/
 1535: 	while ((limit_val == -1 || limit_val > 1)) {
 1536: 		count = pcre_exec(pce->re, extra, subject,
 1537: 						  subject_len, start_offset,
 1538: 						  exoptions|g_notempty, offsets, size_offsets);
 1539: 
 1540: 		/* the string was already proved to be valid UTF-8 */
 1541: 		exoptions |= PCRE_NO_UTF8_CHECK;
 1542: 
 1543: 		/* Check for too many substrings condition. */
 1544: 		if (count == 0) {
 1545: 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
 1546: 			count = size_offsets/3;
 1547: 		}
 1548: 				
 1549: 		/* If something matched */
 1550: 		if (count > 0) {
 1551: 			if (!no_empty || &subject[offsets[0]] != last_match) {
 1552: 
 1553: 				if (offset_capture) {
 1554: 					/* Add (match, offset) pair to the return value */
 1555: 					add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
 1556: 				} else {
 1557: 					/* Add the piece to the return value */
 1558: 					add_next_index_stringl(return_value, last_match,
 1559: 								   	   &subject[offsets[0]]-last_match, 1);
 1560: 				}
 1561: 
 1562: 				/* One less left to do */
 1563: 				if (limit_val != -1)
 1564: 					limit_val--;
 1565: 			}
 1566: 			
 1567: 			last_match = &subject[offsets[1]];
 1568: 			next_offset = offsets[1];
 1569: 
 1570: 			if (delim_capture) {
 1571: 				int i, match_len;
 1572: 				for (i = 1; i < count; i++) {
 1573: 					match_len = offsets[(i<<1)+1] - offsets[i<<1];
 1574: 					/* If we have matched a delimiter */
 1575: 					if (!no_empty || match_len > 0) {
 1576: 						if (offset_capture) {
 1577: 							add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
 1578: 						} else {
 1579: 							add_next_index_stringl(return_value,
 1580: 												   &subject[offsets[i<<1]],
 1581: 												   match_len, 1);
 1582: 						}
 1583: 					}
 1584: 				}
 1585: 			}
 1586: 		} else if (count == PCRE_ERROR_NOMATCH) {
 1587: 			/* If we previously set PCRE_NOTEMPTY after a null match,
 1588: 			   this is not necessarily the end. We need to advance
 1589: 			   the start offset, and continue. Fudge the offset values
 1590: 			   to achieve this, unless we're already at the end of the string. */
 1591: 			if (g_notempty != 0 && start_offset < subject_len) {
 1592: 				if (pce->compile_options & PCRE_UTF8) {
 1593: 					if (re_bump == NULL) {
 1594: 						int dummy;
 1595: 
 1596: 						if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
 1597: 							RETURN_FALSE;
 1598: 						}
 1599: 					}
 1600: 					count = pcre_exec(re_bump, extra_bump, subject,
 1601: 							  subject_len, start_offset,
 1602: 							  exoptions, offsets, size_offsets);
 1603: 					if (count < 1) {
 1604: 						php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
 1605: 						RETURN_FALSE;
 1606: 					}
 1607: 				} else {
 1608: 					offsets[0] = start_offset;
 1609: 					offsets[1] = start_offset + 1;
 1610: 				}
 1611: 			} else
 1612: 				break;
 1613: 		} else {
 1614: 			pcre_handle_exec_error(count TSRMLS_CC);
 1615: 			break;
 1616: 		}
 1617: 
 1618: 		/* If we have matched an empty string, mimic what Perl's /g options does.
 1619: 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
 1620: 		   the match again at the same point. If this fails (picked up above) we
 1621: 		   advance to the next character. */
 1622: 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
 1623: 		
 1624: 		/* Advance to the position right after the last full match */
 1625: 		start_offset = offsets[1];
 1626: 	}
 1627: 
 1628: 
 1629: 	start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
 1630: 
 1631: 	if (!no_empty || start_offset < subject_len)
 1632: 	{
 1633: 		if (offset_capture) {
 1634: 			/* Add the last (match, offset) pair to the return value */
 1635: 			add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
 1636: 		} else {
 1637: 			/* Add the last piece to the return value */
 1638: 			add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
 1639: 		}
 1640: 	}
 1641: 
 1642: 	
 1643: 	/* Clean up */
 1644: 	efree(offsets);
 1645: }
 1646: /* }}} */
 1647: 
 1648: /* {{{ proto string preg_quote(string str [, string delim_char])
 1649:    Quote regular expression characters plus an optional character */
 1650: static PHP_FUNCTION(preg_quote)
 1651: {
 1652: 	int		 in_str_len;
 1653: 	char	*in_str;		/* Input string argument */
 1654: 	char	*in_str_end;    /* End of the input string */
 1655: 	int		 delim_len = 0;
 1656: 	char	*delim = NULL;	/* Additional delimiter argument */
 1657: 	char	*out_str,		/* Output string with quoted characters */
 1658: 		 	*p,				/* Iterator for input string */
 1659: 			*q,				/* Iterator for output string */
 1660: 			 delim_char=0,	/* Delimiter character to be quoted */
 1661: 			 c;				/* Current character */
 1662: 	zend_bool quote_delim = 0; /* Whether to quote additional delim char */
 1663: 	
 1664: 	/* Get the arguments and check for errors */
 1665: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
 1666: 							  &delim, &delim_len) == FAILURE) {
 1667: 		return;
 1668: 	}
 1669: 	
 1670: 	in_str_end = in_str + in_str_len;
 1671: 
 1672: 	/* Nothing to do if we got an empty string */
 1673: 	if (in_str == in_str_end) {
 1674: 		RETURN_EMPTY_STRING();
 1675: 	}
 1676: 
 1677: 	if (delim && *delim) {
 1678: 		delim_char = delim[0];
 1679: 		quote_delim = 1;
 1680: 	}
 1681: 	
 1682: 	/* Allocate enough memory so that even if each character
 1683: 	   is quoted, we won't run out of room */
 1684: 	out_str = safe_emalloc(4, in_str_len, 1);
 1685: 	
 1686: 	/* Go through the string and quote necessary characters */
 1687: 	for(p = in_str, q = out_str; p != in_str_end; p++) {
 1688: 		c = *p;
 1689: 		switch(c) {
 1690: 			case '.':
 1691: 			case '\\':
 1692: 			case '+':
 1693: 			case '*':
 1694: 			case '?':
 1695: 			case '[':
 1696: 			case '^':
 1697: 			case ']':
 1698: 			case '$':
 1699: 			case '(':
 1700: 			case ')':
 1701: 			case '{':
 1702: 			case '}':
 1703: 			case '=':
 1704: 			case '!':
 1705: 			case '>':
 1706: 			case '<':
 1707: 			case '|':
 1708: 			case ':':
 1709: 			case '-':
 1710: 				*q++ = '\\';
 1711: 				*q++ = c;
 1712: 				break;
 1713: 
 1714: 			case '\0':
 1715: 				*q++ = '\\';
 1716: 				*q++ = '0';
 1717: 				*q++ = '0';
 1718: 				*q++ = '0';
 1719: 				break;
 1720: 
 1721: 			default:
 1722: 				if (quote_delim && c == delim_char)
 1723: 					*q++ = '\\';
 1724: 				*q++ = c;
 1725: 				break;
 1726: 		}
 1727: 	}
 1728: 	*q = '\0';
 1729: 	
 1730: 	/* Reallocate string and return it */
 1731: 	RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
 1732: }
 1733: /* }}} */
 1734: 
 1735: /* {{{ proto array preg_grep(string regex, array input [, int flags])
 1736:    Searches array and returns entries which match regex */
 1737: static PHP_FUNCTION(preg_grep)
 1738: {
 1739: 	char				*regex;			/* Regular expression */
 1740: 	int				 	 regex_len;
 1741: 	zval				*input;			/* Input array */
 1742: 	long				 flags = 0;		/* Match control flags */
 1743: 	pcre_cache_entry	*pce;			/* Compiled regular expression */
 1744: 
 1745: 	/* Get arguments and do error checking */
 1746: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", &regex, &regex_len,
 1747: 							  &input, &flags) == FAILURE) {
 1748: 		return;
 1749: 	}
 1750: 	
 1751: 	/* Compile regex or get it from cache. */
 1752: 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
 1753: 		RETURN_FALSE;
 1754: 	}
 1755: 	
 1756: 	php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
 1757: }
 1758: /* }}} */
 1759: 
 1760: PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
 1761: {
 1762: 	zval		   **entry;				/* An entry in the input array */
 1763: 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
 1764: 	pcre_extra		 extra_data;		/* Used locally for exec options */
 1765: 	int				*offsets;			/* Array of subpattern offsets */
 1766: 	int				 size_offsets;		/* Size of the offsets array */
 1767: 	int				 count = 0;			/* Count of matched subpatterns */
 1768: 	char			*string_key;
 1769: 	ulong			 num_key;
 1770: 	zend_bool		 invert;			/* Whether to return non-matching
 1771: 										   entries */
 1772: 	int				 rc;
 1773: 	
 1774: 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
 1775: 	
 1776: 	if (extra == NULL) {
 1777: 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 1778: 		extra = &extra_data;
 1779: 	}
 1780: 	extra->match_limit = PCRE_G(backtrack_limit);
 1781: 	extra->match_limit_recursion = PCRE_G(recursion_limit);
 1782: 
 1783: 	/* Calculate the size of the offsets array, and allocate memory for it. */
 1784: 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
 1785: 	if (rc < 0) {
 1786: 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
 1787: 		RETURN_FALSE;
 1788: 	}
 1789: 	size_offsets = (size_offsets + 1) * 3;
 1790: 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
 1791: 	
 1792: 	/* Initialize return array */
 1793: 	array_init(return_value);
 1794: 
 1795: 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 1796: 
 1797: 	/* Go through the input array */
 1798: 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
 1799: 	while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
 1800: 		zval subject = **entry;
 1801: 
 1802: 		if (Z_TYPE_PP(entry) != IS_STRING) {
 1803: 			zval_copy_ctor(&subject);
 1804: 			convert_to_string(&subject);
 1805: 		}
 1806: 
 1807: 		/* Perform the match */
 1808: 		count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
 1809: 						  Z_STRLEN(subject), 0,
 1810: 						  0, offsets, size_offsets);
 1811: 
 1812: 		/* Check for too many substrings condition. */
 1813: 		if (count == 0) {
 1814: 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
 1815: 			count = size_offsets/3;
 1816: 		} else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
 1817: 			pcre_handle_exec_error(count TSRMLS_CC);
 1818: 			break;
 1819: 		}
 1820: 
 1821: 		/* If the entry fits our requirements */
 1822: 		if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
 1823: 
 1824: 			Z_ADDREF_PP(entry);
 1825: 
 1826: 			/* Add to return array */
 1827: 			switch (zend_hash_get_current_key(Z_ARRVAL_P(input), &string_key, &num_key, 0))
 1828: 			{
 1829: 				case HASH_KEY_IS_STRING:
 1830: 					zend_hash_update(Z_ARRVAL_P(return_value), string_key,
 1831: 									 strlen(string_key)+1, entry, sizeof(zval *), NULL);
 1832: 					break;
 1833: 
 1834: 				case HASH_KEY_IS_LONG:
 1835: 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
 1836: 										   sizeof(zval *), NULL);
 1837: 					break;
 1838: 			}
 1839: 		}
 1840: 
 1841: 		if (Z_TYPE_PP(entry) != IS_STRING) {
 1842: 			zval_dtor(&subject);
 1843: 		}
 1844: 
 1845: 		zend_hash_move_forward(Z_ARRVAL_P(input));
 1846: 	}
 1847: 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
 1848: 	/* Clean up */
 1849: 	efree(offsets);
 1850: }
 1851: /* }}} */
 1852: 
 1853: /* {{{ proto int preg_last_error()
 1854:    Returns the error code of the last regexp execution. */
 1855: static PHP_FUNCTION(preg_last_error)
 1856: {
 1857: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
 1858: 		return;
 1859: 	}
 1860: 
 1861: 	RETURN_LONG(PCRE_G(error_code));
 1862: }
 1863: /* }}} */
 1864: 
 1865: /* {{{ module definition structures */
 1866: 
 1867: /* {{{ arginfo */
 1868: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
 1869:     ZEND_ARG_INFO(0, pattern)
 1870:     ZEND_ARG_INFO(0, subject)
 1871:     ZEND_ARG_INFO(1, subpatterns) /* array */
 1872:     ZEND_ARG_INFO(0, flags)
 1873:     ZEND_ARG_INFO(0, offset)
 1874: ZEND_END_ARG_INFO()
 1875: 
 1876: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
 1877:     ZEND_ARG_INFO(0, pattern)
 1878:     ZEND_ARG_INFO(0, subject)
 1879:     ZEND_ARG_INFO(1, subpatterns) /* array */
 1880:     ZEND_ARG_INFO(0, flags)
 1881:     ZEND_ARG_INFO(0, offset)
 1882: ZEND_END_ARG_INFO()
 1883: 
 1884: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
 1885:     ZEND_ARG_INFO(0, regex)
 1886:     ZEND_ARG_INFO(0, replace)
 1887:     ZEND_ARG_INFO(0, subject)
 1888:     ZEND_ARG_INFO(0, limit)
 1889:     ZEND_ARG_INFO(1, count)
 1890: ZEND_END_ARG_INFO()
 1891: 
 1892: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
 1893:     ZEND_ARG_INFO(0, regex)
 1894:     ZEND_ARG_INFO(0, callback)
 1895:     ZEND_ARG_INFO(0, subject)
 1896:     ZEND_ARG_INFO(0, limit)
 1897:     ZEND_ARG_INFO(1, count)
 1898: ZEND_END_ARG_INFO()
 1899: 
 1900: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
 1901:     ZEND_ARG_INFO(0, pattern)
 1902:     ZEND_ARG_INFO(0, subject)
 1903:     ZEND_ARG_INFO(0, limit)
 1904:     ZEND_ARG_INFO(0, flags) 
 1905: ZEND_END_ARG_INFO()
 1906: 
 1907: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
 1908:     ZEND_ARG_INFO(0, str)
 1909:     ZEND_ARG_INFO(0, delim_char)
 1910: ZEND_END_ARG_INFO()
 1911: 
 1912: ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
 1913:     ZEND_ARG_INFO(0, regex)
 1914:     ZEND_ARG_INFO(0, input) /* array */
 1915:     ZEND_ARG_INFO(0, flags)
 1916: ZEND_END_ARG_INFO()
 1917: 
 1918: ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
 1919: ZEND_END_ARG_INFO()
 1920: /* }}} */
 1921: 
 1922: static const zend_function_entry pcre_functions[] = {
 1923: 	PHP_FE(preg_match,				arginfo_preg_match)
 1924: 	PHP_FE(preg_match_all,			arginfo_preg_match_all)
 1925: 	PHP_FE(preg_replace,			arginfo_preg_replace)
 1926: 	PHP_FE(preg_replace_callback,	arginfo_preg_replace_callback)
 1927: 	PHP_FE(preg_filter,				arginfo_preg_replace)
 1928: 	PHP_FE(preg_split,				arginfo_preg_split)
 1929: 	PHP_FE(preg_quote,				arginfo_preg_quote)
 1930: 	PHP_FE(preg_grep,				arginfo_preg_grep)
 1931: 	PHP_FE(preg_last_error,			arginfo_preg_last_error)
 1932: 	PHP_FE_END
 1933: };
 1934: 
 1935: zend_module_entry pcre_module_entry = {
 1936: 	STANDARD_MODULE_HEADER,
 1937:    "pcre",
 1938: 	pcre_functions,
 1939: 	PHP_MINIT(pcre),
 1940: 	PHP_MSHUTDOWN(pcre),
 1941: 	NULL,
 1942: 	NULL,
 1943: 	PHP_MINFO(pcre),
 1944: 	NO_VERSION_YET,
 1945: 	PHP_MODULE_GLOBALS(pcre),
 1946: 	PHP_GINIT(pcre),
 1947: 	PHP_GSHUTDOWN(pcre),
 1948: 	NULL,
 1949: 	STANDARD_MODULE_PROPERTIES_EX
 1950: };
 1951: 
 1952: #ifdef COMPILE_DL_PCRE
 1953: ZEND_GET_MODULE(pcre)
 1954: #endif
 1955: 
 1956: /* }}} */
 1957: 
 1958: #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
 1959: 
 1960: /*
 1961:  * Local variables:
 1962:  * tab-width: 4
 1963:  * c-basic-offset: 4
 1964:  * End:
 1965:  * vim600: sw=4 ts=4 fdm=marker
 1966:  * vim<600: sw=4 ts=4
 1967:  */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>