File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / standard / metaphone.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 20:03:57 2014 UTC (10 years, 1 month ago) by misho
Branches: php, MAIN
CVS tags: v5_4_29, HEAD
php 5.4.29

    1: /*
    2:    +----------------------------------------------------------------------+
    3:    | PHP Version 5                                                        |
    4:    +----------------------------------------------------------------------+
    5:    | Copyright (c) 1997-2014 The PHP Group                                |
    6:    +----------------------------------------------------------------------+
    7:    | This source file is subject to version 3.01 of the PHP license,      |
    8:    | that is bundled with this package in the file LICENSE, and is        |
    9:    | available through the world-wide-web at the following url:           |
   10:    | http://www.php.net/license/3_01.txt                                  |
   11:    | If you did not receive a copy of the PHP license and are unable to   |
   12:    | obtain it through the world-wide-web, please send a note to          |
   13:    | license@php.net so we can mail you a copy immediately.               |
   14:    +----------------------------------------------------------------------+
   15:    | Author: Thies C. Arntzen <thies@thieso.net>                          |
   16:    +----------------------------------------------------------------------+
   17: */
   18: 
   19: /* $Id: metaphone.c,v 1.1.1.4 2014/06/15 20:03:57 misho Exp $ */
   20: 
   21: /*
   22: 	Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 
   23: */
   24: 
   25: #include "php.h"
   26: #include "php_metaphone.h"
   27: 
   28: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
   29: 
   30: /* {{{ proto string metaphone(string text[, int phones])
   31:    Break english phrases down into their phonemes */
   32: PHP_FUNCTION(metaphone)
   33: {
   34: 	char *str;
   35: 	char *result = 0;
   36: 	int str_len;
   37: 	long phones = 0;
   38: 
   39: 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
   40: 							  &phones) == FAILURE) {
   41: 		return;
   42: 	}
   43: 
   44: 	if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
   45: 		RETVAL_STRING(result, 0);
   46: 	} else {
   47: 		if (result) {
   48: 			efree(result);
   49: 		}
   50: 		RETURN_FALSE;
   51: 	}
   52: }
   53: /* }}} */
   54: 
   55: /* 
   56:    this is now the original code by Michael G Schwern:
   57:    i've changed it just a slightly bit (use emalloc, 
   58:    get rid of includes etc) 
   59: 	- thies - 13.09.1999
   60: */
   61: 
   62: /*-----------------------------  */
   63: /* this used to be "metaphone.h" */
   64: /*-----------------------------  */
   65: 
   66: /* Special encodings */
   67: #define  SH 	'X'
   68: #define  TH		'0'
   69: 
   70: /*-----------------------------  */
   71: /* end of "metaphone.h"          */
   72: /*-----------------------------  */
   73: 
   74: /*----------------------------- */
   75: /* this used to be "metachar.h" */
   76: /*----------------------------- */
   77: 
   78: /* Metachar.h ... little bits about characters for metaphone */
   79: /*-- Character encoding array & accessing macros --*/
   80: /* Stolen directly out of the book... */
   81: char _codes[26] =
   82: {
   83: 	1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
   84: /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
   85: };
   86: 
   87: 
   88: #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
   89: 
   90: #define isvowel(c)  (ENCODE(c) & 1)		/* AEIOU */
   91: 
   92: /* These letters are passed through unchanged */
   93: #define NOCHANGE(c) (ENCODE(c) & 2)		/* FJMNR */
   94: 
   95: /* These form dipthongs when preceding H */
   96: #define AFFECTH(c)  (ENCODE(c) & 4)		/* CGPST */
   97: 
   98: /* These make C and G soft */
   99: #define MAKESOFT(c) (ENCODE(c) & 8)		/* EIY */
  100: 
  101: /* These prevent GH from becoming F */
  102: #define NOGHTOF(c)  (ENCODE(c) & 16)	/* BDH */
  103: 
  104: /*----------------------------- */
  105: /* end of "metachar.h"          */
  106: /*----------------------------- */
  107: 
  108: /* I suppose I could have been using a character pointer instead of
  109:  * accesssing the array directly... */
  110: 
  111: /* Look at the next letter in the word */
  112: #define Next_Letter (toupper(word[w_idx+1]))
  113: /* Look at the current letter in the word */
  114: #define Curr_Letter (toupper(word[w_idx]))
  115: /* Go N letters back. */
  116: #define Look_Back_Letter(n)	(w_idx >= n ? toupper(word[w_idx-n]) : '\0')
  117: /* Previous letter.  I dunno, should this return null on failure? */
  118: #define Prev_Letter (Look_Back_Letter(1))
  119: /* Look two letters down.  It makes sure you don't walk off the string. */
  120: #define After_Next_Letter	(Next_Letter != '\0' ? toupper(word[w_idx+2]) \
  121: 											     : '\0')
  122: #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
  123: 
  124: 
  125: /* Allows us to safely look ahead an arbitrary # of letters */
  126: /* I probably could have just used strlen... */
  127: static char Lookahead(char *word, int how_far)
  128: {
  129: 	char letter_ahead = '\0';	/* null by default */
  130: 	int idx;
  131: 	for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
  132: 	/* Edge forward in the string... */
  133: 
  134: 	letter_ahead = word[idx];	/* idx will be either == to how_far or
  135: 								 * at the end of the string
  136: 								 */
  137: 	return letter_ahead;
  138: }
  139: 
  140: 
  141: /* phonize one letter
  142:  * We don't know the buffers size in advance. On way to solve this is to just
  143:  * re-allocate the buffer size. We're using an extra of 2 characters (this
  144:  * could be one though; or more too). */
  145: #define Phonize(c)	{ \
  146: 						if (p_idx >= max_buffer_len) { \
  147: 							*phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
  148: 							max_buffer_len += 2; \
  149: 						} \
  150: 						(*phoned_word)[p_idx++] = c; \
  151: 					}
  152: /* Slap a null character on the end of the phoned word */
  153: #define End_Phoned_Word	{ \
  154: 							if (p_idx == max_buffer_len) { \
  155: 								*phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
  156: 							} \
  157: 							(*phoned_word)[p_idx] = '\0'; \
  158: 						}
  159: /* How long is the phoned word? */
  160: #define Phone_Len	(p_idx)
  161: 
  162: /* Note is a letter is a 'break' in the word */
  163: #define Isbreak(c)  (!isalpha(c))
  164: 
  165: /* {{{ metaphone
  166:  */
  167: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
  168: {
  169: 	int w_idx = 0;				/* point in the phonization we're at. */
  170: 	int p_idx = 0;				/* end of the phoned phrase */
  171: 	int max_buffer_len = 0;		/* maximum length of the destination buffer */
  172: 
  173: /*-- Parameter checks --*/
  174: 	/* Negative phoneme length is meaningless */
  175: 
  176: 	if (max_phonemes < 0)
  177: 		return -1;
  178: 
  179: 	/* Empty/null string is meaningless */
  180: 	/* Overly paranoid */
  181: 	/* assert(word != NULL && word[0] != '\0'); */
  182: 
  183: 	if (word == NULL)
  184: 		return -1;
  185: 
  186: /*-- Allocate memory for our phoned_phrase --*/
  187: 	if (max_phonemes == 0) {	/* Assume largest possible */
  188: 		max_buffer_len = word_len;
  189: 		*phoned_word = safe_emalloc(sizeof(char), word_len, 1);
  190: 	} else {
  191: 		max_buffer_len = max_phonemes;
  192: 		*phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
  193: 	}
  194: 
  195: 
  196: /*-- The first phoneme has to be processed specially. --*/
  197: 	/* Find our first letter */
  198: 	for (; !isalpha(Curr_Letter); w_idx++) {
  199: 		/* On the off chance we were given nothing but crap... */
  200: 		if (Curr_Letter == '\0') {
  201: 			End_Phoned_Word
  202: 				return SUCCESS;	/* For testing */
  203: 		}
  204: 	}
  205: 
  206: 	switch (Curr_Letter) {
  207: 		/* AE becomes E */
  208: 	case 'A':
  209: 		if (Next_Letter == 'E') {
  210: 			Phonize('E');
  211: 			w_idx += 2;
  212: 		}
  213: 		/* Remember, preserve vowels at the beginning */
  214: 		else {
  215: 			Phonize('A');
  216: 			w_idx++;
  217: 		}
  218: 		break;
  219: 		/* [GKP]N becomes N */
  220: 	case 'G':
  221: 	case 'K':
  222: 	case 'P':
  223: 		if (Next_Letter == 'N') {
  224: 			Phonize('N');
  225: 			w_idx += 2;
  226: 		}
  227: 		break;
  228: 		/* WH becomes W, 
  229: 		   WR becomes R 
  230: 		   W if followed by a vowel */
  231: 	case 'W':
  232: 		if (Next_Letter == 'R') {
  233: 			Phonize(Next_Letter);
  234: 			w_idx += 2;
  235: 		} else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
  236: 			Phonize('W');
  237: 			w_idx += 2;
  238: 		}
  239: 		/* else ignore */
  240: 		break;
  241: 		/* X becomes S */
  242: 	case 'X':
  243: 		Phonize('S');
  244: 		w_idx++;
  245: 		break;
  246: 		/* Vowels are kept */
  247: 		/* We did A already
  248: 		   case 'A':
  249: 		   case 'a':
  250: 		 */
  251: 	case 'E':
  252: 	case 'I':
  253: 	case 'O':
  254: 	case 'U':
  255: 		Phonize(Curr_Letter);
  256: 		w_idx++;
  257: 		break;
  258: 	default:
  259: 		/* do nothing */
  260: 		break;
  261: 	}
  262: 
  263: 
  264: 
  265: 	/* On to the metaphoning */
  266: 	for (; Curr_Letter != '\0' &&
  267: 		 (max_phonemes == 0 || Phone_Len < max_phonemes);
  268: 		 w_idx++) {
  269: 		/* How many letters to skip because an eariler encoding handled     
  270: 		 * multiple letters */
  271: 		unsigned short int skip_letter = 0;
  272: 
  273: 
  274: 		/* THOUGHT:  It would be nice if, rather than having things like...
  275: 		 * well, SCI.  For SCI you encode the S, then have to remember
  276: 		 * to skip the C.  So the phonome SCI invades both S and C.  It would
  277: 		 * be better, IMHO, to skip the C from the S part of the encoding.
  278: 		 * Hell, I'm trying it.
  279: 		 */
  280: 
  281: 		/* Ignore non-alphas */
  282: 		if (!isalpha(Curr_Letter))
  283: 			continue;
  284: 
  285: 		/* Drop duplicates, except CC */
  286: 		if (Curr_Letter == Prev_Letter &&
  287: 			Curr_Letter != 'C')
  288: 			continue;
  289: 
  290: 		switch (Curr_Letter) {
  291: 			/* B -> B unless in MB */
  292: 		case 'B':
  293: 			if (Prev_Letter != 'M')
  294: 				Phonize('B');
  295: 			break;
  296: 			/* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
  297: 			 * (SCHW is handled in S)
  298: 			 *  S if -CI-, -CE- or -CY-
  299: 			 *  dropped if -SCI-, SCE-, -SCY- (handed in S)
  300: 			 *  else K
  301: 			 */
  302: 		case 'C':
  303: 			if (MAKESOFT(Next_Letter)) {	/* C[IEY] */
  304: 				if (After_Next_Letter == 'A' &&
  305: 					Next_Letter == 'I') {	/* CIA */
  306: 					Phonize(SH);
  307: 				}
  308: 				/* SC[IEY] */
  309: 				else if (Prev_Letter == 'S') {
  310: 					/* Dropped */
  311: 				} else {
  312: 					Phonize('S');
  313: 				}
  314: 			} else if (Next_Letter == 'H') {
  315: 				if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) {	/* Christ, School */
  316: 					Phonize('K');
  317: 				} else {
  318: 					Phonize(SH);
  319: 				}
  320: 				skip_letter++;
  321: 			} else {
  322: 				Phonize('K');
  323: 			}
  324: 			break;
  325: 			/* J if in -DGE-, -DGI- or -DGY-
  326: 			 * else T
  327: 			 */
  328: 		case 'D':
  329: 			if (Next_Letter == 'G' &&
  330: 				MAKESOFT(After_Next_Letter)) {
  331: 				Phonize('J');
  332: 				skip_letter++;
  333: 			} else
  334: 				Phonize('T');
  335: 			break;
  336: 			/* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
  337: 			 * else dropped if -GNED, -GN, 
  338: 			 * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
  339: 			 * else J if in -GE-, -GI, -GY and not GG
  340: 			 * else K
  341: 			 */
  342: 		case 'G':
  343: 			if (Next_Letter == 'H') {
  344: 				if (!(NOGHTOF(Look_Back_Letter(3)) ||
  345: 					  Look_Back_Letter(4) == 'H')) {
  346: 					Phonize('F');
  347: 					skip_letter++;
  348: 				} else {
  349: 					/* silent */
  350: 				}
  351: 			} else if (Next_Letter == 'N') {
  352: 				if (Isbreak(After_Next_Letter) ||
  353: 					(After_Next_Letter == 'E' &&
  354: 					 Look_Ahead_Letter(3) == 'D')) {
  355: 					/* dropped */
  356: 				} else
  357: 					Phonize('K');
  358: 			} else if (MAKESOFT(Next_Letter) &&
  359: 					   Prev_Letter != 'G') {
  360: 				Phonize('J');
  361: 			} else {
  362: 				Phonize('K');
  363: 			}
  364: 			break;
  365: 			/* H if before a vowel and not after C,G,P,S,T */
  366: 		case 'H':
  367: 			if (isvowel(Next_Letter) &&
  368: 				!AFFECTH(Prev_Letter))
  369: 				Phonize('H');
  370: 			break;
  371: 			/* dropped if after C
  372: 			 * else K
  373: 			 */
  374: 		case 'K':
  375: 			if (Prev_Letter != 'C')
  376: 				Phonize('K');
  377: 			break;
  378: 			/* F if before H
  379: 			 * else P
  380: 			 */
  381: 		case 'P':
  382: 			if (Next_Letter == 'H') {
  383: 				Phonize('F');
  384: 			} else {
  385: 				Phonize('P');
  386: 			}
  387: 			break;
  388: 			/* K
  389: 			 */
  390: 		case 'Q':
  391: 			Phonize('K');
  392: 			break;
  393: 			/* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
  394: 			 * else S
  395: 			 */
  396: 		case 'S':
  397: 			if (Next_Letter == 'I' &&
  398: 				(After_Next_Letter == 'O' ||
  399: 				 After_Next_Letter == 'A')) {
  400: 				Phonize(SH);
  401: 			} else if (Next_Letter == 'H') {
  402: 				Phonize(SH);
  403: 				skip_letter++;
  404: 			} else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
  405: 				Phonize(SH);
  406: 				skip_letter += 2;
  407: 			} else {
  408: 				Phonize('S');
  409: 			}
  410: 			break;
  411: 			/* 'sh' in -TIA- or -TIO-
  412: 			 * else 'th' before H
  413: 			 * else T
  414: 			 */
  415: 		case 'T':
  416: 			if (Next_Letter == 'I' &&
  417: 				(After_Next_Letter == 'O' ||
  418: 				 After_Next_Letter == 'A')) {
  419: 				Phonize(SH);
  420: 			} else if (Next_Letter == 'H') {
  421: 				Phonize(TH);
  422: 				skip_letter++;
  423: 			} else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
  424: 				Phonize('T');
  425: 			}
  426: 			break;
  427: 			/* F */
  428: 		case 'V':
  429: 			Phonize('F');
  430: 			break;
  431: 			/* W before a vowel, else dropped */
  432: 		case 'W':
  433: 			if (isvowel(Next_Letter))
  434: 				Phonize('W');
  435: 			break;
  436: 			/* KS */
  437: 		case 'X':
  438: 			Phonize('K');
  439: 			Phonize('S');
  440: 			break;
  441: 			/* Y if followed by a vowel */
  442: 		case 'Y':
  443: 			if (isvowel(Next_Letter))
  444: 				Phonize('Y');
  445: 			break;
  446: 			/* S */
  447: 		case 'Z':
  448: 			Phonize('S');
  449: 			break;
  450: 			/* No transformation */
  451: 		case 'F':
  452: 		case 'J':
  453: 		case 'L':
  454: 		case 'M':
  455: 		case 'N':
  456: 		case 'R':
  457: 			Phonize(Curr_Letter);
  458: 			break;
  459: 		default:
  460: 			/* nothing */
  461: 			break;
  462: 		}						/* END SWITCH */
  463: 
  464: 		w_idx += skip_letter;
  465: 	}							/* END FOR */
  466: 
  467: 	End_Phoned_Word;
  468: 
  469: 	return 0;
  470: }								/* END metaphone */
  471: /* }}} */
  472: 
  473: /*
  474:  * Local variables:
  475:  * tab-width: 4
  476:  * c-basic-offset: 4
  477:  * End:
  478:  * vim600: sw=4 ts=4 fdm=marker
  479:  * vim<600: sw=4 ts=4
  480:  */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>