embedaddon/php/ext/standard/metaphone.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / standard / metaphone.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 20:03:57 2014 UTC (10 years, 1 month ago) by misho
Branches: php, MAIN
CVS tags: v5_4_29, HEAD

php 5.4.29

1: /* 2: +----------------------------------------------------------------------+ 3: | PHP Version 5 | 4: +----------------------------------------------------------------------+ 5: | Copyright (c) 1997-2014 The PHP Group | 6: +----------------------------------------------------------------------+ 7: | This source file is subject to version 3.01 of the PHP license, | 8: | that is bundled with this package in the file LICENSE, and is | 9: | available through the world-wide-web at the following url: | 10: | http://www.php.net/license/3_01.txt | 11: | If you did not receive a copy of the PHP license and are unable to | 12: | obtain it through the world-wide-web, please send a note to | 13: | license@php.net so we can mail you a copy immediately. | 14: +----------------------------------------------------------------------+ 15: | Author: Thies C. Arntzen <thies@thieso.net> | 16: +----------------------------------------------------------------------+ 17: */ 18: 19: /* $Id: metaphone.c,v 1.1.1.4 2014/06/15 20:03:57 misho Exp $ */ 20: 21: /* 22: Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> 23: */ 24: 25: #include "php.h" 26: #include "php_metaphone.h" 27: 28: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional); 29: 30: /* {{{ proto string metaphone(string text[, int phones]) 31: Break english phrases down into their phonemes */ 32: PHP_FUNCTION(metaphone) 33: { 34: char *str; 35: char *result = 0; 36: int str_len; 37: long phones = 0; 38: 39: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, 40: &phones) == FAILURE) { 41: return; 42: } 43: 44: if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) { 45: RETVAL_STRING(result, 0); 46: } else { 47: if (result) { 48: efree(result); 49: } 50: RETURN_FALSE; 51: } 52: } 53: /* }}} */ 54: 55: /* 56: this is now the original code by Michael G Schwern: 57: i've changed it just a slightly bit (use emalloc, 58: get rid of includes etc) 59: - thies - 13.09.1999 60: */ 61: 62: /*----------------------------- */ 63: /* this used to be "metaphone.h" */ 64: /*----------------------------- */ 65: 66: /* Special encodings */ 67: #define SH 'X' 68: #define TH '0' 69: 70: /*----------------------------- */ 71: /* end of "metaphone.h" */ 72: /*----------------------------- */ 73: 74: /*----------------------------- */ 75: /* this used to be "metachar.h" */ 76: /*----------------------------- */ 77: 78: /* Metachar.h ... little bits about characters for metaphone */ 79: /*-- Character encoding array & accessing macros --*/ 80: /* Stolen directly out of the book... */ 81: char _codes[26] = 82: { 83: 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0 84: /* a b c d e f g h i j k l m n o p q r s t u v w x y z */ 85: }; 86: 87: 88: #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) 89: 90: #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ 91: 92: /* These letters are passed through unchanged */ 93: #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */ 94: 95: /* These form dipthongs when preceding H */ 96: #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */ 97: 98: /* These make C and G soft */ 99: #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */ 100: 101: /* These prevent GH from becoming F */ 102: #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */ 103: 104: /*----------------------------- */ 105: /* end of "metachar.h" */ 106: /*----------------------------- */ 107: 108: /* I suppose I could have been using a character pointer instead of 109: * accesssing the array directly... */ 110: 111: /* Look at the next letter in the word */ 112: #define Next_Letter (toupper(word[w_idx+1])) 113: /* Look at the current letter in the word */ 114: #define Curr_Letter (toupper(word[w_idx])) 115: /* Go N letters back. */ 116: #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') 117: /* Previous letter. I dunno, should this return null on failure? */ 118: #define Prev_Letter (Look_Back_Letter(1)) 119: /* Look two letters down. It makes sure you don't walk off the string. */ 120: #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ 121: : '\0') 122: #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n))) 123: 124: 125: /* Allows us to safely look ahead an arbitrary # of letters */ 126: /* I probably could have just used strlen... */ 127: static char Lookahead(char *word, int how_far) 128: { 129: char letter_ahead = '\0'; /* null by default */ 130: int idx; 131: for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); 132: /* Edge forward in the string... */ 133: 134: letter_ahead = word[idx]; /* idx will be either == to how_far or 135: * at the end of the string 136: */ 137: return letter_ahead; 138: } 139: 140: 141: /* phonize one letter 142: * We don't know the buffers size in advance. On way to solve this is to just 143: * re-allocate the buffer size. We're using an extra of 2 characters (this 144: * could be one though; or more too). */ 145: #define Phonize(c) { \ 146: if (p_idx >= max_buffer_len) { \ 147: *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \ 148: max_buffer_len += 2; \ 149: } \ 150: (*phoned_word)[p_idx++] = c; \ 151: } 152: /* Slap a null character on the end of the phoned word */ 153: #define End_Phoned_Word { \ 154: if (p_idx == max_buffer_len) { \ 155: *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \ 156: } \ 157: (*phoned_word)[p_idx] = '\0'; \ 158: } 159: /* How long is the phoned word? */ 160: #define Phone_Len (p_idx) 161: 162: /* Note is a letter is a 'break' in the word */ 163: #define Isbreak(c) (!isalpha(c)) 164: 165: /* {{{ metaphone 166: */ 167: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional) 168: { 169: int w_idx = 0; /* point in the phonization we're at. */ 170: int p_idx = 0; /* end of the phoned phrase */ 171: int max_buffer_len = 0; /* maximum length of the destination buffer */ 172: 173: /*-- Parameter checks --*/ 174: /* Negative phoneme length is meaningless */ 175: 176: if (max_phonemes < 0) 177: return -1; 178: 179: /* Empty/null string is meaningless */ 180: /* Overly paranoid */ 181: /* assert(word != NULL && word[0] != '\0'); */ 182: 183: if (word == NULL) 184: return -1; 185: 186: /*-- Allocate memory for our phoned_phrase --*/ 187: if (max_phonemes == 0) { /* Assume largest possible */ 188: max_buffer_len = word_len; 189: *phoned_word = safe_emalloc(sizeof(char), word_len, 1); 190: } else { 191: max_buffer_len = max_phonemes; 192: *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1); 193: } 194: 195: 196: /*-- The first phoneme has to be processed specially. --*/ 197: /* Find our first letter */ 198: for (; !isalpha(Curr_Letter); w_idx++) { 199: /* On the off chance we were given nothing but crap... */ 200: if (Curr_Letter == '\0') { 201: End_Phoned_Word 202: return SUCCESS; /* For testing */ 203: } 204: } 205: 206: switch (Curr_Letter) { 207: /* AE becomes E */ 208: case 'A': 209: if (Next_Letter == 'E') { 210: Phonize('E'); 211: w_idx += 2; 212: } 213: /* Remember, preserve vowels at the beginning */ 214: else { 215: Phonize('A'); 216: w_idx++; 217: } 218: break; 219: /* [GKP]N becomes N */ 220: case 'G': 221: case 'K': 222: case 'P': 223: if (Next_Letter == 'N') { 224: Phonize('N'); 225: w_idx += 2; 226: } 227: break; 228: /* WH becomes W, 229: WR becomes R 230: W if followed by a vowel */ 231: case 'W': 232: if (Next_Letter == 'R') { 233: Phonize(Next_Letter); 234: w_idx += 2; 235: } else if (Next_Letter == 'H' || isvowel(Next_Letter)) { 236: Phonize('W'); 237: w_idx += 2; 238: } 239: /* else ignore */ 240: break; 241: /* X becomes S */ 242: case 'X': 243: Phonize('S'); 244: w_idx++; 245: break; 246: /* Vowels are kept */ 247: /* We did A already 248: case 'A': 249: case 'a': 250: */ 251: case 'E': 252: case 'I': 253: case 'O': 254: case 'U': 255: Phonize(Curr_Letter); 256: w_idx++; 257: break; 258: default: 259: /* do nothing */ 260: break; 261: } 262: 263: 264: 265: /* On to the metaphoning */ 266: for (; Curr_Letter != '\0' && 267: (max_phonemes == 0 || Phone_Len < max_phonemes); 268: w_idx++) { 269: /* How many letters to skip because an eariler encoding handled 270: * multiple letters */ 271: unsigned short int skip_letter = 0; 272: 273: 274: /* THOUGHT: It would be nice if, rather than having things like... 275: * well, SCI. For SCI you encode the S, then have to remember 276: * to skip the C. So the phonome SCI invades both S and C. It would 277: * be better, IMHO, to skip the C from the S part of the encoding. 278: * Hell, I'm trying it. 279: */ 280: 281: /* Ignore non-alphas */ 282: if (!isalpha(Curr_Letter)) 283: continue; 284: 285: /* Drop duplicates, except CC */ 286: if (Curr_Letter == Prev_Letter && 287: Curr_Letter != 'C') 288: continue; 289: 290: switch (Curr_Letter) { 291: /* B -> B unless in MB */ 292: case 'B': 293: if (Prev_Letter != 'M') 294: Phonize('B'); 295: break; 296: /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. 297: * (SCHW is handled in S) 298: * S if -CI-, -CE- or -CY- 299: * dropped if -SCI-, SCE-, -SCY- (handed in S) 300: * else K 301: */ 302: case 'C': 303: if (MAKESOFT(Next_Letter)) { /* C[IEY] */ 304: if (After_Next_Letter == 'A' && 305: Next_Letter == 'I') { /* CIA */ 306: Phonize(SH); 307: } 308: /* SC[IEY] */ 309: else if (Prev_Letter == 'S') { 310: /* Dropped */ 311: } else { 312: Phonize('S'); 313: } 314: } else if (Next_Letter == 'H') { 315: if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ 316: Phonize('K'); 317: } else { 318: Phonize(SH); 319: } 320: skip_letter++; 321: } else { 322: Phonize('K'); 323: } 324: break; 325: /* J if in -DGE-, -DGI- or -DGY- 326: * else T 327: */ 328: case 'D': 329: if (Next_Letter == 'G' && 330: MAKESOFT(After_Next_Letter)) { 331: Phonize('J'); 332: skip_letter++; 333: } else 334: Phonize('T'); 335: break; 336: /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH 337: * else dropped if -GNED, -GN, 338: * else dropped if -DGE-, -DGI- or -DGY- (handled in D) 339: * else J if in -GE-, -GI, -GY and not GG 340: * else K 341: */ 342: case 'G': 343: if (Next_Letter == 'H') { 344: if (!(NOGHTOF(Look_Back_Letter(3)) || 345: Look_Back_Letter(4) == 'H')) { 346: Phonize('F'); 347: skip_letter++; 348: } else { 349: /* silent */ 350: } 351: } else if (Next_Letter == 'N') { 352: if (Isbreak(After_Next_Letter) || 353: (After_Next_Letter == 'E' && 354: Look_Ahead_Letter(3) == 'D')) { 355: /* dropped */ 356: } else 357: Phonize('K'); 358: } else if (MAKESOFT(Next_Letter) && 359: Prev_Letter != 'G') { 360: Phonize('J'); 361: } else { 362: Phonize('K'); 363: } 364: break; 365: /* H if before a vowel and not after C,G,P,S,T */ 366: case 'H': 367: if (isvowel(Next_Letter) && 368: !AFFECTH(Prev_Letter)) 369: Phonize('H'); 370: break; 371: /* dropped if after C 372: * else K 373: */ 374: case 'K': 375: if (Prev_Letter != 'C') 376: Phonize('K'); 377: break; 378: /* F if before H 379: * else P 380: */ 381: case 'P': 382: if (Next_Letter == 'H') { 383: Phonize('F'); 384: } else { 385: Phonize('P'); 386: } 387: break; 388: /* K 389: */ 390: case 'Q': 391: Phonize('K'); 392: break; 393: /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- 394: * else S 395: */ 396: case 'S': 397: if (Next_Letter == 'I' && 398: (After_Next_Letter == 'O' || 399: After_Next_Letter == 'A')) { 400: Phonize(SH); 401: } else if (Next_Letter == 'H') { 402: Phonize(SH); 403: skip_letter++; 404: } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { 405: Phonize(SH); 406: skip_letter += 2; 407: } else { 408: Phonize('S'); 409: } 410: break; 411: /* 'sh' in -TIA- or -TIO- 412: * else 'th' before H 413: * else T 414: */ 415: case 'T': 416: if (Next_Letter == 'I' && 417: (After_Next_Letter == 'O' || 418: After_Next_Letter == 'A')) { 419: Phonize(SH); 420: } else if (Next_Letter == 'H') { 421: Phonize(TH); 422: skip_letter++; 423: } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) { 424: Phonize('T'); 425: } 426: break; 427: /* F */ 428: case 'V': 429: Phonize('F'); 430: break; 431: /* W before a vowel, else dropped */ 432: case 'W': 433: if (isvowel(Next_Letter)) 434: Phonize('W'); 435: break; 436: /* KS */ 437: case 'X': 438: Phonize('K'); 439: Phonize('S'); 440: break; 441: /* Y if followed by a vowel */ 442: case 'Y': 443: if (isvowel(Next_Letter)) 444: Phonize('Y'); 445: break; 446: /* S */ 447: case 'Z': 448: Phonize('S'); 449: break; 450: /* No transformation */ 451: case 'F': 452: case 'J': 453: case 'L': 454: case 'M': 455: case 'N': 456: case 'R': 457: Phonize(Curr_Letter); 458: break; 459: default: 460: /* nothing */ 461: break; 462: } /* END SWITCH */ 463: 464: w_idx += skip_letter; 465: } /* END FOR */ 466: 467: End_Phoned_Word; 468: 469: return 0; 470: } /* END metaphone */ 471: /* }}} */ 472: 473: /* 474: * Local variables: 475: * tab-width: 4 476: * c-basic-offset: 4 477: * End: 478: * vim600: sw=4 ts=4 fdm=marker 479: * vim<600: sw=4 ts=4 480: */