Return to metaphone.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / standard |
1.1 ! misho 1: /* ! 2: +----------------------------------------------------------------------+ ! 3: | PHP Version 5 | ! 4: +----------------------------------------------------------------------+ ! 5: | Copyright (c) 1997-2012 The PHP Group | ! 6: +----------------------------------------------------------------------+ ! 7: | This source file is subject to version 3.01 of the PHP license, | ! 8: | that is bundled with this package in the file LICENSE, and is | ! 9: | available through the world-wide-web at the following url: | ! 10: | http://www.php.net/license/3_01.txt | ! 11: | If you did not receive a copy of the PHP license and are unable to | ! 12: | obtain it through the world-wide-web, please send a note to | ! 13: | license@php.net so we can mail you a copy immediately. | ! 14: +----------------------------------------------------------------------+ ! 15: | Author: Thies C. Arntzen <thies@thieso.net> | ! 16: +----------------------------------------------------------------------+ ! 17: */ ! 18: ! 19: /* $Id: metaphone.c 321634 2012-01-01 13:15:04Z felipe $ */ ! 20: ! 21: /* ! 22: Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> ! 23: */ ! 24: ! 25: #include "php.h" ! 26: #include "php_metaphone.h" ! 27: ! 28: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional); ! 29: ! 30: /* {{{ proto string metaphone(string text[, int phones]) ! 31: Break english phrases down into their phonemes */ ! 32: PHP_FUNCTION(metaphone) ! 33: { ! 34: char *str; ! 35: char *result = 0; ! 36: int str_len; ! 37: long phones = 0; ! 38: ! 39: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, ! 40: &phones) == FAILURE) { ! 41: return; ! 42: } ! 43: ! 44: if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) { ! 45: RETVAL_STRING(result, 0); ! 46: } else { ! 47: if (result) { ! 48: efree(result); ! 49: } ! 50: RETURN_FALSE; ! 51: } ! 52: } ! 53: /* }}} */ ! 54: ! 55: /* ! 56: this is now the original code by Michael G Schwern: ! 57: i've changed it just a slightly bit (use emalloc, ! 58: get rid of includes etc) ! 59: - thies - 13.09.1999 ! 60: */ ! 61: ! 62: /*----------------------------- */ ! 63: /* this used to be "metaphone.h" */ ! 64: /*----------------------------- */ ! 65: ! 66: /* Special encodings */ ! 67: #define SH 'X' ! 68: #define TH '0' ! 69: ! 70: /*----------------------------- */ ! 71: /* end of "metaphone.h" */ ! 72: /*----------------------------- */ ! 73: ! 74: /*----------------------------- */ ! 75: /* this used to be "metachar.h" */ ! 76: /*----------------------------- */ ! 77: ! 78: /* Metachar.h ... little bits about characters for metaphone */ ! 79: /*-- Character encoding array & accessing macros --*/ ! 80: /* Stolen directly out of the book... */ ! 81: char _codes[26] = ! 82: { ! 83: 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0 ! 84: /* a b c d e f g h i j k l m n o p q r s t u v w x y z */ ! 85: }; ! 86: ! 87: ! 88: #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) ! 89: ! 90: #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ ! 91: ! 92: /* These letters are passed through unchanged */ ! 93: #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */ ! 94: ! 95: /* These form dipthongs when preceding H */ ! 96: #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */ ! 97: ! 98: /* These make C and G soft */ ! 99: #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */ ! 100: ! 101: /* These prevent GH from becoming F */ ! 102: #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */ ! 103: ! 104: /*----------------------------- */ ! 105: /* end of "metachar.h" */ ! 106: /*----------------------------- */ ! 107: ! 108: /* I suppose I could have been using a character pointer instead of ! 109: * accesssing the array directly... */ ! 110: ! 111: /* Look at the next letter in the word */ ! 112: #define Next_Letter (toupper(word[w_idx+1])) ! 113: /* Look at the current letter in the word */ ! 114: #define Curr_Letter (toupper(word[w_idx])) ! 115: /* Go N letters back. */ ! 116: #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') ! 117: /* Previous letter. I dunno, should this return null on failure? */ ! 118: #define Prev_Letter (Look_Back_Letter(1)) ! 119: /* Look two letters down. It makes sure you don't walk off the string. */ ! 120: #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ ! 121: : '\0') ! 122: #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n))) ! 123: ! 124: ! 125: /* Allows us to safely look ahead an arbitrary # of letters */ ! 126: /* I probably could have just used strlen... */ ! 127: static char Lookahead(char *word, int how_far) ! 128: { ! 129: char letter_ahead = '\0'; /* null by default */ ! 130: int idx; ! 131: for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); ! 132: /* Edge forward in the string... */ ! 133: ! 134: letter_ahead = word[idx]; /* idx will be either == to how_far or ! 135: * at the end of the string ! 136: */ ! 137: return letter_ahead; ! 138: } ! 139: ! 140: ! 141: /* phonize one letter ! 142: * We don't know the buffers size in advance. On way to solve this is to just ! 143: * re-allocate the buffer size. We're using an extra of 2 characters (this ! 144: * could be one though; or more too). */ ! 145: #define Phonize(c) { \ ! 146: if (p_idx >= max_buffer_len) { \ ! 147: *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \ ! 148: max_buffer_len += 2; \ ! 149: } \ ! 150: (*phoned_word)[p_idx++] = c; \ ! 151: } ! 152: /* Slap a null character on the end of the phoned word */ ! 153: #define End_Phoned_Word { \ ! 154: if (p_idx == max_buffer_len) { \ ! 155: *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \ ! 156: } \ ! 157: (*phoned_word)[p_idx] = '\0'; \ ! 158: } ! 159: /* How long is the phoned word? */ ! 160: #define Phone_Len (p_idx) ! 161: ! 162: /* Note is a letter is a 'break' in the word */ ! 163: #define Isbreak(c) (!isalpha(c)) ! 164: ! 165: /* {{{ metaphone ! 166: */ ! 167: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional) ! 168: { ! 169: int w_idx = 0; /* point in the phonization we're at. */ ! 170: int p_idx = 0; /* end of the phoned phrase */ ! 171: int max_buffer_len = 0; /* maximum length of the destination buffer */ ! 172: ! 173: /*-- Parameter checks --*/ ! 174: /* Negative phoneme length is meaningless */ ! 175: ! 176: if (max_phonemes < 0) ! 177: return -1; ! 178: ! 179: /* Empty/null string is meaningless */ ! 180: /* Overly paranoid */ ! 181: /* assert(word != NULL && word[0] != '\0'); */ ! 182: ! 183: if (word == NULL) ! 184: return -1; ! 185: ! 186: /*-- Allocate memory for our phoned_phrase --*/ ! 187: if (max_phonemes == 0) { /* Assume largest possible */ ! 188: max_buffer_len = word_len; ! 189: *phoned_word = safe_emalloc(sizeof(char), word_len, 1); ! 190: } else { ! 191: max_buffer_len = max_phonemes; ! 192: *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1); ! 193: } ! 194: ! 195: ! 196: /*-- The first phoneme has to be processed specially. --*/ ! 197: /* Find our first letter */ ! 198: for (; !isalpha(Curr_Letter); w_idx++) { ! 199: /* On the off chance we were given nothing but crap... */ ! 200: if (Curr_Letter == '\0') { ! 201: End_Phoned_Word ! 202: return SUCCESS; /* For testing */ ! 203: } ! 204: } ! 205: ! 206: switch (Curr_Letter) { ! 207: /* AE becomes E */ ! 208: case 'A': ! 209: if (Next_Letter == 'E') { ! 210: Phonize('E'); ! 211: w_idx += 2; ! 212: } ! 213: /* Remember, preserve vowels at the beginning */ ! 214: else { ! 215: Phonize('A'); ! 216: w_idx++; ! 217: } ! 218: break; ! 219: /* [GKP]N becomes N */ ! 220: case 'G': ! 221: case 'K': ! 222: case 'P': ! 223: if (Next_Letter == 'N') { ! 224: Phonize('N'); ! 225: w_idx += 2; ! 226: } ! 227: break; ! 228: /* WH becomes W, ! 229: WR becomes R ! 230: W if followed by a vowel */ ! 231: case 'W': ! 232: if (Next_Letter == 'R') { ! 233: Phonize(Next_Letter); ! 234: w_idx += 2; ! 235: } else if (Next_Letter == 'H' || isvowel(Next_Letter)) { ! 236: Phonize('W'); ! 237: w_idx += 2; ! 238: } ! 239: /* else ignore */ ! 240: break; ! 241: /* X becomes S */ ! 242: case 'X': ! 243: Phonize('S'); ! 244: w_idx++; ! 245: break; ! 246: /* Vowels are kept */ ! 247: /* We did A already ! 248: case 'A': ! 249: case 'a': ! 250: */ ! 251: case 'E': ! 252: case 'I': ! 253: case 'O': ! 254: case 'U': ! 255: Phonize(Curr_Letter); ! 256: w_idx++; ! 257: break; ! 258: default: ! 259: /* do nothing */ ! 260: break; ! 261: } ! 262: ! 263: ! 264: ! 265: /* On to the metaphoning */ ! 266: for (; Curr_Letter != '\0' && ! 267: (max_phonemes == 0 || Phone_Len < max_phonemes); ! 268: w_idx++) { ! 269: /* How many letters to skip because an eariler encoding handled ! 270: * multiple letters */ ! 271: unsigned short int skip_letter = 0; ! 272: ! 273: ! 274: /* THOUGHT: It would be nice if, rather than having things like... ! 275: * well, SCI. For SCI you encode the S, then have to remember ! 276: * to skip the C. So the phonome SCI invades both S and C. It would ! 277: * be better, IMHO, to skip the C from the S part of the encoding. ! 278: * Hell, I'm trying it. ! 279: */ ! 280: ! 281: /* Ignore non-alphas */ ! 282: if (!isalpha(Curr_Letter)) ! 283: continue; ! 284: ! 285: /* Drop duplicates, except CC */ ! 286: if (Curr_Letter == Prev_Letter && ! 287: Curr_Letter != 'C') ! 288: continue; ! 289: ! 290: switch (Curr_Letter) { ! 291: /* B -> B unless in MB */ ! 292: case 'B': ! 293: if (Prev_Letter != 'M') ! 294: Phonize('B'); ! 295: break; ! 296: /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. ! 297: * (SCHW is handled in S) ! 298: * S if -CI-, -CE- or -CY- ! 299: * dropped if -SCI-, SCE-, -SCY- (handed in S) ! 300: * else K ! 301: */ ! 302: case 'C': ! 303: if (MAKESOFT(Next_Letter)) { /* C[IEY] */ ! 304: if (After_Next_Letter == 'A' && ! 305: Next_Letter == 'I') { /* CIA */ ! 306: Phonize(SH); ! 307: } ! 308: /* SC[IEY] */ ! 309: else if (Prev_Letter == 'S') { ! 310: /* Dropped */ ! 311: } else { ! 312: Phonize('S'); ! 313: } ! 314: } else if (Next_Letter == 'H') { ! 315: if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ ! 316: Phonize('K'); ! 317: } else { ! 318: Phonize(SH); ! 319: } ! 320: skip_letter++; ! 321: } else { ! 322: Phonize('K'); ! 323: } ! 324: break; ! 325: /* J if in -DGE-, -DGI- or -DGY- ! 326: * else T ! 327: */ ! 328: case 'D': ! 329: if (Next_Letter == 'G' && ! 330: MAKESOFT(After_Next_Letter)) { ! 331: Phonize('J'); ! 332: skip_letter++; ! 333: } else ! 334: Phonize('T'); ! 335: break; ! 336: /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH ! 337: * else dropped if -GNED, -GN, ! 338: * else dropped if -DGE-, -DGI- or -DGY- (handled in D) ! 339: * else J if in -GE-, -GI, -GY and not GG ! 340: * else K ! 341: */ ! 342: case 'G': ! 343: if (Next_Letter == 'H') { ! 344: if (!(NOGHTOF(Look_Back_Letter(3)) || ! 345: Look_Back_Letter(4) == 'H')) { ! 346: Phonize('F'); ! 347: skip_letter++; ! 348: } else { ! 349: /* silent */ ! 350: } ! 351: } else if (Next_Letter == 'N') { ! 352: if (Isbreak(After_Next_Letter) || ! 353: (After_Next_Letter == 'E' && ! 354: Look_Ahead_Letter(3) == 'D')) { ! 355: /* dropped */ ! 356: } else ! 357: Phonize('K'); ! 358: } else if (MAKESOFT(Next_Letter) && ! 359: Prev_Letter != 'G') { ! 360: Phonize('J'); ! 361: } else { ! 362: Phonize('K'); ! 363: } ! 364: break; ! 365: /* H if before a vowel and not after C,G,P,S,T */ ! 366: case 'H': ! 367: if (isvowel(Next_Letter) && ! 368: !AFFECTH(Prev_Letter)) ! 369: Phonize('H'); ! 370: break; ! 371: /* dropped if after C ! 372: * else K ! 373: */ ! 374: case 'K': ! 375: if (Prev_Letter != 'C') ! 376: Phonize('K'); ! 377: break; ! 378: /* F if before H ! 379: * else P ! 380: */ ! 381: case 'P': ! 382: if (Next_Letter == 'H') { ! 383: Phonize('F'); ! 384: } else { ! 385: Phonize('P'); ! 386: } ! 387: break; ! 388: /* K ! 389: */ ! 390: case 'Q': ! 391: Phonize('K'); ! 392: break; ! 393: /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- ! 394: * else S ! 395: */ ! 396: case 'S': ! 397: if (Next_Letter == 'I' && ! 398: (After_Next_Letter == 'O' || ! 399: After_Next_Letter == 'A')) { ! 400: Phonize(SH); ! 401: } else if (Next_Letter == 'H') { ! 402: Phonize(SH); ! 403: skip_letter++; ! 404: } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { ! 405: Phonize(SH); ! 406: skip_letter += 2; ! 407: } else { ! 408: Phonize('S'); ! 409: } ! 410: break; ! 411: /* 'sh' in -TIA- or -TIO- ! 412: * else 'th' before H ! 413: * else T ! 414: */ ! 415: case 'T': ! 416: if (Next_Letter == 'I' && ! 417: (After_Next_Letter == 'O' || ! 418: After_Next_Letter == 'A')) { ! 419: Phonize(SH); ! 420: } else if (Next_Letter == 'H') { ! 421: Phonize(TH); ! 422: skip_letter++; ! 423: } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) { ! 424: Phonize('T'); ! 425: } ! 426: break; ! 427: /* F */ ! 428: case 'V': ! 429: Phonize('F'); ! 430: break; ! 431: /* W before a vowel, else dropped */ ! 432: case 'W': ! 433: if (isvowel(Next_Letter)) ! 434: Phonize('W'); ! 435: break; ! 436: /* KS */ ! 437: case 'X': ! 438: Phonize('K'); ! 439: Phonize('S'); ! 440: break; ! 441: /* Y if followed by a vowel */ ! 442: case 'Y': ! 443: if (isvowel(Next_Letter)) ! 444: Phonize('Y'); ! 445: break; ! 446: /* S */ ! 447: case 'Z': ! 448: Phonize('S'); ! 449: break; ! 450: /* No transformation */ ! 451: case 'F': ! 452: case 'J': ! 453: case 'L': ! 454: case 'M': ! 455: case 'N': ! 456: case 'R': ! 457: Phonize(Curr_Letter); ! 458: break; ! 459: default: ! 460: /* nothing */ ! 461: break; ! 462: } /* END SWITCH */ ! 463: ! 464: w_idx += skip_letter; ! 465: } /* END FOR */ ! 466: ! 467: End_Phoned_Word; ! 468: ! 469: return 0; ! 470: } /* END metaphone */ ! 471: /* }}} */ ! 472: ! 473: /* ! 474: * Local variables: ! 475: * tab-width: 4 ! 476: * c-basic-offset: 4 ! 477: * End: ! 478: * vim600: sw=4 ts=4 fdm=marker ! 479: * vim<600: sw=4 ts=4 ! 480: */