Annotation of embedaddon/php/ext/standard/metaphone.c, revision 1.1
1.1 ! misho 1: /*
! 2: +----------------------------------------------------------------------+
! 3: | PHP Version 5 |
! 4: +----------------------------------------------------------------------+
! 5: | Copyright (c) 1997-2012 The PHP Group |
! 6: +----------------------------------------------------------------------+
! 7: | This source file is subject to version 3.01 of the PHP license, |
! 8: | that is bundled with this package in the file LICENSE, and is |
! 9: | available through the world-wide-web at the following url: |
! 10: | http://www.php.net/license/3_01.txt |
! 11: | If you did not receive a copy of the PHP license and are unable to |
! 12: | obtain it through the world-wide-web, please send a note to |
! 13: | license@php.net so we can mail you a copy immediately. |
! 14: +----------------------------------------------------------------------+
! 15: | Author: Thies C. Arntzen <thies@thieso.net> |
! 16: +----------------------------------------------------------------------+
! 17: */
! 18:
! 19: /* $Id: metaphone.c 321634 2012-01-01 13:15:04Z felipe $ */
! 20:
! 21: /*
! 22: Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
! 23: */
! 24:
! 25: #include "php.h"
! 26: #include "php_metaphone.h"
! 27:
! 28: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
! 29:
! 30: /* {{{ proto string metaphone(string text[, int phones])
! 31: Break english phrases down into their phonemes */
! 32: PHP_FUNCTION(metaphone)
! 33: {
! 34: char *str;
! 35: char *result = 0;
! 36: int str_len;
! 37: long phones = 0;
! 38:
! 39: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
! 40: &phones) == FAILURE) {
! 41: return;
! 42: }
! 43:
! 44: if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
! 45: RETVAL_STRING(result, 0);
! 46: } else {
! 47: if (result) {
! 48: efree(result);
! 49: }
! 50: RETURN_FALSE;
! 51: }
! 52: }
! 53: /* }}} */
! 54:
! 55: /*
! 56: this is now the original code by Michael G Schwern:
! 57: i've changed it just a slightly bit (use emalloc,
! 58: get rid of includes etc)
! 59: - thies - 13.09.1999
! 60: */
! 61:
! 62: /*----------------------------- */
! 63: /* this used to be "metaphone.h" */
! 64: /*----------------------------- */
! 65:
! 66: /* Special encodings */
! 67: #define SH 'X'
! 68: #define TH '0'
! 69:
! 70: /*----------------------------- */
! 71: /* end of "metaphone.h" */
! 72: /*----------------------------- */
! 73:
! 74: /*----------------------------- */
! 75: /* this used to be "metachar.h" */
! 76: /*----------------------------- */
! 77:
! 78: /* Metachar.h ... little bits about characters for metaphone */
! 79: /*-- Character encoding array & accessing macros --*/
! 80: /* Stolen directly out of the book... */
! 81: char _codes[26] =
! 82: {
! 83: 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
! 84: /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
! 85: };
! 86:
! 87:
! 88: #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
! 89:
! 90: #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
! 91:
! 92: /* These letters are passed through unchanged */
! 93: #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
! 94:
! 95: /* These form dipthongs when preceding H */
! 96: #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
! 97:
! 98: /* These make C and G soft */
! 99: #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
! 100:
! 101: /* These prevent GH from becoming F */
! 102: #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
! 103:
! 104: /*----------------------------- */
! 105: /* end of "metachar.h" */
! 106: /*----------------------------- */
! 107:
! 108: /* I suppose I could have been using a character pointer instead of
! 109: * accesssing the array directly... */
! 110:
! 111: /* Look at the next letter in the word */
! 112: #define Next_Letter (toupper(word[w_idx+1]))
! 113: /* Look at the current letter in the word */
! 114: #define Curr_Letter (toupper(word[w_idx]))
! 115: /* Go N letters back. */
! 116: #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
! 117: /* Previous letter. I dunno, should this return null on failure? */
! 118: #define Prev_Letter (Look_Back_Letter(1))
! 119: /* Look two letters down. It makes sure you don't walk off the string. */
! 120: #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
! 121: : '\0')
! 122: #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
! 123:
! 124:
! 125: /* Allows us to safely look ahead an arbitrary # of letters */
! 126: /* I probably could have just used strlen... */
! 127: static char Lookahead(char *word, int how_far)
! 128: {
! 129: char letter_ahead = '\0'; /* null by default */
! 130: int idx;
! 131: for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
! 132: /* Edge forward in the string... */
! 133:
! 134: letter_ahead = word[idx]; /* idx will be either == to how_far or
! 135: * at the end of the string
! 136: */
! 137: return letter_ahead;
! 138: }
! 139:
! 140:
! 141: /* phonize one letter
! 142: * We don't know the buffers size in advance. On way to solve this is to just
! 143: * re-allocate the buffer size. We're using an extra of 2 characters (this
! 144: * could be one though; or more too). */
! 145: #define Phonize(c) { \
! 146: if (p_idx >= max_buffer_len) { \
! 147: *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
! 148: max_buffer_len += 2; \
! 149: } \
! 150: (*phoned_word)[p_idx++] = c; \
! 151: }
! 152: /* Slap a null character on the end of the phoned word */
! 153: #define End_Phoned_Word { \
! 154: if (p_idx == max_buffer_len) { \
! 155: *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
! 156: } \
! 157: (*phoned_word)[p_idx] = '\0'; \
! 158: }
! 159: /* How long is the phoned word? */
! 160: #define Phone_Len (p_idx)
! 161:
! 162: /* Note is a letter is a 'break' in the word */
! 163: #define Isbreak(c) (!isalpha(c))
! 164:
! 165: /* {{{ metaphone
! 166: */
! 167: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
! 168: {
! 169: int w_idx = 0; /* point in the phonization we're at. */
! 170: int p_idx = 0; /* end of the phoned phrase */
! 171: int max_buffer_len = 0; /* maximum length of the destination buffer */
! 172:
! 173: /*-- Parameter checks --*/
! 174: /* Negative phoneme length is meaningless */
! 175:
! 176: if (max_phonemes < 0)
! 177: return -1;
! 178:
! 179: /* Empty/null string is meaningless */
! 180: /* Overly paranoid */
! 181: /* assert(word != NULL && word[0] != '\0'); */
! 182:
! 183: if (word == NULL)
! 184: return -1;
! 185:
! 186: /*-- Allocate memory for our phoned_phrase --*/
! 187: if (max_phonemes == 0) { /* Assume largest possible */
! 188: max_buffer_len = word_len;
! 189: *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
! 190: } else {
! 191: max_buffer_len = max_phonemes;
! 192: *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
! 193: }
! 194:
! 195:
! 196: /*-- The first phoneme has to be processed specially. --*/
! 197: /* Find our first letter */
! 198: for (; !isalpha(Curr_Letter); w_idx++) {
! 199: /* On the off chance we were given nothing but crap... */
! 200: if (Curr_Letter == '\0') {
! 201: End_Phoned_Word
! 202: return SUCCESS; /* For testing */
! 203: }
! 204: }
! 205:
! 206: switch (Curr_Letter) {
! 207: /* AE becomes E */
! 208: case 'A':
! 209: if (Next_Letter == 'E') {
! 210: Phonize('E');
! 211: w_idx += 2;
! 212: }
! 213: /* Remember, preserve vowels at the beginning */
! 214: else {
! 215: Phonize('A');
! 216: w_idx++;
! 217: }
! 218: break;
! 219: /* [GKP]N becomes N */
! 220: case 'G':
! 221: case 'K':
! 222: case 'P':
! 223: if (Next_Letter == 'N') {
! 224: Phonize('N');
! 225: w_idx += 2;
! 226: }
! 227: break;
! 228: /* WH becomes W,
! 229: WR becomes R
! 230: W if followed by a vowel */
! 231: case 'W':
! 232: if (Next_Letter == 'R') {
! 233: Phonize(Next_Letter);
! 234: w_idx += 2;
! 235: } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
! 236: Phonize('W');
! 237: w_idx += 2;
! 238: }
! 239: /* else ignore */
! 240: break;
! 241: /* X becomes S */
! 242: case 'X':
! 243: Phonize('S');
! 244: w_idx++;
! 245: break;
! 246: /* Vowels are kept */
! 247: /* We did A already
! 248: case 'A':
! 249: case 'a':
! 250: */
! 251: case 'E':
! 252: case 'I':
! 253: case 'O':
! 254: case 'U':
! 255: Phonize(Curr_Letter);
! 256: w_idx++;
! 257: break;
! 258: default:
! 259: /* do nothing */
! 260: break;
! 261: }
! 262:
! 263:
! 264:
! 265: /* On to the metaphoning */
! 266: for (; Curr_Letter != '\0' &&
! 267: (max_phonemes == 0 || Phone_Len < max_phonemes);
! 268: w_idx++) {
! 269: /* How many letters to skip because an eariler encoding handled
! 270: * multiple letters */
! 271: unsigned short int skip_letter = 0;
! 272:
! 273:
! 274: /* THOUGHT: It would be nice if, rather than having things like...
! 275: * well, SCI. For SCI you encode the S, then have to remember
! 276: * to skip the C. So the phonome SCI invades both S and C. It would
! 277: * be better, IMHO, to skip the C from the S part of the encoding.
! 278: * Hell, I'm trying it.
! 279: */
! 280:
! 281: /* Ignore non-alphas */
! 282: if (!isalpha(Curr_Letter))
! 283: continue;
! 284:
! 285: /* Drop duplicates, except CC */
! 286: if (Curr_Letter == Prev_Letter &&
! 287: Curr_Letter != 'C')
! 288: continue;
! 289:
! 290: switch (Curr_Letter) {
! 291: /* B -> B unless in MB */
! 292: case 'B':
! 293: if (Prev_Letter != 'M')
! 294: Phonize('B');
! 295: break;
! 296: /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
! 297: * (SCHW is handled in S)
! 298: * S if -CI-, -CE- or -CY-
! 299: * dropped if -SCI-, SCE-, -SCY- (handed in S)
! 300: * else K
! 301: */
! 302: case 'C':
! 303: if (MAKESOFT(Next_Letter)) { /* C[IEY] */
! 304: if (After_Next_Letter == 'A' &&
! 305: Next_Letter == 'I') { /* CIA */
! 306: Phonize(SH);
! 307: }
! 308: /* SC[IEY] */
! 309: else if (Prev_Letter == 'S') {
! 310: /* Dropped */
! 311: } else {
! 312: Phonize('S');
! 313: }
! 314: } else if (Next_Letter == 'H') {
! 315: if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
! 316: Phonize('K');
! 317: } else {
! 318: Phonize(SH);
! 319: }
! 320: skip_letter++;
! 321: } else {
! 322: Phonize('K');
! 323: }
! 324: break;
! 325: /* J if in -DGE-, -DGI- or -DGY-
! 326: * else T
! 327: */
! 328: case 'D':
! 329: if (Next_Letter == 'G' &&
! 330: MAKESOFT(After_Next_Letter)) {
! 331: Phonize('J');
! 332: skip_letter++;
! 333: } else
! 334: Phonize('T');
! 335: break;
! 336: /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
! 337: * else dropped if -GNED, -GN,
! 338: * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
! 339: * else J if in -GE-, -GI, -GY and not GG
! 340: * else K
! 341: */
! 342: case 'G':
! 343: if (Next_Letter == 'H') {
! 344: if (!(NOGHTOF(Look_Back_Letter(3)) ||
! 345: Look_Back_Letter(4) == 'H')) {
! 346: Phonize('F');
! 347: skip_letter++;
! 348: } else {
! 349: /* silent */
! 350: }
! 351: } else if (Next_Letter == 'N') {
! 352: if (Isbreak(After_Next_Letter) ||
! 353: (After_Next_Letter == 'E' &&
! 354: Look_Ahead_Letter(3) == 'D')) {
! 355: /* dropped */
! 356: } else
! 357: Phonize('K');
! 358: } else if (MAKESOFT(Next_Letter) &&
! 359: Prev_Letter != 'G') {
! 360: Phonize('J');
! 361: } else {
! 362: Phonize('K');
! 363: }
! 364: break;
! 365: /* H if before a vowel and not after C,G,P,S,T */
! 366: case 'H':
! 367: if (isvowel(Next_Letter) &&
! 368: !AFFECTH(Prev_Letter))
! 369: Phonize('H');
! 370: break;
! 371: /* dropped if after C
! 372: * else K
! 373: */
! 374: case 'K':
! 375: if (Prev_Letter != 'C')
! 376: Phonize('K');
! 377: break;
! 378: /* F if before H
! 379: * else P
! 380: */
! 381: case 'P':
! 382: if (Next_Letter == 'H') {
! 383: Phonize('F');
! 384: } else {
! 385: Phonize('P');
! 386: }
! 387: break;
! 388: /* K
! 389: */
! 390: case 'Q':
! 391: Phonize('K');
! 392: break;
! 393: /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
! 394: * else S
! 395: */
! 396: case 'S':
! 397: if (Next_Letter == 'I' &&
! 398: (After_Next_Letter == 'O' ||
! 399: After_Next_Letter == 'A')) {
! 400: Phonize(SH);
! 401: } else if (Next_Letter == 'H') {
! 402: Phonize(SH);
! 403: skip_letter++;
! 404: } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
! 405: Phonize(SH);
! 406: skip_letter += 2;
! 407: } else {
! 408: Phonize('S');
! 409: }
! 410: break;
! 411: /* 'sh' in -TIA- or -TIO-
! 412: * else 'th' before H
! 413: * else T
! 414: */
! 415: case 'T':
! 416: if (Next_Letter == 'I' &&
! 417: (After_Next_Letter == 'O' ||
! 418: After_Next_Letter == 'A')) {
! 419: Phonize(SH);
! 420: } else if (Next_Letter == 'H') {
! 421: Phonize(TH);
! 422: skip_letter++;
! 423: } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
! 424: Phonize('T');
! 425: }
! 426: break;
! 427: /* F */
! 428: case 'V':
! 429: Phonize('F');
! 430: break;
! 431: /* W before a vowel, else dropped */
! 432: case 'W':
! 433: if (isvowel(Next_Letter))
! 434: Phonize('W');
! 435: break;
! 436: /* KS */
! 437: case 'X':
! 438: Phonize('K');
! 439: Phonize('S');
! 440: break;
! 441: /* Y if followed by a vowel */
! 442: case 'Y':
! 443: if (isvowel(Next_Letter))
! 444: Phonize('Y');
! 445: break;
! 446: /* S */
! 447: case 'Z':
! 448: Phonize('S');
! 449: break;
! 450: /* No transformation */
! 451: case 'F':
! 452: case 'J':
! 453: case 'L':
! 454: case 'M':
! 455: case 'N':
! 456: case 'R':
! 457: Phonize(Curr_Letter);
! 458: break;
! 459: default:
! 460: /* nothing */
! 461: break;
! 462: } /* END SWITCH */
! 463:
! 464: w_idx += skip_letter;
! 465: } /* END FOR */
! 466:
! 467: End_Phoned_Word;
! 468:
! 469: return 0;
! 470: } /* END metaphone */
! 471: /* }}} */
! 472:
! 473: /*
! 474: * Local variables:
! 475: * tab-width: 4
! 476: * c-basic-offset: 4
! 477: * End:
! 478: * vim600: sw=4 ts=4 fdm=marker
! 479: * vim<600: sw=4 ts=4
! 480: */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>