Return to metaphone.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / standard |
1.1 misho 1: /*
2: +----------------------------------------------------------------------+
3: | PHP Version 5 |
4: +----------------------------------------------------------------------+
1.1.1.3 ! misho 5: | Copyright (c) 1997-2013 The PHP Group |
1.1 misho 6: +----------------------------------------------------------------------+
7: | This source file is subject to version 3.01 of the PHP license, |
8: | that is bundled with this package in the file LICENSE, and is |
9: | available through the world-wide-web at the following url: |
10: | http://www.php.net/license/3_01.txt |
11: | If you did not receive a copy of the PHP license and are unable to |
12: | obtain it through the world-wide-web, please send a note to |
13: | license@php.net so we can mail you a copy immediately. |
14: +----------------------------------------------------------------------+
15: | Author: Thies C. Arntzen <thies@thieso.net> |
16: +----------------------------------------------------------------------+
17: */
18:
1.1.1.2 misho 19: /* $Id$ */
1.1 misho 20:
21: /*
22: Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
23: */
24:
25: #include "php.h"
26: #include "php_metaphone.h"
27:
28: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional);
29:
30: /* {{{ proto string metaphone(string text[, int phones])
31: Break english phrases down into their phonemes */
32: PHP_FUNCTION(metaphone)
33: {
34: char *str;
35: char *result = 0;
36: int str_len;
37: long phones = 0;
38:
39: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len,
40: &phones) == FAILURE) {
41: return;
42: }
43:
44: if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) {
45: RETVAL_STRING(result, 0);
46: } else {
47: if (result) {
48: efree(result);
49: }
50: RETURN_FALSE;
51: }
52: }
53: /* }}} */
54:
55: /*
56: this is now the original code by Michael G Schwern:
57: i've changed it just a slightly bit (use emalloc,
58: get rid of includes etc)
59: - thies - 13.09.1999
60: */
61:
62: /*----------------------------- */
63: /* this used to be "metaphone.h" */
64: /*----------------------------- */
65:
66: /* Special encodings */
67: #define SH 'X'
68: #define TH '0'
69:
70: /*----------------------------- */
71: /* end of "metaphone.h" */
72: /*----------------------------- */
73:
74: /*----------------------------- */
75: /* this used to be "metachar.h" */
76: /*----------------------------- */
77:
78: /* Metachar.h ... little bits about characters for metaphone */
79: /*-- Character encoding array & accessing macros --*/
80: /* Stolen directly out of the book... */
81: char _codes[26] =
82: {
83: 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
84: /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
85: };
86:
87:
88: #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
89:
90: #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
91:
92: /* These letters are passed through unchanged */
93: #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
94:
95: /* These form dipthongs when preceding H */
96: #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
97:
98: /* These make C and G soft */
99: #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
100:
101: /* These prevent GH from becoming F */
102: #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
103:
104: /*----------------------------- */
105: /* end of "metachar.h" */
106: /*----------------------------- */
107:
108: /* I suppose I could have been using a character pointer instead of
109: * accesssing the array directly... */
110:
111: /* Look at the next letter in the word */
112: #define Next_Letter (toupper(word[w_idx+1]))
113: /* Look at the current letter in the word */
114: #define Curr_Letter (toupper(word[w_idx]))
115: /* Go N letters back. */
116: #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
117: /* Previous letter. I dunno, should this return null on failure? */
118: #define Prev_Letter (Look_Back_Letter(1))
119: /* Look two letters down. It makes sure you don't walk off the string. */
120: #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
121: : '\0')
122: #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
123:
124:
125: /* Allows us to safely look ahead an arbitrary # of letters */
126: /* I probably could have just used strlen... */
127: static char Lookahead(char *word, int how_far)
128: {
129: char letter_ahead = '\0'; /* null by default */
130: int idx;
131: for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
132: /* Edge forward in the string... */
133:
134: letter_ahead = word[idx]; /* idx will be either == to how_far or
135: * at the end of the string
136: */
137: return letter_ahead;
138: }
139:
140:
141: /* phonize one letter
142: * We don't know the buffers size in advance. On way to solve this is to just
143: * re-allocate the buffer size. We're using an extra of 2 characters (this
144: * could be one though; or more too). */
145: #define Phonize(c) { \
146: if (p_idx >= max_buffer_len) { \
147: *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \
148: max_buffer_len += 2; \
149: } \
150: (*phoned_word)[p_idx++] = c; \
151: }
152: /* Slap a null character on the end of the phoned word */
153: #define End_Phoned_Word { \
154: if (p_idx == max_buffer_len) { \
155: *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \
156: } \
157: (*phoned_word)[p_idx] = '\0'; \
158: }
159: /* How long is the phoned word? */
160: #define Phone_Len (p_idx)
161:
162: /* Note is a letter is a 'break' in the word */
163: #define Isbreak(c) (!isalpha(c))
164:
165: /* {{{ metaphone
166: */
167: static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional)
168: {
169: int w_idx = 0; /* point in the phonization we're at. */
170: int p_idx = 0; /* end of the phoned phrase */
171: int max_buffer_len = 0; /* maximum length of the destination buffer */
172:
173: /*-- Parameter checks --*/
174: /* Negative phoneme length is meaningless */
175:
176: if (max_phonemes < 0)
177: return -1;
178:
179: /* Empty/null string is meaningless */
180: /* Overly paranoid */
181: /* assert(word != NULL && word[0] != '\0'); */
182:
183: if (word == NULL)
184: return -1;
185:
186: /*-- Allocate memory for our phoned_phrase --*/
187: if (max_phonemes == 0) { /* Assume largest possible */
188: max_buffer_len = word_len;
189: *phoned_word = safe_emalloc(sizeof(char), word_len, 1);
190: } else {
191: max_buffer_len = max_phonemes;
192: *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1);
193: }
194:
195:
196: /*-- The first phoneme has to be processed specially. --*/
197: /* Find our first letter */
198: for (; !isalpha(Curr_Letter); w_idx++) {
199: /* On the off chance we were given nothing but crap... */
200: if (Curr_Letter == '\0') {
201: End_Phoned_Word
202: return SUCCESS; /* For testing */
203: }
204: }
205:
206: switch (Curr_Letter) {
207: /* AE becomes E */
208: case 'A':
209: if (Next_Letter == 'E') {
210: Phonize('E');
211: w_idx += 2;
212: }
213: /* Remember, preserve vowels at the beginning */
214: else {
215: Phonize('A');
216: w_idx++;
217: }
218: break;
219: /* [GKP]N becomes N */
220: case 'G':
221: case 'K':
222: case 'P':
223: if (Next_Letter == 'N') {
224: Phonize('N');
225: w_idx += 2;
226: }
227: break;
228: /* WH becomes W,
229: WR becomes R
230: W if followed by a vowel */
231: case 'W':
232: if (Next_Letter == 'R') {
233: Phonize(Next_Letter);
234: w_idx += 2;
235: } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
236: Phonize('W');
237: w_idx += 2;
238: }
239: /* else ignore */
240: break;
241: /* X becomes S */
242: case 'X':
243: Phonize('S');
244: w_idx++;
245: break;
246: /* Vowels are kept */
247: /* We did A already
248: case 'A':
249: case 'a':
250: */
251: case 'E':
252: case 'I':
253: case 'O':
254: case 'U':
255: Phonize(Curr_Letter);
256: w_idx++;
257: break;
258: default:
259: /* do nothing */
260: break;
261: }
262:
263:
264:
265: /* On to the metaphoning */
266: for (; Curr_Letter != '\0' &&
267: (max_phonemes == 0 || Phone_Len < max_phonemes);
268: w_idx++) {
269: /* How many letters to skip because an eariler encoding handled
270: * multiple letters */
271: unsigned short int skip_letter = 0;
272:
273:
274: /* THOUGHT: It would be nice if, rather than having things like...
275: * well, SCI. For SCI you encode the S, then have to remember
276: * to skip the C. So the phonome SCI invades both S and C. It would
277: * be better, IMHO, to skip the C from the S part of the encoding.
278: * Hell, I'm trying it.
279: */
280:
281: /* Ignore non-alphas */
282: if (!isalpha(Curr_Letter))
283: continue;
284:
285: /* Drop duplicates, except CC */
286: if (Curr_Letter == Prev_Letter &&
287: Curr_Letter != 'C')
288: continue;
289:
290: switch (Curr_Letter) {
291: /* B -> B unless in MB */
292: case 'B':
293: if (Prev_Letter != 'M')
294: Phonize('B');
295: break;
296: /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
297: * (SCHW is handled in S)
298: * S if -CI-, -CE- or -CY-
299: * dropped if -SCI-, SCE-, -SCY- (handed in S)
300: * else K
301: */
302: case 'C':
303: if (MAKESOFT(Next_Letter)) { /* C[IEY] */
304: if (After_Next_Letter == 'A' &&
305: Next_Letter == 'I') { /* CIA */
306: Phonize(SH);
307: }
308: /* SC[IEY] */
309: else if (Prev_Letter == 'S') {
310: /* Dropped */
311: } else {
312: Phonize('S');
313: }
314: } else if (Next_Letter == 'H') {
315: if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
316: Phonize('K');
317: } else {
318: Phonize(SH);
319: }
320: skip_letter++;
321: } else {
322: Phonize('K');
323: }
324: break;
325: /* J if in -DGE-, -DGI- or -DGY-
326: * else T
327: */
328: case 'D':
329: if (Next_Letter == 'G' &&
330: MAKESOFT(After_Next_Letter)) {
331: Phonize('J');
332: skip_letter++;
333: } else
334: Phonize('T');
335: break;
336: /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
337: * else dropped if -GNED, -GN,
338: * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
339: * else J if in -GE-, -GI, -GY and not GG
340: * else K
341: */
342: case 'G':
343: if (Next_Letter == 'H') {
344: if (!(NOGHTOF(Look_Back_Letter(3)) ||
345: Look_Back_Letter(4) == 'H')) {
346: Phonize('F');
347: skip_letter++;
348: } else {
349: /* silent */
350: }
351: } else if (Next_Letter == 'N') {
352: if (Isbreak(After_Next_Letter) ||
353: (After_Next_Letter == 'E' &&
354: Look_Ahead_Letter(3) == 'D')) {
355: /* dropped */
356: } else
357: Phonize('K');
358: } else if (MAKESOFT(Next_Letter) &&
359: Prev_Letter != 'G') {
360: Phonize('J');
361: } else {
362: Phonize('K');
363: }
364: break;
365: /* H if before a vowel and not after C,G,P,S,T */
366: case 'H':
367: if (isvowel(Next_Letter) &&
368: !AFFECTH(Prev_Letter))
369: Phonize('H');
370: break;
371: /* dropped if after C
372: * else K
373: */
374: case 'K':
375: if (Prev_Letter != 'C')
376: Phonize('K');
377: break;
378: /* F if before H
379: * else P
380: */
381: case 'P':
382: if (Next_Letter == 'H') {
383: Phonize('F');
384: } else {
385: Phonize('P');
386: }
387: break;
388: /* K
389: */
390: case 'Q':
391: Phonize('K');
392: break;
393: /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
394: * else S
395: */
396: case 'S':
397: if (Next_Letter == 'I' &&
398: (After_Next_Letter == 'O' ||
399: After_Next_Letter == 'A')) {
400: Phonize(SH);
401: } else if (Next_Letter == 'H') {
402: Phonize(SH);
403: skip_letter++;
404: } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
405: Phonize(SH);
406: skip_letter += 2;
407: } else {
408: Phonize('S');
409: }
410: break;
411: /* 'sh' in -TIA- or -TIO-
412: * else 'th' before H
413: * else T
414: */
415: case 'T':
416: if (Next_Letter == 'I' &&
417: (After_Next_Letter == 'O' ||
418: After_Next_Letter == 'A')) {
419: Phonize(SH);
420: } else if (Next_Letter == 'H') {
421: Phonize(TH);
422: skip_letter++;
423: } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
424: Phonize('T');
425: }
426: break;
427: /* F */
428: case 'V':
429: Phonize('F');
430: break;
431: /* W before a vowel, else dropped */
432: case 'W':
433: if (isvowel(Next_Letter))
434: Phonize('W');
435: break;
436: /* KS */
437: case 'X':
438: Phonize('K');
439: Phonize('S');
440: break;
441: /* Y if followed by a vowel */
442: case 'Y':
443: if (isvowel(Next_Letter))
444: Phonize('Y');
445: break;
446: /* S */
447: case 'Z':
448: Phonize('S');
449: break;
450: /* No transformation */
451: case 'F':
452: case 'J':
453: case 'L':
454: case 'M':
455: case 'N':
456: case 'R':
457: Phonize(Curr_Letter);
458: break;
459: default:
460: /* nothing */
461: break;
462: } /* END SWITCH */
463:
464: w_idx += skip_letter;
465: } /* END FOR */
466:
467: End_Phoned_Word;
468:
469: return 0;
470: } /* END metaphone */
471: /* }}} */
472:
473: /*
474: * Local variables:
475: * tab-width: 4
476: * c-basic-offset: 4
477: * End:
478: * vim600: sw=4 ts=4 fdm=marker
479: * vim<600: sw=4 ts=4
480: */