1: /*
2: * This file defines the string_tokenize interface
3: * Time-stamp: "2010-07-17 10:40:26 bkorb"
4: *
5: * This file is part of AutoOpts, a companion to AutoGen.
6: * AutoOpts is free software.
7: * AutoOpts is Copyright (c) 1992-2011 by Bruce Korb - all rights reserved
8: *
9: * AutoOpts is available under any one of two licenses. The license
10: * in use must be one of these two and the choice is under the control
11: * of the user of the license.
12: *
13: * The GNU Lesser General Public License, version 3 or later
14: * See the files "COPYING.lgplv3" and "COPYING.gplv3"
15: *
16: * The Modified Berkeley Software Distribution License
17: * See the file "COPYING.mbsd"
18: *
19: * These files have the following md5sums:
20: *
21: * 43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
22: * 06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
23: * 66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
24: */
25:
26: #include <errno.h>
27: #include <stdlib.h>
28:
29: #define cc_t const unsigned char
30: #define ch_t unsigned char
31:
32: /* = = = START-STATIC-FORWARD = = = */
33: static void
34: copy_cooked(ch_t** ppDest, char const ** ppSrc);
35:
36: static void
37: copy_raw(ch_t** ppDest, char const ** ppSrc);
38:
39: static token_list_t *
40: alloc_token_list(char const * str);
41: /* = = = END-STATIC-FORWARD = = = */
42:
43: static void
44: copy_cooked(ch_t** ppDest, char const ** ppSrc)
45: {
46: ch_t* pDest = (ch_t*)*ppDest;
47: const ch_t* pSrc = (const ch_t*)(*ppSrc + 1);
48:
49: for (;;) {
50: ch_t ch = *(pSrc++);
51: switch (ch) {
52: case NUL: *ppSrc = NULL; return;
53: case '"': goto done;
54: case '\\':
55: pSrc += ao_string_cook_escape_char((char*)pSrc, (char*)&ch, 0x7F);
56: if (ch == 0x7F)
57: break;
58: /* FALLTHROUGH */
59:
60: default:
61: *(pDest++) = ch;
62: }
63: }
64:
65: done:
66: *ppDest = (ch_t*)pDest; /* next spot for storing character */
67: *ppSrc = (char const *)pSrc; /* char following closing quote */
68: }
69:
70:
71: static void
72: copy_raw(ch_t** ppDest, char const ** ppSrc)
73: {
74: ch_t* pDest = *ppDest;
75: cc_t* pSrc = (cc_t*) (*ppSrc + 1);
76:
77: for (;;) {
78: ch_t ch = *(pSrc++);
79: switch (ch) {
80: case NUL: *ppSrc = NULL; return;
81: case '\'': goto done;
82: case '\\':
83: /*
84: * *Four* escapes are handled: newline removal, escape char
85: * quoting and apostrophe quoting
86: */
87: switch (*pSrc) {
88: case NUL: *ppSrc = NULL; return;
89: case '\r':
90: if (*(++pSrc) == '\n')
91: ++pSrc;
92: continue;
93:
94: case '\n':
95: ++pSrc;
96: continue;
97:
98: case '\'':
99: ch = '\'';
100: /* FALLTHROUGH */
101:
102: case '\\':
103: ++pSrc;
104: break;
105: }
106: /* FALLTHROUGH */
107:
108: default:
109: *(pDest++) = ch;
110: }
111: }
112:
113: done:
114: *ppDest = pDest; /* next spot for storing character */
115: *ppSrc = (char const *) pSrc; /* char following closing quote */
116: }
117:
118: static token_list_t *
119: alloc_token_list(char const * str)
120: {
121: token_list_t * res;
122:
123: int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
124:
125: if (str == NULL) goto enoent_res;
126:
127: /*
128: * Trim leading white space. Use "ENOENT" and a NULL return to indicate
129: * an empty string was passed.
130: */
131: while (IS_WHITESPACE_CHAR(*str)) str++;
132: if (*str == NUL) goto enoent_res;
133:
134: /*
135: * Take an approximate count of tokens. If no quoted strings are used,
136: * it will be accurate. If quoted strings are used, it will be a little
137: * high and we'll squander the space for a few extra pointers.
138: */
139: {
140: cc_t* pz = (cc_t*)str;
141:
142: do {
143: max_token_ct++;
144: while (! IS_WHITESPACE_CHAR(*++pz))
145: if (*pz == NUL) goto found_nul;
146: while (IS_WHITESPACE_CHAR(*pz)) pz++;
147: } while (*pz != NUL);
148:
149: found_nul:
150: res = malloc(sizeof(*res) + (pz - (cc_t*)str)
151: + (max_token_ct * sizeof(ch_t*)));
152: }
153:
154: if (res == NULL)
155: errno = ENOMEM;
156: else res->tkn_list[0] = (ch_t*)(res->tkn_list + (max_token_ct - 1));
157:
158: return res;
159:
160: enoent_res:
161:
162: errno = ENOENT;
163: return NULL;
164: }
165:
166: /*=export_func ao_string_tokenize
167: *
168: * what: tokenize an input string
169: *
170: * arg: + char const* + string + string to be tokenized +
171: *
172: * ret_type: token_list_t*
173: * ret_desc: pointer to a structure that lists each token
174: *
175: * doc:
176: *
177: * This function will convert one input string into a list of strings.
178: * The list of strings is derived by separating the input based on
179: * white space separation. However, if the input contains either single
180: * or double quote characters, then the text after that character up to
181: * a matching quote will become the string in the list.
182: *
183: * The returned pointer should be deallocated with @code{free(3C)} when
184: * are done using the data. The data are placed in a single block of
185: * allocated memory. Do not deallocate individual token/strings.
186: *
187: * The structure pointed to will contain at least these two fields:
188: * @table @samp
189: * @item tkn_ct
190: * The number of tokens found in the input string.
191: * @item tok_list
192: * An array of @code{tkn_ct + 1} pointers to substring tokens, with
193: * the last pointer set to NULL.
194: * @end table
195: *
196: * There are two types of quoted strings: single quoted (@code{'}) and
197: * double quoted (@code{"}). Singly quoted strings are fairly raw in that
198: * escape characters (@code{\\}) are simply another character, except when
199: * preceding the following characters:
200: * @example
201: * @code{\\} double backslashes reduce to one
202: * @code{'} incorporates the single quote into the string
203: * @code{\n} suppresses both the backslash and newline character
204: * @end example
205: *
206: * Double quote strings are formed according to the rules of string
207: * constants in ANSI-C programs.
208: *
209: * example:
210: * @example
211: * #include <stdlib.h>
212: * int ix;
213: * token_list_t* ptl = ao_string_tokenize(some_string)
214: * for (ix = 0; ix < ptl->tkn_ct; ix++)
215: * do_something_with_tkn(ptl->tkn_list[ix]);
216: * free(ptl);
217: * @end example
218: * Note that everything is freed with the one call to @code{free(3C)}.
219: *
220: * err:
221: * NULL is returned and @code{errno} will be set to indicate the problem:
222: * @itemize @bullet
223: * @item
224: * @code{EINVAL} - There was an unterminated quoted string.
225: * @item
226: * @code{ENOENT} - The input string was empty.
227: * @item
228: * @code{ENOMEM} - There is not enough memory.
229: * @end itemize
230: =*/
231: token_list_t*
232: ao_string_tokenize(char const* str)
233: {
234: token_list_t* res = alloc_token_list(str);
235: ch_t* pzDest;
236:
237: /*
238: * Now copy each token into the output buffer.
239: */
240: if (res == NULL)
241: return res;
242:
243: pzDest = (ch_t*)(res->tkn_list[0]);
244: res->tkn_ct = 0;
245:
246: do {
247: res->tkn_list[ res->tkn_ct++ ] = pzDest;
248: for (;;) {
249: int ch = (ch_t)*str;
250: if (IS_WHITESPACE_CHAR(ch)) {
251: found_white_space:
252: while (IS_WHITESPACE_CHAR(*++str)) ;
253: break;
254: }
255:
256: switch (ch) {
257: case '"':
258: copy_cooked(&pzDest, &str);
259: if (str == NULL) {
260: free(res);
261: errno = EINVAL;
262: return NULL;
263: }
264: if (IS_WHITESPACE_CHAR(*str))
265: goto found_white_space;
266: break;
267:
268: case '\'':
269: copy_raw(&pzDest, &str);
270: if (str == NULL) {
271: free(res);
272: errno = EINVAL;
273: return NULL;
274: }
275: if (IS_WHITESPACE_CHAR(*str))
276: goto found_white_space;
277: break;
278:
279: case NUL:
280: goto copy_done;
281:
282: default:
283: str++;
284: *(pzDest++) = ch;
285: }
286: } copy_done:;
287:
288: /*
289: * NUL terminate the last token and see if we have any more tokens.
290: */
291: *(pzDest++) = NUL;
292: } while (*str != NUL);
293:
294: res->tkn_list[ res->tkn_ct ] = NULL;
295:
296: return res;
297: }
298:
299: #ifdef TEST
300: #include <stdio.h>
301: #include <string.h>
302:
303: int
304: main(int argc, char** argv)
305: {
306: if (argc == 1) {
307: printf("USAGE: %s arg [ ... ]\n", *argv);
308: return 1;
309: }
310: while (--argc > 0) {
311: char* arg = *(++argv);
312: token_list_t* p = ao_string_tokenize(arg);
313: if (p == NULL) {
314: printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
315: arg, errno, strerror(errno));
316: } else {
317: int ix = 0;
318: printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
319: do {
320: printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
321: } while (++ix < p->tkn_ct);
322: free(p);
323: }
324: }
325: return 0;
326: }
327: #endif
328:
329: /*
330: * Local Variables:
331: * mode: C
332: * c-file-style: "stroustrup"
333: * indent-tabs-mode: nil
334: * End:
335: * end of autoopts/tokenize.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>