1 | #include <stddef.h> |
---|
2 | #include <string.h> |
---|
3 | #include <stdio.h> |
---|
4 | |
---|
5 | #include "mad_mem.h" |
---|
6 | #include "mad_regex.h" |
---|
7 | |
---|
8 | // private types |
---|
9 | |
---|
10 | struct reg_token |
---|
11 | { |
---|
12 | int type; /* 0: empty, 1: char, 2: simple, 3: list, 4: dot */ |
---|
13 | int rep; /* 0 no, 1 yes (including 0 repetitions) */ |
---|
14 | int rep_cnt; /* no. of repitions at current match */ |
---|
15 | int rep_max; /* max. no. of repitions at current match */ |
---|
16 | int invert; /* 0 no, 1 yes (for type 3 only) */ |
---|
17 | int match_end; /* 0 no, 1 yes (only last token for '$' */ |
---|
18 | char c; /* for type = 1 */ |
---|
19 | struct r_char_array* simple; |
---|
20 | struct r_char_array* list; |
---|
21 | struct reg_token* next; |
---|
22 | struct reg_token* previous; |
---|
23 | }; |
---|
24 | |
---|
25 | // note: very similar to char_array from mad_array.h |
---|
26 | |
---|
27 | struct r_char_array /* dynamic array of char for regex */ |
---|
28 | { |
---|
29 | int max, /* max. array size */ |
---|
30 | curr; /* current occupation */ |
---|
31 | char* chars; |
---|
32 | }; |
---|
33 | |
---|
34 | // forward declarations |
---|
35 | |
---|
36 | // static struct reg_token* make_list(struct reg_token*, char*, int, int); |
---|
37 | |
---|
38 | // private funtions (utils) |
---|
39 | |
---|
40 | static int |
---|
41 | char_count(char c, char* s) |
---|
42 | { |
---|
43 | int n = 0, i = 0; |
---|
44 | while (*s != '\0') |
---|
45 | { |
---|
46 | if (*s++ == c) |
---|
47 | { |
---|
48 | i++; |
---|
49 | if (n < i) n = i; |
---|
50 | } |
---|
51 | else i = 0; |
---|
52 | } |
---|
53 | return n; |
---|
54 | } |
---|
55 | |
---|
56 | #if 0 // not used |
---|
57 | static void |
---|
58 | dump_tokens(struct reg_token* tk) |
---|
59 | { |
---|
60 | while (tk) |
---|
61 | { |
---|
62 | printf("type: %d rep = %d invert = %d max = %d dollar = %d\n", |
---|
63 | tk->type, tk->rep, tk->invert, tk->rep_max, tk->match_end); |
---|
64 | tk = tk->next; |
---|
65 | } |
---|
66 | } |
---|
67 | #endif |
---|
68 | |
---|
69 | static void |
---|
70 | grow_r_char_array(struct r_char_array* a) |
---|
71 | { |
---|
72 | char rout_name[] = "grow_r_char_array"; |
---|
73 | char* loc = a->chars; |
---|
74 | a->max *= 2; |
---|
75 | a->chars = (char*) mymalloc(rout_name, a->max); |
---|
76 | strncpy(a->chars, loc, a->curr); |
---|
77 | myfree(rout_name, loc); |
---|
78 | } |
---|
79 | |
---|
80 | static struct reg_token* |
---|
81 | make_dot(struct reg_token* rt) |
---|
82 | { |
---|
83 | char rout_name[] = "make_dot"; |
---|
84 | struct reg_token *rn = rt; |
---|
85 | if (rn->type != 0) rn = (struct reg_token*) mycalloc(rout_name, 1, sizeof(struct reg_token)); |
---|
86 | rn->type = 4; |
---|
87 | if (rn != rt) |
---|
88 | { |
---|
89 | rt->next = rn; rn->previous = rt; |
---|
90 | } |
---|
91 | return rn; |
---|
92 | } |
---|
93 | |
---|
94 | static void |
---|
95 | fill_list(char s, char e, struct r_char_array* a) |
---|
96 | { |
---|
97 | int i; |
---|
98 | while ((int)s <= (int)e) |
---|
99 | { |
---|
100 | if (a->curr == a->max) grow_r_char_array(a); |
---|
101 | a->chars[a->curr++] = s; |
---|
102 | i = (int)s + 1; s = (char) i; |
---|
103 | } |
---|
104 | } |
---|
105 | |
---|
106 | static struct reg_token* |
---|
107 | make_list(struct reg_token* rt, char* pattern, |
---|
108 | int is, int ie) |
---|
109 | { |
---|
110 | char rout_name[] = "make_list"; |
---|
111 | int i; |
---|
112 | struct reg_token *rn = rt; |
---|
113 | if (rn->type != 0) |
---|
114 | { |
---|
115 | rn = (struct reg_token*) mycalloc(rout_name, 1, sizeof(struct reg_token)); |
---|
116 | rt->next = rn; rn->previous = rt; |
---|
117 | } |
---|
118 | rn->type = 3; |
---|
119 | rn->list = (struct r_char_array*) mycalloc(rout_name, 1, sizeof(struct r_char_array)); |
---|
120 | rn->list->chars = (char*) mymalloc(rout_name, 100); |
---|
121 | rn->list->max = 100; |
---|
122 | if (pattern[is] == '^') /* invert list */ |
---|
123 | { |
---|
124 | rn->invert = 1; is++; |
---|
125 | } |
---|
126 | for (i = is; i <= ie; i++) |
---|
127 | { |
---|
128 | if (i < ie && pattern[i+1] == '-') |
---|
129 | { |
---|
130 | fill_list(pattern[i], pattern[i+2], rn->list); |
---|
131 | i += 2; |
---|
132 | } |
---|
133 | else fill_list(pattern[i], pattern[i], rn->list); |
---|
134 | } |
---|
135 | rn->list->chars[rn->list->curr] = '\0'; |
---|
136 | return rn; |
---|
137 | } |
---|
138 | |
---|
139 | static int |
---|
140 | list_count(struct r_char_array* list, int invert, char* s) |
---|
141 | { |
---|
142 | char* p; |
---|
143 | int n = 0, i = 0; |
---|
144 | while (*s != '\0') |
---|
145 | { |
---|
146 | p = strchr(list->chars, *s++); |
---|
147 | if ((p != NULL && invert == 0) || (p == NULL && invert != 0)) |
---|
148 | { |
---|
149 | i++; |
---|
150 | if (n < i) n = i; |
---|
151 | } |
---|
152 | else i = 0; |
---|
153 | } |
---|
154 | return n; |
---|
155 | } |
---|
156 | |
---|
157 | static int |
---|
158 | new_comb(struct reg_token* start) |
---|
159 | { |
---|
160 | struct reg_token* rt = start; |
---|
161 | while (rt) |
---|
162 | { |
---|
163 | if (rt->rep_cnt++ < rt->rep_max) return 1; |
---|
164 | rt->rep_cnt = 0; |
---|
165 | rt = rt->next; |
---|
166 | } |
---|
167 | return 0; |
---|
168 | } |
---|
169 | |
---|
170 | // private funtions (regex) |
---|
171 | |
---|
172 | static struct reg_token* |
---|
173 | flag(struct reg_token* rt, int* err) |
---|
174 | { |
---|
175 | char rout_name[] = "flag"; |
---|
176 | struct reg_token *rn = rt; |
---|
177 | if (rt == NULL || rt->type == 0) |
---|
178 | { |
---|
179 | *err = 3; return rt; /* illegal '*' */ |
---|
180 | } |
---|
181 | if (rt->rep != 0) |
---|
182 | { |
---|
183 | *err = 4; return rt; /* double '*' */ |
---|
184 | } |
---|
185 | *err = 0; |
---|
186 | if (rt->type == 2) |
---|
187 | { |
---|
188 | if (rt->simple->curr > 1) |
---|
189 | { |
---|
190 | rn = (struct reg_token*) mycalloc(rout_name, 1, sizeof(struct reg_token)); |
---|
191 | rn->c = rt->simple->chars[--rt->simple->curr]; |
---|
192 | rt->simple->chars[rt->simple->curr] = '\0'; |
---|
193 | rn->type = 1; |
---|
194 | rt->next = rn; |
---|
195 | } |
---|
196 | else |
---|
197 | { |
---|
198 | rt->type = 1; |
---|
199 | rt->c = rt->simple->chars[0]; |
---|
200 | myfree(rout_name, rt->simple->chars); |
---|
201 | myfree(rout_name, rt->simple); |
---|
202 | rt->simple = NULL; |
---|
203 | } |
---|
204 | } |
---|
205 | rn->rep = 1; |
---|
206 | return rn; |
---|
207 | } |
---|
208 | |
---|
209 | static struct reg_token* |
---|
210 | add_tok(char c, struct reg_token* rt) |
---|
211 | { |
---|
212 | char rout_name[] = "add_tok"; |
---|
213 | struct reg_token *rn = rt; |
---|
214 | if (rn->type == 1 || rn->type == 3 || rn->type == 4 || rn->rep == 1) |
---|
215 | { |
---|
216 | rn = (struct reg_token*) mycalloc(rout_name, 1, sizeof(struct reg_token)); |
---|
217 | rt->next = rn; rn->previous = rt; |
---|
218 | } |
---|
219 | if (rn->type == 0) |
---|
220 | { |
---|
221 | rn->type = 2; |
---|
222 | rn->simple = (struct r_char_array*) mycalloc(rout_name, 1, |
---|
223 | sizeof(struct r_char_array)); |
---|
224 | rn->simple->chars = (char*) mymalloc(rout_name, 100); |
---|
225 | rn->simple->max = 100; |
---|
226 | } |
---|
227 | if (rn->simple->curr == rn->simple->max) grow_r_char_array(rn->simple); |
---|
228 | rn->simple->chars[rn->simple->curr++] = c; |
---|
229 | rn->simple->chars[rn->simple->curr] = '\0'; |
---|
230 | return rn; |
---|
231 | } |
---|
232 | |
---|
233 | static struct reg_token* |
---|
234 | convert_pattern(char* pattern, int dollar, int* error) |
---|
235 | { |
---|
236 | char rout_name[] = "convert_pattern"; |
---|
237 | struct reg_token *rt, *first_token; |
---|
238 | int i, j, k, toggle = 0, last = strlen(pattern); |
---|
239 | *error = 0; |
---|
240 | first_token = rt = (struct reg_token*) mycalloc(rout_name, 1, sizeof(struct reg_token)); |
---|
241 | if (pattern[0] == '*') {*error = 1; return NULL;} |
---|
242 | for (i = 0; i < last; i++) |
---|
243 | { |
---|
244 | if (pattern[i] == '\\') toggle = 1; |
---|
245 | else if (toggle == 0 && pattern[i] == '[') |
---|
246 | { |
---|
247 | k = 0; |
---|
248 | for (j = i+2; j < last; j++) |
---|
249 | { |
---|
250 | if (pattern[j] == ']') |
---|
251 | { |
---|
252 | k = j; break; |
---|
253 | } |
---|
254 | } |
---|
255 | if (k == 0) {*error = 2; return NULL;} /* no closing right ']' */ |
---|
256 | rt = make_list(rt, pattern, i+1, j-1); |
---|
257 | i = j; |
---|
258 | } |
---|
259 | else if (toggle == 0 && pattern[i] == '*') |
---|
260 | { |
---|
261 | rt = flag(rt, error); if (*error != 0) return NULL; |
---|
262 | } |
---|
263 | else if (toggle == 0 && pattern[i] == '.') rt = make_dot(rt); |
---|
264 | else |
---|
265 | { |
---|
266 | rt = add_tok(pattern[i], rt); |
---|
267 | toggle = 0; |
---|
268 | } |
---|
269 | } |
---|
270 | rt->match_end = dollar; |
---|
271 | return first_token; |
---|
272 | } |
---|
273 | |
---|
274 | static void |
---|
275 | myregend(char* mypat, struct reg_token* start) |
---|
276 | { |
---|
277 | char rout_name[] = "myregend"; |
---|
278 | struct reg_token *rp, *aux; |
---|
279 | if (mypat != NULL) myfree(rout_name, mypat); mypat = NULL; |
---|
280 | rp = start; |
---|
281 | while (rp != NULL) |
---|
282 | { |
---|
283 | if (rp->simple != NULL) |
---|
284 | { |
---|
285 | if (rp->simple->chars != NULL) myfree(rout_name, rp->simple->chars); |
---|
286 | myfree(rout_name, rp->simple); |
---|
287 | } |
---|
288 | if (rp->list != NULL) |
---|
289 | { |
---|
290 | if (rp->list->chars != NULL) myfree(rout_name, rp->list->chars); |
---|
291 | myfree(rout_name, rp->list); |
---|
292 | } |
---|
293 | aux = rp; |
---|
294 | rp = rp->next; |
---|
295 | myfree(rout_name, aux); |
---|
296 | } |
---|
297 | } |
---|
298 | |
---|
299 | static void |
---|
300 | edit_tokens(struct reg_token* start, char* pattern, char* string, int dollar) |
---|
301 | { |
---|
302 | struct reg_token *tk, *tp; |
---|
303 | (void)pattern; |
---|
304 | tk = tp = start; |
---|
305 | while (tk) |
---|
306 | { |
---|
307 | if (tk->rep) |
---|
308 | { |
---|
309 | if (tk->type == 1) tk->rep_max = char_count(tk->c, string); |
---|
310 | else if (tk->type == 3) |
---|
311 | tk->rep_max = list_count(tk->list, tk->invert, string); |
---|
312 | else if (tk->type == 4) tk->rep_max = strlen(string); |
---|
313 | } |
---|
314 | tp = tk; |
---|
315 | tk = tk->next; |
---|
316 | } |
---|
317 | tp->match_end = dollar; |
---|
318 | } |
---|
319 | |
---|
320 | static char* |
---|
321 | match_token(struct reg_token* rt, char* p) |
---|
322 | { |
---|
323 | char *q; |
---|
324 | int j, n; |
---|
325 | if (*p == '\0') return NULL; |
---|
326 | switch (rt->type) |
---|
327 | { |
---|
328 | case 1: /* simple character match - exact repetition */ |
---|
329 | for (j = 0; j < rt->rep_cnt; j++) if (*p++ != rt->c) return NULL; |
---|
330 | return p; |
---|
331 | case 2: /* simple string */ |
---|
332 | if (strncmp(rt->simple->chars, p, rt->simple->curr) == 0) |
---|
333 | return &p[rt->simple->curr]; |
---|
334 | else return NULL; |
---|
335 | case 3: /* match character from list */ |
---|
336 | if (rt->rep == 0) n = 1; |
---|
337 | else n = rt->rep_cnt; |
---|
338 | for (j = 0; j < n; j++) |
---|
339 | { |
---|
340 | q = strchr(rt->list->chars, *p++); |
---|
341 | if ((q == NULL && rt->invert == 0) |
---|
342 | || (q != NULL && rt->invert != 0)) return NULL; |
---|
343 | } |
---|
344 | return p; |
---|
345 | case 4: /* dot */ |
---|
346 | if (rt->rep == 0) n = 1; |
---|
347 | else n = rt->rep_cnt; |
---|
348 | for (j = 0; j < n; j++) if (*p++ == '\0') return NULL; |
---|
349 | return p; |
---|
350 | default: |
---|
351 | return NULL; |
---|
352 | } |
---|
353 | } |
---|
354 | |
---|
355 | static int |
---|
356 | (match_all)(struct reg_token* start, char* string) |
---|
357 | { |
---|
358 | struct reg_token* rt; |
---|
359 | char *p; |
---|
360 | if (string[0] == '\0') return 0; |
---|
361 | restart: |
---|
362 | rt = start; p = string; |
---|
363 | while (rt) |
---|
364 | { |
---|
365 | if (((p = match_token(rt, p)) == NULL)) break; |
---|
366 | else if (rt->match_end && *p != '\0') |
---|
367 | { |
---|
368 | if (rt->rep == 0) break; |
---|
369 | else if (rt->rep_cnt != 0) break; |
---|
370 | else if (rt == start) break; |
---|
371 | } |
---|
372 | rt = rt->next; |
---|
373 | } |
---|
374 | if (rt == NULL) return 0; |
---|
375 | if (new_comb(start)) goto restart; |
---|
376 | return 1; |
---|
377 | } |
---|
378 | |
---|
379 | // public interface |
---|
380 | |
---|
381 | int |
---|
382 | myregex(char* pattern, char* string) |
---|
383 | { |
---|
384 | /* returns 0 if pattern = regex matches string, else 1 */ |
---|
385 | char rout_name[] = "myregex"; |
---|
386 | char *mypat; |
---|
387 | struct reg_token *first_token; |
---|
388 | int error, j, l, dollar = 0, res = 0; |
---|
389 | if (pattern == NULL || (l = strlen(pattern)) == 0) return 0; |
---|
390 | mypat = (char*) mymalloc(rout_name, strlen(pattern)+5); |
---|
391 | strcpy(mypat, pattern); |
---|
392 | /* $ at end ? */ |
---|
393 | if (mypat[l-1] == '$') |
---|
394 | { |
---|
395 | if (l == 1) return 0; |
---|
396 | else if (l > 2 && strncmp(&mypat[l-3], ".*", 2) == 0) |
---|
397 | { |
---|
398 | if (l == 3) return 0; |
---|
399 | if (mypat[l-4] != '\\') mypat[l-3] = '\0'; |
---|
400 | } |
---|
401 | else |
---|
402 | { |
---|
403 | mypat[l-1] = '\0'; |
---|
404 | dollar = 1; |
---|
405 | } |
---|
406 | } |
---|
407 | else if (l > 1 && strncmp(&mypat[l-2], ".*", 2) == 0) |
---|
408 | { |
---|
409 | if (l == 2) return 0; |
---|
410 | if (mypat[l-3] != '\\') mypat[l-2] = '\0'; |
---|
411 | } |
---|
412 | l = strlen(mypat); |
---|
413 | /* ^ at start ? */ |
---|
414 | if (mypat[0] == '^') |
---|
415 | { |
---|
416 | if (l == 0) return 0; |
---|
417 | else |
---|
418 | { |
---|
419 | for (j = 0; j < l; j++) mypat[j] = mypat[j+1]; |
---|
420 | } |
---|
421 | } |
---|
422 | else if (strncmp(mypat, ".*", 2) != 0) |
---|
423 | { |
---|
424 | for (j = l; j >= 0; j--) mypat[j+2] = mypat[j]; |
---|
425 | mypat[0] = '.'; mypat[1] = '*'; |
---|
426 | } |
---|
427 | l = strlen(mypat); |
---|
428 | /* printf("mypat: %s\n", mypat); */ |
---|
429 | first_token = convert_pattern(mypat, dollar, &error); |
---|
430 | if (error) |
---|
431 | { |
---|
432 | if (error == 1) puts("+++ illegal '*' in pattern"); |
---|
433 | if (error == 2) puts("+++ missing ']' in pattern"); |
---|
434 | if (error == 3) puts("+++ illegal '*' in pattern"); |
---|
435 | if (error == 4) puts("+++ double '*' in pattern"); |
---|
436 | myregend(mypat, NULL); |
---|
437 | return 1; |
---|
438 | } |
---|
439 | edit_tokens(first_token, mypat, string, dollar); |
---|
440 | /* dump_tokens(first_token); */ |
---|
441 | res = match_all(first_token, string); |
---|
442 | myregend(mypat, first_token); |
---|
443 | return res; |
---|
444 | } |
---|
445 | |
---|