source: CMT/HEAD/source/cmt_regexp.cxx@ 581

Last change on this file since 581 was 581, checked in by rybkin, 14 years ago

See C.L. 459

  • Property svn:eol-style set to native
File size: 36.4 KB
Line 
1//-----------------------------------------------------------
2// Copyright Christian Arnault LAL-Orsay CNRS
3// arnault@lal.in2p3.fr
4// See the complete license in cmt_license.txt "http://www.cecill.info".
5//-----------------------------------------------------------
6
7#include "cmt_std.h"
8#include "cmt_regexp.h"
9#include "cmt_vector.h"
10#include "cmt_system.h"
11
12//----------------------------------------------------------
13//
14// Declarations
15//
16//----------------------------------------------------------
17
18static int tab_level = 0;
19
20static void tab ()
21{
22 for (int i = 0; i < tab_level; i++)
23 {
24 cerr << " ";
25 }
26}
27
28//----------------------------------------------------------
29class cmt_regexp_node
30{
31public:
32 static cmt_regexp_node& null ();
33 static int node_count ();
34
35 cmt_regexp_node ();
36 virtual ~cmt_regexp_node ();
37
38 virtual const cmt_regexp::iterator match (const cmt_string& text,
39 int pos) const;
40 virtual bool is_char () const;
41 virtual bool is_many_node () const;
42
43 virtual void dump () const;
44
45private:
46 static int _node_count;
47};
48//----------------------------------------------------------
49
50//----------------------------------------------------------
51class cmt_char_node : public cmt_regexp_node
52{
53public:
54 cmt_char_node (char c);
55
56 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
57
58 bool is_char () const;
59 operator char ();
60
61 void dump () const;
62
63private:
64 char _c;
65};
66//----------------------------------------------------------
67
68//----------------------------------------------------------
69class cmt_string_node : public cmt_regexp_node
70{
71public:
72 cmt_string_node (const cmt_string& s);
73
74 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
75
76 void dump () const;
77
78private:
79 cmt_string _s;
80};
81//----------------------------------------------------------
82
83//----------------------------------------------------------
84class cmt_char_list_node : public cmt_regexp_node
85{
86public:
87 cmt_char_list_node (cmt_string list);
88
89 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
90
91 void dump () const;
92
93protected:
94 cmt_string _list;
95 cmt_string _choices;
96};
97//----------------------------------------------------------
98
99//----------------------------------------------------------
100class cmt_not_char_list_node : public cmt_char_list_node
101{
102public:
103 cmt_not_char_list_node (cmt_string list);
104
105 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
106
107 void dump () const;
108};
109//----------------------------------------------------------
110
111//----------------------------------------------------------
112class cmt_any_node : public cmt_regexp_node
113{
114public:
115 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
116
117 void dump () const;
118};
119//----------------------------------------------------------
120
121//----------------------------------------------------------
122class cmt_zero_one : public cmt_regexp_node
123{
124public:
125 cmt_zero_one (cmt_regexp_node* n);
126 ~cmt_zero_one ();
127
128 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
129
130 void dump () const;
131
132protected:
133 cmt_regexp_node* _node;
134};
135//----------------------------------------------------------
136
137
138
139//----------------------------------------------------------
140class cmt_begin_node : public cmt_regexp_node
141{
142public:
143 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
144
145 void dump () const;
146};
147//----------------------------------------------------------
148
149//----------------------------------------------------------
150class cmt_end_node : public cmt_regexp_node
151{
152public:
153 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
154
155 void dump () const;
156};
157//----------------------------------------------------------
158
159//----------------------------------------------------------
160class cmt_regexp_node_set : public cmt_regexp_node
161{
162public:
163 cmt_regexp_node_set ();
164 cmt_regexp_node_set (cmt_regexp_node_set* father);
165 ~cmt_regexp_node_set ();
166
167 cmt_regexp_node_set* father ();
168 void clear ();
169 void push (cmt_regexp_node* n);
170 cmt_regexp_node* pop ();
171 cmt_regexp_node* top () const;
172 int nodes () const;
173 const cmt_regexp_node* nodeAt (int index) const;
174 bool parentheses () const;
175 void set_parentheses (bool value);
176 virtual void reduce ();
177
178 virtual void dump () const;
179 void dump (const cmt_string& title) const;
180
181protected:
182 cmt_regexp_node_set* _father;
183 cmt_vector<cmt_regexp_node*> _nodes;
184 bool _parentheses;
185};
186//----------------------------------------------------------
187
188//----------------------------------------------------------
189class cmt_and_node : public cmt_regexp_node_set
190{
191public:
192 cmt_and_node ();
193 cmt_and_node (cmt_regexp_node_set* father);
194
195 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
196 void reduce ();
197 void fill (cmt_and_node& other, int start_index);
198
199 void dump () const;
200};
201//----------------------------------------------------------
202
203//----------------------------------------------------------
204class cmt_or_node : public cmt_regexp_node_set
205{
206public:
207 cmt_or_node (cmt_regexp_node_set* father);
208
209 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
210
211 void dump () const;
212};
213//----------------------------------------------------------
214
215//----------------------------------------------------------
216cmt_regexp_node& cmt_regexp_node::null ()
217{
218 static cmt_regexp_node null_instance;
219
220 return (null_instance);
221}
222//----------------------------------------------------------
223
224//----------------------------------------------------------
225class cmt_many_node : public cmt_regexp_node
226{
227public:
228 bool is_many_node () const;
229 void install (cmt_and_node& other, int index);
230 void reduce ();
231
232 void dump () const;
233
234protected:
235 cmt_many_node (cmt_regexp_node* n);
236 virtual ~cmt_many_node ();
237 cmt_regexp_node* _node;
238 cmt_and_node _follower;
239};
240//----------------------------------------------------------
241
242//----------------------------------------------------------
243class cmt_zero_more : public cmt_many_node
244{
245public:
246 cmt_zero_more (cmt_regexp_node* n);
247
248 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
249
250 void dump () const;
251};
252//----------------------------------------------------------
253
254//----------------------------------------------------------
255class cmt_one_more : public cmt_many_node
256{
257public:
258 cmt_one_more (cmt_regexp_node* n);
259
260 const cmt_regexp::iterator match (const cmt_string& text, int pos) const;
261
262 void dump () const;
263};
264//----------------------------------------------------------
265
266
267//----------------------------------------------------------
268//
269// Implementation
270//
271//----------------------------------------------------------
272
273//----------------------------------------------------------
274cmt_regexp_node::cmt_regexp_node ()
275{
276 _node_count++;
277}
278
279cmt_regexp_node::~cmt_regexp_node ()
280{
281 _node_count--;
282}
283
284int cmt_regexp_node::node_count ()
285{
286 return (_node_count);
287}
288
289const cmt_regexp::iterator cmt_regexp_node::match (const cmt_string& /*text*/,
290 int /*pos*/) const
291{
292 return (cmt_regexp::iterator::null());
293}
294
295bool cmt_regexp_node::is_char () const
296{
297 return (false);
298}
299
300bool cmt_regexp_node::is_many_node () const
301{
302 return (false);
303}
304
305void cmt_regexp_node::dump () const
306{
307}
308
309int cmt_regexp_node::_node_count = 0;
310//----------------------------------------------------------
311
312//----------------------------------------------------------
313cmt_char_node::cmt_char_node (char c)
314{
315 _c = c;
316}
317
318const cmt_regexp::iterator cmt_char_node::match (const cmt_string& text,
319 int pos) const
320{
321 if ((pos < 0) || (pos > text.size ()))
322 {
323 return (cmt_regexp::iterator::null ());
324 }
325
326 char c = text[pos];
327
328 if (c == _c)
329 {
330 return (cmt_regexp::iterator (pos, 1));
331 }
332
333 return (cmt_regexp::iterator::null ());
334}
335
336bool cmt_char_node::is_char () const
337{
338 return (true);
339}
340
341cmt_char_node::operator char ()
342{
343 return (_c);
344}
345
346void cmt_char_node::dump () const
347{
348 tab (); cerr << "char>(" << this << ") c=" << _c << endl;
349}
350
351//----------------------------------------------------------
352
353//----------------------------------------------------------
354cmt_string_node::cmt_string_node (const cmt_string& s)
355{
356 _s = s;
357}
358
359const cmt_regexp::iterator cmt_string_node::match (const cmt_string& text,
360 int pos) const
361{
362 if ((pos < 0) || (pos > text.size ()))
363 {
364 return (cmt_regexp::iterator::null ());
365 }
366
367 int length = _s.size ();
368
369 cmt_string s = text.substr (pos, length);
370
371 if ((length == 0) || (s == _s))
372 {
373 return (cmt_regexp::iterator (pos, length));
374 }
375
376 return (cmt_regexp::iterator::null ());
377}
378
379void cmt_string_node::dump () const
380{
381 tab (); cerr << "string (" << this << ") s=[" << _s << "]" << endl;
382}
383
384//----------------------------------------------------------
385
386//----------------------------------------------------------
387cmt_char_list_node::cmt_char_list_node (cmt_string list)
388{
389 _list = list;
390
391 _choices = "";
392
393 char c;
394 int i;
395
396 for (i = 0; i < list.size (); i++)
397 {
398 c = list[i];
399
400 switch (c)
401 {
402 case '-':
403 i++;
404 {
405 char c1 = _choices[_choices.size () - 1];
406 char c2 = list[i];
407 int j;
408 int j0 = (c1 < c2) ? c1 : c2;
409 int j1 = (c1 > c2) ? c1 : c2;
410 for (j = j0; j <= j1; j++)
411 {
412 _choices += j;
413 }
414 }
415 break;
416 case '\\':
417 i++;
418 c = list[i];
419 switch (c)
420 {
421 case '[':
422 case ']':
423 case '(':
424 case ')':
425 case '.':
426 case '*':
427 case '?':
428 case '^':
429 case '$':
430 case '\\':
431 c = '\\';
432 break;
433 case 'r':
434 c = '\r';
435 break;
436 case 't':
437 c = '\t';
438 break;
439 case 'n':
440 c = '\n';
441 break;
442 default:
443 break;
444 }
445 default:
446 _choices += c;
447 break;
448 }
449 }
450}
451
452const cmt_regexp::iterator cmt_char_list_node::match (const cmt_string& text,
453 int pos) const
454{
455 if ((pos < 0) || (pos > text.size ()))
456 {
457 return (cmt_regexp::iterator::null ());
458 }
459
460 char c = text[pos];
461
462 int i;
463
464 for (i = 0; i < _choices.size (); i++)
465 {
466 if (c == _choices[i]) return (cmt_regexp::iterator (pos, 1));
467 }
468
469 return (cmt_regexp::iterator::null ());
470}
471
472void cmt_char_list_node::dump () const
473{
474 tab (); cerr << "char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;
475}
476
477//----------------------------------------------------------
478
479//----------------------------------------------------------
480cmt_not_char_list_node::cmt_not_char_list_node (cmt_string list) :
481 cmt_char_list_node (list)
482{
483}
484
485const cmt_regexp::iterator cmt_not_char_list_node::match (const cmt_string& text,
486 int pos) const
487{
488 if ((pos < 0) || (pos > text.size ()))
489 {
490 return (cmt_regexp::iterator::null ());
491 }
492
493 char c = text[pos];
494
495 int i;
496
497 for (i = 0; i < _choices.size (); i++)
498 {
499 if (c == _choices[i]) return (cmt_regexp::iterator::null ());
500 }
501
502 return (cmt_regexp::iterator (pos, 1));
503}
504
505void cmt_not_char_list_node::dump () const
506{
507 tab (); cerr << "not_char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;
508}
509
510
511//----------------------------------------------------------
512
513//----------------------------------------------------------
514const cmt_regexp::iterator cmt_any_node::match (const cmt_string& text,
515 int pos) const
516{
517 if ((pos < 0) | (pos >= text.size ()))
518 {
519 return (cmt_regexp::iterator::null ());
520 }
521
522 return (cmt_regexp::iterator (pos, 1));
523}
524
525void cmt_any_node::dump () const
526{
527 tab (); cerr << "any (" << this << ") " << endl;
528}
529
530//----------------------------------------------------------
531
532//----------------------------------------------------------
533cmt_zero_one::cmt_zero_one (cmt_regexp_node* n) : _node (n)
534{
535}
536
537cmt_zero_one::~cmt_zero_one ()
538{
539 delete _node;
540}
541
542const cmt_regexp::iterator cmt_zero_one::match (const cmt_string& text,
543 int pos) const
544{
545 if ((pos < 0) || (pos > text.size ()))
546 {
547 return (cmt_regexp::iterator::null ());
548 }
549
550 int total = 0;
551
552 if (pos < text.size ())
553 {
554 const cmt_regexp::iterator it = _node->match (text, pos);
555 if (it != cmt_regexp::iterator::null ())
556 {
557 total += it._length;
558 pos += it._length;
559 }
560 }
561
562 return (cmt_regexp::iterator (pos, total));
563}
564
565void cmt_zero_one::dump () const
566{
567 tab (); cerr << "zero_one (" << this << ") " << endl;
568
569 if (_node != 0)
570 {
571 tab_level++;
572 _node->dump ();
573 tab_level--;
574 }
575}
576
577//----------------------------------------------------------
578
579//----------------------------------------------------------
580cmt_many_node::cmt_many_node (cmt_regexp_node* n) : _node (n)
581{
582}
583
584bool cmt_many_node::is_many_node () const
585{
586 return (true);
587}
588
589cmt_many_node::~cmt_many_node ()
590{
591 delete _node;
592}
593
594void cmt_many_node::install (cmt_and_node& other, int start_index)
595{
596 _follower.fill (other, start_index);
597}
598
599void cmt_many_node::reduce ()
600{
601 _follower.reduce ();
602}
603
604void cmt_many_node::dump () const
605{
606 tab (); cerr << "many (" << this << ") " << endl;
607
608 if (_node != 0)
609 {
610 tab_level++;
611 _node->dump ();
612 tab_level--;
613 }
614
615 tab_level++;
616 _follower.dump ();
617 tab_level--;
618}
619
620//----------------------------------------------------------
621
622
623
624//----------------------------------------------------------
625cmt_zero_more::cmt_zero_more (cmt_regexp_node* n) : cmt_many_node (n)
626{
627}
628
629const cmt_regexp::iterator cmt_zero_more::match (const cmt_string& text,
630 int pos) const
631{
632 if ((pos < 0) || (pos > text.size ()))
633 {
634 return (cmt_regexp::iterator::null ());
635 }
636
637 int total = 0;
638
639 //
640 // we are at : x*y
641 //
642
643 int saved_pos = -1;
644 int saved_total = -1;
645
646 do
647 {
648 const cmt_regexp::iterator itx = _node->match (text, pos);
649 const cmt_regexp::iterator ity = _follower.match (text, pos);
650
651 if ((itx == cmt_regexp::iterator::null ()) &&
652 (ity == cmt_regexp::iterator::null ()))
653 {
654 //
655 // There is neither x nor y. We move back to the last
656 // succesful match for y.
657 //
658 if (saved_pos >= 0)
659 {
660 //
661 // We had once a y.
662 //
663 pos = saved_pos;
664 total = saved_total;
665 }
666 else
667 {
668 //
669 // We never had any y !
670 //
671 return (cmt_regexp::iterator::null ());
672 }
673
674 break;
675 }
676
677 if (itx == cmt_regexp::iterator::null ())
678 {
679 //
680 // There is a y but no x anymore, fine, we can quit.
681 //
682 total += ity._length;
683 pos += ity._length;
684 break;
685 }
686
687 if (ity != cmt_regexp::iterator::null ())
688 {
689 //
690 // We have both x and y. We save the current pos and total,
691 // and then skip this x.
692 //
693 saved_total = total + ity._length;
694 saved_pos = pos + ity._length;
695 }
696 total += itx._length;
697 pos += itx._length;
698 } while (true);
699
700 return (cmt_regexp::iterator (pos, total));
701}
702
703void cmt_zero_more::dump () const
704{
705 tab (); cerr << "zero_more (" << this << ") " << endl;
706
707 if (_node != 0)
708 {
709 tab_level++;
710 _node->dump ();
711 tab_level--;
712 }
713
714 _follower.dump ();
715}
716
717//----------------------------------------------------------
718
719//----------------------------------------------------------
720cmt_one_more::cmt_one_more (cmt_regexp_node* n) : cmt_many_node (n)
721{
722}
723
724const cmt_regexp::iterator cmt_one_more::match (const cmt_string& text,
725 int pos) const
726{
727 if ((pos < 0) || (pos > text.size ()))
728 {
729 return (cmt_regexp::iterator::null ());
730 }
731
732 int total = 0;
733
734 //
735 // we are at : x+y
736 //
737
738 int saved_pos = -1;
739 int saved_total = -1;
740 bool at_least_one = false;
741
742 do
743 {
744 const cmt_regexp::iterator itx = _node->match (text, pos);
745 const cmt_regexp::iterator ity = _follower.match (text, pos);
746
747 if ((itx == cmt_regexp::iterator::null ()) &&
748 (ity == cmt_regexp::iterator::null ()))
749 {
750 //
751 // There is neither x nor y. We move back to the last
752 // succesful match for y.
753 //
754 if (saved_pos >= 0)
755 {
756 //
757 // We had once a y.
758 //
759 pos = saved_pos;
760 total = saved_total;
761 }
762 else
763 {
764 //
765 // We never had any y !
766 //
767 return (cmt_regexp::iterator::null ());
768 }
769
770 break;
771 }
772
773 if (itx == cmt_regexp::iterator::null ())
774 {
775 //
776 // There is a y but no x anymore, fine, we can quit.
777 //
778 total += ity._length;
779 pos += ity._length;
780 break;
781 }
782
783 if (ity != cmt_regexp::iterator::null ())
784 {
785 //
786 // We have both x and y. We save the current pos and total,
787 // and then skip this x.
788 //
789 saved_total = total + ity._length;
790 saved_pos = pos + ity._length;
791 }
792
793 total += itx._length;
794 pos += itx._length;
795
796 at_least_one = true;
797 } while (true);
798
799 if (!at_least_one) return (cmt_regexp::iterator::null ());
800
801 return (cmt_regexp::iterator (pos, total));
802}
803
804void cmt_one_more::dump () const
805{
806 tab (); cerr << "one_more (" << this << ") " << endl;
807
808 if (_node != 0)
809 {
810 tab_level++;
811 _node->dump ();
812 tab_level--;
813 }
814
815 tab_level++;
816 _follower.dump ();
817 tab_level--;
818}
819
820//----------------------------------------------------------
821
822
823
824//----------------------------------------------------------
825const cmt_regexp::iterator cmt_begin_node::match (const cmt_string& /*text*/,
826 int pos) const
827{
828 if (pos == 0) return (cmt_regexp::iterator (pos, 0));
829 return (cmt_regexp::iterator::null ());
830}
831
832void cmt_begin_node::dump () const
833{
834 tab (); cerr << "begin (" << this << ") " << endl;
835}
836//----------------------------------------------------------
837
838//----------------------------------------------------------
839const cmt_regexp::iterator cmt_end_node::match (const cmt_string& text,
840 int pos) const
841{
842 if (pos == text.size ()) return (cmt_regexp::iterator (pos, 0));
843 return (cmt_regexp::iterator::null ());
844}
845
846void cmt_end_node::dump () const
847{
848 tab (); cerr << "end (" << this << ") " << endl;
849}
850//----------------------------------------------------------
851
852//----------------------------------------------------------
853cmt_regexp_node_set::cmt_regexp_node_set () : _father (0)
854{
855 _parentheses = false;
856}
857
858cmt_regexp_node_set::cmt_regexp_node_set (cmt_regexp_node_set* father) : _father (father)
859{
860 if (father != 0) father->push (this);
861 _parentheses = false;
862}
863
864cmt_regexp_node_set::~cmt_regexp_node_set ()
865{
866 clear ();
867}
868
869cmt_regexp_node_set* cmt_regexp_node_set::father ()
870{
871 return (_father);
872}
873
874void cmt_regexp_node_set::clear ()
875{
876 int i;
877
878 for (i = 0; i < _nodes.size (); i++)
879 {
880 cmt_regexp_node* n = _nodes[i];
881 delete n;
882 }
883 _nodes.clear ();
884}
885
886void cmt_regexp_node_set::push (cmt_regexp_node* n)
887{
888 _nodes.push_back (n);
889}
890
891cmt_regexp_node* cmt_regexp_node_set::pop ()
892{
893 if (_nodes.size () == 0) return (&cmt_regexp_node::null ());
894
895 int index = _nodes.size () - 1;
896
897 cmt_regexp_node* n = _nodes[index];
898 _nodes.erase (index);
899
900 return (n);
901}
902
903cmt_regexp_node* cmt_regexp_node_set::top () const
904{
905 if (_nodes.size () == 0) return (&cmt_regexp_node::null ());
906
907 int index = _nodes.size () - 1;
908
909 cmt_regexp_node* n = _nodes[index];
910
911 return (n);
912}
913
914int cmt_regexp_node_set::nodes () const
915{
916 return (_nodes.size ());
917}
918
919const cmt_regexp_node* cmt_regexp_node_set::nodeAt (int index) const
920{
921 return (_nodes[index]);
922}
923
924bool cmt_regexp_node_set::parentheses () const
925{
926 return (_parentheses);
927}
928
929void cmt_regexp_node_set::set_parentheses (bool value)
930{
931 _parentheses = value;
932}
933
934void cmt_regexp_node_set::reduce ()
935{
936}
937
938void cmt_regexp_node_set::dump () const
939{
940 tab (); cerr << "regexp_node_set (" << this << ") " << endl;
941}
942
943void cmt_regexp_node_set::dump (const cmt_string& title) const
944{
945 tab (); cerr << "Set (" << this << ") father=" << _father << " pars=" << _parentheses << endl;
946
947 for (int i = 0; i < _nodes.size (); i++)
948 {
949 cmt_regexp_node* n = _nodes[i];
950 if (n != 0)
951 {
952 if (i > 0)
953 {
954 tab (); cerr << title << endl;
955 }
956 tab_level++;
957 n->dump ();
958 tab_level--;
959 }
960 }
961
962 tab (); cerr << "EndSet (" << this << ")" << endl;
963}
964//----------------------------------------------------------
965
966//----------------------------------------------------------
967cmt_and_node::cmt_and_node () : cmt_regexp_node_set ()
968{
969}
970
971cmt_and_node::cmt_and_node (cmt_regexp_node_set* father) : cmt_regexp_node_set (father)
972{
973}
974
975const cmt_regexp::iterator cmt_and_node::match (const cmt_string& text,
976 int pos) const
977{
978 if ((pos < 0) || (pos > text.size ()))
979 {
980 return (cmt_regexp::iterator::null ());
981 }
982
983 if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
984
985 int i;
986 int total = 0;
987 int p = pos;
988
989 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
990
991 if (dbg) {tab (); cerr << "match and (" << this << ") pos=" << pos << endl;}
992
993 for (i = 0; i < _nodes.size (); i++)
994 {
995 cmt_regexp_node* n = _nodes[i];
996
997 if (dbg) tab_level++;
998 const cmt_regexp::iterator it = n->match (text, p);
999 if (dbg) tab_level--;
1000
1001 if (dbg) {tab (); cerr << " -> it(" << n << ") p=" << it._pos << " l=" << it._length << endl;}
1002
1003 if (it == cmt_regexp::iterator::null ()) return (it);
1004
1005 total += it._length;
1006 p += it._length;
1007 }
1008
1009 // All nodes match
1010
1011 return (cmt_regexp::iterator (pos, total));
1012}
1013
1014void cmt_and_node::reduce ()
1015{
1016 if (_nodes.size () < 2) return;
1017
1018 char c = ' ';
1019 cmt_string s = "";
1020 cmt_vector<cmt_regexp_node*> new_nodes;
1021
1022 //
1023 // We loop once too much in order to finish the possibly accumulated
1024 // string at the end.
1025 //
1026 for (int i = 0; i <= _nodes.size (); i++)
1027 {
1028 cmt_regexp_node* n = 0;
1029
1030 if (i < _nodes.size ()) n = _nodes[i];
1031
1032 if ((i >= _nodes.size ()) || (!n->is_char ()))
1033 {
1034 if (s.size () == 1)
1035 {
1036 //
1037 // Too bad there was only one char node to consider
1038 // let's put it back as a char node !
1039 //
1040 new_nodes.push_back (new cmt_char_node (c));
1041 s = "";
1042 }
1043 else if (s.size () > 1)
1044 {
1045 //
1046 // We have real reduction here sonce there was several
1047 // consecutive char nodes.
1048 //
1049 new_nodes.push_back (new cmt_string_node (s));
1050 s = "";
1051 }
1052
1053 if (i >= _nodes.size ()) break;
1054 }
1055
1056 if (n->is_char ())
1057 {
1058 //
1059 // We are now trying to compact those char nodes.
1060 //
1061 cmt_char_node& cn = *((cmt_char_node*) n);
1062 c = (char) cn;
1063 s += c;
1064 delete n;
1065 _nodes[i] = 0;
1066 }
1067 else if (n->is_many_node ())
1068 {
1069 cmt_many_node& mn = *((cmt_many_node*) n);
1070 mn.install (*this, i + 1);
1071 mn.reduce ();
1072 new_nodes.push_back (n);
1073 break;
1074 }
1075 else
1076 {
1077 new_nodes.push_back (n);
1078 }
1079 }
1080
1081 _nodes = new_nodes;
1082}
1083
1084void cmt_and_node::fill (cmt_and_node& other, int start_index)
1085{
1086 if ((start_index < 0) || (start_index > other.nodes ())) return;
1087
1088 for (int i = start_index; i < other.nodes (); i++)
1089 {
1090 cmt_regexp_node* n = other._nodes[i];
1091 push (n);
1092 }
1093}
1094
1095void cmt_and_node::dump () const
1096{
1097 cmt_regexp_node_set::dump ("and");
1098}
1099
1100//----------------------------------------------------------
1101
1102//----------------------------------------------------------
1103cmt_or_node::cmt_or_node (cmt_regexp_node_set* father) : cmt_regexp_node_set (father)
1104{
1105}
1106
1107const cmt_regexp::iterator cmt_or_node::match (const cmt_string& text,
1108 int pos) const
1109{
1110 if ((pos < 0) || (pos >= text.size ()))
1111 {
1112 return (cmt_regexp::iterator::null ());
1113 }
1114
1115 if (_nodes.size () == 0) return (cmt_regexp::iterator (pos, 0));
1116
1117 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
1118
1119 if (dbg) {tab (); cerr << "match or (" << this << ") pos=" << pos << endl;}
1120
1121 int i;
1122
1123 int longest = 0;
1124
1125 cmt_regexp::iterator result = cmt_regexp::iterator::null ();
1126
1127 for (i = 0; i < _nodes.size (); i++)
1128 {
1129 const cmt_regexp_node* n = _nodes[i];
1130
1131 if (dbg) tab_level++;
1132 const cmt_regexp::iterator it = n->match (text, pos);
1133 if (dbg) tab_level--;
1134
1135 if (it._length > longest)
1136 {
1137 longest = it._length;
1138 result = it;
1139 }
1140
1141 // at least one or-ed expression matches
1142 // if (it != cmt_regexp::iterator::null ()) return (it);
1143 }
1144
1145 return (result);
1146}
1147
1148void cmt_or_node::dump () const
1149{
1150 cmt_regexp_node_set::dump ("or");
1151}
1152
1153//----------------------------------------------------------
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167//----------------------------------------------------------
1168cmt_regexp::cmt_regexp ()
1169{
1170 _root = 0;
1171}
1172
1173//----------------------------------------------------------
1174cmt_regexp::cmt_regexp (const cmt_string& expression)
1175{
1176 _root = 0;
1177 set (expression);
1178}
1179
1180//----------------------------------------------------------
1181void cmt_regexp::set (const cmt_string& expression)
1182{
1183 if (_root != 0)
1184 {
1185 delete _root;
1186 _root = 0;
1187 }
1188
1189 //
1190 // The root is the cmt_or_node which will be returned. It is
1191 // the top of the hierarchy.
1192 //
1193 // top is the running cmt_and_node.
1194 //
1195 cmt_regexp_node_set* or_root = 0;
1196 cmt_regexp_node_set* top_and = 0;
1197
1198 // abcd
1199 // ab|cd
1200 // a|b|cd
1201 // a|b*|cd
1202 // a|b*|cd?e
1203 //
1204 // exp : and
1205 // | exp '|' and
1206 //
1207 // and : unary
1208 // | unary and
1209 //
1210 // unary : primary '*'
1211 // | primary '?'
1212 //
1213 // primary : '[' opt_begin opt_chars opt_end ']'
1214 // | '^'
1215 // | '$'
1216 // | char
1217 // | '(' exp ')'
1218 //
1219
1220 {
1221 //
1222 // First we build an cmt_or_node (corresponding to the
1223 // first grammatical rule)
1224 //
1225 // Then cmt_and_nodes are pushed into it.
1226 // and standard nodes are pushed into the running (top_and) cmt_and_node
1227 //
1228 or_root = new cmt_or_node (0);
1229 top_and = new cmt_and_node (or_root);
1230 }
1231
1232 int i;
1233
1234 for (i = 0; i < expression.size (); i++)
1235 {
1236 char c = expression[i];
1237 switch (c)
1238 {
1239 case '[':
1240 {
1241 //
1242 // The case is
1243 //
1244 // exp : '[' char ... ']'
1245 // exp : '[' '^' char ... ']'
1246 //
1247
1248 if (i >= expression.size ())
1249 {
1250 // syntax error : unbalanced '['
1251 delete or_root;
1252 return;
1253 }
1254 i++;
1255
1256 int i0 = i;
1257
1258 bool done = false;
1259 bool has_not = false;
1260
1261 cmt_string choices = "";
1262
1263 for (; i < expression.size (); i++)
1264 {
1265 c = expression[i];
1266 switch (c)
1267 {
1268 case ']':
1269 done = true;
1270 break;
1271 case '^':
1272 if (i == i0) has_not = true;
1273 else choices += c;
1274 break;
1275 case '\\':
1276 choices += c;
1277 if (i >= expression.size ())
1278 {
1279 // syntax error : unbalanced '[' and unfinished
1280 // escape sequence
1281 delete or_root;
1282 return;
1283 }
1284 i++;
1285 c = expression[i];
1286 choices += c;
1287 break;
1288 default:
1289 choices += c;
1290 break;
1291 }
1292 if (done) break;
1293 }
1294
1295 if (!done)
1296 {
1297 // syntax error : unbalanced '['
1298 delete or_root;
1299 return;
1300 }
1301 if (has_not)
1302 top_and->push (new cmt_not_char_list_node (choices));
1303 else
1304 top_and->push (new cmt_char_list_node (choices));
1305 }
1306 break;
1307 case '*':
1308 {
1309 //
1310 // exp : exp '*'
1311 //
1312 if (top_and->nodes () == 0)
1313 {
1314 // Syntax error : '*' is not preceded by an expression
1315 delete or_root;
1316 return;
1317 }
1318
1319 cmt_regexp_node* n = top_and->pop ();
1320 top_and->push (new cmt_zero_more (n));
1321 }
1322 break;
1323 case '+':
1324 {
1325 //
1326 // exp : exp '+'
1327 //
1328 if (top_and->nodes () == 0)
1329 {
1330 // Syntax error : '+' is not preceded by an expression
1331 delete or_root;
1332 return;
1333 }
1334
1335 cmt_regexp_node* n = top_and->pop ();
1336 top_and->push (new cmt_one_more (n));
1337 }
1338 break;
1339 case '?':
1340 {
1341 //
1342 // exp : exp '?'
1343 //
1344 if (top_and->nodes () == 0)
1345 {
1346 // Syntax error : '?' is not preceded by an expression
1347 delete or_root;
1348 return;
1349 }
1350
1351 cmt_regexp_node* n = top_and->pop ();
1352 top_and->push (new cmt_zero_one (n));
1353 }
1354 break;
1355 case '.':
1356 //
1357 // exp : '.'
1358 //
1359 top_and->push (new cmt_any_node ());
1360 break;
1361 case '(':
1362 {
1363 //
1364 // exp : '(' exp ')'
1365 //
1366 if (top_and->parentheses ())
1367 {
1368 // This should never happen.
1369 delete or_root;
1370 return;
1371 }
1372
1373 top_and->set_parentheses (true);
1374
1375 //
1376 // A new complete expression is started.
1377 // -> do as for top_and parsing.
1378 //
1379
1380 top_and = new cmt_and_node (new cmt_or_node (top_and));
1381 }
1382 break;
1383 case ')':
1384 {
1385 //
1386 // exp : '(' exp ')'
1387 //
1388
1389 // top_and is the cmt_and_node into which new nodes are pushed.
1390 cmt_regexp_node_set* or_node = top_and->father ();
1391 if (or_node == 0)
1392 {
1393 // This should never happen : top_and should always be
1394 // at least an cmt_and_node hanging at an cmt_or_node
1395 delete or_root;
1396 return;
1397 }
1398
1399 //
1400 // The last cmt_and_node was empty, thus we had either '()' or '(...|)'
1401 //
1402
1403 if (top_and->nodes () == 0)
1404 {
1405 delete (or_node->pop ());
1406 }
1407 else
1408 {
1409 top_and->reduce ();
1410 }
1411
1412 top_and = or_node->father ();
1413
1414 if (top_and == 0)
1415 {
1416 // Syntax error : too many ')'
1417 delete or_root;
1418 return;
1419 }
1420
1421 //
1422 // top_and is now the previous running cmt_and_node where the '('
1423 // was originally met its top_and node contains the parenthesized
1424 // sub expression If this one is empty, (due to an empty '()'
1425 // expression) then it may simply be discarded.
1426 //
1427
1428 if (!top_and->parentheses ())
1429 {
1430 // Syntax error : too many ')'
1431 delete or_root;
1432 return;
1433 }
1434
1435 top_and->set_parentheses (false);
1436
1437 cmt_regexp_node* unique = 0;
1438 if (or_node->nodes () == 1)
1439 {
1440 cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top ();
1441 if (and_node->nodes () == 1)
1442 {
1443 unique = and_node->pop ();
1444 delete (or_node->pop ());
1445 }
1446 else if (and_node->nodes () == 0)
1447 {
1448 delete (or_node->pop ());
1449 }
1450 }
1451
1452 if (or_node->nodes () == 0) delete (top_and->pop ());
1453 if (unique != 0) top_and->push (unique);
1454 }
1455
1456 break;
1457 case '|':
1458 {
1459 //
1460 // exp : exp '|' exp
1461 //
1462
1463 cmt_regexp_node_set* or_node = top_and->father ();
1464
1465 top_and->reduce ();
1466
1467 //
1468 // or is the father cmt_or_node, which only contains cmt_and_nodes
1469 //
1470
1471 const cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top ();
1472 if (and_node->nodes () == 0)
1473 {
1474 // the previous node was empty.
1475 // we may discard it
1476 or_node->pop ();
1477 }
1478
1479 top_and = new cmt_and_node (or_node);
1480 }
1481 break;
1482 case '^':
1483 //
1484 // exp : '^'
1485 //
1486 top_and->push (new cmt_begin_node ());
1487 break;
1488 case '$':
1489 //
1490 // exp : '$'
1491 //
1492 top_and->push (new cmt_end_node ());
1493 break;
1494 case '\\':
1495 if (i >= expression.size ())
1496 {
1497 delete or_root;
1498 return;
1499 }
1500 i++;
1501 c = expression[i];
1502 switch (c)
1503 {
1504 case '[':
1505 case ']':
1506 case '(':
1507 case ')':
1508 case '.':
1509 case '*':
1510 case '?':
1511 case '^':
1512 case '$':
1513 case '\\':
1514 break;
1515 case 'r':
1516 c = '\r';
1517 break;
1518 case 't':
1519 c = '\t';
1520 break;
1521 case 'n':
1522 c = '\n';
1523 break;
1524 default:
1525 break;
1526 }
1527 default:
1528 top_and->push (new cmt_char_node (c));
1529 break;
1530 }
1531 }
1532
1533 if (or_root != 0)
1534 {
1535 cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_root->top ();
1536
1537 if (or_root->nodes () == 1)
1538 {
1539 //
1540 // Check whether there is at least one non-empty
1541 // cmt_and_node
1542 //
1543 if (and_node->nodes () == 0)
1544 {
1545 delete or_root;
1546 return;
1547 }
1548 }
1549
1550 if (and_node != 0)
1551 {
1552 and_node->reduce ();
1553
1554 if (and_node->parentheses ())
1555 {
1556 delete or_root;
1557 return;
1558 }
1559 }
1560 }
1561
1562 _root = or_root;
1563
1564 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP");
1565
1566 if (dbg)
1567 {
1568 if (_root != 0)
1569 {
1570 _root->dump ();
1571 }
1572 }
1573}
1574
1575cmt_regexp::~cmt_regexp ()
1576{
1577 if (_root != 0)
1578 {
1579 delete _root;
1580 }
1581}
1582
1583//----------------------------------------------------------
1584void cmt_regexp::clear ()
1585{
1586 if (_root != 0)
1587 {
1588 delete _root;
1589 _root = 0;
1590 }
1591}
1592
1593//----------------------------------------------------------
1594bool cmt_regexp::is_valid () const
1595{
1596 if (_root != 0) return (true);
1597 else return (false);
1598}
1599
1600cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos)
1601{
1602 if (_root != 0)
1603 {
1604 int i;
1605
1606 for (i = pos; i < text.size (); i++)
1607 {
1608 cmt_regexp::iterator it = _root->match (text, i);
1609 if (it != end ()) return (it);
1610 }
1611 }
1612
1613 return (end ());
1614}
1615
1616cmt_regexp::iterator cmt_regexp::end ()
1617{
1618 return (cmt_regexp::iterator::null ());
1619}
1620
1621cmt_regexp::iterator cmt_regexp::begin (const cmt_string& text, int pos) const
1622{
1623 if (_root != 0)
1624 {
1625 int i;
1626
1627 for (i = pos; i < text.size (); i++)
1628 {
1629 cmt_regexp::iterator it = _root->match (text, i);
1630 if (it != end ()) return (it);
1631 }
1632 }
1633
1634 return (end ());
1635}
1636
1637cmt_regexp::iterator cmt_regexp::end () const
1638{
1639 return (cmt_regexp::iterator::null ());
1640}
1641
1642bool cmt_regexp::match (const cmt_string& text) const
1643{
1644 iterator it = begin (text);
1645 if (it == end ()) return (false);
1646 else return (true);
1647}
1648//----------------------------------------------------------
1649
1650//----------------------------------------------------------
1651const cmt_regexp::iterator cmt_regexp::iterator::null ()
1652{
1653 static const iterator null_instance (-1, -1);
1654
1655 return (null_instance);
1656}
1657
1658cmt_regexp::iterator::iterator ()
1659{
1660 _pos = 0;
1661 _length = 0;
1662}
1663
1664cmt_regexp::iterator::iterator (int pos, int length)
1665{
1666 _pos = pos;
1667 _length = length;
1668}
1669
1670cmt_regexp::iterator::iterator (const iterator& other)
1671{
1672 _pos = other._pos;
1673 _length = other._length;
1674}
1675
1676int cmt_regexp::iterator::operator != (const iterator& other) const
1677{
1678 return ((this->_pos != other._pos) ||
1679 (this->_length != other._length));
1680}
1681
1682int cmt_regexp::iterator::operator == (const iterator& other) const
1683{
1684 return ((this->_pos == other._pos) &&
1685 (this->_length == other._length));
1686}
1687
1688int cmt_regexp::iterator::operator < (const iterator& other) const
1689{
1690 if (_pos == -1) return (0);
1691 if (other._pos == -1) return (0);
1692
1693 return (_pos < other._pos);
1694}
1695
1696cmt_string cmt_regexp::iterator::operator () (const cmt_string& text) const
1697{
1698 if (_pos == -1) return ("");
1699 if (_length <= 0) return ("");
1700
1701 return (text.substr (_pos, _length));
1702}
1703
1704//----------------------------------------------------------
1705
Note: See TracBrowser for help on using the repository browser.