Changeset 400 for CMT/HEAD/source/cmt_regexp.cxx
- Timestamp:
- Apr 23, 2007, 11:10:18 AM (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
CMT/HEAD/source/cmt_regexp.cxx
r11 r400 17 17 18 18 static int tab_level = 0; 19 19 20 static void tab () 20 21 { 21 22 for (int i = 0; i < tab_level; i++) 22 23 { 23 c out<< " ";24 cerr << " "; 24 25 } 25 26 } … … 287 288 288 289 const cmt_regexp::iterator cmt_regexp_node::match (const cmt_string& /*text*/, 289 290 int /*pos*/) const 290 291 { 291 292 return (cmt_regexp::iterator::null()); … … 345 346 void cmt_char_node::dump () const 346 347 { 347 tab (); c out<< "char>(" << this << ") c=" << _c << endl;348 tab (); cerr << "char>(" << this << ") c=" << _c << endl; 348 349 } 349 350 … … 357 358 358 359 const cmt_regexp::iterator cmt_string_node::match (const cmt_string& text, 359 360 int pos) const 360 361 { 361 362 if ((pos < 0) || (pos > text.size ())) … … 378 379 void cmt_string_node::dump () const 379 380 { 380 tab (); c out<< "string (" << this << ") s=[" << _s << "]" << endl;381 tab (); cerr << "string (" << this << ") s=[" << _s << "]" << endl; 381 382 } 382 383 … … 399 400 switch (c) 400 401 { 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 402 case '-': 403 i++; 404 { 405 char c1 = _choices[_choices.size () - 1]; 406 char c2 = list[i]; 407 int j; 408 int j0 = (c1 < c2) ? c1 : c2; 409 int j1 = (c1 > c2) ? c1 : c2; 410 for (j = j0; j <= j1; j++) 411 { 412 _choices += j; 413 } 414 } 415 break; 416 case '\\': 417 i++; 418 c = list[i]; 419 switch (c) 420 { 421 case '[': 422 case ']': 423 case '(': 424 case ')': 425 case '.': 426 case '*': 427 case '?': 428 case '^': 429 case '$': 430 case '\\': 431 c = '\\'; 432 break; 433 case 'r': 434 c = '\r'; 435 break; 436 case 't': 437 c = '\t'; 438 break; 439 case 'n': 440 c = '\n'; 441 break; 442 default: 443 break; 444 } 445 default: 446 _choices += c; 447 break; 447 448 } 448 449 } … … 450 451 451 452 const cmt_regexp::iterator cmt_char_list_node::match (const cmt_string& text, 452 453 int pos) const 453 454 { 454 455 if ((pos < 0) || (pos > text.size ())) … … 471 472 void cmt_char_list_node::dump () const 472 473 { 473 tab (); c out<< "char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;474 tab (); cerr << "char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl; 474 475 } 475 476 … … 478 479 //---------------------------------------------------------- 479 480 cmt_not_char_list_node::cmt_not_char_list_node (cmt_string list) : 480 481 cmt_char_list_node (list) 481 482 { 482 483 } 483 484 484 485 const cmt_regexp::iterator cmt_not_char_list_node::match (const cmt_string& text, 485 486 int pos) const 486 487 { 487 488 if ((pos < 0) || (pos > text.size ())) … … 504 505 void cmt_not_char_list_node::dump () const 505 506 { 506 tab (); c out<< "not_char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl;507 tab (); cerr << "not_char_list (" << this << ") list=[" << _list << "] choices=[" << _choices << "]" << endl; 507 508 } 508 509 … … 512 513 //---------------------------------------------------------- 513 514 const cmt_regexp::iterator cmt_any_node::match (const cmt_string& text, 514 515 int pos) const 515 516 { 516 517 if ((pos < 0) | (pos >= text.size ())) … … 524 525 void cmt_any_node::dump () const 525 526 { 526 tab (); c out<< "any (" << this << ") " << endl;527 tab (); cerr << "any (" << this << ") " << endl; 527 528 } 528 529 … … 540 541 541 542 const cmt_regexp::iterator cmt_zero_one::match (const cmt_string& text, 542 543 int pos) const 543 544 { 544 545 if ((pos < 0) || (pos > text.size ())) … … 564 565 void cmt_zero_one::dump () const 565 566 { 566 tab (); cout << "zero_one (" << this << ") " << endl; 567 tab (); cerr << "zero_one (" << this << ") " << endl; 568 567 569 if (_node != 0) 568 570 { … … 602 604 void cmt_many_node::dump () const 603 605 { 604 tab (); cout << "many (" << this << ") " << endl; 606 tab (); cerr << "many (" << this << ") " << endl; 607 605 608 if (_node != 0) 606 609 { … … 609 612 tab_level--; 610 613 } 614 611 615 tab_level++; 612 616 _follower.dump (); … … 624 628 625 629 const cmt_regexp::iterator cmt_zero_more::match (const cmt_string& text, 626 630 int pos) const 627 631 { 628 632 if ((pos < 0) || (pos > text.size ())) … … 699 703 void cmt_zero_more::dump () const 700 704 { 701 tab (); cout << "zero_more (" << this << ") " << endl; 705 tab (); cerr << "zero_more (" << this << ") " << endl; 706 702 707 if (_node != 0) 703 708 { … … 706 711 tab_level--; 707 712 } 713 708 714 _follower.dump (); 709 715 } … … 717 723 718 724 const cmt_regexp::iterator cmt_one_more::match (const cmt_string& text, 719 725 int pos) const 720 726 { 721 727 if ((pos < 0) || (pos > text.size ())) … … 798 804 void cmt_one_more::dump () const 799 805 { 800 tab (); cout << "one_more (" << this << ") " << endl; 806 tab (); cerr << "one_more (" << this << ") " << endl; 807 801 808 if (_node != 0) 802 809 { … … 805 812 tab_level--; 806 813 } 814 807 815 tab_level++; 808 816 _follower.dump (); … … 816 824 //---------------------------------------------------------- 817 825 const cmt_regexp::iterator cmt_begin_node::match (const cmt_string& /*text*/, 818 826 int pos) const 819 827 { 820 828 if (pos == 0) return (cmt_regexp::iterator (pos, 0)); … … 824 832 void cmt_begin_node::dump () const 825 833 { 826 tab (); c out<< "begin (" << this << ") " << endl;834 tab (); cerr << "begin (" << this << ") " << endl; 827 835 } 828 836 //---------------------------------------------------------- … … 838 846 void cmt_end_node::dump () const 839 847 { 840 tab (); c out<< "end (" << this << ") " << endl;848 tab (); cerr << "end (" << this << ") " << endl; 841 849 } 842 850 //---------------------------------------------------------- … … 930 938 void cmt_regexp_node_set::dump () const 931 939 { 932 tab (); c out<< "regexp_node_set (" << this << ") " << endl;940 tab (); cerr << "regexp_node_set (" << this << ") " << endl; 933 941 } 934 942 935 943 void cmt_regexp_node_set::dump (const cmt_string& title) const 936 944 { 937 tab (); cout << "Set (" << this << ") father=" << _father << " pars=" << _parentheses << endl; 945 tab (); cerr << "Set (" << this << ") father=" << _father << " pars=" << _parentheses << endl; 946 938 947 for (int i = 0; i < _nodes.size (); i++) 939 948 { … … 943 952 if (i > 0) 944 953 { 945 tab (); c out<< title << endl;954 tab (); cerr << title << endl; 946 955 } 947 956 tab_level++; … … 950 959 } 951 960 } 952 tab (); cout << "EndSet (" << this << ")" << endl; 961 962 tab (); cerr << "EndSet (" << this << ")" << endl; 953 963 } 954 964 //---------------------------------------------------------- … … 978 988 979 989 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP"); 980 if (dbg) {tab (); cout << "match and (" << this << ") pos=" << pos << endl;} 990 991 if (dbg) {tab (); cerr << "match and (" << this << ") pos=" << pos << endl;} 981 992 982 993 for (i = 0; i < _nodes.size (); i++) … … 988 999 if (dbg) tab_level--; 989 1000 990 if (dbg) {tab (); c out<< " -> it(" << n << ") p=" << it._pos << " l=" << it._length << endl;}1001 if (dbg) {tab (); cerr << " -> it(" << n << ") p=" << it._pos << " l=" << it._length << endl;} 991 1002 992 1003 if (it == cmt_regexp::iterator::null ()) return (it); … … 996 1007 } 997 1008 998 1009 // All nodes match 999 1010 1000 1011 return (cmt_regexp::iterator (pos, total)); … … 1105 1116 1106 1117 bool dbg = CmtSystem::testenv ("CMTTESTREGEXP"); 1107 if (dbg) {tab (); cout << "match or (" << this << ") pos=" << pos << endl;} 1118 1119 if (dbg) {tab (); cerr << "match or (" << this << ") pos=" << pos << endl;} 1108 1120 1109 1121 int i; … … 1127 1139 } 1128 1140 1129 1141 // at least one or-ed expression matches 1130 1142 // if (it != cmt_regexp::iterator::null ()) return (it); 1131 1143 } … … 1175 1187 } 1176 1188 1177 1178 1179 1180 1181 1182 1189 // 1190 // The root is the cmt_or_node which will be returned. It is 1191 // the top of the hierarchy. 1192 // 1193 // top is the running cmt_and_node. 1194 // 1183 1195 cmt_regexp_node_set* or_root = 0; 1184 1196 cmt_regexp_node_set* top_and = 0; 1185 1197 1186 // abcd 1187 // ab|cd 1188 // a|b|cd 1189 // a|b*|cd 1190 // a|b*|cd?e 1198 // abcd 1199 // ab|cd 1200 // a|b|cd 1201 // a|b*|cd 1202 // a|b*|cd?e 1203 // 1204 // exp : and 1205 // | exp '|' and 1206 // 1207 // and : unary 1208 // | unary and 1209 // 1210 // unary : primary '*' 1211 // | primary '?' 1212 // 1213 // primary : '[' opt_begin opt_chars opt_end ']' 1214 // | '^' 1215 // | '$' 1216 // | char 1217 // | '(' exp ')' 1218 // 1219 1220 { 1191 1221 // 1192 // exp : and1193 // | exp '|' and1222 // First we build an cmt_or_node (corresponding to the 1223 // first grammatical rule) 1194 1224 // 1195 // and : unary1196 // | unary and1225 // Then cmt_and_nodes are pushed into it. 1226 // and standard nodes are pushed into the running (top_and) cmt_and_node 1197 1227 // 1198 // unary : primary '*'1199 // | primary '?'1200 //1201 // primary : '[' opt_begin opt_chars opt_end ']'1202 // | '^'1203 // | '$'1204 // | char1205 // | '(' exp ')'1206 //1207 1208 {1209 //1210 // First we build an cmt_or_node (corresponding to the1211 // first grammatical rule)1212 //1213 // Then cmt_and_nodes are pushed into it.1214 // and standard nodes are pushed into the running (top_and) cmt_and_node1215 //1216 1228 or_root = new cmt_or_node (0); 1217 1229 top_and = new cmt_and_node (or_root); … … 1225 1237 switch (c) 1226 1238 { 1227 1239 case '[': 1228 1240 { 1229 1230 1231 1232 1233 1234 1241 // 1242 // The case is 1243 // 1244 // exp : '[' char ... ']' 1245 // exp : '[' '^' char ... ']' 1246 // 1235 1247 1236 1248 if (i >= expression.size ()) 1237 1249 { 1238 1250 // syntax error : unbalanced '[' 1239 1251 delete or_root; 1240 1252 return; … … 1254 1266 switch (c) 1255 1267 { 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1268 case ']': 1269 done = true; 1270 break; 1271 case '^': 1272 if (i == i0) has_not = true; 1273 else choices += c; 1274 break; 1275 case '\\': 1276 choices += c; 1277 if (i >= expression.size ()) 1278 { 1279 // syntax error : unbalanced '[' and unfinished 1280 // escape sequence 1281 delete or_root; 1282 return; 1283 } 1284 i++; 1285 c = expression[i]; 1286 choices += c; 1287 break; 1288 default: 1289 choices += c; 1290 break; 1279 1291 } 1280 1292 if (done) break; … … 1283 1295 if (!done) 1284 1296 { 1285 1297 // syntax error : unbalanced '[' 1286 1298 delete or_root; 1287 1299 return; … … 1293 1305 } 1294 1306 break; 1295 1307 case '*': 1296 1308 { 1297 1298 1299 1309 // 1310 // exp : exp '*' 1311 // 1300 1312 if (top_and->nodes () == 0) 1301 1313 { 1302 1314 // Syntax error : '*' is not preceded by an expression 1303 1315 delete or_root; 1304 1316 return; … … 1309 1321 } 1310 1322 break; 1311 1323 case '+': 1312 1324 { 1313 1314 1315 1325 // 1326 // exp : exp '+' 1327 // 1316 1328 if (top_and->nodes () == 0) 1317 1329 { 1318 1330 // Syntax error : '+' is not preceded by an expression 1319 1331 delete or_root; 1320 1332 return; … … 1325 1337 } 1326 1338 break; 1327 1339 case '?': 1328 1340 { 1329 1330 1331 1341 // 1342 // exp : exp '?' 1343 // 1332 1344 if (top_and->nodes () == 0) 1333 1345 { 1334 1346 // Syntax error : '?' is not preceded by an expression 1335 1347 delete or_root; 1336 1348 return; … … 1341 1353 } 1342 1354 break; 1343 1344 1345 1346 1347 1348 1349 1355 case '.': 1356 // 1357 // exp : '.' 1358 // 1359 top_and->push (new cmt_any_node ()); 1360 break; 1361 case '(': 1350 1362 { 1351 1352 1353 1363 // 1364 // exp : '(' exp ')' 1365 // 1354 1366 if (top_and->parentheses ()) 1355 1367 { 1356 1368 // This should never happen. 1357 1369 delete or_root; 1358 1370 return; … … 1361 1373 top_and->set_parentheses (true); 1362 1374 1363 1364 1365 1366 1375 // 1376 // A new complete expression is started. 1377 // -> do as for top_and parsing. 1378 // 1367 1379 1368 1380 top_and = new cmt_and_node (new cmt_or_node (top_and)); 1369 1381 } 1370 1382 break; 1371 1383 case ')': 1372 1384 { 1373 1374 1375 1376 1377 1385 // 1386 // exp : '(' exp ')' 1387 // 1388 1389 // top_and is the cmt_and_node into which new nodes are pushed. 1378 1390 cmt_regexp_node_set* or_node = top_and->father (); 1379 1391 if (or_node == 0) 1380 1392 { 1381 1382 1393 // This should never happen : top_and should always be 1394 // at least an cmt_and_node hanging at an cmt_or_node 1383 1395 delete or_root; 1384 1396 return; 1385 1397 } 1386 1398 1387 1388 1389 1399 // 1400 // The last cmt_and_node was empty, thus we had either '()' or '(...|)' 1401 // 1390 1402 1391 1403 if (top_and->nodes () == 0) … … 1402 1414 if (top_and == 0) 1403 1415 { 1404 1416 // Syntax error : too many ')' 1405 1417 delete or_root; 1406 1418 return; 1407 1419 } 1408 1420 1409 1410 1411 1412 1413 1414 1421 // 1422 // top_and is now the previous running cmt_and_node where the '(' 1423 // was originally met its top_and node contains the parenthesized 1424 // sub expression If this one is empty, (due to an empty '()' 1425 // expression) then it may simply be discarded. 1426 // 1415 1427 1416 1428 if (!top_and->parentheses ()) 1417 1429 { 1418 1430 // Syntax error : too many ')' 1419 1431 delete or_root; 1420 1432 return; … … 1443 1455 1444 1456 break; 1445 1457 case '|': 1446 1458 { 1447 1448 1449 1459 // 1460 // exp : exp '|' exp 1461 // 1450 1462 1451 1463 cmt_regexp_node_set* or_node = top_and->father (); … … 1453 1465 top_and->reduce (); 1454 1466 1455 1456 1457 1467 // 1468 // or is the father cmt_or_node, which only contains cmt_and_nodes 1469 // 1458 1470 1459 1471 const cmt_regexp_node_set* and_node = (cmt_regexp_node_set*) or_node->top (); 1460 1472 if (and_node->nodes () == 0) 1461 1473 { 1462 1463 1474 // the previous node was empty. 1475 // we may discard it 1464 1476 or_node->pop (); 1465 1477 } … … 1468 1480 } 1469 1481 break; 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1482 case '^': 1483 // 1484 // exp : '^' 1485 // 1486 top_and->push (new cmt_begin_node ()); 1487 break; 1488 case '$': 1489 // 1490 // exp : '$' 1491 // 1492 top_and->push (new cmt_end_node ()); 1493 break; 1494 case '\\': 1495 if (i >= expression.size ()) 1496 { 1497 delete or_root; 1498 return; 1499 } 1500 i++; 1501 c = expression[i]; 1502 switch (c) 1503 { 1504 case '[': 1505 case ']': 1506 case '(': 1507 case ')': 1508 case '.': 1509 case '*': 1510 case '?': 1511 case '^': 1512 case '$': 1513 case '\\': 1514 break; 1515 case 'r': 1516 c = '\r'; 1517 break; 1518 case 't': 1519 c = '\t'; 1520 break; 1521 case 'n': 1522 c = '\n'; 1523 break; 1524 default: 1525 break; 1526 } 1527 default: 1528 top_and->push (new cmt_char_node (c)); 1529 break; 1518 1530 } 1519 1531 } … … 1525 1537 if (or_root->nodes () == 1) 1526 1538 { 1527 1528 1529 1530 1539 // 1540 // Check whether there is at least one non-empty 1541 // cmt_and_node 1542 // 1531 1543 if (and_node->nodes () == 0) 1532 1544 {
Note: See TracChangeset
for help on using the changeset viewer.