1: <?php
2:
3: 4: 5: 6: 7: 8: 9: 10: 11:
12:
13: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
14:
15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33:
34: class HtmlParser {
35:
36: 37: 38: 39: 40:
41: const NODE_TYPE_ELEMENT = 1;
42:
43: 44: 45: 46: 47:
48: const NODE_TYPE_ENDELEMENT = 2;
49:
50: 51: 52: 53: 54:
55: const NODE_TYPE_TEXT = 3;
56:
57: 58: 59: 60: 61:
62: const = 4;
63:
64: 65: 66: 67: 68:
69: const NODE_TYPE_DONE = 5;
70:
71: 72: 73: 74: 75: 76: 77:
78: protected $_NodeType;
79:
80: 81: 82: 83: 84: 85: 86:
87: protected $_NodeName = "";
88:
89: 90: 91: 92: 93: 94: 95:
96: protected $_NodeValue = "";
97:
98: 99: 100: 101: 102: 103: 104: 105:
106: protected $_NodeAttributes = array();
107:
108: 109: 110: 111:
112: protected $_HtmlText = '';
113:
114: 115: 116: 117:
118: protected $_HtmlTextLength;
119:
120: 121: 122: 123:
124: protected $_HtmlTextIndex = 0;
125:
126: 127: 128: 129: 130: 131: 132:
133: public function __construct($HtmlText) {
134: $this->setHtmlText($HtmlText);
135: $this->setHtmlTextLength(strlen($HtmlText));
136: }
137:
138: 139: 140: 141: 142: 143: 144: 145: 146:
147: public function HtmlParser($HtmlText) {
148: cDeprecated('This method is deprecated and is not needed any longer. Please use __construct() as constructor function.');
149: return $this->__construct($HtmlText);
150: }
151:
152: 153: 154: 155: 156: 157:
158: public function setHtmlText($HtmlText) {
159: return $this->_HtmlText = $HtmlText;
160: }
161:
162: 163: 164: 165: 166: 167:
168: public function setHtmlTextLength($HtmlTextLength) {
169: return $this->_HtmlTextLength = $HtmlTextLength;
170: }
171:
172: 173: 174: 175: 176: 177:
178: public function setHtmlTextIndex($HtmlTextIndex) {
179: return $this->_HtmlTextIndex = $HtmlTextIndex;
180: }
181:
182: 183: 184: 185: 186: 187: 188: 189:
190: public function _setNodeAttributes($NodeAttributes) {
191:
192: if (!is_array($NodeAttributes)) {
193: return false;
194: }
195:
196: return $this->_NodeAttributes = $NodeAttributes;
197: }
198:
199: 200: 201: 202: 203:
204: public function getHtmlText() {
205: return $this->_HtmlText;
206: }
207:
208: 209: 210: 211: 212:
213: public function getHtmlTextLength() {
214: return $this->_HtmlTextLength;
215: }
216:
217: 218: 219: 220: 221:
222: public function getNodeType() {
223: return $this->_NodeType;
224: }
225:
226: 227: 228: 229: 230:
231: public function getNodeName() {
232: return $this->_NodeName;
233: }
234:
235: 236: 237: 238: 239:
240: public function getNodeAttributesArray() {
241: return $this->_NodeAttributes;
242: }
243:
244: 245: 246: 247: 248: 249:
250: public function getNodeAttributes($attribute) {
251: return $this->_NodeAttributes[$attribute];
252: }
253:
254: 255: 256: 257: 258:
259: public function getHtmlTextIndex() {
260: return $this->_HtmlTextIndex;
261: }
262:
263: 264: 265: 266: 267:
268: protected function increaseHtmlTextIndex() {
269: return $this->_HtmlTextIndex++;
270: }
271:
272: 273: 274: 275: 276: 277: 278: 279:
280: public function parse() {
281: $text = $this->_skipToElement();
282: if ($text != "") {
283: $this->_NodeType = self::NODE_TYPE_TEXT;
284: $this->_NodeName = "Text";
285: $this->_NodeValue = $text;
286: return true;
287: }
288: return $this->_readTag();
289: }
290:
291: 292: 293: 294: 295:
296: protected function _clearAttributes() {
297: return $this->_NodeAttributes = array();
298: }
299:
300: 301: 302: 303:
304: protected function _readTag() {
305: if ($this->_currentChar() != "<") {
306: $this->_NodeType = self::NODE_TYPE_DONE;
307: return false;
308: }
309:
310: $this->_skipInTag("<");
311: $this->_clearAttributes();
312: $name = $this->_skipToBlanksInTag();
313: $pos = strpos($name, "/");
314:
315: if ($pos === 0) {
316: $this->_NodeType = self::NODE_TYPE_ENDELEMENT;
317: $this->_NodeName = substr($name, 1);
318: $this->_NodeValue = "";
319: } else {
320: if (!$this->_isValidTagIdentifier($name)) {
321: $comment = false;
322: if ($name == "!--") {
323: $rest = $this->_skipToStringInTag("-->");
324: if ($rest != "") {
325: $this->_NodeType = self::NODE_TYPE_COMMENT;
326: $this->_NodeName = "Comment";
327: $this->_NodeValue = "<" . $name . $rest;
328: $comment = true;
329: }
330: }
331: if (!$comment) {
332: $this->_NodeType = self::NODE_TYPE_TEXT;
333: $this->_NodeName = "Text";
334: $this->_NodeValue = "<" . $name;
335: }
336: return true;
337: } else {
338: $this->_NodeType = self::NODE_TYPE_ELEMENT;
339: $this->_NodeValue = "";
340: $nameLength = strlen($name);
341: if ($nameLength > 0 && substr($name, $nameLength - 1, 1) == "/") {
342: $this->_NodeName = substr($name, 0, $nameLength - 1);
343: } else {
344: $this->_NodeName = $name;
345: }
346: }
347: }
348:
349: while ($this->_skipBlanksInTag()) {
350: $attrName = $this->_skipToBlanksOrEqualsInTag();
351: $NodeAttributes = $this->getNodeAttributesArray();
352:
353: if ($attrName != "") {
354:
355: $this->_skipBlanksInTag();
356:
357: if ($this->_currentChar() == "=") {
358: $this->_skipEqualsInTag();
359: $this->_skipBlanksInTag();
360:
361: $value = $this->_readValueInTag();
362:
363: $NodeAttributes[strtolower($attrName)] = $value;
364: $this->_setNodeAttributes($NodeAttributes);
365: } else {
366: $NodeAttributes[strtolower($attrName)] = "";
367: $this->_setNodeAttributes($NodeAttributes);
368: }
369: }
370: }
371:
372: $this->_skipEndOfTag();
373: return true;
374: }
375:
376: 377: 378: 379: 380:
381: protected function _isValidTagIdentifier($name) {
382: return preg_match('/[A-Za-z0-9]+/', $name);
383: }
384:
385: 386: 387: 388:
389: protected function _skipBlanksInTag() {
390: return "" != ($this->_skipInTag(array(
391: " ",
392: "\t",
393: "\r",
394: "\n"
395: )));
396: }
397:
398: 399: 400: 401:
402: protected function _skipToBlanksOrEqualsInTag() {
403: return $this->_skipToInTag(array(
404: " ",
405: "\t",
406: "\r",
407: "\n",
408: "="
409: ));
410: }
411:
412: 413: 414: 415:
416: protected function _skipToBlanksInTag() {
417: return $this->_skipToInTag(array(
418: " ",
419: "\t",
420: "\r",
421: "\n"
422: ));
423: }
424:
425: 426: 427: 428:
429: protected function _skipEqualsInTag() {
430: return $this->_skipInTag(array(
431: "="
432: ));
433: }
434:
435: 436: 437: 438:
439: protected function _readValueInTag() {
440: $ch = $this->_currentChar();
441: $value = "";
442:
443: if ($ch == "\"") {
444: $this->_skipInTag(array(
445: "\""
446: ));
447: $value = $this->_skipToInTag(array(
448: "\""
449: ));
450: $this->_skipInTag(array(
451: "\""
452: ));
453: } else if ($ch == "\'") {
454: $this->_skipInTag(array(
455: "\'"
456: ));
457: $value = $this->_skipToInTag(array(
458: "\'"
459: ));
460: $this->_skipInTag(array(
461: "\'"
462: ));
463: } else {
464: $value = $this->_skipToBlanksInTag();
465: }
466:
467: return $value;
468: }
469:
470: 471: 472: 473:
474: protected function _currentChar() {
475: if ($this->getHtmlTextIndex() >= $this->getHtmlTextLength()) {
476: return -1;
477: }
478: $HtmlText = $this->getHtmlText();
479: return $HtmlText{$this->getHtmlTextIndex()};
480: }
481:
482: 483: 484: 485:
486: protected function _moveNext() {
487: if ($this->getHtmlTextIndex() < $this->getHtmlTextLength()) {
488: $this->increaseHtmlTextIndex();
489: return true;
490: } else {
491: return false;
492: }
493: }
494:
495: 496: 497: 498:
499: protected function _skipEndOfTag() {
500: $sb = "";
501: if (($ch = $this->_currentChar()) !== -1) {
502: $match = ($ch == ">");
503: if (!$match) {
504: return $sb;
505: }
506: $sb .= $ch;
507: $this->_moveNext();
508: }
509: return $sb;
510: }
511:
512: 513: 514: 515: 516:
517: protected function _skipInTag($chars) {
518: $sb = "";
519: while (($ch = $this->_currentChar()) !== -1) {
520: if ($ch == ">") {
521: return $sb;
522: } else {
523: $match = false;
524: for ($idx = 0; $idx < count($chars); $idx++) {
525: if ($ch == $chars[$idx]) {
526: $match = true;
527: break;
528: }
529: }
530: if (!$match) {
531: return $sb;
532: }
533: $sb .= $ch;
534: $this->_moveNext();
535: }
536: }
537: return $sb;
538: }
539:
540: 541: 542: 543: 544:
545: protected function _skipToInTag($chars) {
546: $sb = "";
547: while (($ch = $this->_currentChar()) !== -1) {
548: $match = $ch == ">";
549: if (!$match) {
550: for ($idx = 0; $idx < count($chars); $idx++) {
551: if ($ch == $chars[$idx]) {
552: $match = true;
553: break;
554: }
555: }
556: }
557: if ($match) {
558: return $sb;
559: }
560: $sb .= $ch;
561: $this->_moveNext();
562: }
563: return $sb;
564: }
565:
566: 567: 568: 569:
570: protected function _skipToElement() {
571: $sb = "";
572: while (($ch = $this->_currentChar()) !== -1) {
573: if ($ch == "<") {
574: return $sb;
575: }
576: $sb .= $ch;
577: $this->_moveNext();
578: }
579: return $sb;
580: }
581:
582: 583: 584: 585: 586: 587: 588: 589: 590: 591:
592: protected function _skipToStringInTag($needle) {
593: $pos = strpos($this->getHtmlText(), $needle, $this->getHtmlTextIndex());
594: if ($pos === false) {
595: return "";
596: }
597: $top = $pos + strlen($needle);
598: $retvalue = substr($this->getHtmlText(), $this->getHtmlTextIndex(), $top - $this->getHtmlTextIndex());
599: $this->setHtmlTextIndex($top);
600: return $retvalue;
601: }
602: }