1: <?php
2:
3: 4: 5: 6: 7: 8: 9: 10: 11:
12:
13: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
14:
15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33:
34: class HtmlParser
35: {
36: 37: 38: 39: 40:
41: const NODE_TYPE_ELEMENT = 1;
42: 43: 44: 45: 46:
47: const NODE_TYPE_ENDELEMENT = 2;
48: 49: 50: 51: 52:
53: const NODE_TYPE_TEXT = 3;
54: 55: 56: 57: 58:
59: const = 4;
60: 61: 62: 63: 64:
65: const NODE_TYPE_DONE = 5;
66:
67: 68: 69: 70: 71: 72: 73:
74: protected $_NodeType;
75:
76: 77: 78: 79: 80: 81: 82:
83: protected $_NodeName = "";
84:
85: 86: 87: 88: 89: 90: 91:
92: protected $_NodeValue = "";
93:
94: 95: 96: 97: 98: 99: 100: 101:
102: protected $_NodeAttributes = [];
103:
104: 105: 106:
107: protected $_HtmlText = '';
108:
109: 110: 111:
112: protected $_HtmlTextLength;
113:
114: 115: 116:
117: protected $_HtmlTextIndex = 0;
118:
119: 120: 121: 122: 123: 124: 125:
126: public function __construct($HtmlText)
127: {
128: $this->setHtmlText($HtmlText);
129: $this->setHtmlTextLength(cString::getStringLength($HtmlText));
130: }
131:
132: 133: 134: 135: 136: 137: 138:
139: public function setHtmlText($htmlText)
140: {
141: return $this->_HtmlText = $htmlText;
142: }
143:
144: 145: 146: 147: 148: 149: 150:
151: public function setHtmlTextLength($htmlTextLength)
152: {
153: return $this->_HtmlTextLength = $htmlTextLength;
154: }
155:
156: 157: 158: 159: 160: 161: 162:
163: public function setHtmlTextIndex($HtmlTextIndex)
164: {
165: return $this->_HtmlTextIndex = $HtmlTextIndex;
166: }
167:
168: 169: 170: 171: 172: 173: 174: 175: 176:
177: public function _setNodeAttributes($NodeAttributes)
178: {
179: if (!is_array($NodeAttributes)) {
180: return false;
181: }
182:
183: return $this->_NodeAttributes = $NodeAttributes;
184: }
185:
186: 187: 188: 189: 190:
191: public function getHtmlText()
192: {
193: return $this->_HtmlText;
194: }
195:
196: 197: 198: 199: 200:
201: public function getHtmlTextLength()
202: {
203: return $this->_HtmlTextLength;
204: }
205:
206: 207: 208: 209: 210:
211: public function getNodeType()
212: {
213: return $this->_NodeType;
214: }
215:
216: 217: 218: 219: 220:
221: public function getNodeName()
222: {
223: return $this->_NodeName;
224: }
225:
226: 227: 228: 229: 230:
231: public function getNodeAttributesArray()
232: {
233: return $this->_NodeAttributes;
234: }
235:
236: 237: 238: 239: 240: 241: 242:
243: public function getNodeAttributes($attribute)
244: {
245: return isset($this->_NodeAttributes[$attribute]) ? $this->_NodeAttributes[$attribute] : '';
246: }
247:
248: 249: 250: 251: 252:
253: public function getHtmlTextIndex()
254: {
255: return $this->_HtmlTextIndex;
256: }
257:
258: 259: 260: 261: 262:
263: protected function increaseHtmlTextIndex()
264: {
265: return $this->_HtmlTextIndex++;
266: }
267:
268: 269: 270: 271: 272: 273: 274: 275:
276: public function parse()
277: {
278: $text = $this->_skipToElement();
279: if ($text != "") {
280: $this->_NodeType = self::NODE_TYPE_TEXT;
281: $this->_NodeName = "Text";
282: $this->_NodeValue = $text;
283:
284: return true;
285: }
286:
287: return $this->_readTag();
288: }
289:
290: 291: 292: 293: 294:
295: protected function _clearAttributes()
296: {
297: return $this->_NodeAttributes = [];
298: }
299:
300: 301: 302:
303: protected function _readTag()
304: {
305: if ($this->_currentChar() != "<") {
306: $this->_NodeType = self::NODE_TYPE_DONE;
307:
308: return false;
309: }
310:
311: $this->_skipInTag(["<"]);
312: $this->_clearAttributes();
313: $name = $this->_skipToBlanksInTag();
314: $pos = cString::findFirstPos($name, "/");
315:
316: if ($pos === 0) {
317: $this->_NodeType = self::NODE_TYPE_ENDELEMENT;
318: $this->_NodeName = cString::getPartOfString($name, 1);
319: $this->_NodeValue = "";
320: } else {
321: if (!$this->_isValidTagIdentifier($name)) {
322: $comment = false;
323: if ($name == "!--") {
324: $rest = $this->_skipToStringInTag("-->");
325: if ($rest != "") {
326: $this->_NodeType = self::NODE_TYPE_COMMENT;
327: $this->_NodeName = "Comment";
328: $this->_NodeValue = "<" . $name . $rest;
329: $comment = true;
330: }
331: }
332: if (!$comment) {
333: $this->_NodeType = self::NODE_TYPE_TEXT;
334: $this->_NodeName = "Text";
335: $this->_NodeValue = "<" . $name;
336: }
337:
338: return true;
339: } else {
340: $this->_NodeType = self::NODE_TYPE_ELEMENT;
341: $this->_NodeValue = "";
342: $nameLength = cString::getStringLength($name);
343: if ($nameLength > 0 && cString::getPartOfString($name, $nameLength - 1, 1) == "/") {
344: $this->_NodeName = cString::getPartOfString($name, 0, $nameLength - 1);
345: } else {
346: $this->_NodeName = $name;
347: }
348: }
349: }
350:
351: while ($this->_skipBlanksInTag()) {
352: $attrName = $this->_skipToBlanksOrEqualsInTag();
353: $NodeAttributes = $this->getNodeAttributesArray();
354:
355: if ($attrName != "") {
356: $this->_skipBlanksInTag();
357:
358: if ($this->_currentChar() == "=") {
359: $this->_skipEqualsInTag();
360: $this->_skipBlanksInTag();
361:
362: $value = $this->_readValueInTag();
363:
364: $NodeAttributes[cString::toLowerCase($attrName)] = $value;
365: $this->_setNodeAttributes($NodeAttributes);
366: } else {
367: $NodeAttributes[cString::toLowerCase($attrName)] = "";
368: $this->_setNodeAttributes($NodeAttributes);
369: }
370: }
371: }
372:
373: $this->_skipEndOfTag();
374:
375: return true;
376: }
377:
378: 379: 380: 381: 382:
383: protected function _isValidTagIdentifier($name)
384: {
385: return preg_match('/[A-Za-z0-9]+/', $name);
386: }
387:
388: 389: 390:
391: protected function _skipBlanksInTag()
392: {
393: return "" != ($this->_skipInTag([" ", "\t", "\r", "\n"]));
394: }
395:
396: 397: 398:
399: protected function _skipToBlanksOrEqualsInTag()
400: {
401: return $this->_skipToInTag([" ", "\t", "\r", "\n", "="]);
402: }
403:
404: 405: 406:
407: protected function _skipToBlanksInTag()
408: {
409: return $this->_skipToInTag([" ", "\t", "\r", "\n"]);
410: }
411:
412: 413: 414:
415: protected function _skipEqualsInTag()
416: {
417: return $this->_skipInTag(["="]);
418: }
419:
420: 421: 422:
423: protected function _readValueInTag()
424: {
425: $ch = $this->_currentChar();
426:
427: if ($ch == "\"") {
428: $this->_skipInTag(["\""]);
429: $value = $this->_skipToInTag(["\""]);
430: $this->_skipInTag(["\""]);
431: } elseif ($ch == "\'") {
432: $this->_skipInTag(["\'"]);
433: $value = $this->_skipToInTag(["\'"]);
434: $this->_skipInTag(["\'"]);
435: } else {
436: $value = $this->_skipToBlanksInTag();
437: }
438:
439: return $value;
440: }
441:
442: 443: 444:
445: protected function _currentChar()
446: {
447: if ($this->getHtmlTextIndex() >= $this->getHtmlTextLength()) {
448: return -1;
449: }
450: $HtmlText = $this->getHtmlText();
451:
452: return cString::getPartOfString($HtmlText, $this->getHtmlTextIndex(), 1);
453: }
454:
455: 456: 457:
458: protected function _moveNext()
459: {
460: if ($this->getHtmlTextIndex() < $this->getHtmlTextLength()) {
461: $this->increaseHtmlTextIndex();
462:
463: return true;
464: } else {
465: return false;
466: }
467: }
468:
469: 470: 471:
472: protected function _skipEndOfTag()
473: {
474: $sb = "";
475: if (($ch = $this->_currentChar()) !== -1) {
476: $match = ($ch == ">");
477: if (!$match) {
478: return $sb;
479: }
480: $sb .= $ch;
481: $this->_moveNext();
482: }
483:
484: return $sb;
485: }
486:
487: 488: 489: 490: 491:
492: protected function _skipInTag($chars)
493: {
494: $sb = "";
495: while (($ch = $this->_currentChar()) !== -1) {
496: if ($ch == ">") {
497: return $sb;
498: } else {
499: $match = false;
500: for ($idx = 0; $idx < count($chars); $idx++) {
501: if ($ch == $chars[$idx]) {
502: $match = true;
503: break;
504: }
505: }
506: if (!$match) {
507: return $sb;
508: }
509: $sb .= $ch;
510: $this->_moveNext();
511: }
512: }
513:
514: return $sb;
515: }
516:
517: 518: 519: 520: 521:
522: protected function _skipToInTag($chars)
523: {
524: $sb = "";
525: while (($ch = $this->_currentChar()) !== -1) {
526: $match = $ch == ">";
527: if (!$match) {
528: for ($idx = 0; $idx < count($chars); $idx++) {
529: if ($ch == $chars[$idx]) {
530: $match = true;
531: break;
532: }
533: }
534: }
535: if ($match) {
536: return $sb;
537: }
538: $sb .= $ch;
539: $this->_moveNext();
540: }
541:
542: return $sb;
543: }
544:
545: 546: 547:
548: protected function _skipToElement()
549: {
550: $sb = "";
551: while (($ch = $this->_currentChar()) !== -1) {
552: if ($ch == "<") {
553: return $sb;
554: }
555: $sb .= $ch;
556: $this->_moveNext();
557: }
558:
559: return $sb;
560: }
561:
562: 563: 564: 565: 566: 567: 568: 569: 570:
571: protected function _skipToStringInTag($needle)
572: {
573: $pos = cString::findFirstPos($this->getHtmlText(), $needle, $this->getHtmlTextIndex());
574: if ($pos === false) {
575: return "";
576: }
577: $top = $pos + cString::getStringLength($needle);
578: $retvalue =
579: cString::getPartOfString($this->getHtmlText(), $this->getHtmlTextIndex(), $top - $this->getHtmlTextIndex());
580: $this->setHtmlTextIndex($top);
581:
582: return $retvalue;
583: }
584: }