1: <?php
2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12:
13:
14: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
15:
16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31:
32: class HtmlParser {
33:
34: 35: 36: 37: 38:
39: const NODE_TYPE_ELEMENT = 1;
40:
41: 42: 43: 44: 45:
46: const NODE_TYPE_ENDELEMENT = 2;
47:
48: 49: 50: 51: 52:
53: const NODE_TYPE_TEXT = 3;
54:
55: 56: 57: 58: 59:
60: const = 4;
61:
62: 63: 64: 65: 66:
67: const NODE_TYPE_DONE = 5;
68:
69: 70: 71: 72: 73: 74:
75: var $iNodeType;
76:
77: 78: 79: 80: 81: 82:
83: var $iNodeName = "";
84:
85: 86: 87: 88: 89: 90:
91: var $iNodeValue = "";
92:
93: 94: 95: 96: 97: 98: 99:
100: var $iNodeAttributes;
101:
102: 103: 104: 105: 106:
107: var $iHtmlText;
108:
109: 110: 111: 112: 113:
114: var $iHtmlTextLength;
115:
116: 117: 118: 119: 120:
121: var $iHtmlTextIndex = 0;
122:
123: 124: 125: 126: 127: 128: 129:
130: function HtmlParser($aHtmlText) {
131: $this->iHtmlText = $aHtmlText;
132: $this->iHtmlTextLength = strlen($aHtmlText);
133: }
134:
135: 136: 137: 138: 139: 140: 141:
142: function parse() {
143: $text = $this->skipToElement();
144: if ($text != "") {
145: $this->iNodeType = self::NODE_TYPE_TEXT;
146: $this->iNodeName = "Text";
147: $this->iNodeValue = $text;
148: return true;
149: }
150: return $this->readTag();
151: }
152:
153: 154:
155: function clearAttributes() {
156: $this->iNodeAttributes = array();
157: }
158:
159: 160: 161: 162:
163: function readTag() {
164: if ($this->currentChar() != "<") {
165: $this->iNodeType = self::NODE_TYPE_DONE;
166: return false;
167: }
168:
169: $this->skipInTag("<");
170: $this->clearAttributes();
171: $name = $this->skipToBlanksInTag();
172: $pos = strpos($name, "/");
173:
174: if ($pos === 0) {
175: $this->iNodeType = self::NODE_TYPE_ENDELEMENT;
176: $this->iNodeName = substr($name, 1);
177: $this->iNodeValue = "";
178: } else {
179: if (!$this->isValidTagIdentifier($name)) {
180: $comment = false;
181: if ($name == "!--") {
182: $rest = $this->skipToStringInTag("-->");
183: if ($rest != "") {
184: $this->iNodeType = self::NODE_TYPE_COMMENT;
185: $this->iNodeName = "Comment";
186: $this->iNodeValue = "<" . $name . $rest;
187: $comment = true;
188: }
189: }
190: if (!$comment) {
191: $this->iNodeType = self::NODE_TYPE_TEXT;
192: $this->iNodeName = "Text";
193: $this->iNodeValue = "<" . $name;
194: }
195: return true;
196: } else {
197: $this->iNodeType = self::NODE_TYPE_ELEMENT;
198: $this->iNodeValue = "";
199: $nameLength = strlen($name);
200: if ($nameLength > 0 && substr($name, $nameLength - 1, 1) == "/") {
201: $this->iNodeName = substr($name, 0, $nameLength - 1);
202: } else {
203: $this->iNodeName = $name;
204: }
205: }
206: }
207:
208: while ($this->skipBlanksInTag()) {
209: $attrName = $this->skipToBlanksOrEqualsInTag();
210: if ($attrName != "") {
211: $this->skipBlanksInTag();
212: if ($this->currentChar() == "=") {
213: $this->skipEqualsInTag();
214: $this->skipBlanksInTag();
215: $value = $this->readValueInTag();
216: $this->iNodeAttributes[strtolower($attrName)] = $value;
217: } else {
218: $this->iNodeAttributes[strtolower($attrName)] = "";
219: }
220: }
221: }
222: $this->skipEndOfTag();
223: return true;
224: }
225:
226: 227: 228: 229: 230:
231: function isValidTagIdentifier($name) {
232: return preg_match('/[A-Za-z0-9]+/', $name);
233: }
234:
235: 236: 237: 238:
239: function skipBlanksInTag() {
240: return "" != ($this->skipInTag(array(
241: " ",
242: "\t",
243: "\r",
244: "\n"
245: )));
246: }
247:
248: 249: 250: 251:
252: function skipToBlanksOrEqualsInTag() {
253: return $this->skipToInTag(array(
254: " ",
255: "\t",
256: "\r",
257: "\n",
258: "="
259: ));
260: }
261:
262: 263: 264: 265:
266: function skipToBlanksInTag() {
267: return $this->skipToInTag(array(
268: " ",
269: "\t",
270: "\r",
271: "\n"
272: ));
273: }
274:
275: 276: 277: 278:
279: function skipEqualsInTag() {
280: return $this->skipInTag(array(
281: "="
282: ));
283: }
284:
285: 286: 287: 288:
289: function readValueInTag() {
290: $ch = $this->currentChar();
291: $value = "";
292:
293: if ($ch == "\"") {
294: $this->skipInTag(array(
295: "\""
296: ));
297: $value = $this->skipToInTag(array(
298: "\""
299: ));
300: $this->skipInTag(array(
301: "\""
302: ));
303: } else if ($ch == "\'") {
304: $this->skipInTag(array(
305: "\'"
306: ));
307: $value = $this->skipToInTag(array(
308: "\'"
309: ));
310: $this->skipInTag(array(
311: "\'"
312: ));
313: } else {
314: $value = $this->skipToBlanksInTag();
315: }
316:
317: return $value;
318: }
319:
320: 321: 322: 323:
324: function currentChar() {
325: if ($this->iHtmlTextIndex >= $this->iHtmlTextLength) {
326: return -1;
327: }
328: return $this->iHtmlText{$this->iHtmlTextIndex};
329: }
330:
331: 332: 333: 334:
335: function moveNext() {
336: if ($this->iHtmlTextIndex < $this->iHtmlTextLength) {
337: $this->iHtmlTextIndex++;
338: return true;
339: } else {
340: return false;
341: }
342: }
343:
344: 345: 346: 347:
348: function skipEndOfTag() {
349: $sb = "";
350: if (($ch = $this->currentChar()) !== -1) {
351: $match = ($ch == ">");
352: if (!$match) {
353: return $sb;
354: }
355: $sb .= $ch;
356: $this->moveNext();
357: }
358: return $sb;
359: }
360:
361: 362: 363: 364: 365:
366: function skipInTag($chars) {
367: $sb = "";
368: while (($ch = $this->currentChar()) !== -1) {
369: if ($ch == ">") {
370: return $sb;
371: } else {
372: $match = false;
373: for ($idx = 0; $idx < count($chars); $idx++) {
374: if ($ch == $chars[$idx]) {
375: $match = true;
376: break;
377: }
378: }
379: if (!$match) {
380: return $sb;
381: }
382: $sb .= $ch;
383: $this->moveNext();
384: }
385: }
386: return $sb;
387: }
388:
389: 390: 391: 392: 393:
394: function skipToInTag($chars) {
395: $sb = "";
396: while (($ch = $this->currentChar()) !== -1) {
397: $match = $ch == ">";
398: if (!$match) {
399: for ($idx = 0; $idx < count($chars); $idx++) {
400: if ($ch == $chars[$idx]) {
401: $match = true;
402: break;
403: }
404: }
405: }
406: if ($match) {
407: return $sb;
408: }
409: $sb .= $ch;
410: $this->moveNext();
411: }
412: return $sb;
413: }
414:
415: 416: 417: 418:
419: function skipToElement() {
420: $sb = "";
421: while (($ch = $this->currentChar()) !== -1) {
422: if ($ch == "<") {
423: return $sb;
424: }
425: $sb .= $ch;
426: $this->moveNext();
427: }
428: return $sb;
429: }
430:
431: 432: 433: 434: 435: 436: 437: 438: 439: 440:
441: function skipToStringInTag($needle) {
442: $pos = strpos($this->iHtmlText, $needle, $this->iHtmlTextIndex);
443: if ($pos === false) {
444: return "";
445: }
446: $top = $pos + strlen($needle);
447: $retvalue = substr($this->iHtmlText, $this->iHtmlTextIndex, $top - $this->iHtmlTextIndex);
448: $this->iHtmlTextIndex = $top;
449: return $retvalue;
450: }
451: }
452: