1: <?php
  2:   3:   4:   5:   6:   7:   8:   9:  10:  11:  12: 
 13: 
 14: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 15: 
 16:  17:  18:  19:  20:  21:  22:  23:  24:  25:  26:  27:  28:  29:  30:  31: 
 32: class HtmlParser {
 33: 
 34:      35:  36:  37:  38: 
 39:     const NODE_TYPE_ELEMENT = 1;
 40: 
 41:      42:  43:  44:  45: 
 46:     const NODE_TYPE_ENDELEMENT = 2;
 47: 
 48:      49:  50:  51:  52: 
 53:     const NODE_TYPE_TEXT = 3;
 54: 
 55:      56:  57:  58:  59: 
 60:     const  = 4;
 61: 
 62:      63:  64:  65:  66: 
 67:     const NODE_TYPE_DONE = 5;
 68: 
 69:      70:  71:  72:  73:  74: 
 75:     var $iNodeType;
 76: 
 77:      78:  79:  80:  81:  82: 
 83:     var $iNodeName = "";
 84: 
 85:      86:  87:  88:  89:  90: 
 91:     var $iNodeValue = "";
 92: 
 93:      94:  95:  96:  97:  98:  99: 
100:     var $iNodeAttributes;
101: 
102:     103: 104: 105: 106: 
107:     var $iHtmlText;
108: 
109:     110: 111: 112: 113: 
114:     var $iHtmlTextLength;
115: 
116:     117: 118: 119: 120: 
121:     var $iHtmlTextIndex = 0;
122: 
123:     124: 125: 126: 127: 128: 129: 
130:     function HtmlParser($aHtmlText) {
131:         $this->iHtmlText = $aHtmlText;
132:         $this->iHtmlTextLength = strlen($aHtmlText);
133:     }
134: 
135:     136: 137: 138: 139: 140: 141: 
142:     function parse() {
143:         $text = $this->skipToElement();
144:         if ($text != "") {
145:             $this->iNodeType = self::NODE_TYPE_TEXT;
146:             $this->iNodeName = "Text";
147:             $this->iNodeValue = $text;
148:             return true;
149:         }
150:         return $this->readTag();
151:     }
152: 
153:     154: 
155:     function clearAttributes() {
156:         $this->iNodeAttributes = array();
157:     }
158: 
159:     160: 161: 162: 
163:     function readTag() {
164:         if ($this->currentChar() != "<") {
165:             $this->iNodeType = self::NODE_TYPE_DONE;
166:             return false;
167:         }
168: 
169:         $this->skipInTag("<");
170:         $this->clearAttributes();
171:         $name = $this->skipToBlanksInTag();
172:         $pos = strpos($name, "/");
173: 
174:         if ($pos === 0) {
175:             $this->iNodeType = self::NODE_TYPE_ENDELEMENT;
176:             $this->iNodeName = substr($name, 1);
177:             $this->iNodeValue = "";
178:         } else {
179:             if (!$this->isValidTagIdentifier($name)) {
180:                 $comment = false;
181:                 if ($name == "!--") {
182:                     $rest = $this->skipToStringInTag("-->");
183:                     if ($rest != "") {
184:                         $this->iNodeType = self::NODE_TYPE_COMMENT;
185:                         $this->iNodeName = "Comment";
186:                         $this->iNodeValue = "<" . $name . $rest;
187:                         $comment = true;
188:                     }
189:                 }
190:                 if (!$comment) {
191:                     $this->iNodeType = self::NODE_TYPE_TEXT;
192:                     $this->iNodeName = "Text";
193:                     $this->iNodeValue = "<" . $name;
194:                 }
195:                 return true;
196:             } else {
197:                 $this->iNodeType = self::NODE_TYPE_ELEMENT;
198:                 $this->iNodeValue = "";
199:                 $nameLength = strlen($name);
200:                 if ($nameLength > 0 && substr($name, $nameLength - 1, 1) == "/") {
201:                     $this->iNodeName = substr($name, 0, $nameLength - 1);
202:                 } else {
203:                     $this->iNodeName = $name;
204:                 }
205:             }
206:         }
207: 
208:         while ($this->skipBlanksInTag()) {
209:             $attrName = $this->skipToBlanksOrEqualsInTag();
210:             if ($attrName != "") {
211:                 $this->skipBlanksInTag();
212:                 if ($this->currentChar() == "=") {
213:                     $this->skipEqualsInTag();
214:                     $this->skipBlanksInTag();
215:                     $value = $this->readValueInTag();
216:                     $this->iNodeAttributes[strtolower($attrName)] = $value;
217:                 } else {
218:                     $this->iNodeAttributes[strtolower($attrName)] = "";
219:                 }
220:             }
221:         }
222:         $this->skipEndOfTag();
223:         return true;
224:     }
225: 
226:     227: 228: 229: 230: 
231:     function isValidTagIdentifier($name) {
232:         return preg_match('/[A-Za-z0-9]+/', $name);
233:     }
234: 
235:     236: 237: 238: 
239:     function skipBlanksInTag() {
240:         return "" != ($this->skipInTag(array(
241:             " ",
242:             "\t",
243:             "\r",
244:             "\n"
245:         )));
246:     }
247: 
248:     249: 250: 251: 
252:     function skipToBlanksOrEqualsInTag() {
253:         return $this->skipToInTag(array(
254:             " ",
255:             "\t",
256:             "\r",
257:             "\n",
258:             "="
259:         ));
260:     }
261: 
262:     263: 264: 265: 
266:     function skipToBlanksInTag() {
267:         return $this->skipToInTag(array(
268:             " ",
269:             "\t",
270:             "\r",
271:             "\n"
272:         ));
273:     }
274: 
275:     276: 277: 278: 
279:     function skipEqualsInTag() {
280:         return $this->skipInTag(array(
281:             "="
282:         ));
283:     }
284: 
285:     286: 287: 288: 
289:     function readValueInTag() {
290:         $ch = $this->currentChar();
291:         $value = "";
292: 
293:         if ($ch == "\"") {
294:             $this->skipInTag(array(
295:                 "\""
296:             ));
297:             $value = $this->skipToInTag(array(
298:                 "\""
299:             ));
300:             $this->skipInTag(array(
301:                 "\""
302:             ));
303:         } else if ($ch == "\'") {
304:             $this->skipInTag(array(
305:                 "\'"
306:             ));
307:             $value = $this->skipToInTag(array(
308:                 "\'"
309:             ));
310:             $this->skipInTag(array(
311:                 "\'"
312:             ));
313:         } else {
314:             $value = $this->skipToBlanksInTag();
315:         }
316: 
317:         return $value;
318:     }
319: 
320:     321: 322: 323: 
324:     function currentChar() {
325:         if ($this->iHtmlTextIndex >= $this->iHtmlTextLength) {
326:             return -1;
327:         }
328:         return $this->iHtmlText{$this->iHtmlTextIndex};
329:     }
330: 
331:     332: 333: 334: 
335:     function moveNext() {
336:         if ($this->iHtmlTextIndex < $this->iHtmlTextLength) {
337:             $this->iHtmlTextIndex++;
338:             return true;
339:         } else {
340:             return false;
341:         }
342:     }
343: 
344:     345: 346: 347: 
348:     function skipEndOfTag() {
349:         $sb = "";
350:         if (($ch = $this->currentChar()) !== -1) {
351:             $match = ($ch == ">");
352:             if (!$match) {
353:                 return $sb;
354:             }
355:             $sb .= $ch;
356:             $this->moveNext();
357:         }
358:         return $sb;
359:     }
360: 
361:     362: 363: 364: 365: 
366:     function skipInTag($chars) {
367:         $sb = "";
368:         while (($ch = $this->currentChar()) !== -1) {
369:             if ($ch == ">") {
370:                 return $sb;
371:             } else {
372:                 $match = false;
373:                 for ($idx = 0; $idx < count($chars); $idx++) {
374:                     if ($ch == $chars[$idx]) {
375:                         $match = true;
376:                         break;
377:                     }
378:                 }
379:                 if (!$match) {
380:                     return $sb;
381:                 }
382:                 $sb .= $ch;
383:                 $this->moveNext();
384:             }
385:         }
386:         return $sb;
387:     }
388: 
389:     390: 391: 392: 393: 
394:     function skipToInTag($chars) {
395:         $sb = "";
396:         while (($ch = $this->currentChar()) !== -1) {
397:             $match = $ch == ">";
398:             if (!$match) {
399:                 for ($idx = 0; $idx < count($chars); $idx++) {
400:                     if ($ch == $chars[$idx]) {
401:                         $match = true;
402:                         break;
403:                     }
404:                 }
405:             }
406:             if ($match) {
407:                 return $sb;
408:             }
409:             $sb .= $ch;
410:             $this->moveNext();
411:         }
412:         return $sb;
413:     }
414: 
415:     416: 417: 418: 
419:     function skipToElement() {
420:         $sb = "";
421:         while (($ch = $this->currentChar()) !== -1) {
422:             if ($ch == "<") {
423:                 return $sb;
424:             }
425:             $sb .= $ch;
426:             $this->moveNext();
427:         }
428:         return $sb;
429:     }
430: 
431:     432: 433: 434: 435: 436: 437: 438: 439: 440: 
441:     function skipToStringInTag($needle) {
442:         $pos = strpos($this->iHtmlText, $needle, $this->iHtmlTextIndex);
443:         if ($pos === false) {
444:             return "";
445:         }
446:         $top = $pos + strlen($needle);
447:         $retvalue = substr($this->iHtmlText, $this->iHtmlTextIndex, $top - $this->iHtmlTextIndex);
448:         $this->iHtmlTextIndex = $top;
449:         return $retvalue;
450:     }
451: }
452: