Overview

Packages

  • CONTENIDO
  • Core
    • Authentication
    • Backend
    • Cache
    • CEC
    • Chain
    • ContentType
    • Database
    • Debug
    • Exception
    • Frontend
      • Search
      • URI
      • Util
    • GenericDB
      • Model
    • GUI
      • HTML
    • I18N
    • LayoutHandler
    • Log
    • Security
    • Session
    • Util
    • Validation
    • Versioning
    • XML
  • Module
    • ContentSitemapHtml
    • ContentSitemapXml
    • ContentUserForum
    • NavigationTop
    • ScriptCookieDirective
  • mpAutoloaderClassMap
  • None
  • PHP
  • Plugin
    • ContentAllocation
    • CronjobOverview
    • FormAssistant
    • FrontendLogic
    • FrontendUsers
    • Linkchecker
    • ModRewrite
    • Newsletter
    • Repository
      • FrontendNavigation
      • KeywordDensity
    • SmartyWrapper
    • UrlShortener
    • UserForum
    • Workflow
  • PluginManager
  • Setup
    • Form
    • GUI
    • Helper
      • Environment
      • Filesystem
      • MySQL
      • PHP
    • UpgradeJob

Classes

  • cSearch
  • cSearchBaseAbstract
  • cSearchIndex
  • cSearchResult
  • Overview
  • Package
  • Class
  • Tree
  • Deprecated
  • Todo
  1: <?php
  2: 
  3: /**
  4:  * This file contains the base class for building search indices.
  5:  *
  6:  * @package Core
  7:  * @subpackage Frontend_Search
  8:  * @author Willi Man
  9:  * @copyright four for business AG <www.4fb.de>
 10:  * @license http://www.contenido.org/license/LIZENZ.txt
 11:  * @link http://www.4fb.de
 12:  * @link http://www.contenido.org
 13:  */
 14: 
 15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 16: 
 17: cInclude('includes', 'functions.encoding.php');
 18: 
 19: /**
 20:  * CONTENIDO API - Search Index Object.
 21:  *
 22:  * This object creates an index of an article.
 23:  *
 24:  * Create object where $db is the global CONTENIDO database object.
 25:  *
 26:  * $oIndex = new SearchIndex($db);
 27:  *
 28:  * Start indexing where $aContent is the complete content of an article
 29:  * specified by its content types.
 30:  *
 31:  * $oIndex->start($idart, $aContent);
 32:  *
 33:  * It looks like:
 34:  * Array (
 35:  *      [CMS_HTMLHEAD] => Array (
 36:  *          [1] => Herzlich Willkommen...
 37:  *          [2] => ...auf Ihrer Website!
 38:  *      )
 39:  *      [CMS_HTML] => Array (
 40:  *          [1] => Die Inhalte auf dieser Website ...
 41:  *
 42:  * The index for keyword 'willkommen' would look like
 43:  * '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs
 44:  * 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1].
 45:  *
 46:  * TODO: The basic idea of the indexing process is to take the complete
 47:  * content of an article and to generate normalized index terms from the
 48:  * content and to store a specific index structure in the relation
 49:  * 'con_keywords'.
 50:  *
 51:  * To take the complete content is not very flexible. It would be better
 52:  * to differentiate by specific content types or by any content.
 53:  *
 54:  * The &, =, () and - seperated string is not easy to parse to compute
 55:  * the search result set.
 56:  *
 57:  * It would be a better idea (and a lot of work) to extend the relation
 58:  * 'con_keywords' to store keywords by articleId (or content source
 59:  * identifier) and content type.
 60:  *
 61:  * The functions removeSpecialChars, setStopwords, setContentTypes and
 62:  * setCmsOptions should be sourced out into a new helper-class.
 63:  *
 64:  * Keep in mind that class Search and SearchResult uses an instance of
 65:  * object Index.
 66:  *
 67:  * @package Core
 68:  * @subpackage Frontend_Search
 69:  */
 70: class cSearchIndex extends cSearchBaseAbstract {
 71: 
 72:     /**
 73:      * content of the cms-types of an article
 74:      *
 75:      * @var array
 76:      */
 77:     protected $_keycode = array();
 78: 
 79:     /**
 80:      * list of keywords of an article
 81:      *
 82:      * @var array
 83:      */
 84:     protected $_keywords = array();
 85: 
 86:     /**
 87:      * words, which should not be indexed
 88:      *
 89:      * @var array
 90:      */
 91:     protected $_stopwords = array();
 92: 
 93:     /**
 94:      * keywords of an article stored in the DB
 95:      *
 96:      * @var array
 97:      */
 98:     protected $_keywordsOld = array();
 99: 
100:     /**
101:      * keywords to be deleted
102:      *
103:      * @var array
104:      */
105:     protected $_keywordsDel = array();
106: 
107:     /**
108:      * 'auto' or 'self'
109:      *
110:      * The field 'auto' in table con_keywords is used for automatic indexing.
111:      * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)",
112:      * which means a keyword occurs 2 times in article with $idart 12
113:      * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
114:      *
115:      * The field 'self' can be used in the article properties to index
116:      * the article manually.
117:      *
118:      * @var string
119:      */
120:     protected $_place;
121: 
122:     /**
123:      * array of cms types
124:      *
125:      * @var array
126:      */
127:     protected $_cmsOptions = array();
128: 
129:     /**
130:      * array of all available cms types
131:      *
132:      * htmlhead - HTML Headline
133:      * html - HTML Text
134:      * head - Headline (no HTML)
135:      * text - Text (no HTML)
136:      * img - Upload id of the element
137:      * imgdescr - Image description
138:      * link - Link (URL)
139:      * linktarget - Linktarget (_self, _blank, _top ...)
140:      * linkdescr - Linkdescription
141:      * swf - Upload id of the element
142:      * etc.
143:      *
144:      * @var array
145:      */
146:     protected $_cmsType = array();
147: 
148:     /**
149:      * suffix of all available cms types
150:      *
151:      * @var array
152:      */
153:     protected $_cmsTypeSuffix = array();
154: 
155:     /**
156:      *
157:      * @var int
158:      */
159:     protected $idart;
160: 
161:     /**
162:      * Constructor to create an instance of this class.
163:      *
164:      * Set object properties.
165:      *
166:      * @param cDb $db [optional]
167:      *                CONTENIDO database object
168:      *
169:      * @throws cDbException
170:      * @throws cInvalidArgumentException
171:      */
172:     public function __construct($db = NULL) {
173:         parent::__construct($db);
174: 
175:         $this->setContentTypes();
176:     }
177: 
178:     /**
179:      * Start indexing the article.
180:      *
181:      * @param int    $idart
182:      *                            Article Id
183:      * @param array  $aContent
184:      *                            The complete content of an article specified by its content types.
185:      *                            It looks like:
186:      *                            Array (
187:      *                            [CMS_HTMLHEAD] => Array (
188:      *                            [1] => Herzlich Willkommen...
189:      *                            [2] => ...auf Ihrer Website!
190:      *                            )
191:      *                            [CMS_HTML] => Array (
192:      *                            [1] => Die Inhalte auf dieser Website ...
193:      *                            )
194:      *                            )
195:      * @param string $place       [optional]
196:      *                            The field where to store the index information in db.
197:      * @param array  $cms_options [optional]
198:      *                            One can specify explicitly cms types which should not be indexed.
199:      * @param array  $aStopwords  [optional]
200:      *                            Array with words which should not be indexed.
201:      *
202:      * @throws cInvalidArgumentException
203:      * @throws cDbException
204:      */
205:     public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
206:         if (!is_int((int) $idart) || $idart < 0) {
207:             return;
208:         } else {
209:             $this->idart = $idart;
210:         }
211: 
212:         $this->_place = $place;
213:         $this->_keycode = $aContent;
214:         $this->setStopwords($aStopwords);
215:         $this->setCmsOptions($cms_options);
216: 
217:         $this->createKeywords();
218: 
219:         $this->getKeywords();
220: 
221:         $this->saveKeywords();
222: 
223:         $new_keys = array_keys($this->_keywords);
224:         $old_keys = array_keys($this->_keywordsOld);
225: 
226:         $this->_keywordsDel = array_diff($old_keys, $new_keys);
227: 
228:         if (count($this->_keywordsDel) > 0) {
229:             $this->deleteKeywords();
230:         }
231:     }
232: 
233:     /**
234:      * For each cms-type create index structure.
235:      *
236:      * It looks like:
237:      * Array (
238:      *     [die] => CMS_HTML-1
239:      *     [inhalte] => CMS_HTML-1
240:      *     [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
241:      *     [dieser] => CMS_HTML-1
242:      *     [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
243:      * )
244:      *
245:      * @throws cInvalidArgumentException
246:      */
247:     public function createKeywords() {
248:         $tmp_keys = array();
249: 
250:         // Only create keycodes, if some are available
251:         if (is_array($this->_keycode)) {
252:             foreach ($this->_keycode as $idtype => $data) {
253:                 if ($this->checkCmsType($idtype)) {
254:                     foreach ($data as $typeid => $code) {
255:                         $this->_debug('code', $code);
256: 
257:                         // remove backslash
258:                         $code = stripslashes($code);
259:                         // replace HTML line breaks with newlines
260:                         $code = str_ireplace(array(
261:                             '<br>',
262:                             '<br />'
263:                         ), "\n", $code);
264:                         // remove html tags
265:                         $code = strip_tags($code);
266:                         if (cString::getStringLength($code) > 0) {
267:                             $code = conHtmlEntityDecode($code);
268:                         }
269:                         $this->_debug('code', $code);
270: 
271:                         // split content by any number of commas, space
272:                         // characters or hyphens
273:                         $tmp_keys = mb_split('[\s,-]+', trim($code));
274:                         $this->_debug('tmp_keys', $tmp_keys);
275: 
276:                         foreach ($tmp_keys as $value) {
277:                             // index terms are stored with lower case
278:                             $value = conHtmlentities($value);
279:                             $value = trim(cString::toLowerCase($value));
280:                             $value = conHtmlEntityDecode($value);
281: 
282:                             if (!in_array($value, $this->_stopwords)) {
283:                                 // eliminate stopwords
284:                                 $value = $this->removeSpecialChars($value);
285: 
286:                                 if (cString::getStringLength($value) > 1) {
287:                                     // do not index single characters
288:                                     $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
289:                                 }
290:                             }
291:                         }
292:                     }
293:                 }
294: 
295:                 unset($tmp_keys);
296:             }
297:         }
298: 
299:         $this->_debug('keywords', $this->_keywords);
300:     }
301: 
302:     /**
303:      * Generate index_string from index structure and save keywords.
304:      * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
305:      *
306:      * @throws cInvalidArgumentException
307:      * @throws cDbException
308:      */
309:     public function saveKeywords() {
310:         $tmp_count = array();
311: 
312:         foreach ($this->_keywords as $keyword => $count) {
313:             $tmp_count = preg_split('/[\s]/', trim($count));
314:             $this->_debug('tmp_count', $tmp_count);
315: 
316:             $occurrence = count($tmp_count);
317:             $tmp_count = array_unique($tmp_count);
318:             $cms_types = implode(',', $tmp_count);
319:             $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
320: 
321:             if (!array_key_exists($keyword, $this->_keywordsOld)) {
322:                 // if keyword is new, save index information
323:                 // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
324:                 $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
325:                             (keyword, " . $this->_place . ", idlang)
326:                         VALUES
327:                             ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
328:             } else {
329:                 // if keyword allready exists, create new index_string
330:                 if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
331:                     $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
332:                 } else {
333:                     $index_string = $this->_keywordsOld[$keyword] . $index_string;
334:                 }
335: 
336:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
337:                         SET " . $this->_place . " = '" . $index_string . "'
338:                         WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
339:             }
340:             $this->_debug('sql', $sql);
341:             $this->db->query($sql);
342:         }
343:     }
344: 
345:     /**
346:      * If keywords don't occur in the article anymore,
347:      * update index_string and delete keyword if necessary.
348:      *
349:      * @throws cInvalidArgumentException
350:      * @throws cDbException
351:      */
352:     public function deleteKeywords() {
353:         foreach ($this->_keywordsDel as $key_del) {
354:             $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
355: 
356:             if (cString::getStringLength($index_string) == 0) {
357:                 // keyword is not referenced by any article
358:                 $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
359:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
360:             } else {
361:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
362:                     SET " . $this->_place . " = '" . $index_string . "'
363:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
364:             }
365:             $this->_debug('sql', $sql);
366:             $this->db->query($sql);
367:         }
368:     }
369: 
370:     /**
371:      * Get the keywords of an article.
372:      *
373:      * @throws cInvalidArgumentException
374:      * @throws cDbException
375:      */
376:     public function getKeywords() {
377:         $keys = implode("','", array_keys($this->_keywords));
378: 
379:         $sql = "SELECT
380:                     keyword, auto, self
381:                 FROM
382:                     " . $this->cfg['tab']['keywords'] . "
383:                 WHERE
384:                     idlang=" . cSecurity::toInteger($this->lang) . "  AND
385:                     (keyword IN ('" . $keys . "')  OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
386: 
387:         $this->_debug('sql', $sql);
388: 
389:         $this->db->query($sql);
390: 
391:         $place = $this->_place;
392: 
393:         while ($this->db->nextRecord()) {
394:             $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
395:         }
396:     }
397: 
398:     /**
399:      * Remove special characters from index term.
400:      *
401:      * @param string $key
402:      *         Keyword
403:      * @return mixed
404:      */
405:     public function removeSpecialChars($key) {
406:         $aSpecialChars = array(
407:             /*"-",*/
408:             "_",
409:             "'",
410:             ".",
411:             "!",
412:             "\"",
413:             "#",
414:             "$",
415:             "%",
416:             "&",
417:             "(",
418:             ")",
419:             "*",
420:             "+",
421:             ",",
422:             "/",
423:             ":",
424:             ";",
425:             "<",
426:             "=",
427:             ">",
428:             "?",
429:             "@",
430:             "[",
431:             "\\",
432:             "]",
433:             "^",
434:             "`",
435:             "{",
436:             "|",
437:             "}",
438:             "~",
439:             "„"
440:         );
441: 
442:         // for ($i = 127; $i < 192; $i++) {
443:         // some other special characters
444:         // $aSpecialChars[] = chr($i);
445:         // }
446: 
447:         // TODO: The transformation of accented characters must depend
448:         // on the selected encoding of the language of a client and
449:         // should not be treated in this method.
450:         // modified 2007-10-01, H. Librenz - added as hotfix for encoding
451:         // problems (doesn't find any words with umlaut vowels in it
452:         // since you turn on UTF-8 as language encoding)
453:         $sEncoding = cRegistry::getEncoding();
454: 
455:         if (cString::toLowerCase($sEncoding) != 'iso-8859-2') {
456:             $key = conHtmlentities($key, NULL, $sEncoding);
457:         } else {
458:             $key = htmlentities_iso88592($key);
459:         }
460: 
461:         // $aUmlautMap = array(
462:         // '&Uuml;' => 'ue',
463:         // '&uuml;' => 'ue',
464:         // '&Auml;' => 'ae',
465:         // '&auml;' => 'ae',
466:         // '&Ouml;' => 'oe',
467:         // '&ouml;' => 'oe',
468:         // '&szlig;' => 'ss'
469:         // );
470: 
471:         // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
472:         // $key = str_replace($sUmlaut, $sMapped, $key);
473:         // }
474: 
475:         $key = conHtmlEntityDecode($key);
476:         $key = str_replace($aSpecialChars, '', $key);
477: 
478:         return $key;
479:     }
480: 
481:     /**
482:      *
483:      * @param string $key
484:      *         Keyword
485:      * @return string
486:      */
487:     public function addSpecialUmlauts($key) {
488:         $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
489:         $aUmlautMap = array(
490:             'Ue' => '&Uuml;',
491:             'ue' => '&uuml;',
492:             'Ae' => '&Auml;',
493:             'ae' => '&auml;',
494:             'Oe' => '&Ouml;',
495:             'oe' => '&ouml;',
496:             'ss' => '&szlig;'
497:         );
498: 
499:         foreach ($aUmlautMap as $sUmlaut => $sMapped) {
500:             $key = str_replace($sUmlaut, $sMapped, $key);
501:         }
502: 
503:         $key = conHtmlEntityDecode($key);
504:         return $key;
505:     }
506: 
507:     /**
508:      * Set the array of stopwords which should not be indexed.
509:      *
510:      * @param array $aStopwords
511:      */
512:     public function setStopwords($aStopwords) {
513:         if (is_array($aStopwords) && count($aStopwords) > 0) {
514:             $this->_stopwords = $aStopwords;
515:         }
516:     }
517: 
518:     /**
519:      * Set the cms types.
520:      *
521:      * @throws cInvalidArgumentException
522:      * @throws cDbException
523:      */
524:     public function setContentTypes() {
525:         $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
526:         $this->_debug('sql', $sql);
527:         $this->db->query($sql);
528:         while ($this->db->nextRecord()) {
529:             $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
530:             $this->_cmsTypeSuffix[$this->db->f('idtype')] = cString::getPartOfString($this->db->f('type'), 4, cString::getStringLength($this->db->f('type')));
531:         }
532:     }
533: 
534:     /**
535:      * Set the cms_options array of cms types which should be treated
536:      * special.
537:      *
538:      * @param mixed $cms_options
539:      */
540:     public function setCmsOptions($cms_options) {
541:         if (is_array($cms_options) && count($cms_options) > 0) {
542:             foreach ($cms_options as $opt) {
543:                 $opt = cString::toUpperCase($opt);
544: 
545:                 if (cString::getStringLength($opt) > 0) {
546:                     if (!cString::findFirstOccurrenceCI($opt, 'cms_')) {
547:                         if (in_array($opt, $this->_cmsTypeSuffix)) {
548:                             $this->_cmsOptions[$opt] = 'CMS_' . $opt;
549:                         }
550:                     } else {
551:                         if (array_key_exists($opt, $this->_cmsType)) {
552:                             $this->_cmsOptions[$opt] = $opt;
553:                         }
554:                     }
555:                 }
556:             }
557:         } else {
558:             $this->_cmsOptions = array();
559:         }
560:     }
561: 
562:     /**
563:      * Check if the requested content type should be indexed (false) or
564:      * not (true).
565:      *
566:      * @param string $idtype
567:      * @return bool
568:      */
569:     public function checkCmsType($idtype) {
570:         $idtype = cString::toUpperCase($idtype);
571: 
572:         // Do not index CMS_RAW
573:         if ($idtype == "CMS_RAW") {
574:             return true;
575:         }
576: 
577:         return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
578:     }
579: 
580:     /**
581:      * Returns the property _cmsType.
582:      *
583:      * @return array
584:      */
585:     public function getCmsType() {
586:         return $this->_cmsType;
587:     }
588: 
589:     /**
590:      * Returns the property _cmsTypeSuffix.
591:      *
592:      * @return array
593:      */
594:     public function getCmsTypeSuffix() {
595:         return $this->_cmsTypeSuffix;
596:     }
597: }
598: 
CMS CONTENIDO 4.10.0 API documentation generated by ApiGen 2.8.0