Overview

Packages

  • CONTENIDO
  • Core
    • Authentication
    • Backend
    • Cache
    • CEC
    • Chain
    • ContentType
    • Database
    • Debug
    • Exception
    • Frontend
      • Search
      • URI
      • Util
    • GenericDB
      • Model
    • GUI
      • HTML
    • I18N
    • LayoutHandler
    • Log
    • Security
    • Session
    • Util
    • Validation
    • Versioning
    • XML
  • Module
    • ContentSitemapHtml
    • ContentSitemapXml
    • ContentUserForum
    • NavigationTop
    • ScriptCookieDirective
  • mpAutoloaderClassMap
  • None
  • PHP
  • Plugin
    • ContentAllocation
    • CronjobOverview
    • FormAssistant
    • FrontendLogic
    • FrontendUsers
    • Linkchecker
    • ModRewrite
    • Newsletter
    • Repository
      • FrontendNavigation
      • KeywordDensity
    • SIWECOS
    • SmartyWrapper
    • UrlShortener
    • UserForum
    • Workflow
  • PluginManager
  • Setup
    • Form
    • GUI
    • Helper
      • Environment
      • Filesystem
      • MySQL
      • PHP
    • UpgradeJob

Classes

  • cSearch
  • cSearchBaseAbstract
  • cSearchIndex
  • cSearchResult
  • Overview
  • Package
  • Class
  • Tree
  • Deprecated
  • Todo
  1: <?php
  2: 
  3: /**
  4:  * This file contains the base class for building search indices.
  5:  *
  6:  * @package Core
  7:  * @subpackage Frontend_Search
  8:  * @author Willi Man
  9:  * @copyright four for business AG <www.4fb.de>
 10:  * @license http://www.contenido.org/license/LIZENZ.txt
 11:  * @link http://www.4fb.de
 12:  * @link http://www.contenido.org
 13:  */
 14: 
 15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 16: 
 17: cInclude('includes', 'functions.encoding.php');
 18: 
 19: /**
 20:  * CONTENIDO API - Search Index Object.
 21:  *
 22:  * This object creates an index of an article.
 23:  *
 24:  * Create object where $db is the global CONTENIDO database object.
 25:  *
 26:  * $oIndex = new SearchIndex($db);
 27:  *
 28:  * Start indexing where $aContent is the complete content of an article
 29:  * specified by its content types.
 30:  *
 31:  * $oIndex->start($idart, $aContent);
 32:  *
 33:  * It looks like:
 34:  * Array (
 35:  *      [CMS_HTMLHEAD] => Array (
 36:  *          [1] => Herzlich Willkommen...
 37:  *          [2] => ...auf Ihrer Website!
 38:  *      )
 39:  *      [CMS_HTML] => Array (
 40:  *          [1] => Die Inhalte auf dieser Website ...
 41:  *
 42:  * The index for keyword 'willkommen' would look like
 43:  * '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs
 44:  * 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1].
 45:  *
 46:  * TODO: The basic idea of the indexing process is to take the complete
 47:  * content of an article and to generate normalized index terms from the
 48:  * content and to store a specific index structure in the relation
 49:  * 'con_keywords'.
 50:  *
 51:  * To take the complete content is not very flexible. It would be better
 52:  * to differentiate by specific content types or by any content.
 53:  *
 54:  * The &, =, () and - seperated string is not easy to parse to compute
 55:  * the search result set.
 56:  *
 57:  * It would be a better idea (and a lot of work) to extend the relation
 58:  * 'con_keywords' to store keywords by articleId (or content source
 59:  * identifier) and content type.
 60:  *
 61:  * The functions removeSpecialChars, setStopwords, setContentTypes and
 62:  * setCmsOptions should be sourced out into a new helper-class.
 63:  *
 64:  * Keep in mind that class Search and SearchResult uses an instance of
 65:  * object Index.
 66:  *
 67:  * @package Core
 68:  * @subpackage Frontend_Search
 69:  */
 70: class cSearchIndex extends cSearchBaseAbstract {
 71: 
 72:     /**
 73:      * content of the cms-types of an article
 74:      *
 75:      * @var array
 76:      */
 77:     protected $_keycode = array();
 78: 
 79:     /**
 80:      * list of keywords of an article
 81:      *
 82:      * @var array
 83:      */
 84:     protected $_keywords = array();
 85: 
 86:     /**
 87:      * words, which should not be indexed
 88:      *
 89:      * @var array
 90:      */
 91:     protected $_stopwords = array();
 92: 
 93:     /**
 94:      * keywords of an article stored in the DB
 95:      *
 96:      * @var array
 97:      */
 98:     protected $_keywordsOld = array();
 99: 
100:     /**
101:      * keywords to be deleted
102:      *
103:      * @var array
104:      */
105:     protected $_keywordsDel = array();
106: 
107:     /**
108:      * 'auto' or 'self'
109:      *
110:      * The field 'auto' in table con_keywords is used for automatic indexing.
111:      * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)",
112:      * which means a keyword occurs 2 times in article with $idart 12
113:      * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
114:      *
115:      * The field 'self' can be used in the article properties to index
116:      * the article manually.
117:      *
118:      * @var string
119:      */
120:     protected $_place;
121: 
122:     /**
123:      * array of cms types
124:      *
125:      * @var array
126:      */
127:     protected $_cmsOptions = array();
128: 
129:     /**
130:      * array of all available cms types
131:      *
132:      * htmlhead - HTML Headline
133:      * html - HTML Text
134:      * head - Headline (no HTML)
135:      * text - Text (no HTML)
136:      * img - Upload id of the element
137:      * imgdescr - Image description
138:      * link - Link (URL)
139:      * linktarget - Linktarget (_self, _blank, _top ...)
140:      * linkdescr - Linkdescription
141:      * swf - Upload id of the element
142:      * etc.
143:      *
144:      * @var array
145:      */
146:     protected $_cmsType = array();
147: 
148:     /**
149:      * suffix of all available cms types
150:      *
151:      * @var array
152:      */
153:     protected $_cmsTypeSuffix = array();
154: 
155:     /**
156:      *
157:      * @var int
158:      */
159:     protected $idart;
160: 
161:     /**
162:      * Constructor to create an instance of this class.
163:      *
164:      * Set object properties.
165:      *
166:      * @param cDb $db [optional]
167:      *                CONTENIDO database object
168:      *
169:      * @throws cDbException
170:      * @throws cInvalidArgumentException
171:      */
172:     public function __construct($db = NULL) {
173:         parent::__construct($db);
174: 
175:         $this->setContentTypes();
176:     }
177: 
178:     /**
179:      * Start indexing the article.
180:      *
181:      * @param int    $idart             Article Id
182:      * @param array  $aContent          The complete content of an article specified by its content types.
183:      *                                  It looks like:
184:      *                                  [
185:      *                                  [CMS_HTMLHEAD] => [
186:      *                                  [1] => Herzlich Willkommen...
187:      *                                  [2] => ...auf Ihrer Website!
188:      *                                  ]
189:      *                                  [CMS_HTML] => [
190:      *                                  [1] => Die Inhalte auf dieser Website ...
191:      *                                  ]
192:      *                                  ]
193:      * @param string $place             [optional] The field where to store the index information in db.
194:      * @param array  $cms_options       [optional] One can specify explicitly cms types which should not be indexed.
195:      * @param array  $aStopwords        [optional] Array with words which should not be indexed.
196:      *
197:      * @throws cInvalidArgumentException
198:      * @throws cDbException
199:      */
200:     public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
201:         if (!is_int((int) $idart) || $idart < 0) {
202:             return;
203:         } else {
204:             $this->idart = $idart;
205:         }
206: 
207:         $this->_place = $place;
208:         $this->_keycode = $aContent;
209:         $this->setStopwords($aStopwords);
210:         $this->setCmsOptions($cms_options);
211: 
212:         $this->createKeywords();
213: 
214:         $this->getKeywords();
215: 
216:         $this->saveKeywords();
217: 
218:         $new_keys = array_keys($this->_keywords);
219:         $old_keys = array_keys($this->_keywordsOld);
220: 
221:         $this->_keywordsDel = array_diff($old_keys, $new_keys);
222: 
223:         if (count($this->_keywordsDel) > 0) {
224:             $this->deleteKeywords();
225:         }
226:     }
227: 
228:     /**
229:      * For each cms-type create index structure.
230:      *
231:      * It looks like:
232:      * Array (
233:      *     [die] => CMS_HTML-1
234:      *     [inhalte] => CMS_HTML-1
235:      *     [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
236:      *     [dieser] => CMS_HTML-1
237:      *     [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
238:      * )
239:      *
240:      * @throws cInvalidArgumentException
241:      */
242:     public function createKeywords() {
243:         $tmp_keys = array();
244: 
245:         // Only create keycodes, if some are available
246:         if (is_array($this->_keycode)) {
247:             foreach ($this->_keycode as $idtype => $data) {
248:                 if ($this->checkCmsType($idtype)) {
249:                     foreach ($data as $typeid => $code) {
250:                         $this->_debug('code', $code);
251: 
252:                         // remove backslash
253:                         $code = stripslashes($code);
254:                         // replace HTML line breaks with newlines
255:                         $code = str_ireplace(array(
256:                             '<br>',
257:                             '<br />'
258:                         ), "\n", $code);
259:                         // remove html tags
260:                         $code = strip_tags($code);
261:                         if (cString::getStringLength($code) > 0) {
262:                             $code = conHtmlEntityDecode($code);
263:                         }
264:                         $this->_debug('code', $code);
265: 
266:                         // split content by any number of commas, space
267:                         // characters or hyphens
268:                         $tmp_keys = mb_split('[\s,-]+', trim($code));
269:                         $this->_debug('tmp_keys', $tmp_keys);
270: 
271:                         foreach ($tmp_keys as $value) {
272:                             // index terms are stored with lower case
273:                             $value = conHtmlentities($value);
274:                             $value = trim(cString::toLowerCase($value));
275:                             $value = conHtmlEntityDecode($value);
276: 
277:                             if (!in_array($value, $this->_stopwords)) {
278:                                 // eliminate stopwords
279:                                 $value = $this->removeSpecialChars($value);
280: 
281:                                 if (cString::getStringLength($value) > 1) {
282:                                     // do not index single characters
283:                                     $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
284:                                 }
285:                             }
286:                         }
287:                     }
288:                 }
289: 
290:                 unset($tmp_keys);
291:             }
292:         }
293: 
294:         $this->_debug('keywords', $this->_keywords);
295:     }
296: 
297:     /**
298:      * Generate index_string from index structure and save keywords.
299:      * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
300:      *
301:      * @throws cInvalidArgumentException
302:      * @throws cDbException
303:      */
304:     public function saveKeywords() {
305: 
306:         foreach ($this->_keywords as $keyword => $count) {
307:             $tmp_count = preg_split('/[\s]/', trim($count));
308:             $this->_debug('tmp_count', $tmp_count);
309: 
310:             $occurrence = count($tmp_count);
311:             $tmp_count = array_unique($tmp_count);
312:             $cms_types = implode(',', $tmp_count);
313:             $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
314: 
315:             if (!array_key_exists($keyword, $this->_keywordsOld)) {
316:                 // if keyword is new, save index information
317:                 // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
318:                 $iLang = cSecurity::toInteger($this->lang);
319:                 $sql   = "INSERT INTO {$this->cfg['tab']['keywords']}";
320:                 $sql  .= "(keyword, {$this->_place}, idlang) ";
321:                 $sql  .= "VALUES";
322:                 $sql  .= "('{$this->db->escape($keyword)}', '{$this->db->escape($index_string)}', {$iLang})";
323:             } else {
324:                 // if keyword allready exists, create new index_string
325:                 if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
326:                     $index_string = preg_replace("/&$this->idart=[0-9]+\([\w\-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
327:                 } else {
328:                     $index_string = $this->_keywordsOld[$keyword] . $index_string;
329:                 }
330: 
331:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
332:                         SET " . $this->_place . " = '" . $index_string . "'
333:                         WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
334:             }
335:             $this->_debug('sql', $sql);
336:             $this->db->query($sql);
337:         }
338:     }
339: 
340:     /**
341:      * If keywords don't occur in the article anymore,
342:      * update index_string and delete keyword if necessary.
343:      *
344:      * @throws cInvalidArgumentException
345:      * @throws cDbException
346:      */
347:     public function deleteKeywords() {
348:         foreach ($this->_keywordsDel as $key_del) {
349:             $index_string = preg_replace("/&$this->idart=[0-9]+\([\w\-,]+\)/", "", $this->_keywordsOld[$key_del]);
350: 
351:             if (cString::getStringLength($index_string) == 0) {
352:                 // keyword is not referenced by any article
353:                 $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
354:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
355:             } else {
356:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
357:                     SET " . $this->_place . " = '" . $index_string . "'
358:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
359:             }
360:             $this->_debug('sql', $sql);
361:             $this->db->query($sql);
362:         }
363:     }
364: 
365:     /**
366:      * Get the keywords of an article.
367:      *
368:      * @throws cInvalidArgumentException
369:      * @throws cDbException
370:      */
371:     public function getKeywords() {
372:         $keys = implode("','", array_keys($this->_keywords));
373: 
374:         $sql = "SELECT
375:                     keyword, auto, self
376:                 FROM
377:                     " . $this->cfg['tab']['keywords'] . "
378:                 WHERE
379:                     idlang=" . cSecurity::toInteger($this->lang) . "  AND
380:                     (keyword IN ('" . $keys . "')  OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
381: 
382:         $this->_debug('sql', $sql);
383: 
384:         $this->db->query($sql);
385: 
386:         $place = $this->_place;
387: 
388:         while ($this->db->nextRecord()) {
389:             $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
390:         }
391:     }
392: 
393:     /**
394:      * Remove special characters from index term.
395:      *
396:      * @param string $key
397:      *         Keyword
398:      * @return mixed
399:      */
400:     public function removeSpecialChars($key) {
401:         $aSpecialChars = array(
402:             /*"-",*/
403:             "_",
404:             "'",
405:             ".",
406:             "!",
407:             "\"",
408:             "#",
409:             "$",
410:             "%",
411:             "&",
412:             "(",
413:             ")",
414:             "*",
415:             "+",
416:             ",",
417:             "/",
418:             ":",
419:             ";",
420:             "<",
421:             "=",
422:             ">",
423:             "?",
424:             "@",
425:             "[",
426:             "\\",
427:             "]",
428:             "^",
429:             "`",
430:             "{",
431:             "|",
432:             "}",
433:             "~",
434:             "„"
435:         );
436: 
437:         // for ($i = 127; $i < 192; $i++) {
438:         // some other special characters
439:         // $aSpecialChars[] = chr($i);
440:         // }
441: 
442:         // TODO: The transformation of accented characters must depend
443:         // on the selected encoding of the language of a client and
444:         // should not be treated in this method.
445:         // modified 2007-10-01, H. Librenz - added as hotfix for encoding
446:         // problems (doesn't find any words with umlaut vowels in it
447:         // since you turn on UTF-8 as language encoding)
448:         $sEncoding = cRegistry::getEncoding();
449: 
450:         if (cString::toLowerCase($sEncoding) != 'iso-8859-2') {
451:             $key = conHtmlentities($key, NULL, $sEncoding);
452:         } else {
453:             $key = htmlentities_iso88592($key);
454:         }
455: 
456:         // $aUmlautMap = array(
457:         // '&Uuml;' => 'ue',
458:         // '&uuml;' => 'ue',
459:         // '&Auml;' => 'ae',
460:         // '&auml;' => 'ae',
461:         // '&Ouml;' => 'oe',
462:         // '&ouml;' => 'oe',
463:         // '&szlig;' => 'ss'
464:         // );
465: 
466:         // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
467:         // $key = str_replace($sUmlaut, $sMapped, $key);
468:         // }
469: 
470:         $key = conHtmlEntityDecode($key);
471:         $key = str_replace($aSpecialChars, '', $key);
472: 
473:         return $key;
474:     }
475: 
476:     /**
477:      *
478:      * @param string $key
479:      *         Keyword
480:      * @return string
481:      */
482:     public function addSpecialUmlauts($key) {
483:         $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
484:         $aUmlautMap = array(
485:             'Ue' => '&Uuml;',
486:             'ue' => '&uuml;',
487:             'Ae' => '&Auml;',
488:             'ae' => '&auml;',
489:             'Oe' => '&Ouml;',
490:             'oe' => '&ouml;',
491:             'ss' => '&szlig;'
492:         );
493: 
494:         foreach ($aUmlautMap as $sUmlaut => $sMapped) {
495:             $key = str_replace($sUmlaut, $sMapped, $key);
496:         }
497: 
498:         $key = conHtmlEntityDecode($key);
499:         return $key;
500:     }
501: 
502:     /**
503:      * Set the array of stopwords which should not be indexed.
504:      *
505:      * @param array $aStopwords
506:      */
507:     public function setStopwords($aStopwords) {
508:         if (is_array($aStopwords) && count($aStopwords) > 0) {
509:             $this->_stopwords = $aStopwords;
510:         }
511:     }
512: 
513:     /**
514:      * Set the cms types.
515:      *
516:      * @throws cInvalidArgumentException
517:      * @throws cDbException
518:      */
519:     public function setContentTypes() {
520:         $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
521:         $this->_debug('sql', $sql);
522:         $this->db->query($sql);
523:         while ($this->db->nextRecord()) {
524:             $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
525:             $this->_cmsTypeSuffix[$this->db->f('idtype')] = cString::getPartOfString($this->db->f('type'), 4, cString::getStringLength($this->db->f('type')));
526:         }
527:     }
528: 
529:     /**
530:      * Set the cms_options array of cms types which should be treated
531:      * special.
532:      *
533:      * @param mixed $cms_options
534:      */
535:     public function setCmsOptions($cms_options) {
536:         if (is_array($cms_options) && count($cms_options) > 0) {
537:             foreach ($cms_options as $opt) {
538:                 $opt = cString::toUpperCase($opt);
539: 
540:                 if (cString::getStringLength($opt) > 0) {
541:                     if (!cString::findFirstOccurrenceCI($opt, 'cms_')) {
542:                         if (in_array($opt, $this->_cmsTypeSuffix)) {
543:                             $this->_cmsOptions[$opt] = 'CMS_' . $opt;
544:                         }
545:                     } else {
546:                         if (array_key_exists($opt, $this->_cmsType)) {
547:                             $this->_cmsOptions[$opt] = $opt;
548:                         }
549:                     }
550:                 }
551:             }
552:         } else {
553:             $this->_cmsOptions = array();
554:         }
555:     }
556: 
557:     /**
558:      * Check if the requested content type should be indexed (false) or
559:      * not (true).
560:      *
561:      * @param string $idtype
562:      * @return bool
563:      */
564:     public function checkCmsType($idtype) {
565:         $idtype = cString::toUpperCase($idtype);
566: 
567:         // Do not index CMS_RAW
568:         if ($idtype == "CMS_RAW") {
569:             return true;
570:         }
571: 
572:         return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
573:     }
574: 
575:     /**
576:      * Returns the property _cmsType.
577:      *
578:      * @return array
579:      */
580:     public function getCmsType() {
581:         return $this->_cmsType;
582:     }
583: 
584:     /**
585:      * Returns the property _cmsTypeSuffix.
586:      *
587:      * @return array
588:      */
589:     public function getCmsTypeSuffix() {
590:         return $this->_cmsTypeSuffix;
591:     }
592: }
593: 
CMS CONTENIDO 4.10.1 API documentation generated by ApiGen 2.8.0