Overview

Packages

  • Core
    • Authentication
    • Backend
    • Cache
    • CEC
    • Chain
    • ContentType
    • Database
    • Datatype
    • Debug
    • Exception
    • Frontend
      • Search
      • URI
      • Util
    • GenericDB
      • Model
    • GUI
      • HTML
    • I18N
    • LayoutHandler
    • Log
    • Security
    • Session
    • Util
    • Validation
    • Versioning
    • XML
  • Module
    • ContentSitemapHtml
    • ContentSitemapXml
    • ContentUserForum
    • NavigationTop
  • mpAutoloaderClassMap
  • None
  • Plugin
    • ContentAllocation
    • CronjobOverview
    • FormAssistant
    • FrontendLogic
    • FrontendUsers
    • Linkchecker
    • ModRewrite
    • Newsletter
    • Repository
      • FrontendNavigation
      • KeywordDensity
    • SearchSolr
    • SmartyWrapper
    • UrlShortener
    • UserForum
    • Workflow
  • PluginManager
  • Setup
    • Form
    • GUI
    • Helper
      • Environment
      • Filesystem
      • MySQL
      • PHP
    • UpgradeJob

Classes

  • cSearch
  • cSearchBaseAbstract
  • cSearchIndex
  • cSearchResult
  • Overview
  • Package
  • Class
  • Tree
  • Deprecated
  • Todo
  1: <?php
  2: /**
  3:  * This file contains the base class for building search indices
  4:  *
  5:  * @package Core
  6:  * @subpackage Frontend_Search
  7:  * @version SVN Revision $Rev:$
  8:  *
  9:  * @author Willi Man
 10:  * @copyright four for business AG <www.4fb.de>
 11:  * @license http://www.contenido.org/license/LIZENZ.txt
 12:  * @link http://www.4fb.de
 13:  * @link http://www.contenido.org
 14:  */
 15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 16: 
 17: cInclude('includes', 'functions.encoding.php');
 18: 
 19: /**
 20:  * CONTENIDO API - Search Index Object
 21:  *
 22:  * This object creates an index of an article
 23:  *
 24:  * Create object with
 25:  * $oIndex = new SearchIndex($db); # where $db is the global CONTENIDO database
 26:  * object.
 27:  * Start indexing with
 28:  * $oIndex->start($idart, $aContent);
 29:  * where $aContent is the complete content of an article specified by its
 30:  * content types.
 31:  * It looks like
 32:  * Array (
 33:  * [CMS_HTMLHEAD] => Array (
 34:  * [1] => Herzlich Willkommen...
 35:  * [2] => ...auf Ihrer Website!
 36:  * )
 37:  * [CMS_HTML] => Array (
 38:  * [1] => Die Inhalte auf dieser Website ...
 39:  *
 40:  * The index for keyword 'willkommen' would look like '&12=1(CMS_HTMLHEAD-1)'
 41:  * which means the keyword 'willkommen' occurs 1 times in article with articleId
 42:  * 12 and content type CMS_HTMLHEAD[1].
 43:  *
 44:  * TODO: The basic idea of the indexing process is to take the complete content
 45:  * of an article and to generate normalized index terms
 46:  * from the content and to store a specific index structure in the relation
 47:  * 'con_keywords'.
 48:  * To take the complete content is not very flexible. It would be better to
 49:  * differentiate by specific content types or by any content.
 50:  * The &, =, () and - seperated string is not easy to parse to compute the
 51:  * search result set.
 52:  * It would be a better idea (and a lot of work) to extend the relation
 53:  * 'con_keywords' to store keywords by articleId (or content source identifier)
 54:  * and content type.
 55:  * The functions removeSpecialChars, setStopwords, setContentTypes and
 56:  * setCmsOptions should be sourced out into a new helper-class.
 57:  * Keep in mind that class Search and SearchResult uses an instance of object
 58:  * Index.
 59:  *
 60:  * @package Core
 61:  * @subpackage Frontend_Search
 62:  */
 63: class cSearchIndex extends cSearchBaseAbstract {
 64: 
 65:     /**
 66:      * the content of the cms-types of an article
 67:      *
 68:      * @var array
 69:      */
 70:     protected $_keycode = array();
 71: 
 72:     /**
 73:      * the list of keywords of an article
 74:      *
 75:      * @var array
 76:      */
 77:     protected $_keywords = array();
 78: 
 79:     /**
 80:      * the words, which should not be indexed
 81:      *
 82:      * @var array
 83:      */
 84:     protected $_stopwords = array();
 85: 
 86:     /**
 87:      * the keywords of an article stored in the DB
 88:      *
 89:      * @var array
 90:      */
 91:     protected $_keywordsOld = array();
 92: 
 93:     /**
 94:      * the keywords to be deleted
 95:      *
 96:      * @var array
 97:      */
 98:     protected $_keywordsDel = array();
 99: 
100:     /**
101:      * 'auto' or 'self'
102:      * The field 'auto' in table con_keywords is used for automatic indexing.
103:      * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)", which
104:      * means a keyword occurs 2 times in article with $idart 12
105:      * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
106:      * The field 'self' can be used in the article properties to index the
107:      * article manually.
108:      *
109:      * @var string
110:      */
111:     protected $_place;
112: 
113:     /**
114:      * array of cms types
115:      *
116:      * @var array
117:      */
118:     protected $_cmsOptions = array();
119: 
120:     /**
121:      * array of all available cms types
122:      *
123:      * htmlhead - HTML Headline
124:      * html - HTML Text
125:      * head - Headline (no HTML)
126:      * text - Text (no HTML)
127:      * img - Upload id of the element
128:      * imgdescr - Image description
129:      * link - Link (URL)
130:      * linktarget - Linktarget (_self, _blank, _top ...)
131:      * linkdescr - Linkdescription
132:      * swf - Upload id of the element
133:      * etc.
134:      *
135:      * @var array
136:      */
137:     protected $_cmsType = array();
138: 
139:     /**
140:      * the suffix of all available cms types
141:      *
142:      * @var array
143:      */
144:     protected $_cmsTypeSuffix = array();
145: 
146:     /**
147:      * @var int
148:      */
149:     protected $idart;
150: 
151:     /**
152:      * Constructor, set object properties
153:      *
154:      * @param cDb $db CONTENIDO Database object
155:      */
156:     public function __construct($db = NULL) {
157:         parent::__construct($db);
158: 
159:         $this->setContentTypes();
160:     }
161: 
162:     /**
163:      * Start indexing the article.
164:      *
165:      * @param int $idart Article Id
166:      * @param array $aContent The complete content of an article specified by
167:      *        its content types.
168:      *        It looks like
169:      *        Array (
170:      *        [CMS_HTMLHEAD] => Array (
171:      *        [1] => Herzlich Willkommen...
172:      *        [2] => ...auf Ihrer Website!
173:      *        )
174:      *        [CMS_HTML] => Array (
175:      *        [1] => Die Inhalte auf dieser Website ...
176:      *
177:      * @param string $place The field where to store the index information in
178:      *        db.
179:      * @param array $cms_options One can specify explicitly cms types which
180:      *        should not be indexed.
181:      * @param array $aStopwords Array with words which should not be indexed.
182:      */
183:     public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
184:         if (!is_int((int) $idart) || $idart < 0) {
185:             return;
186:         } else {
187:             $this->idart = $idart;
188:         }
189: 
190:         $this->_place = $place;
191:         $this->_keycode = $aContent;
192:         $this->setStopwords($aStopwords);
193:         $this->setCmsOptions($cms_options);
194: 
195:         $this->createKeywords();
196: 
197:         $this->getKeywords();
198: 
199:         $this->saveKeywords();
200: 
201:         $new_keys = array_keys($this->_keywords);
202:         $old_keys = array_keys($this->_keywordsOld);
203: 
204:         $this->_keywordsDel = array_diff($old_keys, $new_keys);
205: 
206:         if (count($this->_keywordsDel) > 0) {
207:             $this->deleteKeywords();
208:         }
209:     }
210: 
211:     /**
212:      * for each cms-type create index structure.
213:      * it looks like
214:      * Array (
215:      * [die] => CMS_HTML-1
216:      * [inhalte] => CMS_HTML-1
217:      * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
218:      * [dieser] => CMS_HTML-1
219:      * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
220:      * )
221:      */
222:     public function createKeywords() {
223:         $tmp_keys = array();
224: 
225:         // Only create keycodes, if some are available
226:         if (is_array($this->_keycode)) {
227:             foreach ($this->_keycode as $idtype => $data) {
228:                 if ($this->checkCmsType($idtype)) {
229:                     foreach ($data as $typeid => $code) {
230:                         $this->_debug('code', $code);
231: 
232:                         // remove backslash
233:                         $code = stripslashes($code);
234:                         // replace HTML line breaks with newlines
235:                         $code = str_ireplace(array(
236:                             '<br>',
237:                             '<br />'
238:                         ), "\n", $code);
239:                         // remove html tags
240:                         $code = strip_tags($code);
241:                         if (strlen($code) > 0) {
242:                             $code = conHtmlEntityDecode($code);
243:                         }
244:                         $this->_debug('code', $code);
245: 
246:                         // split content by any number of commas or space
247:                         // characters
248:                         $tmp_keys = mb_split('[\s,]+', trim($code));
249:                         $this->_debug('tmp_keys', $tmp_keys);
250: 
251:                         foreach ($tmp_keys as $value) {
252:                             // index terms are stored with lower case
253:                             // $value = strtolower($value);
254: 
255:                             $value = conHtmlentities($value);
256:                             $value = trim(strtolower($value));
257:                             $value = conHtmlEntityDecode($value);
258: 
259:                             if (!in_array($value, $this->_stopwords)) {
260:                                 // eliminate stopwords
261:                                 $value = $this->removeSpecialChars($value);
262: 
263:                                 if (strlen($value) > 1) {
264:                                     // do not index single characters
265:                                     $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
266:                                 }
267:                             }
268:                         }
269:                     }
270:                 }
271: 
272:                 unset($tmp_keys);
273:             }
274:         }
275: 
276:         $this->_debug('keywords', $this->_keywords);
277:     }
278: 
279:     /**
280:      * generate index_string from index structure and save keywords
281:      * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)"
282:      */
283:     public function saveKeywords() {
284:         $tmp_count = array();
285: 
286:         foreach ($this->_keywords as $keyword => $count) {
287:             $tmp_count = preg_split('/[\s]/', trim($count));
288:             $this->_debug('tmp_count', $tmp_count);
289: 
290:             $occurrence = count($tmp_count);
291:             $tmp_count = array_unique($tmp_count);
292:             $cms_types = implode(',', $tmp_count);
293:             $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
294: 
295:             if (!array_key_exists($keyword, $this->_keywordsOld)) {
296:                 // if keyword is new, save index information
297:                 // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
298:                 $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
299:                             (keyword, " . $this->_place . ", idlang)
300:                         VALUES
301:                             ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
302:             } else {
303:                 // if keyword allready exists, create new index_string
304:                 if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
305:                     $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
306:                 } else {
307:                     $index_string = $this->_keywordsOld[$keyword] . $index_string;
308:                 }
309: 
310:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
311:                         SET " . $this->_place . " = '" . $index_string . "'
312:                         WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
313:             }
314:             $this->_debug('sql', $sql);
315:             $this->db->query($sql);
316:         }
317:     }
318: 
319:     /**
320:      * if keywords don't occur in the article anymore, update index_string and
321:      * delete keyword if necessary
322:      */
323:     public function deleteKeywords() {
324:         foreach ($this->_keywordsDel as $key_del) {
325:             $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
326: 
327:             if (strlen($index_string) == 0) {
328:                 // keyword is not referenced by any article
329:                 $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
330:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
331:             } else {
332:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
333:                     SET " . $this->_place . " = '" . $index_string . "'
334:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
335:             }
336:             $this->_debug('sql', $sql);
337:             $this->db->query($sql);
338:         }
339:     }
340: 
341:     /**
342:      * get the keywords of an article
343:      */
344:     public function getKeywords() {
345:         $keys = implode("','", array_keys($this->_keywords));
346: 
347:         $sql = "SELECT
348:                     keyword, auto, self
349:                 FROM
350:                     " . $this->cfg['tab']['keywords'] . "
351:                 WHERE
352:                     idlang=" . cSecurity::toInteger($this->lang) . "  AND
353:                     (keyword IN ('" . $keys . "')  OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
354: 
355:         $this->_debug('sql', $sql);
356: 
357:         $this->db->query($sql);
358: 
359:         $place = $this->_place;
360: 
361:         while ($this->db->nextRecord()) {
362:             $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
363:         }
364:     }
365: 
366:     /**
367:      * remove special characters from index term
368:      *
369:      * @param string $key Keyword
370:      * @return mixed
371:      */
372:     public function removeSpecialChars($key) {
373:         $aSpecialChars = array(
374:             /*"-",*/
375:             "_",
376:             "'",
377:             ".",
378:             "!",
379:             "\"",
380:             "#",
381:             "$",
382:             "%",
383:             "&",
384:             "(",
385:             ")",
386:             "*",
387:             "+",
388:             ",",
389:             "/",
390:             ":",
391:             ";",
392:             "<",
393:             "=",
394:             ">",
395:             "?",
396:             "@",
397:             "[",
398:             "\\",
399:             "]",
400:             "^",
401:             "`",
402:             "{",
403:             "|",
404:             "}",
405:             "~",
406:             "„"
407:         );
408: 
409:         // for ($i = 127; $i < 192; $i++) {
410:         // some other special characters
411:         // $aSpecialChars[] = chr($i);
412:         // }
413: 
414:         // TODO: The transformation of accented characters must depend on the
415:         // selected encoding of the language of
416:         // a client and should not be treated in this method.
417:         // modified 2007-10-01, H. Librenz - added as hotfix for encoding
418:         // problems (doesn't find any words with
419:         // umlaut vowels in it since you turn on UTF-8 as language encoding)
420:         $sEncoding = getEncodingByLanguage($this->db, $this->lang);
421: 
422:         if (strtolower($sEncoding) != 'iso-8859-2') {
423:             $key = conHtmlentities($key, NULL, $sEncoding);
424:         } else {
425:             $key = htmlentities_iso88592($key);
426:         }
427: 
428:         // $aUmlautMap = array(
429:         // '&Uuml;' => 'ue',
430:         // '&uuml;' => 'ue',
431:         // '&Auml;' => 'ae',
432:         // '&auml;' => 'ae',
433:         // '&Ouml;' => 'oe',
434:         // '&ouml;' => 'oe',
435:         // '&szlig;' => 'ss'
436:         // );
437: 
438:         // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
439:         // $key = str_replace($sUmlaut, $sMapped, $key);
440:         // }
441: 
442:         $key = conHtmlEntityDecode($key);
443:         $key = str_replace($aSpecialChars, '', $key);
444: 
445:         return $key;
446:     }
447: 
448:     /**
449:      *
450:      * @param string $key Keyword
451:      * @return string
452:      */
453:     public function addSpecialUmlauts($key) {
454:         $key = conHtmlentities($key, NULL, getEncodingByLanguage($this->db, $this->lang));
455:         $aUmlautMap = array(
456:             'Ue' => '&Uuml;',
457:             'ue' => '&uuml;',
458:             'Ae' => '&Auml;',
459:             'ae' => '&auml;',
460:             'Oe' => '&Ouml;',
461:             'oe' => '&ouml;',
462:             'ss' => '&szlig;'
463:         );
464: 
465:         foreach ($aUmlautMap as $sUmlaut => $sMapped) {
466:             $key = str_replace($sUmlaut, $sMapped, $key);
467:         }
468: 
469:         $key = conHtmlEntityDecode($key);
470:         return $key;
471:     }
472: 
473:     /**
474:      * set the array of stopwords which should not be indexed
475:      *
476:      * @param array $aStopwords
477:      */
478:     public function setStopwords($aStopwords) {
479:         if (is_array($aStopwords) && count($aStopwords) > 0) {
480:             $this->_stopwords = $aStopwords;
481:         }
482:     }
483: 
484:     /**
485:      * set the cms types
486:      */
487:     public function setContentTypes() {
488:         $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
489:         $this->_debug('sql', $sql);
490:         $this->db->query($sql);
491:         while ($this->db->nextRecord()) {
492:             $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
493:             $this->_cmsTypeSuffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
494:         }
495:     }
496: 
497:     /**
498:      * set the cms_options array of cms types which should be treated special
499:      *
500:      * @param mixed $cms_options
501:      */
502:     public function setCmsOptions($cms_options) {
503:         if (is_array($cms_options) && count($cms_options) > 0) {
504:             foreach ($cms_options as $opt) {
505:                 $opt = strtoupper($opt);
506: 
507:                 if (strlen($opt) > 0) {
508:                     if (!stristr($opt, 'cms_')) {
509:                         if (in_array($opt, $this->_cmsTypeSuffix)) {
510:                             $this->_cmsOptions[$opt] = 'CMS_' . $opt;
511:                         }
512:                     } else {
513:                         if (array_key_exists($opt, $this->_cmsType)) {
514:                             $this->_cmsOptions[$opt] = $opt;
515:                         }
516:                     }
517:                 }
518:             }
519:         } else {
520:             $this->_cmsOptions = array();
521:         }
522:     }
523: 
524:     /**
525:      * Check if the requested content type should be indexed (false) or not (true)
526:      *
527:      * @param string $idtype
528:      * @return boolean
529:      */
530:     public function checkCmsType($idtype) {
531:         $idtype = strtoupper($idtype);
532:         
533:         // Do not index CMS_RAW
534:         if ($idtype == "CMS_RAW") {
535:             return true;
536:         }
537: 
538:         return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
539:     }
540: 
541:     /**
542:      *
543:      * @return array the _cmsType property
544:      */
545:     public function getCmsType() {
546:         return $this->_cmsType;
547:     }
548: 
549:     /**
550:      *
551:      * @return array the _cmsTypeSuffix property
552:      */
553:     public function getCmsTypeSuffix() {
554:         return $this->_cmsTypeSuffix;
555:     }
556: }
557: 
CMS CONTENIDO 4.9.5 API documentation generated by ApiGen 2.8.0