Overview

Packages

  • CONTENIDO
  • Core
    • Authentication
    • Backend
    • Cache
    • CEC
    • Chain
    • ContentType
    • Database
    • Debug
    • Exception
    • Frontend
      • Search
      • URI
      • Util
    • GenericDB
      • Model
    • GUI
      • HTML
    • I18N
    • LayoutHandler
    • Log
    • Security
    • Session
    • Util
    • Validation
    • Versioning
    • XML
  • Module
    • ContentRssCreator
    • ContentSitemapHtml
    • ContentSitemapXml
    • ContentUserForum
    • NavigationTop
    • ScriptCookieDirective
  • mpAutoloaderClassMap
  • None
  • Plugin
    • ContentAllocation
    • CronjobOverview
    • FormAssistant
    • FrontendLogic
    • FrontendUsers
    • Linkchecker
    • ModRewrite
    • Newsletter
    • Repository
      • FrontendNavigation
      • KeywordDensity
    • SearchSolr
    • SmartyWrapper
    • UrlShortener
    • UserForum
    • Workflow
  • PluginManager
  • Setup
    • Form
    • GUI
    • Helper
      • Environment
      • Filesystem
      • MySQL
      • PHP
    • UpgradeJob

Classes

  • cSearch
  • cSearchBaseAbstract
  • cSearchIndex
  • cSearchResult
  • Overview
  • Package
  • Class
  • Tree
  • Deprecated
  • Todo
  1: <?php
  2: 
  3: /**
  4:  * This file contains the base class for building search indices.
  5:  *
  6:  * @package Core
  7:  * @subpackage Frontend_Search
  8:  * @author Willi Man
  9:  * @copyright four for business AG <www.4fb.de>
 10:  * @license http://www.contenido.org/license/LIZENZ.txt
 11:  * @link http://www.4fb.de
 12:  * @link http://www.contenido.org
 13:  */
 14: 
 15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 16: 
 17: cInclude('includes', 'functions.encoding.php');
 18: 
 19: /**
 20:  * CONTENIDO API - Search Index Object.
 21:  *
 22:  * This object creates an index of an article.
 23:  *
 24:  * Create object where $db is the global CONTENIDO database object.
 25:  *
 26:  * $oIndex = new SearchIndex($db);
 27:  *
 28:  * Start indexing where $aContent is the complete content of an article
 29:  * specified by its content types.
 30:  *
 31:  * $oIndex->start($idart, $aContent);
 32:  *
 33:  * It looks like:
 34:  * Array (
 35:  *      [CMS_HTMLHEAD] => Array (
 36:  *          [1] => Herzlich Willkommen...
 37:  *          [2] => ...auf Ihrer Website!
 38:  *      )
 39:  *      [CMS_HTML] => Array (
 40:  *          [1] => Die Inhalte auf dieser Website ...
 41:  *
 42:  * The index for keyword 'willkommen' would look like
 43:  * '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs
 44:  * 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1].
 45:  *
 46:  * TODO: The basic idea of the indexing process is to take the complete
 47:  * content of an article and to generate normalized index terms from the
 48:  * content and to store a specific index structure in the relation
 49:  * 'con_keywords'.
 50:  *
 51:  * To take the complete content is not very flexible. It would be better
 52:  * to differentiate by specific content types or by any content.
 53:  *
 54:  * The &, =, () and - seperated string is not easy to parse to compute
 55:  * the search result set.
 56:  *
 57:  * It would be a better idea (and a lot of work) to extend the relation
 58:  * 'con_keywords' to store keywords by articleId (or content source
 59:  * identifier) and content type.
 60:  *
 61:  * The functions removeSpecialChars, setStopwords, setContentTypes and
 62:  * setCmsOptions should be sourced out into a new helper-class.
 63:  *
 64:  * Keep in mind that class Search and SearchResult uses an instance of
 65:  * object Index.
 66:  *
 67:  * @package Core
 68:  * @subpackage Frontend_Search
 69:  */
 70: class cSearchIndex extends cSearchBaseAbstract {
 71: 
 72:     /**
 73:      * content of the cms-types of an article
 74:      *
 75:      * @var array
 76:      */
 77:     protected $_keycode = array();
 78: 
 79:     /**
 80:      * list of keywords of an article
 81:      *
 82:      * @var array
 83:      */
 84:     protected $_keywords = array();
 85: 
 86:     /**
 87:      * words, which should not be indexed
 88:      *
 89:      * @var array
 90:      */
 91:     protected $_stopwords = array();
 92: 
 93:     /**
 94:      * keywords of an article stored in the DB
 95:      *
 96:      * @var array
 97:      */
 98:     protected $_keywordsOld = array();
 99: 
100:     /**
101:      * keywords to be deleted
102:      *
103:      * @var array
104:      */
105:     protected $_keywordsDel = array();
106: 
107:     /**
108:      * 'auto' or 'self'
109:      *
110:      * The field 'auto' in table con_keywords is used for automatic indexing.
111:      * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)",
112:      * which means a keyword occurs 2 times in article with $idart 12
113:      * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
114:      *
115:      * The field 'self' can be used in the article properties to index
116:      * the article manually.
117:      *
118:      * @var string
119:      */
120:     protected $_place;
121: 
122:     /**
123:      * array of cms types
124:      *
125:      * @var array
126:      */
127:     protected $_cmsOptions = array();
128: 
129:     /**
130:      * array of all available cms types
131:      *
132:      * htmlhead - HTML Headline
133:      * html - HTML Text
134:      * head - Headline (no HTML)
135:      * text - Text (no HTML)
136:      * img - Upload id of the element
137:      * imgdescr - Image description
138:      * link - Link (URL)
139:      * linktarget - Linktarget (_self, _blank, _top ...)
140:      * linkdescr - Linkdescription
141:      * swf - Upload id of the element
142:      * etc.
143:      *
144:      * @var array
145:      */
146:     protected $_cmsType = array();
147: 
148:     /**
149:      * suffix of all available cms types
150:      *
151:      * @var array
152:      */
153:     protected $_cmsTypeSuffix = array();
154: 
155:     /**
156:      *
157:      * @var int
158:      */
159:     protected $idart;
160: 
161:     /**
162:      * Constructor to create an instance of this class.
163:      *
164:      * Set object properties.
165:      *
166:      * @param cDb $db [optional]
167:      *         CONTENIDO database object
168:      */
169:     public function __construct($db = NULL) {
170:         parent::__construct($db);
171: 
172:         $this->setContentTypes();
173:     }
174: 
175:     /**
176:      * Start indexing the article.
177:      *
178:      * @param int $idart
179:      *         Article Id
180:      * @param array $aContent
181:      *         The complete content of an article specified by its content types.
182:      *         It looks like:
183:      *         Array (
184:      *             [CMS_HTMLHEAD] => Array (
185:      *                 [1] => Herzlich Willkommen...
186:      *                 [2] => ...auf Ihrer Website!
187:      *             )
188:      *             [CMS_HTML] => Array (
189:      *                 [1] => Die Inhalte auf dieser Website ...
190:      *             )
191:      *         )
192:      * @param string $place [optional]
193:      *         The field where to store the index information in db.
194:      * @param array $cms_options [optional]
195:      *         One can specify explicitly cms types which should not be indexed.
196:      * @param array $aStopwords [optional]
197:      *         Array with words which should not be indexed.
198:      */
199:     public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
200:         if (!is_int((int) $idart) || $idart < 0) {
201:             return;
202:         } else {
203:             $this->idart = $idart;
204:         }
205: 
206:         $this->_place = $place;
207:         $this->_keycode = $aContent;
208:         $this->setStopwords($aStopwords);
209:         $this->setCmsOptions($cms_options);
210: 
211:         $this->createKeywords();
212: 
213:         $this->getKeywords();
214: 
215:         $this->saveKeywords();
216: 
217:         $new_keys = array_keys($this->_keywords);
218:         $old_keys = array_keys($this->_keywordsOld);
219: 
220:         $this->_keywordsDel = array_diff($old_keys, $new_keys);
221: 
222:         if (count($this->_keywordsDel) > 0) {
223:             $this->deleteKeywords();
224:         }
225:     }
226: 
227:     /**
228:      * For each cms-type create index structure.
229:      *
230:      * It looks like:
231:      * Array (
232:      *     [die] => CMS_HTML-1
233:      *     [inhalte] => CMS_HTML-1
234:      *     [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
235:      *     [dieser] => CMS_HTML-1
236:      *     [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
237:      * )
238:      */
239:     public function createKeywords() {
240:         $tmp_keys = array();
241: 
242:         // Only create keycodes, if some are available
243:         if (is_array($this->_keycode)) {
244:             foreach ($this->_keycode as $idtype => $data) {
245:                 if ($this->checkCmsType($idtype)) {
246:                     foreach ($data as $typeid => $code) {
247:                         $this->_debug('code', $code);
248: 
249:                         // remove backslash
250:                         $code = stripslashes($code);
251:                         // replace HTML line breaks with newlines
252:                         $code = str_ireplace(array(
253:                             '<br>',
254:                             '<br />'
255:                         ), "\n", $code);
256:                         // remove html tags
257:                         $code = strip_tags($code);
258:                         if (strlen($code) > 0) {
259:                             $code = conHtmlEntityDecode($code);
260:                         }
261:                         $this->_debug('code', $code);
262: 
263:                         // split content by any number of commas or space
264:                         // characters
265:                         $tmp_keys = mb_split('[\s,]+', trim($code));
266:                         $this->_debug('tmp_keys', $tmp_keys);
267: 
268:                         foreach ($tmp_keys as $value) {
269:                             // index terms are stored with lower case
270:                             // $value = strtolower($value);
271: 
272:                             $value = conHtmlentities($value);
273:                             $value = trim(strtolower($value));
274:                             $value = conHtmlEntityDecode($value);
275: 
276:                             if (!in_array($value, $this->_stopwords)) {
277:                                 // eliminate stopwords
278:                                 $value = $this->removeSpecialChars($value);
279: 
280:                                 if (strlen($value) > 1) {
281:                                     // do not index single characters
282:                                     $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
283:                                 }
284:                             }
285:                         }
286:                     }
287:                 }
288: 
289:                 unset($tmp_keys);
290:             }
291:         }
292: 
293:         $this->_debug('keywords', $this->_keywords);
294:     }
295: 
296:     /**
297:      * Generate index_string from index structure and save keywords.
298:      * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
299:      */
300:     public function saveKeywords() {
301:         $tmp_count = array();
302: 
303:         foreach ($this->_keywords as $keyword => $count) {
304:             $tmp_count = preg_split('/[\s]/', trim($count));
305:             $this->_debug('tmp_count', $tmp_count);
306: 
307:             $occurrence = count($tmp_count);
308:             $tmp_count = array_unique($tmp_count);
309:             $cms_types = implode(',', $tmp_count);
310:             $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
311: 
312:             if (!array_key_exists($keyword, $this->_keywordsOld)) {
313:                 // if keyword is new, save index information
314:                 // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
315:                 $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
316:                             (keyword, " . $this->_place . ", idlang)
317:                         VALUES
318:                             ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
319:             } else {
320:                 // if keyword allready exists, create new index_string
321:                 if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
322:                     $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
323:                 } else {
324:                     $index_string = $this->_keywordsOld[$keyword] . $index_string;
325:                 }
326: 
327:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
328:                         SET " . $this->_place . " = '" . $index_string . "'
329:                         WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
330:             }
331:             $this->_debug('sql', $sql);
332:             $this->db->query($sql);
333:         }
334:     }
335: 
336:     /**
337:      * If keywords don't occur in the article anymore,
338:      * update index_string and delete keyword if necessary.
339:      */
340:     public function deleteKeywords() {
341:         foreach ($this->_keywordsDel as $key_del) {
342:             $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
343: 
344:             if (strlen($index_string) == 0) {
345:                 // keyword is not referenced by any article
346:                 $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
347:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
348:             } else {
349:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
350:                     SET " . $this->_place . " = '" . $index_string . "'
351:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
352:             }
353:             $this->_debug('sql', $sql);
354:             $this->db->query($sql);
355:         }
356:     }
357: 
358:     /**
359:      * Get the keywords of an article.
360:      */
361:     public function getKeywords() {
362:         $keys = implode("','", array_keys($this->_keywords));
363: 
364:         $sql = "SELECT
365:                     keyword, auto, self
366:                 FROM
367:                     " . $this->cfg['tab']['keywords'] . "
368:                 WHERE
369:                     idlang=" . cSecurity::toInteger($this->lang) . "  AND
370:                     (keyword IN ('" . $keys . "')  OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
371: 
372:         $this->_debug('sql', $sql);
373: 
374:         $this->db->query($sql);
375: 
376:         $place = $this->_place;
377: 
378:         while ($this->db->nextRecord()) {
379:             $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
380:         }
381:     }
382: 
383:     /**
384:      * Remove special characters from index term.
385:      *
386:      * @param string $key
387:      *         Keyword
388:      * @return mixed
389:      */
390:     public function removeSpecialChars($key) {
391:         $aSpecialChars = array(
392:             /*"-",*/
393:             "_",
394:             "'",
395:             ".",
396:             "!",
397:             "\"",
398:             "#",
399:             "$",
400:             "%",
401:             "&",
402:             "(",
403:             ")",
404:             "*",
405:             "+",
406:             ",",
407:             "/",
408:             ":",
409:             ";",
410:             "<",
411:             "=",
412:             ">",
413:             "?",
414:             "@",
415:             "[",
416:             "\\",
417:             "]",
418:             "^",
419:             "`",
420:             "{",
421:             "|",
422:             "}",
423:             "~",
424:             "„"
425:         );
426: 
427:         // for ($i = 127; $i < 192; $i++) {
428:         // some other special characters
429:         // $aSpecialChars[] = chr($i);
430:         // }
431: 
432:         // TODO: The transformation of accented characters must depend
433:         // on the selected encoding of the language of a client and
434:         // should not be treated in this method.
435:         // modified 2007-10-01, H. Librenz - added as hotfix for encoding
436:         // problems (doesn't find any words with umlaut vowels in it
437:         // since you turn on UTF-8 as language encoding)
438:         $sEncoding = cRegistry::getEncoding();
439: 
440:         if (strtolower($sEncoding) != 'iso-8859-2') {
441:             $key = conHtmlentities($key, NULL, $sEncoding);
442:         } else {
443:             $key = htmlentities_iso88592($key);
444:         }
445: 
446:         // $aUmlautMap = array(
447:         // '&Uuml;' => 'ue',
448:         // '&uuml;' => 'ue',
449:         // '&Auml;' => 'ae',
450:         // '&auml;' => 'ae',
451:         // '&Ouml;' => 'oe',
452:         // '&ouml;' => 'oe',
453:         // '&szlig;' => 'ss'
454:         // );
455: 
456:         // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
457:         // $key = str_replace($sUmlaut, $sMapped, $key);
458:         // }
459: 
460:         $key = conHtmlEntityDecode($key);
461:         $key = str_replace($aSpecialChars, '', $key);
462: 
463:         return $key;
464:     }
465: 
466:     /**
467:      *
468:      * @param string $key
469:      *         Keyword
470:      * @return string
471:      */
472:     public function addSpecialUmlauts($key) {
473:         $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
474:         $aUmlautMap = array(
475:             'Ue' => '&Uuml;',
476:             'ue' => '&uuml;',
477:             'Ae' => '&Auml;',
478:             'ae' => '&auml;',
479:             'Oe' => '&Ouml;',
480:             'oe' => '&ouml;',
481:             'ss' => '&szlig;'
482:         );
483: 
484:         foreach ($aUmlautMap as $sUmlaut => $sMapped) {
485:             $key = str_replace($sUmlaut, $sMapped, $key);
486:         }
487: 
488:         $key = conHtmlEntityDecode($key);
489:         return $key;
490:     }
491: 
492:     /**
493:      * Set the array of stopwords which should not be indexed.
494:      *
495:      * @param array $aStopwords
496:      */
497:     public function setStopwords($aStopwords) {
498:         if (is_array($aStopwords) && count($aStopwords) > 0) {
499:             $this->_stopwords = $aStopwords;
500:         }
501:     }
502: 
503:     /**
504:      * Set the cms types.
505:      */
506:     public function setContentTypes() {
507:         $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
508:         $this->_debug('sql', $sql);
509:         $this->db->query($sql);
510:         while ($this->db->nextRecord()) {
511:             $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
512:             $this->_cmsTypeSuffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
513:         }
514:     }
515: 
516:     /**
517:      * Set the cms_options array of cms types which should be treated
518:      * special.
519:      *
520:      * @param mixed $cms_options
521:      */
522:     public function setCmsOptions($cms_options) {
523:         if (is_array($cms_options) && count($cms_options) > 0) {
524:             foreach ($cms_options as $opt) {
525:                 $opt = strtoupper($opt);
526: 
527:                 if (strlen($opt) > 0) {
528:                     if (!stristr($opt, 'cms_')) {
529:                         if (in_array($opt, $this->_cmsTypeSuffix)) {
530:                             $this->_cmsOptions[$opt] = 'CMS_' . $opt;
531:                         }
532:                     } else {
533:                         if (array_key_exists($opt, $this->_cmsType)) {
534:                             $this->_cmsOptions[$opt] = $opt;
535:                         }
536:                     }
537:                 }
538:             }
539:         } else {
540:             $this->_cmsOptions = array();
541:         }
542:     }
543: 
544:     /**
545:      * Check if the requested content type should be indexed (false) or
546:      * not (true).
547:      *
548:      * @param string $idtype
549:      * @return bool
550:      */
551:     public function checkCmsType($idtype) {
552:         $idtype = strtoupper($idtype);
553: 
554:         // Do not index CMS_RAW
555:         if ($idtype == "CMS_RAW") {
556:             return true;
557:         }
558: 
559:         return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
560:     }
561: 
562:     /**
563:      * Returns the property _cmsType.
564:      *
565:      * @return array
566:      */
567:     public function getCmsType() {
568:         return $this->_cmsType;
569:     }
570: 
571:     /**
572:      * Returns the property _cmsTypeSuffix.
573:      *
574:      * @return array
575:      */
576:     public function getCmsTypeSuffix() {
577:         return $this->_cmsTypeSuffix;
578:     }
579: }
580: 
CMS CONTENIDO 4.9.11 API documentation generated by ApiGen 2.8.0