Overview

Packages

  • CONTENIDO
  • Core
    • Authentication
    • Backend
    • Cache
    • CEC
    • Chain
    • ContentType
    • Database
    • Debug
    • Exception
    • Frontend
      • Search
      • URI
      • Util
    • GenericDB
      • Model
    • GUI
      • HTML
    • I18N
    • LayoutHandler
    • Log
    • Security
    • Session
    • Util
    • Validation
    • Versioning
    • XML
  • Module
    • ContentRssCreator
    • ContentSitemapHtml
    • ContentSitemapXml
    • ContentUserForum
    • NavigationTop
    • ScriptCookieDirective
  • mpAutoloaderClassMap
  • None
  • Plugin
    • ContentAllocation
    • CronjobOverview
    • FormAssistant
    • FrontendLogic
    • FrontendUsers
    • Linkchecker
    • ModRewrite
    • Newsletter
    • Repository
      • FrontendNavigation
      • KeywordDensity
    • SearchSolr
    • SmartyWrapper
    • UrlShortener
    • UserForum
    • Workflow
  • PluginManager
  • Setup
    • Form
    • GUI
    • Helper
      • Environment
      • Filesystem
      • MySQL
      • PHP
    • UpgradeJob

Classes

  • cSearch
  • cSearchBaseAbstract
  • cSearchIndex
  • cSearchResult
  • Overview
  • Package
  • Class
  • Tree
  • Deprecated
  • Todo
  1: <?php
  2: 
  3: /**
  4:  * This file contains the base class for building search indices
  5:  *
  6:  * @package Core
  7:  * @subpackage Frontend_Search
  8:  * @version SVN Revision $Rev:$
  9:  *
 10:  * @author Willi Man
 11:  * @copyright four for business AG <www.4fb.de>
 12:  * @license http://www.contenido.org/license/LIZENZ.txt
 13:  * @link http://www.4fb.de
 14:  * @link http://www.contenido.org
 15:  */
 16: 
 17: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 18: 
 19: cInclude('includes', 'functions.encoding.php');
 20: 
 21: /**
 22:  * CONTENIDO API - Search Index Object
 23:  *
 24:  * This object creates an index of an article
 25:  *
 26:  * Create object with
 27:  * $oIndex = new SearchIndex($db); # where $db is the global CONTENIDO database
 28:  * object.
 29:  * Start indexing with
 30:  * $oIndex->start($idart, $aContent);
 31:  * where $aContent is the complete content of an article specified by its
 32:  * content types.
 33:  * It looks like
 34:  * Array (
 35:  * [CMS_HTMLHEAD] => Array (
 36:  * [1] => Herzlich Willkommen...
 37:  * [2] => ...auf Ihrer Website!
 38:  * )
 39:  * [CMS_HTML] => Array (
 40:  * [1] => Die Inhalte auf dieser Website ...
 41:  *
 42:  * The index for keyword 'willkommen' would look like '&12=1(CMS_HTMLHEAD-1)'
 43:  * which means the keyword 'willkommen' occurs 1 times in article with articleId
 44:  * 12 and content type CMS_HTMLHEAD[1].
 45:  *
 46:  * TODO: The basic idea of the indexing process is to take the complete content
 47:  * of an article and to generate normalized index terms
 48:  * from the content and to store a specific index structure in the relation
 49:  * 'con_keywords'.
 50:  * To take the complete content is not very flexible. It would be better to
 51:  * differentiate by specific content types or by any content.
 52:  * The &, =, () and - seperated string is not easy to parse to compute the
 53:  * search result set.
 54:  * It would be a better idea (and a lot of work) to extend the relation
 55:  * 'con_keywords' to store keywords by articleId (or content source identifier)
 56:  * and content type.
 57:  * The functions removeSpecialChars, setStopwords, setContentTypes and
 58:  * setCmsOptions should be sourced out into a new helper-class.
 59:  * Keep in mind that class Search and SearchResult uses an instance of object
 60:  * Index.
 61:  *
 62:  * @package Core
 63:  * @subpackage Frontend_Search
 64:  */
 65: class cSearchIndex extends cSearchBaseAbstract {
 66: 
 67:     /**
 68:      * the content of the cms-types of an article
 69:      *
 70:      * @var array
 71:      */
 72:     protected $_keycode = array();
 73: 
 74:     /**
 75:      * the list of keywords of an article
 76:      *
 77:      * @var array
 78:      */
 79:     protected $_keywords = array();
 80: 
 81:     /**
 82:      * the words, which should not be indexed
 83:      *
 84:      * @var array
 85:      */
 86:     protected $_stopwords = array();
 87: 
 88:     /**
 89:      * the keywords of an article stored in the DB
 90:      *
 91:      * @var array
 92:      */
 93:     protected $_keywordsOld = array();
 94: 
 95:     /**
 96:      * the keywords to be deleted
 97:      *
 98:      * @var array
 99:      */
100:     protected $_keywordsDel = array();
101: 
102:     /**
103:      * 'auto' or 'self'
104:      * The field 'auto' in table con_keywords is used for automatic indexing.
105:      * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)", which
106:      * means a keyword occurs 2 times in article with $idart 12
107:      * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
108:      * The field 'self' can be used in the article properties to index the
109:      * article manually.
110:      *
111:      * @var string
112:      */
113:     protected $_place;
114: 
115:     /**
116:      * array of cms types
117:      *
118:      * @var array
119:      */
120:     protected $_cmsOptions = array();
121: 
122:     /**
123:      * array of all available cms types
124:      *
125:      * htmlhead - HTML Headline
126:      * html - HTML Text
127:      * head - Headline (no HTML)
128:      * text - Text (no HTML)
129:      * img - Upload id of the element
130:      * imgdescr - Image description
131:      * link - Link (URL)
132:      * linktarget - Linktarget (_self, _blank, _top ...)
133:      * linkdescr - Linkdescription
134:      * swf - Upload id of the element
135:      * etc.
136:      *
137:      * @var array
138:      */
139:     protected $_cmsType = array();
140: 
141:     /**
142:      * the suffix of all available cms types
143:      *
144:      * @var array
145:      */
146:     protected $_cmsTypeSuffix = array();
147: 
148:     /**
149:      * @var int
150:      */
151:     protected $idart;
152: 
153:     /**
154:      * Constructor, set object properties
155:      *
156:      * @param cDb $db [optional]
157:      *         CONTENIDO database object
158:      */
159:     public function __construct($db = NULL) {
160:         parent::__construct($db);
161: 
162:         $this->setContentTypes();
163:     }
164: 
165:     /**
166:      * Start indexing the article.
167:      *
168:      * @param int $idart
169:      *         Article Id
170:      * @param array $aContent
171:      *         The complete content of an article specified by its content types.
172:      *         It looks like:
173:      *         Array (
174:      *             [CMS_HTMLHEAD] => Array (
175:      *                 [1] => Herzlich Willkommen...
176:      *                 [2] => ...auf Ihrer Website!
177:      *             )
178:      *             [CMS_HTML] => Array (
179:      *                 [1] => Die Inhalte auf dieser Website ...
180:      *             )
181:      *         )
182:      * @param string $place [optional]
183:      *         The field where to store the index information in db.
184:      * @param array $cms_options [optional]
185:      *         One can specify explicitly cms types which should not be indexed.
186:      * @param array $aStopwords [optional]
187:      *         Array with words which should not be indexed.
188:      */
189:     public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
190:         if (!is_int((int) $idart) || $idart < 0) {
191:             return;
192:         } else {
193:             $this->idart = $idart;
194:         }
195: 
196:         $this->_place = $place;
197:         $this->_keycode = $aContent;
198:         $this->setStopwords($aStopwords);
199:         $this->setCmsOptions($cms_options);
200: 
201:         $this->createKeywords();
202: 
203:         $this->getKeywords();
204: 
205:         $this->saveKeywords();
206: 
207:         $new_keys = array_keys($this->_keywords);
208:         $old_keys = array_keys($this->_keywordsOld);
209: 
210:         $this->_keywordsDel = array_diff($old_keys, $new_keys);
211: 
212:         if (count($this->_keywordsDel) > 0) {
213:             $this->deleteKeywords();
214:         }
215:     }
216: 
217:     /**
218:      * For each cms-type create index structure.
219:      * It looks like:
220:      * Array (
221:      *     [die] => CMS_HTML-1
222:      *     [inhalte] => CMS_HTML-1
223:      *     [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
224:      *     [dieser] => CMS_HTML-1
225:      *     [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
226:      * )
227:      */
228:     public function createKeywords() {
229:         $tmp_keys = array();
230: 
231:         // Only create keycodes, if some are available
232:         if (is_array($this->_keycode)) {
233:             foreach ($this->_keycode as $idtype => $data) {
234:                 if ($this->checkCmsType($idtype)) {
235:                     foreach ($data as $typeid => $code) {
236:                         $this->_debug('code', $code);
237: 
238:                         // remove backslash
239:                         $code = stripslashes($code);
240:                         // replace HTML line breaks with newlines
241:                         $code = str_ireplace(array(
242:                             '<br>',
243:                             '<br />'
244:                         ), "\n", $code);
245:                         // remove html tags
246:                         $code = strip_tags($code);
247:                         if (strlen($code) > 0) {
248:                             $code = conHtmlEntityDecode($code);
249:                         }
250:                         $this->_debug('code', $code);
251: 
252:                         // split content by any number of commas or space
253:                         // characters
254:                         $tmp_keys = mb_split('[\s,]+', trim($code));
255:                         $this->_debug('tmp_keys', $tmp_keys);
256: 
257:                         foreach ($tmp_keys as $value) {
258:                             // index terms are stored with lower case
259:                             // $value = strtolower($value);
260: 
261:                             $value = conHtmlentities($value);
262:                             $value = trim(strtolower($value));
263:                             $value = conHtmlEntityDecode($value);
264: 
265:                             if (!in_array($value, $this->_stopwords)) {
266:                                 // eliminate stopwords
267:                                 $value = $this->removeSpecialChars($value);
268: 
269:                                 if (strlen($value) > 1) {
270:                                     // do not index single characters
271:                                     $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
272:                                 }
273:                             }
274:                         }
275:                     }
276:                 }
277: 
278:                 unset($tmp_keys);
279:             }
280:         }
281: 
282:         $this->_debug('keywords', $this->_keywords);
283:     }
284: 
285:     /**
286:      * Generate index_string from index structure and save keywords.
287:      * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
288:      */
289:     public function saveKeywords() {
290:         $tmp_count = array();
291: 
292:         foreach ($this->_keywords as $keyword => $count) {
293:             $tmp_count = preg_split('/[\s]/', trim($count));
294:             $this->_debug('tmp_count', $tmp_count);
295: 
296:             $occurrence = count($tmp_count);
297:             $tmp_count = array_unique($tmp_count);
298:             $cms_types = implode(',', $tmp_count);
299:             $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
300: 
301:             if (!array_key_exists($keyword, $this->_keywordsOld)) {
302:                 // if keyword is new, save index information
303:                 // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
304:                 $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
305:                             (keyword, " . $this->_place . ", idlang)
306:                         VALUES
307:                             ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
308:             } else {
309:                 // if keyword allready exists, create new index_string
310:                 if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
311:                     $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
312:                 } else {
313:                     $index_string = $this->_keywordsOld[$keyword] . $index_string;
314:                 }
315: 
316:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
317:                         SET " . $this->_place . " = '" . $index_string . "'
318:                         WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
319:             }
320:             $this->_debug('sql', $sql);
321:             $this->db->query($sql);
322:         }
323:     }
324: 
325:     /**
326:      * If keywords don't occur in the article anymore, update index_string and
327:      * delete keyword if necessary.
328:      */
329:     public function deleteKeywords() {
330:         foreach ($this->_keywordsDel as $key_del) {
331:             $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
332: 
333:             if (strlen($index_string) == 0) {
334:                 // keyword is not referenced by any article
335:                 $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
336:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
337:             } else {
338:                 $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
339:                     SET " . $this->_place . " = '" . $index_string . "'
340:                     WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
341:             }
342:             $this->_debug('sql', $sql);
343:             $this->db->query($sql);
344:         }
345:     }
346: 
347:     /**
348:      * Get the keywords of an article.
349:      */
350:     public function getKeywords() {
351:         $keys = implode("','", array_keys($this->_keywords));
352: 
353:         $sql = "SELECT
354:                     keyword, auto, self
355:                 FROM
356:                     " . $this->cfg['tab']['keywords'] . "
357:                 WHERE
358:                     idlang=" . cSecurity::toInteger($this->lang) . "  AND
359:                     (keyword IN ('" . $keys . "')  OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
360: 
361:         $this->_debug('sql', $sql);
362: 
363:         $this->db->query($sql);
364: 
365:         $place = $this->_place;
366: 
367:         while ($this->db->nextRecord()) {
368:             $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
369:         }
370:     }
371: 
372:     /**
373:      * Remove special characters from index term.
374:      *
375:      * @param string $key
376:      *         Keyword
377:      * @return mixed
378:      */
379:     public function removeSpecialChars($key) {
380:         $aSpecialChars = array(
381:             /*"-",*/
382:             "_",
383:             "'",
384:             ".",
385:             "!",
386:             "\"",
387:             "#",
388:             "$",
389:             "%",
390:             "&",
391:             "(",
392:             ")",
393:             "*",
394:             "+",
395:             ",",
396:             "/",
397:             ":",
398:             ";",
399:             "<",
400:             "=",
401:             ">",
402:             "?",
403:             "@",
404:             "[",
405:             "\\",
406:             "]",
407:             "^",
408:             "`",
409:             "{",
410:             "|",
411:             "}",
412:             "~",
413:             "„"
414:         );
415: 
416:         // for ($i = 127; $i < 192; $i++) {
417:         // some other special characters
418:         // $aSpecialChars[] = chr($i);
419:         // }
420: 
421:         // TODO: The transformation of accented characters must depend on the
422:         // selected encoding of the language of
423:         // a client and should not be treated in this method.
424:         // modified 2007-10-01, H. Librenz - added as hotfix for encoding
425:         // problems (doesn't find any words with
426:         // umlaut vowels in it since you turn on UTF-8 as language encoding)
427:         $sEncoding == cRegistry::getEncoding();
428: 
429:         if (strtolower($sEncoding) != 'iso-8859-2') {
430:             $key = conHtmlentities($key, NULL, $sEncoding);
431:         } else {
432:             $key = htmlentities_iso88592($key);
433:         }
434: 
435:         // $aUmlautMap = array(
436:         // '&Uuml;' => 'ue',
437:         // '&uuml;' => 'ue',
438:         // '&Auml;' => 'ae',
439:         // '&auml;' => 'ae',
440:         // '&Ouml;' => 'oe',
441:         // '&ouml;' => 'oe',
442:         // '&szlig;' => 'ss'
443:         // );
444: 
445:         // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
446:         // $key = str_replace($sUmlaut, $sMapped, $key);
447:         // }
448: 
449:         $key = conHtmlEntityDecode($key);
450:         $key = str_replace($aSpecialChars, '', $key);
451: 
452:         return $key;
453:     }
454: 
455:     /**
456:      *
457:      * @param string $key
458:      *         Keyword
459:      * @return string
460:      */
461:     public function addSpecialUmlauts($key) {
462:         $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
463:         $aUmlautMap = array(
464:             'Ue' => '&Uuml;',
465:             'ue' => '&uuml;',
466:             'Ae' => '&Auml;',
467:             'ae' => '&auml;',
468:             'Oe' => '&Ouml;',
469:             'oe' => '&ouml;',
470:             'ss' => '&szlig;'
471:         );
472: 
473:         foreach ($aUmlautMap as $sUmlaut => $sMapped) {
474:             $key = str_replace($sUmlaut, $sMapped, $key);
475:         }
476: 
477:         $key = conHtmlEntityDecode($key);
478:         return $key;
479:     }
480: 
481:     /**
482:      * set the array of stopwords which should not be indexed
483:      *
484:      * @param array $aStopwords
485:      */
486:     public function setStopwords($aStopwords) {
487:         if (is_array($aStopwords) && count($aStopwords) > 0) {
488:             $this->_stopwords = $aStopwords;
489:         }
490:     }
491: 
492:     /**
493:      * set the cms types
494:      */
495:     public function setContentTypes() {
496:         $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
497:         $this->_debug('sql', $sql);
498:         $this->db->query($sql);
499:         while ($this->db->nextRecord()) {
500:             $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
501:             $this->_cmsTypeSuffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
502:         }
503:     }
504: 
505:     /**
506:      * set the cms_options array of cms types which should be treated special
507:      *
508:      * @param mixed $cms_options
509:      */
510:     public function setCmsOptions($cms_options) {
511:         if (is_array($cms_options) && count($cms_options) > 0) {
512:             foreach ($cms_options as $opt) {
513:                 $opt = strtoupper($opt);
514: 
515:                 if (strlen($opt) > 0) {
516:                     if (!stristr($opt, 'cms_')) {
517:                         if (in_array($opt, $this->_cmsTypeSuffix)) {
518:                             $this->_cmsOptions[$opt] = 'CMS_' . $opt;
519:                         }
520:                     } else {
521:                         if (array_key_exists($opt, $this->_cmsType)) {
522:                             $this->_cmsOptions[$opt] = $opt;
523:                         }
524:                     }
525:                 }
526:             }
527:         } else {
528:             $this->_cmsOptions = array();
529:         }
530:     }
531: 
532:     /**
533:      * Check if the requested content type should be indexed (false) or not (true)
534:      *
535:      * @param string $idtype
536:      * @return bool
537:      */
538:     public function checkCmsType($idtype) {
539:         $idtype = strtoupper($idtype);
540: 
541:         // Do not index CMS_RAW
542:         if ($idtype == "CMS_RAW") {
543:             return true;
544:         }
545: 
546:         return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
547:     }
548: 
549:     /**
550:      *
551:      * @return array
552:      *         the _cmsType property
553:      */
554:     public function getCmsType() {
555:         return $this->_cmsType;
556:     }
557: 
558:     /**
559:      *
560:      * @return array
561:      *         the _cmsTypeSuffix property
562:      */
563:     public function getCmsTypeSuffix() {
564:         return $this->_cmsTypeSuffix;
565:     }
566: }
567: 
CMS CONTENIDO 4.9.8 API documentation generated by ApiGen 2.8.0