1: <?php
2:
3: /**
4: * This file contains the base class for building search indices.
5: *
6: * @package Core
7: * @subpackage Frontend_Search
8: * @author Willi Man
9: * @copyright four for business AG <www.4fb.de>
10: * @license http://www.contenido.org/license/LIZENZ.txt
11: * @link http://www.4fb.de
12: * @link http://www.contenido.org
13: */
14:
15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
16:
17: cInclude('includes', 'functions.encoding.php');
18:
19: /**
20: * CONTENIDO API - Search Index Object.
21: *
22: * This object creates an index of an article.
23: *
24: * Create object where $db is the global CONTENIDO database object.
25: *
26: * $oIndex = new SearchIndex($db);
27: *
28: * Start indexing where $aContent is the complete content of an article
29: * specified by its content types.
30: *
31: * $oIndex->start($idart, $aContent);
32: *
33: * It looks like:
34: * Array (
35: * [CMS_HTMLHEAD] => Array (
36: * [1] => Herzlich Willkommen...
37: * [2] => ...auf Ihrer Website!
38: * )
39: * [CMS_HTML] => Array (
40: * [1] => Die Inhalte auf dieser Website ...
41: *
42: * The index for keyword 'willkommen' would look like
43: * '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs
44: * 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1].
45: *
46: * TODO: The basic idea of the indexing process is to take the complete
47: * content of an article and to generate normalized index terms from the
48: * content and to store a specific index structure in the relation
49: * 'con_keywords'.
50: *
51: * To take the complete content is not very flexible. It would be better
52: * to differentiate by specific content types or by any content.
53: *
54: * The &, =, () and - seperated string is not easy to parse to compute
55: * the search result set.
56: *
57: * It would be a better idea (and a lot of work) to extend the relation
58: * 'con_keywords' to store keywords by articleId (or content source
59: * identifier) and content type.
60: *
61: * The functions removeSpecialChars, setStopwords, setContentTypes and
62: * setCmsOptions should be sourced out into a new helper-class.
63: *
64: * Keep in mind that class Search and SearchResult uses an instance of
65: * object Index.
66: *
67: * @package Core
68: * @subpackage Frontend_Search
69: */
70: class cSearchIndex extends cSearchBaseAbstract {
71:
72: /**
73: * content of the cms-types of an article
74: *
75: * @var array
76: */
77: protected $_keycode = array();
78:
79: /**
80: * list of keywords of an article
81: *
82: * @var array
83: */
84: protected $_keywords = array();
85:
86: /**
87: * words, which should not be indexed
88: *
89: * @var array
90: */
91: protected $_stopwords = array();
92:
93: /**
94: * keywords of an article stored in the DB
95: *
96: * @var array
97: */
98: protected $_keywordsOld = array();
99:
100: /**
101: * keywords to be deleted
102: *
103: * @var array
104: */
105: protected $_keywordsDel = array();
106:
107: /**
108: * 'auto' or 'self'
109: *
110: * The field 'auto' in table con_keywords is used for automatic indexing.
111: * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)",
112: * which means a keyword occurs 2 times in article with $idart 12
113: * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
114: *
115: * The field 'self' can be used in the article properties to index
116: * the article manually.
117: *
118: * @var string
119: */
120: protected $_place;
121:
122: /**
123: * array of cms types
124: *
125: * @var array
126: */
127: protected $_cmsOptions = array();
128:
129: /**
130: * array of all available cms types
131: *
132: * htmlhead - HTML Headline
133: * html - HTML Text
134: * head - Headline (no HTML)
135: * text - Text (no HTML)
136: * img - Upload id of the element
137: * imgdescr - Image description
138: * link - Link (URL)
139: * linktarget - Linktarget (_self, _blank, _top ...)
140: * linkdescr - Linkdescription
141: * swf - Upload id of the element
142: * etc.
143: *
144: * @var array
145: */
146: protected $_cmsType = array();
147:
148: /**
149: * suffix of all available cms types
150: *
151: * @var array
152: */
153: protected $_cmsTypeSuffix = array();
154:
155: /**
156: *
157: * @var int
158: */
159: protected $idart;
160:
161: /**
162: * Constructor to create an instance of this class.
163: *
164: * Set object properties.
165: *
166: * @param cDb $db [optional]
167: * CONTENIDO database object
168: *
169: * @throws cDbException
170: * @throws cInvalidArgumentException
171: */
172: public function __construct($db = NULL) {
173: parent::__construct($db);
174:
175: $this->setContentTypes();
176: }
177:
178: /**
179: * Start indexing the article.
180: *
181: * @param int $idart Article Id
182: * @param array $aContent The complete content of an article specified by its content types.
183: * It looks like:
184: * [
185: * [CMS_HTMLHEAD] => [
186: * [1] => Herzlich Willkommen...
187: * [2] => ...auf Ihrer Website!
188: * ]
189: * [CMS_HTML] => [
190: * [1] => Die Inhalte auf dieser Website ...
191: * ]
192: * ]
193: * @param string $place [optional] The field where to store the index information in db.
194: * @param array $cms_options [optional] One can specify explicitly cms types which should not be indexed.
195: * @param array $aStopwords [optional] Array with words which should not be indexed.
196: *
197: * @throws cInvalidArgumentException
198: * @throws cDbException
199: */
200: public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
201: if (!is_int((int) $idart) || $idart < 0) {
202: return;
203: } else {
204: $this->idart = $idart;
205: }
206:
207: $this->_place = $place;
208: $this->_keycode = $aContent;
209: $this->setStopwords($aStopwords);
210: $this->setCmsOptions($cms_options);
211:
212: $this->createKeywords();
213:
214: $this->getKeywords();
215:
216: $this->saveKeywords();
217:
218: $new_keys = array_keys($this->_keywords);
219: $old_keys = array_keys($this->_keywordsOld);
220:
221: $this->_keywordsDel = array_diff($old_keys, $new_keys);
222:
223: if (count($this->_keywordsDel) > 0) {
224: $this->deleteKeywords();
225: }
226: }
227:
228: /**
229: * For each cms-type create index structure.
230: *
231: * It looks like:
232: * Array (
233: * [die] => CMS_HTML-1
234: * [inhalte] => CMS_HTML-1
235: * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
236: * [dieser] => CMS_HTML-1
237: * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
238: * )
239: *
240: * @throws cInvalidArgumentException
241: */
242: public function createKeywords() {
243: $tmp_keys = array();
244:
245: // Only create keycodes, if some are available
246: if (is_array($this->_keycode)) {
247: foreach ($this->_keycode as $idtype => $data) {
248: if ($this->checkCmsType($idtype)) {
249: foreach ($data as $typeid => $code) {
250: $this->_debug('code', $code);
251:
252: // remove backslash
253: $code = stripslashes($code);
254: // replace HTML line breaks with newlines
255: $code = str_ireplace(array(
256: '<br>',
257: '<br />'
258: ), "\n", $code);
259: // remove html tags
260: $code = strip_tags($code);
261: if (cString::getStringLength($code) > 0) {
262: $code = conHtmlEntityDecode($code);
263: }
264: $this->_debug('code', $code);
265:
266: // split content by any number of commas, space
267: // characters or hyphens
268: $tmp_keys = mb_split('[\s,-]+', trim($code));
269: $this->_debug('tmp_keys', $tmp_keys);
270:
271: foreach ($tmp_keys as $value) {
272: // index terms are stored with lower case
273: $value = conHtmlentities($value);
274: $value = trim(cString::toLowerCase($value));
275: $value = conHtmlEntityDecode($value);
276:
277: if (!in_array($value, $this->_stopwords)) {
278: // eliminate stopwords
279: $value = $this->removeSpecialChars($value);
280:
281: if (cString::getStringLength($value) > 1) {
282: // do not index single characters
283: $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
284: }
285: }
286: }
287: }
288: }
289:
290: unset($tmp_keys);
291: }
292: }
293:
294: $this->_debug('keywords', $this->_keywords);
295: }
296:
297: /**
298: * Generate index_string from index structure and save keywords.
299: * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
300: *
301: * @throws cInvalidArgumentException
302: * @throws cDbException
303: */
304: public function saveKeywords() {
305:
306: foreach ($this->_keywords as $keyword => $count) {
307: $tmp_count = preg_split('/[\s]/', trim($count));
308: $this->_debug('tmp_count', $tmp_count);
309:
310: $occurrence = count($tmp_count);
311: $tmp_count = array_unique($tmp_count);
312: $cms_types = implode(',', $tmp_count);
313: $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
314:
315: if (!array_key_exists($keyword, $this->_keywordsOld)) {
316: // if keyword is new, save index information
317: // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
318: $iLang = cSecurity::toInteger($this->lang);
319: $sql = "INSERT INTO {$this->cfg['tab']['keywords']}";
320: $sql .= "(keyword, {$this->_place}, idlang) ";
321: $sql .= "VALUES";
322: $sql .= "('{$this->db->escape($keyword)}', '{$this->db->escape($index_string)}', {$iLang})";
323: } else {
324: // if keyword allready exists, create new index_string
325: if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
326: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w\-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
327: } else {
328: $index_string = $this->_keywordsOld[$keyword] . $index_string;
329: }
330:
331: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
332: SET " . $this->_place . " = '" . $index_string . "'
333: WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
334: }
335: $this->_debug('sql', $sql);
336: $this->db->query($sql);
337: }
338: }
339:
340: /**
341: * If keywords don't occur in the article anymore,
342: * update index_string and delete keyword if necessary.
343: *
344: * @throws cInvalidArgumentException
345: * @throws cDbException
346: */
347: public function deleteKeywords() {
348: foreach ($this->_keywordsDel as $key_del) {
349: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w\-,]+\)/", "", $this->_keywordsOld[$key_del]);
350:
351: if (cString::getStringLength($index_string) == 0) {
352: // keyword is not referenced by any article
353: $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
354: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
355: } else {
356: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
357: SET " . $this->_place . " = '" . $index_string . "'
358: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
359: }
360: $this->_debug('sql', $sql);
361: $this->db->query($sql);
362: }
363: }
364:
365: /**
366: * Get the keywords of an article.
367: *
368: * @throws cInvalidArgumentException
369: * @throws cDbException
370: */
371: public function getKeywords() {
372: $keys = implode("','", array_keys($this->_keywords));
373:
374: $sql = "SELECT
375: keyword, auto, self
376: FROM
377: " . $this->cfg['tab']['keywords'] . "
378: WHERE
379: idlang=" . cSecurity::toInteger($this->lang) . " AND
380: (keyword IN ('" . $keys . "') OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
381:
382: $this->_debug('sql', $sql);
383:
384: $this->db->query($sql);
385:
386: $place = $this->_place;
387:
388: while ($this->db->nextRecord()) {
389: $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
390: }
391: }
392:
393: /**
394: * Remove special characters from index term.
395: *
396: * @param string $key
397: * Keyword
398: * @return mixed
399: */
400: public function removeSpecialChars($key) {
401: $aSpecialChars = array(
402: /*"-",*/
403: "_",
404: "'",
405: ".",
406: "!",
407: "\"",
408: "#",
409: "$",
410: "%",
411: "&",
412: "(",
413: ")",
414: "*",
415: "+",
416: ",",
417: "/",
418: ":",
419: ";",
420: "<",
421: "=",
422: ">",
423: "?",
424: "@",
425: "[",
426: "\\",
427: "]",
428: "^",
429: "`",
430: "{",
431: "|",
432: "}",
433: "~",
434: "„"
435: );
436:
437: // for ($i = 127; $i < 192; $i++) {
438: // some other special characters
439: // $aSpecialChars[] = chr($i);
440: // }
441:
442: // TODO: The transformation of accented characters must depend
443: // on the selected encoding of the language of a client and
444: // should not be treated in this method.
445: // modified 2007-10-01, H. Librenz - added as hotfix for encoding
446: // problems (doesn't find any words with umlaut vowels in it
447: // since you turn on UTF-8 as language encoding)
448: $sEncoding = cRegistry::getEncoding();
449:
450: if (cString::toLowerCase($sEncoding) != 'iso-8859-2') {
451: $key = conHtmlentities($key, NULL, $sEncoding);
452: } else {
453: $key = htmlentities_iso88592($key);
454: }
455:
456: // $aUmlautMap = array(
457: // 'Ü' => 'ue',
458: // 'ü' => 'ue',
459: // 'Ä' => 'ae',
460: // 'ä' => 'ae',
461: // 'Ö' => 'oe',
462: // 'ö' => 'oe',
463: // 'ß' => 'ss'
464: // );
465:
466: // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
467: // $key = str_replace($sUmlaut, $sMapped, $key);
468: // }
469:
470: $key = conHtmlEntityDecode($key);
471: $key = str_replace($aSpecialChars, '', $key);
472:
473: return $key;
474: }
475:
476: /**
477: *
478: * @param string $key
479: * Keyword
480: * @return string
481: */
482: public function addSpecialUmlauts($key) {
483: $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
484: $aUmlautMap = array(
485: 'Ue' => 'Ü',
486: 'ue' => 'ü',
487: 'Ae' => 'Ä',
488: 'ae' => 'ä',
489: 'Oe' => 'Ö',
490: 'oe' => 'ö',
491: 'ss' => 'ß'
492: );
493:
494: foreach ($aUmlautMap as $sUmlaut => $sMapped) {
495: $key = str_replace($sUmlaut, $sMapped, $key);
496: }
497:
498: $key = conHtmlEntityDecode($key);
499: return $key;
500: }
501:
502: /**
503: * Set the array of stopwords which should not be indexed.
504: *
505: * @param array $aStopwords
506: */
507: public function setStopwords($aStopwords) {
508: if (is_array($aStopwords) && count($aStopwords) > 0) {
509: $this->_stopwords = $aStopwords;
510: }
511: }
512:
513: /**
514: * Set the cms types.
515: *
516: * @throws cInvalidArgumentException
517: * @throws cDbException
518: */
519: public function setContentTypes() {
520: $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
521: $this->_debug('sql', $sql);
522: $this->db->query($sql);
523: while ($this->db->nextRecord()) {
524: $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
525: $this->_cmsTypeSuffix[$this->db->f('idtype')] = cString::getPartOfString($this->db->f('type'), 4, cString::getStringLength($this->db->f('type')));
526: }
527: }
528:
529: /**
530: * Set the cms_options array of cms types which should be treated
531: * special.
532: *
533: * @param mixed $cms_options
534: */
535: public function setCmsOptions($cms_options) {
536: if (is_array($cms_options) && count($cms_options) > 0) {
537: foreach ($cms_options as $opt) {
538: $opt = cString::toUpperCase($opt);
539:
540: if (cString::getStringLength($opt) > 0) {
541: if (!cString::findFirstOccurrenceCI($opt, 'cms_')) {
542: if (in_array($opt, $this->_cmsTypeSuffix)) {
543: $this->_cmsOptions[$opt] = 'CMS_' . $opt;
544: }
545: } else {
546: if (array_key_exists($opt, $this->_cmsType)) {
547: $this->_cmsOptions[$opt] = $opt;
548: }
549: }
550: }
551: }
552: } else {
553: $this->_cmsOptions = array();
554: }
555: }
556:
557: /**
558: * Check if the requested content type should be indexed (false) or
559: * not (true).
560: *
561: * @param string $idtype
562: * @return bool
563: */
564: public function checkCmsType($idtype) {
565: $idtype = cString::toUpperCase($idtype);
566:
567: // Do not index CMS_RAW
568: if ($idtype == "CMS_RAW") {
569: return true;
570: }
571:
572: return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
573: }
574:
575: /**
576: * Returns the property _cmsType.
577: *
578: * @return array
579: */
580: public function getCmsType() {
581: return $this->_cmsType;
582: }
583:
584: /**
585: * Returns the property _cmsTypeSuffix.
586: *
587: * @return array
588: */
589: public function getCmsTypeSuffix() {
590: return $this->_cmsTypeSuffix;
591: }
592: }
593: