1: <?php
2:
3: /**
4: * This file contains the base class for building search indices.
5: *
6: * @package Core
7: * @subpackage Frontend_Search
8: * @author Willi Man
9: * @copyright four for business AG <www.4fb.de>
10: * @license http://www.contenido.org/license/LIZENZ.txt
11: * @link http://www.4fb.de
12: * @link http://www.contenido.org
13: */
14:
15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
16:
17: cInclude('includes', 'functions.encoding.php');
18:
19: /**
20: * CONTENIDO API - Search Index Object.
21: *
22: * This object creates an index of an article.
23: *
24: * Create object where $db is the global CONTENIDO database object.
25: *
26: * $oIndex = new SearchIndex($db);
27: *
28: * Start indexing where $aContent is the complete content of an article
29: * specified by its content types.
30: *
31: * $oIndex->start($idart, $aContent);
32: *
33: * It looks like:
34: * Array (
35: * [CMS_HTMLHEAD] => Array (
36: * [1] => Herzlich Willkommen...
37: * [2] => ...auf Ihrer Website!
38: * )
39: * [CMS_HTML] => Array (
40: * [1] => Die Inhalte auf dieser Website ...
41: *
42: * The index for keyword 'willkommen' would look like
43: * '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs
44: * 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1].
45: *
46: * TODO: The basic idea of the indexing process is to take the complete
47: * content of an article and to generate normalized index terms from the
48: * content and to store a specific index structure in the relation
49: * 'con_keywords'.
50: *
51: * To take the complete content is not very flexible. It would be better
52: * to differentiate by specific content types or by any content.
53: *
54: * The &, =, () and - seperated string is not easy to parse to compute
55: * the search result set.
56: *
57: * It would be a better idea (and a lot of work) to extend the relation
58: * 'con_keywords' to store keywords by articleId (or content source
59: * identifier) and content type.
60: *
61: * The functions removeSpecialChars, setStopwords, setContentTypes and
62: * setCmsOptions should be sourced out into a new helper-class.
63: *
64: * Keep in mind that class Search and SearchResult uses an instance of
65: * object Index.
66: *
67: * @package Core
68: * @subpackage Frontend_Search
69: */
70: class cSearchIndex extends cSearchBaseAbstract {
71:
72: /**
73: * content of the cms-types of an article
74: *
75: * @var array
76: */
77: protected $_keycode = array();
78:
79: /**
80: * list of keywords of an article
81: *
82: * @var array
83: */
84: protected $_keywords = array();
85:
86: /**
87: * words, which should not be indexed
88: *
89: * @var array
90: */
91: protected $_stopwords = array();
92:
93: /**
94: * keywords of an article stored in the DB
95: *
96: * @var array
97: */
98: protected $_keywordsOld = array();
99:
100: /**
101: * keywords to be deleted
102: *
103: * @var array
104: */
105: protected $_keywordsDel = array();
106:
107: /**
108: * 'auto' or 'self'
109: *
110: * The field 'auto' in table con_keywords is used for automatic indexing.
111: * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)",
112: * which means a keyword occurs 2 times in article with $idart 12
113: * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
114: *
115: * The field 'self' can be used in the article properties to index
116: * the article manually.
117: *
118: * @var string
119: */
120: protected $_place;
121:
122: /**
123: * array of cms types
124: *
125: * @var array
126: */
127: protected $_cmsOptions = array();
128:
129: /**
130: * array of all available cms types
131: *
132: * htmlhead - HTML Headline
133: * html - HTML Text
134: * head - Headline (no HTML)
135: * text - Text (no HTML)
136: * img - Upload id of the element
137: * imgdescr - Image description
138: * link - Link (URL)
139: * linktarget - Linktarget (_self, _blank, _top ...)
140: * linkdescr - Linkdescription
141: * swf - Upload id of the element
142: * etc.
143: *
144: * @var array
145: */
146: protected $_cmsType = array();
147:
148: /**
149: * suffix of all available cms types
150: *
151: * @var array
152: */
153: protected $_cmsTypeSuffix = array();
154:
155: /**
156: *
157: * @var int
158: */
159: protected $idart;
160:
161: /**
162: * Constructor to create an instance of this class.
163: *
164: * Set object properties.
165: *
166: * @param cDb $db [optional]
167: * CONTENIDO database object
168: *
169: * @throws cDbException
170: * @throws cInvalidArgumentException
171: */
172: public function __construct($db = NULL) {
173: parent::__construct($db);
174:
175: $this->setContentTypes();
176: }
177:
178: /**
179: * Start indexing the article.
180: *
181: * @param int $idart
182: * Article Id
183: * @param array $aContent
184: * The complete content of an article specified by its content types.
185: * It looks like:
186: * Array (
187: * [CMS_HTMLHEAD] => Array (
188: * [1] => Herzlich Willkommen...
189: * [2] => ...auf Ihrer Website!
190: * )
191: * [CMS_HTML] => Array (
192: * [1] => Die Inhalte auf dieser Website ...
193: * )
194: * )
195: * @param string $place [optional]
196: * The field where to store the index information in db.
197: * @param array $cms_options [optional]
198: * One can specify explicitly cms types which should not be indexed.
199: * @param array $aStopwords [optional]
200: * Array with words which should not be indexed.
201: *
202: * @throws cInvalidArgumentException
203: * @throws cDbException
204: */
205: public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
206: if (!is_int((int) $idart) || $idart < 0) {
207: return;
208: } else {
209: $this->idart = $idart;
210: }
211:
212: $this->_place = $place;
213: $this->_keycode = $aContent;
214: $this->setStopwords($aStopwords);
215: $this->setCmsOptions($cms_options);
216:
217: $this->createKeywords();
218:
219: $this->getKeywords();
220:
221: $this->saveKeywords();
222:
223: $new_keys = array_keys($this->_keywords);
224: $old_keys = array_keys($this->_keywordsOld);
225:
226: $this->_keywordsDel = array_diff($old_keys, $new_keys);
227:
228: if (count($this->_keywordsDel) > 0) {
229: $this->deleteKeywords();
230: }
231: }
232:
233: /**
234: * For each cms-type create index structure.
235: *
236: * It looks like:
237: * Array (
238: * [die] => CMS_HTML-1
239: * [inhalte] => CMS_HTML-1
240: * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
241: * [dieser] => CMS_HTML-1
242: * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
243: * )
244: *
245: * @throws cInvalidArgumentException
246: */
247: public function createKeywords() {
248: $tmp_keys = array();
249:
250: // Only create keycodes, if some are available
251: if (is_array($this->_keycode)) {
252: foreach ($this->_keycode as $idtype => $data) {
253: if ($this->checkCmsType($idtype)) {
254: foreach ($data as $typeid => $code) {
255: $this->_debug('code', $code);
256:
257: // remove backslash
258: $code = stripslashes($code);
259: // replace HTML line breaks with newlines
260: $code = str_ireplace(array(
261: '<br>',
262: '<br />'
263: ), "\n", $code);
264: // remove html tags
265: $code = strip_tags($code);
266: if (cString::getStringLength($code) > 0) {
267: $code = conHtmlEntityDecode($code);
268: }
269: $this->_debug('code', $code);
270:
271: // split content by any number of commas, space
272: // characters or hyphens
273: $tmp_keys = mb_split('[\s,-]+', trim($code));
274: $this->_debug('tmp_keys', $tmp_keys);
275:
276: foreach ($tmp_keys as $value) {
277: // index terms are stored with lower case
278: $value = conHtmlentities($value);
279: $value = trim(cString::toLowerCase($value));
280: $value = conHtmlEntityDecode($value);
281:
282: if (!in_array($value, $this->_stopwords)) {
283: // eliminate stopwords
284: $value = $this->removeSpecialChars($value);
285:
286: if (cString::getStringLength($value) > 1) {
287: // do not index single characters
288: $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
289: }
290: }
291: }
292: }
293: }
294:
295: unset($tmp_keys);
296: }
297: }
298:
299: $this->_debug('keywords', $this->_keywords);
300: }
301:
302: /**
303: * Generate index_string from index structure and save keywords.
304: * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
305: *
306: * @throws cInvalidArgumentException
307: * @throws cDbException
308: */
309: public function saveKeywords() {
310: $tmp_count = array();
311:
312: foreach ($this->_keywords as $keyword => $count) {
313: $tmp_count = preg_split('/[\s]/', trim($count));
314: $this->_debug('tmp_count', $tmp_count);
315:
316: $occurrence = count($tmp_count);
317: $tmp_count = array_unique($tmp_count);
318: $cms_types = implode(',', $tmp_count);
319: $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
320:
321: if (!array_key_exists($keyword, $this->_keywordsOld)) {
322: // if keyword is new, save index information
323: // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
324: $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
325: (keyword, " . $this->_place . ", idlang)
326: VALUES
327: ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
328: } else {
329: // if keyword allready exists, create new index_string
330: if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
331: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
332: } else {
333: $index_string = $this->_keywordsOld[$keyword] . $index_string;
334: }
335:
336: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
337: SET " . $this->_place . " = '" . $index_string . "'
338: WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
339: }
340: $this->_debug('sql', $sql);
341: $this->db->query($sql);
342: }
343: }
344:
345: /**
346: * If keywords don't occur in the article anymore,
347: * update index_string and delete keyword if necessary.
348: *
349: * @throws cInvalidArgumentException
350: * @throws cDbException
351: */
352: public function deleteKeywords() {
353: foreach ($this->_keywordsDel as $key_del) {
354: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
355:
356: if (cString::getStringLength($index_string) == 0) {
357: // keyword is not referenced by any article
358: $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
359: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
360: } else {
361: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
362: SET " . $this->_place . " = '" . $index_string . "'
363: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
364: }
365: $this->_debug('sql', $sql);
366: $this->db->query($sql);
367: }
368: }
369:
370: /**
371: * Get the keywords of an article.
372: *
373: * @throws cInvalidArgumentException
374: * @throws cDbException
375: */
376: public function getKeywords() {
377: $keys = implode("','", array_keys($this->_keywords));
378:
379: $sql = "SELECT
380: keyword, auto, self
381: FROM
382: " . $this->cfg['tab']['keywords'] . "
383: WHERE
384: idlang=" . cSecurity::toInteger($this->lang) . " AND
385: (keyword IN ('" . $keys . "') OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
386:
387: $this->_debug('sql', $sql);
388:
389: $this->db->query($sql);
390:
391: $place = $this->_place;
392:
393: while ($this->db->nextRecord()) {
394: $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
395: }
396: }
397:
398: /**
399: * Remove special characters from index term.
400: *
401: * @param string $key
402: * Keyword
403: * @return mixed
404: */
405: public function removeSpecialChars($key) {
406: $aSpecialChars = array(
407: /*"-",*/
408: "_",
409: "'",
410: ".",
411: "!",
412: "\"",
413: "#",
414: "$",
415: "%",
416: "&",
417: "(",
418: ")",
419: "*",
420: "+",
421: ",",
422: "/",
423: ":",
424: ";",
425: "<",
426: "=",
427: ">",
428: "?",
429: "@",
430: "[",
431: "\\",
432: "]",
433: "^",
434: "`",
435: "{",
436: "|",
437: "}",
438: "~",
439: "„"
440: );
441:
442: // for ($i = 127; $i < 192; $i++) {
443: // some other special characters
444: // $aSpecialChars[] = chr($i);
445: // }
446:
447: // TODO: The transformation of accented characters must depend
448: // on the selected encoding of the language of a client and
449: // should not be treated in this method.
450: // modified 2007-10-01, H. Librenz - added as hotfix for encoding
451: // problems (doesn't find any words with umlaut vowels in it
452: // since you turn on UTF-8 as language encoding)
453: $sEncoding = cRegistry::getEncoding();
454:
455: if (cString::toLowerCase($sEncoding) != 'iso-8859-2') {
456: $key = conHtmlentities($key, NULL, $sEncoding);
457: } else {
458: $key = htmlentities_iso88592($key);
459: }
460:
461: // $aUmlautMap = array(
462: // 'Ü' => 'ue',
463: // 'ü' => 'ue',
464: // 'Ä' => 'ae',
465: // 'ä' => 'ae',
466: // 'Ö' => 'oe',
467: // 'ö' => 'oe',
468: // 'ß' => 'ss'
469: // );
470:
471: // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
472: // $key = str_replace($sUmlaut, $sMapped, $key);
473: // }
474:
475: $key = conHtmlEntityDecode($key);
476: $key = str_replace($aSpecialChars, '', $key);
477:
478: return $key;
479: }
480:
481: /**
482: *
483: * @param string $key
484: * Keyword
485: * @return string
486: */
487: public function addSpecialUmlauts($key) {
488: $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
489: $aUmlautMap = array(
490: 'Ue' => 'Ü',
491: 'ue' => 'ü',
492: 'Ae' => 'Ä',
493: 'ae' => 'ä',
494: 'Oe' => 'Ö',
495: 'oe' => 'ö',
496: 'ss' => 'ß'
497: );
498:
499: foreach ($aUmlautMap as $sUmlaut => $sMapped) {
500: $key = str_replace($sUmlaut, $sMapped, $key);
501: }
502:
503: $key = conHtmlEntityDecode($key);
504: return $key;
505: }
506:
507: /**
508: * Set the array of stopwords which should not be indexed.
509: *
510: * @param array $aStopwords
511: */
512: public function setStopwords($aStopwords) {
513: if (is_array($aStopwords) && count($aStopwords) > 0) {
514: $this->_stopwords = $aStopwords;
515: }
516: }
517:
518: /**
519: * Set the cms types.
520: *
521: * @throws cInvalidArgumentException
522: * @throws cDbException
523: */
524: public function setContentTypes() {
525: $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
526: $this->_debug('sql', $sql);
527: $this->db->query($sql);
528: while ($this->db->nextRecord()) {
529: $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
530: $this->_cmsTypeSuffix[$this->db->f('idtype')] = cString::getPartOfString($this->db->f('type'), 4, cString::getStringLength($this->db->f('type')));
531: }
532: }
533:
534: /**
535: * Set the cms_options array of cms types which should be treated
536: * special.
537: *
538: * @param mixed $cms_options
539: */
540: public function setCmsOptions($cms_options) {
541: if (is_array($cms_options) && count($cms_options) > 0) {
542: foreach ($cms_options as $opt) {
543: $opt = cString::toUpperCase($opt);
544:
545: if (cString::getStringLength($opt) > 0) {
546: if (!cString::findFirstOccurrenceCI($opt, 'cms_')) {
547: if (in_array($opt, $this->_cmsTypeSuffix)) {
548: $this->_cmsOptions[$opt] = 'CMS_' . $opt;
549: }
550: } else {
551: if (array_key_exists($opt, $this->_cmsType)) {
552: $this->_cmsOptions[$opt] = $opt;
553: }
554: }
555: }
556: }
557: } else {
558: $this->_cmsOptions = array();
559: }
560: }
561:
562: /**
563: * Check if the requested content type should be indexed (false) or
564: * not (true).
565: *
566: * @param string $idtype
567: * @return bool
568: */
569: public function checkCmsType($idtype) {
570: $idtype = cString::toUpperCase($idtype);
571:
572: // Do not index CMS_RAW
573: if ($idtype == "CMS_RAW") {
574: return true;
575: }
576:
577: return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
578: }
579:
580: /**
581: * Returns the property _cmsType.
582: *
583: * @return array
584: */
585: public function getCmsType() {
586: return $this->_cmsType;
587: }
588:
589: /**
590: * Returns the property _cmsTypeSuffix.
591: *
592: * @return array
593: */
594: public function getCmsTypeSuffix() {
595: return $this->_cmsTypeSuffix;
596: }
597: }
598: