1: <?php
2:
3: /**
4: * This file contains the base class for building search indices.
5: *
6: * @package Core
7: * @subpackage Frontend_Search
8: * @author Willi Man
9: * @copyright four for business AG <www.4fb.de>
10: * @license http://www.contenido.org/license/LIZENZ.txt
11: * @link http://www.4fb.de
12: * @link http://www.contenido.org
13: */
14:
15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
16:
17: cInclude('includes', 'functions.encoding.php');
18:
19: /**
20: * CONTENIDO API - Search Index Object.
21: *
22: * This object creates an index of an article.
23: *
24: * Create object where $db is the global CONTENIDO database object.
25: *
26: * $oIndex = new SearchIndex($db);
27: *
28: * Start indexing where $aContent is the complete content of an article
29: * specified by its content types.
30: *
31: * $oIndex->start($idart, $aContent);
32: *
33: * It looks like:
34: * Array (
35: * [CMS_HTMLHEAD] => Array (
36: * [1] => Herzlich Willkommen...
37: * [2] => ...auf Ihrer Website!
38: * )
39: * [CMS_HTML] => Array (
40: * [1] => Die Inhalte auf dieser Website ...
41: *
42: * The index for keyword 'willkommen' would look like
43: * '&12=1(CMS_HTMLHEAD-1)' which means the keyword 'willkommen' occurs
44: * 1 times in article with articleId 12 and content type CMS_HTMLHEAD[1].
45: *
46: * TODO: The basic idea of the indexing process is to take the complete
47: * content of an article and to generate normalized index terms from the
48: * content and to store a specific index structure in the relation
49: * 'con_keywords'.
50: *
51: * To take the complete content is not very flexible. It would be better
52: * to differentiate by specific content types or by any content.
53: *
54: * The &, =, () and - seperated string is not easy to parse to compute
55: * the search result set.
56: *
57: * It would be a better idea (and a lot of work) to extend the relation
58: * 'con_keywords' to store keywords by articleId (or content source
59: * identifier) and content type.
60: *
61: * The functions removeSpecialChars, setStopwords, setContentTypes and
62: * setCmsOptions should be sourced out into a new helper-class.
63: *
64: * Keep in mind that class Search and SearchResult uses an instance of
65: * object Index.
66: *
67: * @package Core
68: * @subpackage Frontend_Search
69: */
70: class cSearchIndex extends cSearchBaseAbstract {
71:
72: /**
73: * content of the cms-types of an article
74: *
75: * @var array
76: */
77: protected $_keycode = array();
78:
79: /**
80: * list of keywords of an article
81: *
82: * @var array
83: */
84: protected $_keywords = array();
85:
86: /**
87: * words, which should not be indexed
88: *
89: * @var array
90: */
91: protected $_stopwords = array();
92:
93: /**
94: * keywords of an article stored in the DB
95: *
96: * @var array
97: */
98: protected $_keywordsOld = array();
99:
100: /**
101: * keywords to be deleted
102: *
103: * @var array
104: */
105: protected $_keywordsDel = array();
106:
107: /**
108: * 'auto' or 'self'
109: *
110: * The field 'auto' in table con_keywords is used for automatic indexing.
111: * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)",
112: * which means a keyword occurs 2 times in article with $idart 12
113: * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
114: *
115: * The field 'self' can be used in the article properties to index
116: * the article manually.
117: *
118: * @var string
119: */
120: protected $_place;
121:
122: /**
123: * array of cms types
124: *
125: * @var array
126: */
127: protected $_cmsOptions = array();
128:
129: /**
130: * array of all available cms types
131: *
132: * htmlhead - HTML Headline
133: * html - HTML Text
134: * head - Headline (no HTML)
135: * text - Text (no HTML)
136: * img - Upload id of the element
137: * imgdescr - Image description
138: * link - Link (URL)
139: * linktarget - Linktarget (_self, _blank, _top ...)
140: * linkdescr - Linkdescription
141: * swf - Upload id of the element
142: * etc.
143: *
144: * @var array
145: */
146: protected $_cmsType = array();
147:
148: /**
149: * suffix of all available cms types
150: *
151: * @var array
152: */
153: protected $_cmsTypeSuffix = array();
154:
155: /**
156: *
157: * @var int
158: */
159: protected $idart;
160:
161: /**
162: * Constructor to create an instance of this class.
163: *
164: * Set object properties.
165: *
166: * @param cDb $db [optional]
167: * CONTENIDO database object
168: */
169: public function __construct($db = NULL) {
170: parent::__construct($db);
171:
172: $this->setContentTypes();
173: }
174:
175: /**
176: * Start indexing the article.
177: *
178: * @param int $idart
179: * Article Id
180: * @param array $aContent
181: * The complete content of an article specified by its content types.
182: * It looks like:
183: * Array (
184: * [CMS_HTMLHEAD] => Array (
185: * [1] => Herzlich Willkommen...
186: * [2] => ...auf Ihrer Website!
187: * )
188: * [CMS_HTML] => Array (
189: * [1] => Die Inhalte auf dieser Website ...
190: * )
191: * )
192: * @param string $place [optional]
193: * The field where to store the index information in db.
194: * @param array $cms_options [optional]
195: * One can specify explicitly cms types which should not be indexed.
196: * @param array $aStopwords [optional]
197: * Array with words which should not be indexed.
198: */
199: public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
200: if (!is_int((int) $idart) || $idart < 0) {
201: return;
202: } else {
203: $this->idart = $idart;
204: }
205:
206: $this->_place = $place;
207: $this->_keycode = $aContent;
208: $this->setStopwords($aStopwords);
209: $this->setCmsOptions($cms_options);
210:
211: $this->createKeywords();
212:
213: $this->getKeywords();
214:
215: $this->saveKeywords();
216:
217: $new_keys = array_keys($this->_keywords);
218: $old_keys = array_keys($this->_keywordsOld);
219:
220: $this->_keywordsDel = array_diff($old_keys, $new_keys);
221:
222: if (count($this->_keywordsDel) > 0) {
223: $this->deleteKeywords();
224: }
225: }
226:
227: /**
228: * For each cms-type create index structure.
229: *
230: * It looks like:
231: * Array (
232: * [die] => CMS_HTML-1
233: * [inhalte] => CMS_HTML-1
234: * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
235: * [dieser] => CMS_HTML-1
236: * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
237: * )
238: */
239: public function createKeywords() {
240: $tmp_keys = array();
241:
242: // Only create keycodes, if some are available
243: if (is_array($this->_keycode)) {
244: foreach ($this->_keycode as $idtype => $data) {
245: if ($this->checkCmsType($idtype)) {
246: foreach ($data as $typeid => $code) {
247: $this->_debug('code', $code);
248:
249: // remove backslash
250: $code = stripslashes($code);
251: // replace HTML line breaks with newlines
252: $code = str_ireplace(array(
253: '<br>',
254: '<br />'
255: ), "\n", $code);
256: // remove html tags
257: $code = strip_tags($code);
258: if (strlen($code) > 0) {
259: $code = conHtmlEntityDecode($code);
260: }
261: $this->_debug('code', $code);
262:
263: // split content by any number of commas or space
264: // characters
265: $tmp_keys = mb_split('[\s,]+', trim($code));
266: $this->_debug('tmp_keys', $tmp_keys);
267:
268: foreach ($tmp_keys as $value) {
269: // index terms are stored with lower case
270: // $value = strtolower($value);
271:
272: $value = conHtmlentities($value);
273: $value = trim(strtolower($value));
274: $value = conHtmlEntityDecode($value);
275:
276: if (!in_array($value, $this->_stopwords)) {
277: // eliminate stopwords
278: $value = $this->removeSpecialChars($value);
279:
280: if (strlen($value) > 1) {
281: // do not index single characters
282: $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
283: }
284: }
285: }
286: }
287: }
288:
289: unset($tmp_keys);
290: }
291: }
292:
293: $this->_debug('keywords', $this->_keywords);
294: }
295:
296: /**
297: * Generate index_string from index structure and save keywords.
298: * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
299: */
300: public function saveKeywords() {
301: $tmp_count = array();
302:
303: foreach ($this->_keywords as $keyword => $count) {
304: $tmp_count = preg_split('/[\s]/', trim($count));
305: $this->_debug('tmp_count', $tmp_count);
306:
307: $occurrence = count($tmp_count);
308: $tmp_count = array_unique($tmp_count);
309: $cms_types = implode(',', $tmp_count);
310: $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
311:
312: if (!array_key_exists($keyword, $this->_keywordsOld)) {
313: // if keyword is new, save index information
314: // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
315: $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
316: (keyword, " . $this->_place . ", idlang)
317: VALUES
318: ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
319: } else {
320: // if keyword allready exists, create new index_string
321: if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
322: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
323: } else {
324: $index_string = $this->_keywordsOld[$keyword] . $index_string;
325: }
326:
327: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
328: SET " . $this->_place . " = '" . $index_string . "'
329: WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
330: }
331: $this->_debug('sql', $sql);
332: $this->db->query($sql);
333: }
334: }
335:
336: /**
337: * If keywords don't occur in the article anymore,
338: * update index_string and delete keyword if necessary.
339: */
340: public function deleteKeywords() {
341: foreach ($this->_keywordsDel as $key_del) {
342: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
343:
344: if (strlen($index_string) == 0) {
345: // keyword is not referenced by any article
346: $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
347: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
348: } else {
349: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
350: SET " . $this->_place . " = '" . $index_string . "'
351: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
352: }
353: $this->_debug('sql', $sql);
354: $this->db->query($sql);
355: }
356: }
357:
358: /**
359: * Get the keywords of an article.
360: */
361: public function getKeywords() {
362: $keys = implode("','", array_keys($this->_keywords));
363:
364: $sql = "SELECT
365: keyword, auto, self
366: FROM
367: " . $this->cfg['tab']['keywords'] . "
368: WHERE
369: idlang=" . cSecurity::toInteger($this->lang) . " AND
370: (keyword IN ('" . $keys . "') OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
371:
372: $this->_debug('sql', $sql);
373:
374: $this->db->query($sql);
375:
376: $place = $this->_place;
377:
378: while ($this->db->nextRecord()) {
379: $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
380: }
381: }
382:
383: /**
384: * Remove special characters from index term.
385: *
386: * @param string $key
387: * Keyword
388: * @return mixed
389: */
390: public function removeSpecialChars($key) {
391: $aSpecialChars = array(
392: /*"-",*/
393: "_",
394: "'",
395: ".",
396: "!",
397: "\"",
398: "#",
399: "$",
400: "%",
401: "&",
402: "(",
403: ")",
404: "*",
405: "+",
406: ",",
407: "/",
408: ":",
409: ";",
410: "<",
411: "=",
412: ">",
413: "?",
414: "@",
415: "[",
416: "\\",
417: "]",
418: "^",
419: "`",
420: "{",
421: "|",
422: "}",
423: "~",
424: "„"
425: );
426:
427: // for ($i = 127; $i < 192; $i++) {
428: // some other special characters
429: // $aSpecialChars[] = chr($i);
430: // }
431:
432: // TODO: The transformation of accented characters must depend
433: // on the selected encoding of the language of a client and
434: // should not be treated in this method.
435: // modified 2007-10-01, H. Librenz - added as hotfix for encoding
436: // problems (doesn't find any words with umlaut vowels in it
437: // since you turn on UTF-8 as language encoding)
438: $sEncoding = cRegistry::getEncoding();
439:
440: if (strtolower($sEncoding) != 'iso-8859-2') {
441: $key = conHtmlentities($key, NULL, $sEncoding);
442: } else {
443: $key = htmlentities_iso88592($key);
444: }
445:
446: // $aUmlautMap = array(
447: // 'Ü' => 'ue',
448: // 'ü' => 'ue',
449: // 'Ä' => 'ae',
450: // 'ä' => 'ae',
451: // 'Ö' => 'oe',
452: // 'ö' => 'oe',
453: // 'ß' => 'ss'
454: // );
455:
456: // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
457: // $key = str_replace($sUmlaut, $sMapped, $key);
458: // }
459:
460: $key = conHtmlEntityDecode($key);
461: $key = str_replace($aSpecialChars, '', $key);
462:
463: return $key;
464: }
465:
466: /**
467: *
468: * @param string $key
469: * Keyword
470: * @return string
471: */
472: public function addSpecialUmlauts($key) {
473: $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
474: $aUmlautMap = array(
475: 'Ue' => 'Ü',
476: 'ue' => 'ü',
477: 'Ae' => 'Ä',
478: 'ae' => 'ä',
479: 'Oe' => 'Ö',
480: 'oe' => 'ö',
481: 'ss' => 'ß'
482: );
483:
484: foreach ($aUmlautMap as $sUmlaut => $sMapped) {
485: $key = str_replace($sUmlaut, $sMapped, $key);
486: }
487:
488: $key = conHtmlEntityDecode($key);
489: return $key;
490: }
491:
492: /**
493: * Set the array of stopwords which should not be indexed.
494: *
495: * @param array $aStopwords
496: */
497: public function setStopwords($aStopwords) {
498: if (is_array($aStopwords) && count($aStopwords) > 0) {
499: $this->_stopwords = $aStopwords;
500: }
501: }
502:
503: /**
504: * Set the cms types.
505: */
506: public function setContentTypes() {
507: $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
508: $this->_debug('sql', $sql);
509: $this->db->query($sql);
510: while ($this->db->nextRecord()) {
511: $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
512: $this->_cmsTypeSuffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
513: }
514: }
515:
516: /**
517: * Set the cms_options array of cms types which should be treated
518: * special.
519: *
520: * @param mixed $cms_options
521: */
522: public function setCmsOptions($cms_options) {
523: if (is_array($cms_options) && count($cms_options) > 0) {
524: foreach ($cms_options as $opt) {
525: $opt = strtoupper($opt);
526:
527: if (strlen($opt) > 0) {
528: if (!stristr($opt, 'cms_')) {
529: if (in_array($opt, $this->_cmsTypeSuffix)) {
530: $this->_cmsOptions[$opt] = 'CMS_' . $opt;
531: }
532: } else {
533: if (array_key_exists($opt, $this->_cmsType)) {
534: $this->_cmsOptions[$opt] = $opt;
535: }
536: }
537: }
538: }
539: } else {
540: $this->_cmsOptions = array();
541: }
542: }
543:
544: /**
545: * Check if the requested content type should be indexed (false) or
546: * not (true).
547: *
548: * @param string $idtype
549: * @return bool
550: */
551: public function checkCmsType($idtype) {
552: $idtype = strtoupper($idtype);
553:
554: // Do not index CMS_RAW
555: if ($idtype == "CMS_RAW") {
556: return true;
557: }
558:
559: return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
560: }
561:
562: /**
563: * Returns the property _cmsType.
564: *
565: * @return array
566: */
567: public function getCmsType() {
568: return $this->_cmsType;
569: }
570:
571: /**
572: * Returns the property _cmsTypeSuffix.
573: *
574: * @return array
575: */
576: public function getCmsTypeSuffix() {
577: return $this->_cmsTypeSuffix;
578: }
579: }
580: