1: <?php
2:
3: /**
4: * This file contains the base class for building search indices
5: *
6: * @package Core
7: * @subpackage Frontend_Search
8: * @version SVN Revision $Rev:$
9: *
10: * @author Willi Man
11: * @copyright four for business AG <www.4fb.de>
12: * @license http://www.contenido.org/license/LIZENZ.txt
13: * @link http://www.4fb.de
14: * @link http://www.contenido.org
15: */
16:
17: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
18:
19: cInclude('includes', 'functions.encoding.php');
20:
21: /**
22: * CONTENIDO API - Search Index Object
23: *
24: * This object creates an index of an article
25: *
26: * Create object with
27: * $oIndex = new SearchIndex($db); # where $db is the global CONTENIDO database
28: * object.
29: * Start indexing with
30: * $oIndex->start($idart, $aContent);
31: * where $aContent is the complete content of an article specified by its
32: * content types.
33: * It looks like
34: * Array (
35: * [CMS_HTMLHEAD] => Array (
36: * [1] => Herzlich Willkommen...
37: * [2] => ...auf Ihrer Website!
38: * )
39: * [CMS_HTML] => Array (
40: * [1] => Die Inhalte auf dieser Website ...
41: *
42: * The index for keyword 'willkommen' would look like '&12=1(CMS_HTMLHEAD-1)'
43: * which means the keyword 'willkommen' occurs 1 times in article with articleId
44: * 12 and content type CMS_HTMLHEAD[1].
45: *
46: * TODO: The basic idea of the indexing process is to take the complete content
47: * of an article and to generate normalized index terms
48: * from the content and to store a specific index structure in the relation
49: * 'con_keywords'.
50: * To take the complete content is not very flexible. It would be better to
51: * differentiate by specific content types or by any content.
52: * The &, =, () and - seperated string is not easy to parse to compute the
53: * search result set.
54: * It would be a better idea (and a lot of work) to extend the relation
55: * 'con_keywords' to store keywords by articleId (or content source identifier)
56: * and content type.
57: * The functions removeSpecialChars, setStopwords, setContentTypes and
58: * setCmsOptions should be sourced out into a new helper-class.
59: * Keep in mind that class Search and SearchResult uses an instance of object
60: * Index.
61: *
62: * @package Core
63: * @subpackage Frontend_Search
64: */
65: class cSearchIndex extends cSearchBaseAbstract {
66:
67: /**
68: * the content of the cms-types of an article
69: *
70: * @var array
71: */
72: protected $_keycode = array();
73:
74: /**
75: * the list of keywords of an article
76: *
77: * @var array
78: */
79: protected $_keywords = array();
80:
81: /**
82: * the words, which should not be indexed
83: *
84: * @var array
85: */
86: protected $_stopwords = array();
87:
88: /**
89: * the keywords of an article stored in the DB
90: *
91: * @var array
92: */
93: protected $_keywordsOld = array();
94:
95: /**
96: * the keywords to be deleted
97: *
98: * @var array
99: */
100: protected $_keywordsDel = array();
101:
102: /**
103: * 'auto' or 'self'
104: * The field 'auto' in table con_keywords is used for automatic indexing.
105: * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)", which
106: * means a keyword occurs 2 times in article with $idart 12
107: * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
108: * The field 'self' can be used in the article properties to index the
109: * article manually.
110: *
111: * @var string
112: */
113: protected $_place;
114:
115: /**
116: * array of cms types
117: *
118: * @var array
119: */
120: protected $_cmsOptions = array();
121:
122: /**
123: * array of all available cms types
124: *
125: * htmlhead - HTML Headline
126: * html - HTML Text
127: * head - Headline (no HTML)
128: * text - Text (no HTML)
129: * img - Upload id of the element
130: * imgdescr - Image description
131: * link - Link (URL)
132: * linktarget - Linktarget (_self, _blank, _top ...)
133: * linkdescr - Linkdescription
134: * swf - Upload id of the element
135: * etc.
136: *
137: * @var array
138: */
139: protected $_cmsType = array();
140:
141: /**
142: * the suffix of all available cms types
143: *
144: * @var array
145: */
146: protected $_cmsTypeSuffix = array();
147:
148: /**
149: * @var int
150: */
151: protected $idart;
152:
153: /**
154: * Constructor, set object properties
155: *
156: * @param cDb $db [optional]
157: * CONTENIDO database object
158: */
159: public function __construct($db = NULL) {
160: parent::__construct($db);
161:
162: $this->setContentTypes();
163: }
164:
165: /**
166: * Start indexing the article.
167: *
168: * @param int $idart
169: * Article Id
170: * @param array $aContent
171: * The complete content of an article specified by its content types.
172: * It looks like:
173: * Array (
174: * [CMS_HTMLHEAD] => Array (
175: * [1] => Herzlich Willkommen...
176: * [2] => ...auf Ihrer Website!
177: * )
178: * [CMS_HTML] => Array (
179: * [1] => Die Inhalte auf dieser Website ...
180: * )
181: * )
182: * @param string $place [optional]
183: * The field where to store the index information in db.
184: * @param array $cms_options [optional]
185: * One can specify explicitly cms types which should not be indexed.
186: * @param array $aStopwords [optional]
187: * Array with words which should not be indexed.
188: */
189: public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
190: if (!is_int((int) $idart) || $idart < 0) {
191: return;
192: } else {
193: $this->idart = $idart;
194: }
195:
196: $this->_place = $place;
197: $this->_keycode = $aContent;
198: $this->setStopwords($aStopwords);
199: $this->setCmsOptions($cms_options);
200:
201: $this->createKeywords();
202:
203: $this->getKeywords();
204:
205: $this->saveKeywords();
206:
207: $new_keys = array_keys($this->_keywords);
208: $old_keys = array_keys($this->_keywordsOld);
209:
210: $this->_keywordsDel = array_diff($old_keys, $new_keys);
211:
212: if (count($this->_keywordsDel) > 0) {
213: $this->deleteKeywords();
214: }
215: }
216:
217: /**
218: * For each cms-type create index structure.
219: * It looks like:
220: * Array (
221: * [die] => CMS_HTML-1
222: * [inhalte] => CMS_HTML-1
223: * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
224: * [dieser] => CMS_HTML-1
225: * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
226: * )
227: */
228: public function createKeywords() {
229: $tmp_keys = array();
230:
231: // Only create keycodes, if some are available
232: if (is_array($this->_keycode)) {
233: foreach ($this->_keycode as $idtype => $data) {
234: if ($this->checkCmsType($idtype)) {
235: foreach ($data as $typeid => $code) {
236: $this->_debug('code', $code);
237:
238: // remove backslash
239: $code = stripslashes($code);
240: // replace HTML line breaks with newlines
241: $code = str_ireplace(array(
242: '<br>',
243: '<br />'
244: ), "\n", $code);
245: // remove html tags
246: $code = strip_tags($code);
247: if (strlen($code) > 0) {
248: $code = conHtmlEntityDecode($code);
249: }
250: $this->_debug('code', $code);
251:
252: // split content by any number of commas or space
253: // characters
254: $tmp_keys = mb_split('[\s,]+', trim($code));
255: $this->_debug('tmp_keys', $tmp_keys);
256:
257: foreach ($tmp_keys as $value) {
258: // index terms are stored with lower case
259: // $value = strtolower($value);
260:
261: $value = conHtmlentities($value);
262: $value = trim(strtolower($value));
263: $value = conHtmlEntityDecode($value);
264:
265: if (!in_array($value, $this->_stopwords)) {
266: // eliminate stopwords
267: $value = $this->removeSpecialChars($value);
268:
269: if (strlen($value) > 1) {
270: // do not index single characters
271: $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
272: }
273: }
274: }
275: }
276: }
277:
278: unset($tmp_keys);
279: }
280: }
281:
282: $this->_debug('keywords', $this->_keywords);
283: }
284:
285: /**
286: * Generate index_string from index structure and save keywords.
287: * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)".
288: */
289: public function saveKeywords() {
290: $tmp_count = array();
291:
292: foreach ($this->_keywords as $keyword => $count) {
293: $tmp_count = preg_split('/[\s]/', trim($count));
294: $this->_debug('tmp_count', $tmp_count);
295:
296: $occurrence = count($tmp_count);
297: $tmp_count = array_unique($tmp_count);
298: $cms_types = implode(',', $tmp_count);
299: $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
300:
301: if (!array_key_exists($keyword, $this->_keywordsOld)) {
302: // if keyword is new, save index information
303: // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
304: $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
305: (keyword, " . $this->_place . ", idlang)
306: VALUES
307: ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
308: } else {
309: // if keyword allready exists, create new index_string
310: if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
311: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
312: } else {
313: $index_string = $this->_keywordsOld[$keyword] . $index_string;
314: }
315:
316: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
317: SET " . $this->_place . " = '" . $index_string . "'
318: WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
319: }
320: $this->_debug('sql', $sql);
321: $this->db->query($sql);
322: }
323: }
324:
325: /**
326: * If keywords don't occur in the article anymore, update index_string and
327: * delete keyword if necessary.
328: */
329: public function deleteKeywords() {
330: foreach ($this->_keywordsDel as $key_del) {
331: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
332:
333: if (strlen($index_string) == 0) {
334: // keyword is not referenced by any article
335: $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
336: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
337: } else {
338: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
339: SET " . $this->_place . " = '" . $index_string . "'
340: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
341: }
342: $this->_debug('sql', $sql);
343: $this->db->query($sql);
344: }
345: }
346:
347: /**
348: * Get the keywords of an article.
349: */
350: public function getKeywords() {
351: $keys = implode("','", array_keys($this->_keywords));
352:
353: $sql = "SELECT
354: keyword, auto, self
355: FROM
356: " . $this->cfg['tab']['keywords'] . "
357: WHERE
358: idlang=" . cSecurity::toInteger($this->lang) . " AND
359: (keyword IN ('" . $keys . "') OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
360:
361: $this->_debug('sql', $sql);
362:
363: $this->db->query($sql);
364:
365: $place = $this->_place;
366:
367: while ($this->db->nextRecord()) {
368: $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
369: }
370: }
371:
372: /**
373: * Remove special characters from index term.
374: *
375: * @param string $key
376: * Keyword
377: * @return mixed
378: */
379: public function removeSpecialChars($key) {
380: $aSpecialChars = array(
381: /*"-",*/
382: "_",
383: "'",
384: ".",
385: "!",
386: "\"",
387: "#",
388: "$",
389: "%",
390: "&",
391: "(",
392: ")",
393: "*",
394: "+",
395: ",",
396: "/",
397: ":",
398: ";",
399: "<",
400: "=",
401: ">",
402: "?",
403: "@",
404: "[",
405: "\\",
406: "]",
407: "^",
408: "`",
409: "{",
410: "|",
411: "}",
412: "~",
413: "„"
414: );
415:
416: // for ($i = 127; $i < 192; $i++) {
417: // some other special characters
418: // $aSpecialChars[] = chr($i);
419: // }
420:
421: // TODO: The transformation of accented characters must depend on the
422: // selected encoding of the language of
423: // a client and should not be treated in this method.
424: // modified 2007-10-01, H. Librenz - added as hotfix for encoding
425: // problems (doesn't find any words with
426: // umlaut vowels in it since you turn on UTF-8 as language encoding)
427: $sEncoding == cRegistry::getEncoding();
428:
429: if (strtolower($sEncoding) != 'iso-8859-2') {
430: $key = conHtmlentities($key, NULL, $sEncoding);
431: } else {
432: $key = htmlentities_iso88592($key);
433: }
434:
435: // $aUmlautMap = array(
436: // 'Ü' => 'ue',
437: // 'ü' => 'ue',
438: // 'Ä' => 'ae',
439: // 'ä' => 'ae',
440: // 'Ö' => 'oe',
441: // 'ö' => 'oe',
442: // 'ß' => 'ss'
443: // );
444:
445: // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
446: // $key = str_replace($sUmlaut, $sMapped, $key);
447: // }
448:
449: $key = conHtmlEntityDecode($key);
450: $key = str_replace($aSpecialChars, '', $key);
451:
452: return $key;
453: }
454:
455: /**
456: *
457: * @param string $key
458: * Keyword
459: * @return string
460: */
461: public function addSpecialUmlauts($key) {
462: $key = conHtmlentities($key, NULL, cRegistry::getEncoding());
463: $aUmlautMap = array(
464: 'Ue' => 'Ü',
465: 'ue' => 'ü',
466: 'Ae' => 'Ä',
467: 'ae' => 'ä',
468: 'Oe' => 'Ö',
469: 'oe' => 'ö',
470: 'ss' => 'ß'
471: );
472:
473: foreach ($aUmlautMap as $sUmlaut => $sMapped) {
474: $key = str_replace($sUmlaut, $sMapped, $key);
475: }
476:
477: $key = conHtmlEntityDecode($key);
478: return $key;
479: }
480:
481: /**
482: * set the array of stopwords which should not be indexed
483: *
484: * @param array $aStopwords
485: */
486: public function setStopwords($aStopwords) {
487: if (is_array($aStopwords) && count($aStopwords) > 0) {
488: $this->_stopwords = $aStopwords;
489: }
490: }
491:
492: /**
493: * set the cms types
494: */
495: public function setContentTypes() {
496: $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
497: $this->_debug('sql', $sql);
498: $this->db->query($sql);
499: while ($this->db->nextRecord()) {
500: $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
501: $this->_cmsTypeSuffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
502: }
503: }
504:
505: /**
506: * set the cms_options array of cms types which should be treated special
507: *
508: * @param mixed $cms_options
509: */
510: public function setCmsOptions($cms_options) {
511: if (is_array($cms_options) && count($cms_options) > 0) {
512: foreach ($cms_options as $opt) {
513: $opt = strtoupper($opt);
514:
515: if (strlen($opt) > 0) {
516: if (!stristr($opt, 'cms_')) {
517: if (in_array($opt, $this->_cmsTypeSuffix)) {
518: $this->_cmsOptions[$opt] = 'CMS_' . $opt;
519: }
520: } else {
521: if (array_key_exists($opt, $this->_cmsType)) {
522: $this->_cmsOptions[$opt] = $opt;
523: }
524: }
525: }
526: }
527: } else {
528: $this->_cmsOptions = array();
529: }
530: }
531:
532: /**
533: * Check if the requested content type should be indexed (false) or not (true)
534: *
535: * @param string $idtype
536: * @return bool
537: */
538: public function checkCmsType($idtype) {
539: $idtype = strtoupper($idtype);
540:
541: // Do not index CMS_RAW
542: if ($idtype == "CMS_RAW") {
543: return true;
544: }
545:
546: return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
547: }
548:
549: /**
550: *
551: * @return array
552: * the _cmsType property
553: */
554: public function getCmsType() {
555: return $this->_cmsType;
556: }
557:
558: /**
559: *
560: * @return array
561: * the _cmsTypeSuffix property
562: */
563: public function getCmsTypeSuffix() {
564: return $this->_cmsTypeSuffix;
565: }
566: }
567: