1: <?php
2: /**
3: * This file contains the base class for building search indices
4: *
5: * @package Core
6: * @subpackage Frontend_Search
7: * @version SVN Revision $Rev:$
8: *
9: * @author Willi Man
10: * @copyright four for business AG <www.4fb.de>
11: * @license http://www.contenido.org/license/LIZENZ.txt
12: * @link http://www.4fb.de
13: * @link http://www.contenido.org
14: */
15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
16:
17: cInclude('includes', 'functions.encoding.php');
18:
19: /**
20: * CONTENIDO API - Search Index Object
21: *
22: * This object creates an index of an article
23: *
24: * Create object with
25: * $oIndex = new SearchIndex($db); # where $db is the global CONTENIDO database
26: * object.
27: * Start indexing with
28: * $oIndex->start($idart, $aContent);
29: * where $aContent is the complete content of an article specified by its
30: * content types.
31: * It looks like
32: * Array (
33: * [CMS_HTMLHEAD] => Array (
34: * [1] => Herzlich Willkommen...
35: * [2] => ...auf Ihrer Website!
36: * )
37: * [CMS_HTML] => Array (
38: * [1] => Die Inhalte auf dieser Website ...
39: *
40: * The index for keyword 'willkommen' would look like '&12=1(CMS_HTMLHEAD-1)'
41: * which means the keyword 'willkommen' occurs 1 times in article with articleId
42: * 12 and content type CMS_HTMLHEAD[1].
43: *
44: * TODO: The basic idea of the indexing process is to take the complete content
45: * of an article and to generate normalized index terms
46: * from the content and to store a specific index structure in the relation
47: * 'con_keywords'.
48: * To take the complete content is not very flexible. It would be better to
49: * differentiate by specific content types or by any content.
50: * The &, =, () and - seperated string is not easy to parse to compute the
51: * search result set.
52: * It would be a better idea (and a lot of work) to extend the relation
53: * 'con_keywords' to store keywords by articleId (or content source identifier)
54: * and content type.
55: * The functions removeSpecialChars, setStopwords, setContentTypes and
56: * setCmsOptions should be sourced out into a new helper-class.
57: * Keep in mind that class Search and SearchResult uses an instance of object
58: * Index.
59: *
60: * @package Core
61: * @subpackage Frontend_Search
62: */
63: class cSearchIndex extends cSearchBaseAbstract {
64:
65: /**
66: * the content of the cms-types of an article
67: *
68: * @var array
69: */
70: protected $_keycode = array();
71:
72: /**
73: * the list of keywords of an article
74: *
75: * @var array
76: */
77: protected $_keywords = array();
78:
79: /**
80: * the words, which should not be indexed
81: *
82: * @var array
83: */
84: protected $_stopwords = array();
85:
86: /**
87: * the keywords of an article stored in the DB
88: *
89: * @var array
90: */
91: protected $_keywordsOld = array();
92:
93: /**
94: * the keywords to be deleted
95: *
96: * @var array
97: */
98: protected $_keywordsDel = array();
99:
100: /**
101: * 'auto' or 'self'
102: * The field 'auto' in table con_keywords is used for automatic indexing.
103: * The value is a string like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)", which
104: * means a keyword occurs 2 times in article with $idart 12
105: * and can be found in CMS_HTMLHEAD[1] and CMS_HTML[1].
106: * The field 'self' can be used in the article properties to index the
107: * article manually.
108: *
109: * @var string
110: */
111: protected $_place;
112:
113: /**
114: * array of cms types
115: *
116: * @var array
117: */
118: protected $_cmsOptions = array();
119:
120: /**
121: * array of all available cms types
122: *
123: * htmlhead - HTML Headline
124: * html - HTML Text
125: * head - Headline (no HTML)
126: * text - Text (no HTML)
127: * img - Upload id of the element
128: * imgdescr - Image description
129: * link - Link (URL)
130: * linktarget - Linktarget (_self, _blank, _top ...)
131: * linkdescr - Linkdescription
132: * swf - Upload id of the element
133: * etc.
134: *
135: * @var array
136: */
137: protected $_cmsType = array();
138:
139: /**
140: * the suffix of all available cms types
141: *
142: * @var array
143: */
144: protected $_cmsTypeSuffix = array();
145:
146: /**
147: * @var int
148: */
149: protected $idart;
150:
151: /**
152: * Constructor, set object properties
153: *
154: * @param cDb $db CONTENIDO Database object
155: */
156: public function __construct($db = NULL) {
157: parent::__construct($db);
158:
159: $this->setContentTypes();
160: }
161:
162: /**
163: * Start indexing the article.
164: *
165: * @param int $idart Article Id
166: * @param array $aContent The complete content of an article specified by
167: * its content types.
168: * It looks like
169: * Array (
170: * [CMS_HTMLHEAD] => Array (
171: * [1] => Herzlich Willkommen...
172: * [2] => ...auf Ihrer Website!
173: * )
174: * [CMS_HTML] => Array (
175: * [1] => Die Inhalte auf dieser Website ...
176: *
177: * @param string $place The field where to store the index information in
178: * db.
179: * @param array $cms_options One can specify explicitly cms types which
180: * should not be indexed.
181: * @param array $aStopwords Array with words which should not be indexed.
182: */
183: public function start($idart, $aContent, $place = 'auto', $cms_options = array(), $aStopwords = array()) {
184: if (!is_int((int) $idart) || $idart < 0) {
185: return;
186: } else {
187: $this->idart = $idart;
188: }
189:
190: $this->_place = $place;
191: $this->_keycode = $aContent;
192: $this->setStopwords($aStopwords);
193: $this->setCmsOptions($cms_options);
194:
195: $this->createKeywords();
196:
197: $this->getKeywords();
198:
199: $this->saveKeywords();
200:
201: $new_keys = array_keys($this->_keywords);
202: $old_keys = array_keys($this->_keywordsOld);
203:
204: $this->_keywordsDel = array_diff($old_keys, $new_keys);
205:
206: if (count($this->_keywordsDel) > 0) {
207: $this->deleteKeywords();
208: }
209: }
210:
211: /**
212: * for each cms-type create index structure.
213: * it looks like
214: * Array (
215: * [die] => CMS_HTML-1
216: * [inhalte] => CMS_HTML-1
217: * [auf] => CMS_HTML-1 CMS_HTMLHEAD-2
218: * [dieser] => CMS_HTML-1
219: * [website] => CMS_HTML-1 CMS_HTML-1 CMS_HTMLHEAD-2
220: * )
221: */
222: public function createKeywords() {
223: $tmp_keys = array();
224:
225: // Only create keycodes, if some are available
226: if (is_array($this->_keycode)) {
227: foreach ($this->_keycode as $idtype => $data) {
228: if ($this->checkCmsType($idtype)) {
229: foreach ($data as $typeid => $code) {
230: $this->_debug('code', $code);
231:
232: // remove backslash
233: $code = stripslashes($code);
234: // replace HTML line breaks with newlines
235: $code = str_ireplace(array(
236: '<br>',
237: '<br />'
238: ), "\n", $code);
239: // remove html tags
240: $code = strip_tags($code);
241: if (strlen($code) > 0) {
242: $code = conHtmlEntityDecode($code);
243: }
244: $this->_debug('code', $code);
245:
246: // split content by any number of commas or space
247: // characters
248: $tmp_keys = mb_split('[\s,]+', trim($code));
249: $this->_debug('tmp_keys', $tmp_keys);
250:
251: foreach ($tmp_keys as $value) {
252: // index terms are stored with lower case
253: // $value = strtolower($value);
254:
255: $value = conHtmlentities($value);
256: $value = trim(strtolower($value));
257: $value = conHtmlEntityDecode($value);
258:
259: if (!in_array($value, $this->_stopwords)) {
260: // eliminate stopwords
261: $value = $this->removeSpecialChars($value);
262:
263: if (strlen($value) > 1) {
264: // do not index single characters
265: $this->_keywords[$value] = $this->_keywords[$value] . $idtype . '-' . $typeid . ' ';
266: }
267: }
268: }
269: }
270: }
271:
272: unset($tmp_keys);
273: }
274: }
275:
276: $this->_debug('keywords', $this->_keywords);
277: }
278:
279: /**
280: * generate index_string from index structure and save keywords
281: * The index_string looks like "&12=2(CMS_HTMLHEAD-1,CMS_HTML-1)"
282: */
283: public function saveKeywords() {
284: $tmp_count = array();
285:
286: foreach ($this->_keywords as $keyword => $count) {
287: $tmp_count = preg_split('/[\s]/', trim($count));
288: $this->_debug('tmp_count', $tmp_count);
289:
290: $occurrence = count($tmp_count);
291: $tmp_count = array_unique($tmp_count);
292: $cms_types = implode(',', $tmp_count);
293: $index_string = '&' . $this->idart . '=' . $occurrence . '(' . $cms_types . ')';
294:
295: if (!array_key_exists($keyword, $this->_keywordsOld)) {
296: // if keyword is new, save index information
297: // $nextid = $this->db->nextid($this->cfg['tab']['keywords']);
298: $sql = "INSERT INTO " . $this->cfg['tab']['keywords'] . "
299: (keyword, " . $this->_place . ", idlang)
300: VALUES
301: ('" . $this->db->escape($keyword) . "', '" . $this->db->escape($index_string) . "', " . cSecurity::toInteger($this->lang) . ")";
302: } else {
303: // if keyword allready exists, create new index_string
304: if (preg_match("/&$this->idart=/", $this->_keywordsOld[$keyword])) {
305: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", $index_string, $this->_keywordsOld[$keyword]);
306: } else {
307: $index_string = $this->_keywordsOld[$keyword] . $index_string;
308: }
309:
310: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
311: SET " . $this->_place . " = '" . $index_string . "'
312: WHERE idlang='" . cSecurity::toInteger($this->lang) . "' AND keyword='" . $this->db->escape($keyword) . "'";
313: }
314: $this->_debug('sql', $sql);
315: $this->db->query($sql);
316: }
317: }
318:
319: /**
320: * if keywords don't occur in the article anymore, update index_string and
321: * delete keyword if necessary
322: */
323: public function deleteKeywords() {
324: foreach ($this->_keywordsDel as $key_del) {
325: $index_string = preg_replace("/&$this->idart=[0-9]+\([\w-,]+\)/", "", $this->_keywordsOld[$key_del]);
326:
327: if (strlen($index_string) == 0) {
328: // keyword is not referenced by any article
329: $sql = "DELETE FROM " . $this->cfg['tab']['keywords'] . "
330: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
331: } else {
332: $sql = "UPDATE " . $this->cfg['tab']['keywords'] . "
333: SET " . $this->_place . " = '" . $index_string . "'
334: WHERE idlang = " . cSecurity::toInteger($this->lang) . " AND keyword = '" . $this->db->escape($key_del) . "'";
335: }
336: $this->_debug('sql', $sql);
337: $this->db->query($sql);
338: }
339: }
340:
341: /**
342: * get the keywords of an article
343: */
344: public function getKeywords() {
345: $keys = implode("','", array_keys($this->_keywords));
346:
347: $sql = "SELECT
348: keyword, auto, self
349: FROM
350: " . $this->cfg['tab']['keywords'] . "
351: WHERE
352: idlang=" . cSecurity::toInteger($this->lang) . " AND
353: (keyword IN ('" . $keys . "') OR " . $this->_place . " REGEXP '&" . cSecurity::toInteger($this->idart) . "=')";
354:
355: $this->_debug('sql', $sql);
356:
357: $this->db->query($sql);
358:
359: $place = $this->_place;
360:
361: while ($this->db->nextRecord()) {
362: $this->_keywordsOld[$this->db->f('keyword')] = $this->db->f($place);
363: }
364: }
365:
366: /**
367: * remove special characters from index term
368: *
369: * @param string $key Keyword
370: * @return mixed
371: */
372: public function removeSpecialChars($key) {
373: $aSpecialChars = array(
374: /*"-",*/
375: "_",
376: "'",
377: ".",
378: "!",
379: "\"",
380: "#",
381: "$",
382: "%",
383: "&",
384: "(",
385: ")",
386: "*",
387: "+",
388: ",",
389: "/",
390: ":",
391: ";",
392: "<",
393: "=",
394: ">",
395: "?",
396: "@",
397: "[",
398: "\\",
399: "]",
400: "^",
401: "`",
402: "{",
403: "|",
404: "}",
405: "~",
406: "„"
407: );
408:
409: // for ($i = 127; $i < 192; $i++) {
410: // some other special characters
411: // $aSpecialChars[] = chr($i);
412: // }
413:
414: // TODO: The transformation of accented characters must depend on the
415: // selected encoding of the language of
416: // a client and should not be treated in this method.
417: // modified 2007-10-01, H. Librenz - added as hotfix for encoding
418: // problems (doesn't find any words with
419: // umlaut vowels in it since you turn on UTF-8 as language encoding)
420: $sEncoding = getEncodingByLanguage($this->db, $this->lang);
421:
422: if (strtolower($sEncoding) != 'iso-8859-2') {
423: $key = conHtmlentities($key, NULL, $sEncoding);
424: } else {
425: $key = htmlentities_iso88592($key);
426: }
427:
428: // $aUmlautMap = array(
429: // 'Ü' => 'ue',
430: // 'ü' => 'ue',
431: // 'Ä' => 'ae',
432: // 'ä' => 'ae',
433: // 'Ö' => 'oe',
434: // 'ö' => 'oe',
435: // 'ß' => 'ss'
436: // );
437:
438: // foreach ($aUmlautMap as $sUmlaut => $sMapped) {
439: // $key = str_replace($sUmlaut, $sMapped, $key);
440: // }
441:
442: $key = conHtmlEntityDecode($key);
443: $key = str_replace($aSpecialChars, '', $key);
444:
445: return $key;
446: }
447:
448: /**
449: *
450: * @param string $key Keyword
451: * @return string
452: */
453: public function addSpecialUmlauts($key) {
454: $key = conHtmlentities($key, NULL, getEncodingByLanguage($this->db, $this->lang));
455: $aUmlautMap = array(
456: 'Ue' => 'Ü',
457: 'ue' => 'ü',
458: 'Ae' => 'Ä',
459: 'ae' => 'ä',
460: 'Oe' => 'Ö',
461: 'oe' => 'ö',
462: 'ss' => 'ß'
463: );
464:
465: foreach ($aUmlautMap as $sUmlaut => $sMapped) {
466: $key = str_replace($sUmlaut, $sMapped, $key);
467: }
468:
469: $key = conHtmlEntityDecode($key);
470: return $key;
471: }
472:
473: /**
474: * set the array of stopwords which should not be indexed
475: *
476: * @param array $aStopwords
477: */
478: public function setStopwords($aStopwords) {
479: if (is_array($aStopwords) && count($aStopwords) > 0) {
480: $this->_stopwords = $aStopwords;
481: }
482: }
483:
484: /**
485: * set the cms types
486: */
487: public function setContentTypes() {
488: $sql = "SELECT type, idtype FROM " . $this->cfg['tab']['type'] . ' ';
489: $this->_debug('sql', $sql);
490: $this->db->query($sql);
491: while ($this->db->nextRecord()) {
492: $this->_cmsType[$this->db->f('type')] = $this->db->f('idtype');
493: $this->_cmsTypeSuffix[$this->db->f('idtype')] = substr($this->db->f('type'), 4, strlen($this->db->f('type')));
494: }
495: }
496:
497: /**
498: * set the cms_options array of cms types which should be treated special
499: *
500: * @param mixed $cms_options
501: */
502: public function setCmsOptions($cms_options) {
503: if (is_array($cms_options) && count($cms_options) > 0) {
504: foreach ($cms_options as $opt) {
505: $opt = strtoupper($opt);
506:
507: if (strlen($opt) > 0) {
508: if (!stristr($opt, 'cms_')) {
509: if (in_array($opt, $this->_cmsTypeSuffix)) {
510: $this->_cmsOptions[$opt] = 'CMS_' . $opt;
511: }
512: } else {
513: if (array_key_exists($opt, $this->_cmsType)) {
514: $this->_cmsOptions[$opt] = $opt;
515: }
516: }
517: }
518: }
519: } else {
520: $this->_cmsOptions = array();
521: }
522: }
523:
524: /**
525: * Check if the requested content type should be indexed (false) or not (true)
526: *
527: * @param string $idtype
528: * @return boolean
529: */
530: public function checkCmsType($idtype) {
531: $idtype = strtoupper($idtype);
532:
533: // Do not index CMS_RAW
534: if ($idtype == "CMS_RAW") {
535: return true;
536: }
537:
538: return (count($this->_cmsOptions) === 0 || in_array($idtype, $this->_cmsOptions)) ? false : true;
539: }
540:
541: /**
542: *
543: * @return array the _cmsType property
544: */
545: public function getCmsType() {
546: return $this->_cmsType;
547: }
548:
549: /**
550: *
551: * @return array the _cmsTypeSuffix property
552: */
553: public function getCmsTypeSuffix() {
554: return $this->_cmsTypeSuffix;
555: }
556: }
557: