1: <?php
2:
3: /**
4: * This file contains the class for content search.
5: *
6: * @package Core
7: * @subpackage Frontend_Search
8: * @author Willi Man
9: * @copyright four for business AG <www.4fb.de>
10: * @license http://www.contenido.org/license/LIZENZ.txt
11: * @link http://www.4fb.de
12: * @link http://www.contenido.org
13: */
14:
15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
16:
17: cInclude('includes', 'functions.encoding.php');
18:
19: /**
20: * CONTENIDO API - Search Object
21: *
22: * This object starts a indexed fulltext search.
23: *
24: * TODO:
25: * - The way to set the search options could be done much more better!
26: * - The computation of the set of searchable articles should not be
27: * treated in this class.
28: * - It is better to compute the array of searchable articles from the
29: * outside and to pass the array of searchable articles as parameter.
30: * - Avoid foreach loops.
31: *
32: * Use object with
33: *
34: * $options = array(
35: * // use db function regexp
36: * 'db' => 'regexp',
37: * // combine searchwords with or
38: * 'combine' => 'or'
39: * );
40: *
41: * The range of searchable articles is by default the complete content
42: * which is online and not protected.
43: *
44: * With option 'searchable_articles' you can define your own set of
45: * searchable articles.
46: *
47: * If parameter 'searchable_articles' is set, the options 'cat_tree',
48: * 'categories', 'articles', 'exclude', 'artspecs', 'protected' and
49: * 'dontshowofflinearticles' won't have any effect.
50: *
51: * $options = array(
52: * // use db function regexp
53: * 'db' => 'regexp',
54: * // combine searchwords with or
55: * 'combine' => 'or',
56: * 'searchable_articles' => array(5, 6, 9, 13)
57: * );
58: *
59: * One can define the range of searchable articles by setting the
60: * parameter 'exclude' to false which means the range of categories
61: * defined by parameter 'cat_tree' or 'categories' and the range of
62: * articles defined by parameter 'articles' is included.
63: *
64: * $options = array(
65: * // use db function regexp
66: * 'db' => 'regexp',
67: * // combine searchwords with or
68: * 'combine' => 'or',
69: * // searchrange specified in 'cat_tree', 'categories' and
70: * // 'articles' is included
71: * 'exclude' => false,
72: * // tree with root 12 included
73: * 'cat_tree' => array(12),
74: * // categories 100, 111 included
75: * 'categories' => array(100,111),
76: * // article 33 included
77: * 'articles' => array(33),
78: * // array of article specifications => search only articles with
79: * // these artspecs
80: * 'artspecs' => array(2, 3),
81: * // results per page
82: * 'res_per_page' => 2,
83: * // do not search articles or articles in categories which are
84: * // offline or protected
85: * 'protected' => true,
86: * // search offline articles or articles in categories which are
87: * // offline
88: * 'dontshowofflinearticles' => false
89: * );
90: *
91: * You can build the complement of the range of searchable articles by
92: * setting the parameter 'exclude' to true which means the range of
93: * categories defined by parameter 'cat_tree' or 'categories' and the
94: * range of articles defined by parameter 'articles' is excluded from
95: * search.
96: *
97: * $options = array(
98: * // use db function regexp
99: * 'db' => 'regexp',
100: * // combine searchwords with or
101: * 'combine' => 'or',
102: * // searchrange specified in 'cat_tree', 'categories' and
103: * // 'articles' is excluded
104: * 'exclude' => true,
105: * // tree with root 12 excluded
106: * 'cat_tree' => array(12),
107: * // categories 100, 111 excluded
108: * 'categories' => array(100,111),
109: * // article 33 excluded
110: * 'articles' => array(33),
111: * // array of article specifications => search only articles with
112: * // these artspecs
113: * 'artspecs' => array(2, 3),
114: * // results per page
115: * 'res_per_page' => 2,
116: * // do not search articles or articles in categories which are
117: * // offline or protected
118: * 'protected' => true,
119: * // search offline articles or articles in categories which are
120: * // offline
121: * 'dontshowofflinearticles' => false
122: * );
123: *
124: * $search = new Search($options);
125: *
126: * // search only in these cms-types
127: * $search->setCmsOptions(array(
128: * "htmlhead",
129: * "html",
130: * "head",
131: * "text",
132: * "imgdescr",
133: * "link",
134: * "linkdescr"
135: * ));
136: *
137: * // start search
138: * $search_result = $search->searchIndex($searchword, $searchwordex);
139: *
140: * The search result structure has following form
141: * Array (
142: * [20] => Array (
143: * [CMS_HTML] => Array (
144: * [0] => 1
145: * [1] => 1
146: * [2] => 1
147: * )
148: * [keyword] => Array (
149: * [0] => content
150: * [1] => contenido
151: * [2] => wwwcontenidoorg
152: * )
153: * [search] => Array (
154: * [0] => con
155: * [1] => con
156: * [2] => con
157: * )
158: * [occurence] => Array (
159: * [0] => 1
160: * [1] => 5
161: * [2] => 1
162: * )
163: * [similarity] => 60
164: * )
165: * )
166: *
167: * The keys of the array are the article ID's found by search.
168: *
169: * Searching 'con' matches keywords 'content', 'contenido' and
170: * 'wwwcontenidoorg' in article with ID 20 in content type CMS_HTML[1].
171: * The search term occurs 7 times.
172: * The maximum similarity between searchterm and matching keyword is 60%.
173: *
174: * // rank and display the results
175: * $oSearchResults = new cSearchResult($search_result, 10);
176: *
177: * @package Core
178: * @subpackage Frontend_Search
179: */
180: class cSearch extends cSearchBaseAbstract {
181:
182: /**
183: * Instance of class Index
184: *
185: * @var object
186: */
187: protected $_index;
188:
189: /**
190: * search words
191: *
192: * @var array
193: */
194: protected $_searchWords = array();
195:
196: /**
197: * words which should be excluded from search
198: *
199: * @var array
200: */
201: protected $_searchWordsExclude = array();
202:
203: /**
204: * type of db search
205: *
206: * like => 'sql like'
207: * regexp => 'sql regexp'
208: *
209: * @var string
210: */
211: protected $_searchOption;
212:
213: /**
214: * logical combination of searchwords (and, or)
215: *
216: * @var string
217: */
218: protected $_searchCombination;
219:
220: /**
221: * array of searchable articles
222: *
223: * @var array
224: */
225: protected $_searchableArts = array();
226:
227: /**
228: * article specifications
229: *
230: * @var array
231: */
232: protected $_articleSpecs = array();
233:
234: /**
235: * If $protected = true => do not search articles which are offline
236: * or articles in catgeories which are offline (protected) unless
237: * the user has access to them.
238: *
239: * @var bool
240: */
241: protected $_protected;
242:
243: /**
244: * If $dontshowofflinearticles = false => search offline articles or
245: * articles in categories which are offline.
246: *
247: * @var bool
248: */
249: protected $_dontshowofflinearticles;
250:
251: /**
252: * If $exclude = true => the specified search range is excluded from
253: * search, otherwise included.
254: *
255: * @var bool
256: */
257: protected $_exclude;
258:
259: /**
260: * Array of article id's with information about cms-types, occurence
261: * of keyword/searchword, similarity.
262: *
263: * @var array
264: */
265: protected $_searchResult = array();
266:
267: /**
268: * Constructor to create an instance of this class.
269: *
270: * @param array $options
271: * $options['db']
272: * 'regexp' => DB search with REGEXP
273: * 'like' => DB search with LIKE
274: * 'exact' => exact match;
275: * $options['combine']
276: * 'and', 'or' Combination of search words with AND, OR
277: * $options['exclude']
278: * 'true' => searchrange specified in 'cat_tree', 'categories'
279: * and 'articles' is excluded;
280: * 'false' => searchrange specified in 'cat_tree', 'categories'
281: * and 'articles' is included
282: * $options['cat_tree']
283: * e.g. array(8) => The complete tree with root 8 is in/excluded
284: * from search
285: * $options['categories']
286: * e.g. array(10, 12) => Categories 10, 12 in/excluded
287: * $options['articles']
288: * e.g. array(23) => Article 33 in/excluded
289: * $options['artspecs']
290: * e.g. array(2, 3) => search only articles with certain article
291: * specifications
292: * $options['protected']
293: * 'true' => do not search articles which are offline (locked)
294: * or articles in catgeories which are offline (protected)
295: * $options['dontshowofflinearticles']
296: * 'false' => search offline articles or articles in categories
297: * which are offline
298: * $options['searchable_articles']
299: * array of article ID's which should be searchable
300: * @param cDb $db [optional]
301: * database instance
302: */
303: public function __construct($options, $db = NULL) {
304: parent::__construct($db);
305:
306: $this->_index = new cSearchIndex($db);
307:
308: $this->_searchOption = (array_key_exists('db', $options)) ? strtolower($options['db']) : 'regexp';
309: $this->_searchCombination = (array_key_exists('combine', $options)) ? strtolower($options['combine']) : 'or';
310: $this->_protected = (array_key_exists('protected', $options)) ? $options['protected'] : true;
311: $this->_dontshowofflinearticles = (array_key_exists('dontshowofflinearticles', $options)) ? $options['dontshowofflinearticles'] : true;
312: $this->_exclude = (array_key_exists('exclude', $options)) ? $options['exclude'] : true;
313: $this->_articleSpecs = (array_key_exists('artspecs', $options) && is_array($options['artspecs'])) ? $options['artspecs'] : array();
314:
315: if (array_key_exists('searchable_articles', $options) && is_array($options['searchable_articles'])) {
316: $this->_searchableArts = $options['searchable_articles'];
317: } else {
318: $this->_searchableArts = $this->getSearchableArticles($options);
319: }
320:
321: // minimum similarity between searchword and keyword in percent
322: $this->intMinimumSimilarity = 50;
323: }
324:
325: /**
326: * indexed fulltext search
327: *
328: * @param string $searchwords
329: * The search words
330: * @param string $searchwords_exclude [optional]
331: * The words, which should be excluded from search
332: * @return bool|array
333: */
334: public function searchIndex($searchwords, $searchwords_exclude = '') {
335: if (strlen(trim($searchwords)) > 0) {
336: $this->_searchWords = $this->stripWords($searchwords);
337: } else {
338: return false;
339: }
340:
341: if (strlen(trim($searchwords_exclude)) > 0) {
342: $this->_searchWordsExclude = $this->stripWords($searchwords_exclude);
343: }
344:
345: $tmp_searchwords = array();
346: foreach ($this->_searchWords as $word) {
347: $wordEscaped = cSecurity::escapeDB($word, $this->db);
348: if ($this->_searchOption == 'like') {
349: $wordEscaped = "'%" . $wordEscaped . "%'";
350: } elseif ($this->_searchOption == 'exact') {
351: $wordEscaped = "'" . $wordEscaped . "'";
352: }
353: $tmp_searchwords[] = $wordEscaped;
354: }
355:
356: if (count($this->_searchWordsExclude) > 0) {
357: foreach ($this->_searchWordsExclude as $word) {
358: $wordEscaped = cSecurity::escapeDB($word, $this->db);
359: if ($this->_searchOption == 'like') {
360: $wordEscaped = "'%" . $wordEscaped . "%'";
361: } elseif ($this->_searchOption == 'exact') {
362: $wordEscaped = "'" . $wordEscaped . "'";
363: }
364: $tmp_searchwords[] = $wordEscaped;
365: $this->_searchWords[] = $word;
366: }
367: }
368:
369: if ($this->_searchOption == 'regexp') {
370: // regexp search
371: $kwSql = "keyword REGEXP '" . implode('|', $tmp_searchwords) . "'";
372: } elseif ($this->_searchOption == 'like') {
373: // like search
374: $search_like = implode(" OR keyword LIKE ", $tmp_searchwords);
375: $kwSql = "keyword LIKE " . $search_like;
376: } elseif ($this->_searchOption == 'exact') {
377: // exact match
378: $search_exact = implode(" OR keyword = ", $tmp_searchwords);
379: $kwSql = "keyword LIKE " . $search_exact;
380: }
381:
382: $sql = "SELECT keyword, auto FROM " . $this->cfg['tab']['keywords'] . " WHERE idlang=" . cSecurity::toInteger($this->lang) . " AND " . $kwSql . " ";
383: $this->_debug('sql', $sql);
384: $this->db->query($sql);
385:
386: while ($this->db->nextRecord()) {
387:
388: $tmp_index_string = preg_split('/&/', $this->db->f('auto'), -1, PREG_SPLIT_NO_EMPTY);
389:
390: $this->_debug('index', $this->db->f('auto'));
391:
392: $tmp_index = array();
393: foreach ($tmp_index_string as $string) {
394: $tmp_string = preg_replace('/[=\(\)]/', ' ', $string);
395: $tmp_index[] = preg_split('/\s/', $tmp_string, -1, PREG_SPLIT_NO_EMPTY);
396: }
397: $this->_debug('tmp_index', $tmp_index);
398:
399: foreach ($tmp_index as $string) {
400: $artid = $string[0];
401:
402: // filter nonsearchable articles
403: if (in_array($artid, $this->_searchableArts)) {
404:
405: $cms_place = $string[2];
406: $keyword = $this->db->f('keyword');
407: $percent = 0;
408: $similarity = 0;
409: foreach ($this->_searchWords as $word) {
410: // computes similarity between searchword and keyword in
411: // percent
412: similar_text($word, $keyword, $percent);
413: if ($percent > $similarity) {
414: $similarity = $percent;
415: $searchword = $word;
416: }
417: }
418:
419: $tmp_cmstype = preg_split('/[,]/', $cms_place, -1, PREG_SPLIT_NO_EMPTY);
420: $this->_debug('tmp_cmstype', $tmp_cmstype);
421:
422: $tmp_cmstype2 = array();
423: foreach ($tmp_cmstype as $type) {
424: $tmp_cmstype2[] = preg_split('/-/', $type, -1, PREG_SPLIT_NO_EMPTY);
425: }
426: $this->_debug('tmp_cmstype2', $tmp_cmstype2);
427:
428: foreach ($tmp_cmstype2 as $type) {
429: if (!$this->_index->checkCmsType($type[0])) {
430: // search for specified cms-types
431: if ($similarity >= $this->intMinimumSimilarity) {
432: // include article into searchresult set only if
433: // similarity between searchword and keyword is
434: // big enough
435: $this->_searchResult[$artid][$type[0]][] = $type[1];
436: $this->_searchResult[$artid]['keyword'][] = $this->db->f('keyword');
437: $this->_searchResult[$artid]['search'][] = $searchword;
438: $this->_searchResult[$artid]['occurence'][] = $string[1];
439: $this->_searchResult[$artid]['debug_similarity'][] = $percent;
440: if ($similarity > $this->_searchResult[$artid]['similarity']) {
441: $this->_searchResult[$artid]['similarity'] = $similarity;
442: }
443: }
444: }
445: }
446: }
447: }
448: }
449:
450: if ($this->_searchCombination == 'and') {
451: // all search words must appear in the article
452: foreach ($this->_searchResult as $article => $val) {
453: if (!count(array_diff($this->_searchWords, $val['search'])) == 0) {
454: // $this->rank_structure[$article] = $rank[$article];
455: unset($this->_searchResult[$article]);
456: }
457: }
458: }
459:
460: if (count($this->_searchWordsExclude) > 0) {
461: // search words to be excluded must not appear in article
462: foreach ($this->_searchResult as $article => $val) {
463: if (!count(array_intersect($this->_searchWordsExclude, $val['search'])) == 0) {
464: // $this->rank_structure[$article] = $rank[$article];
465: unset($this->_searchResult[$article]);
466: }
467: }
468: }
469:
470: $this->_debug('$this->search_result', $this->_searchResult);
471: $this->_debug('$this->searchable_arts', $this->_searchableArts);
472:
473: $searchTracking = new cApiSearchTrackingCollection();
474: $searchTracking->trackSearch($searchwords, count($this->_searchResult));
475:
476: return $this->_searchResult;
477: }
478:
479: /**
480: *
481: * @param mixed $cms_options
482: * The cms-types (htmlhead, html, ...) which should explicitly be
483: * searched.
484: */
485: public function setCmsOptions($cms_options) {
486: if (is_array($cms_options) && count($cms_options) > 0) {
487: $this->_index->setCmsOptions($cms_options);
488: }
489: }
490:
491: /**
492: *
493: * @param string $searchwords
494: * The search-words
495: * @return array
496: * of stripped search-words
497: */
498: public function stripWords($searchwords) {
499: // remove backslash and html tags
500: $searchwords = trim(strip_tags(stripslashes($searchwords)));
501:
502: // split the phrase by any number of commas or space characters
503: $tmp_words = mb_split('[\s,]+', $searchwords);
504:
505: $tmp_searchwords = array();
506:
507: foreach ($tmp_words as $word) {
508:
509: $word = htmlentities($word, ENT_COMPAT, 'UTF-8');
510: $word = (trim(strtolower($word)));
511: $word = html_entity_decode($word, ENT_COMPAT, 'UTF-8');
512:
513: // $word =(trim(strtolower($word)));
514: if (strlen($word) > 1) {
515: $tmp_searchwords[] = $word;
516: }
517: }
518:
519: return array_unique($tmp_searchwords);
520: }
521:
522: /**
523: * Returns the category tree array.
524: *
525: * @todo This is not the job for search, should be outsourced ...
526: * @param int $cat_start
527: * Root of a category tree
528: * @return array
529: * Category Tree
530: */
531: public function getSubTree($cat_start) {
532: $sql = "SELECT
533: B.idcat, B.parentid
534: FROM
535: " . $this->cfg['tab']['cat_tree'] . " AS A,
536: " . $this->cfg['tab']['cat'] . " AS B,
537: " . $this->cfg['tab']['cat_lang'] . " AS C
538: WHERE
539: A.idcat = B.idcat AND
540: B.idcat = C.idcat AND
541: C.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
542: B.idclient = '" . cSecurity::toInteger($this->client) . "'
543: ORDER BY
544: idtree";
545: $this->_debug('sql', $sql);
546: $this->db->query($sql);
547:
548: // $aSubCats = array();
549: // $i = false;
550: // while ($this->db->nextRecord()) {
551: // if ($this->db->f('parentid') < $cat_start) {
552: // // ending part of tree
553: // $i = false;
554: // }
555: // if ($this->db->f('idcat') == $cat_start) {
556: // // starting part of tree
557: // $i = true;
558: // }
559: // if ($i == true) {
560: // $aSubCats[] = $this->db->f('idcat');
561: // }
562: // }
563:
564: $aSubCats = array(
565: $cat_start
566: );
567: while ($this->db->nextRecord()) {
568: // ommit if cat is no child of any recognized descendant
569: if (!in_array($this->db->f('parentid'), $aSubCats)) {
570: continue;
571: }
572: // ommit if cat is already recognized (happens with $cat_start)
573: if (in_array($this->db->f('idcat'), $aSubCats)) {
574: continue;
575: }
576: // add cat as recognized descendant
577: $aSubCats[] = $this->db->f('idcat');
578: }
579:
580: return $aSubCats;
581: }
582:
583: /**
584: * Returns list of searchable article ids in given search range.
585: *
586: * @param array $search_range
587: * @return array
588: */
589: public function getSearchableArticles($search_range) {
590: global $auth;
591:
592: $aCatRange = array();
593: if (array_key_exists('cat_tree', $search_range) && is_array($search_range['cat_tree'])) {
594: if (count($search_range['cat_tree']) > 0) {
595: foreach ($search_range['cat_tree'] as $cat) {
596: $aCatRange = array_merge($aCatRange, $this->getSubTree($cat));
597: }
598: }
599: }
600:
601: if (array_key_exists('categories', $search_range) && is_array($search_range['categories'])) {
602: if (count($search_range['categories']) > 0) {
603: $aCatRange = array_merge($aCatRange, $search_range['categories']);
604: }
605: }
606:
607: $aCatRange = array_unique($aCatRange);
608: $sCatRange = implode("','", $aCatRange);
609:
610: if (array_key_exists('articles', $search_range) && is_array($search_range['articles'])) {
611: if (count($search_range['articles']) > 0) {
612: $sArtRange = implode("','", $search_range['articles']);
613: } else {
614: $sArtRange = '';
615: }
616: }
617:
618: if ($this->_protected == true) {
619: // access will be checked later
620: $sProtected = " C.visible = 1 AND B.online = 1 ";
621: } else {
622: if ($this->_dontshowofflinearticles == true) {
623: $sProtected = " C.visible = 1 AND B.online = 1 ";
624: } else {
625: $sProtected = " 1 ";
626: }
627: }
628:
629: if ($this->_exclude == true) {
630: // exclude searchrange
631: $sSearchRange = " A.idcat NOT IN ('" . $sCatRange . "') AND B.idart NOT IN ('" . $sArtRange . "') AND ";
632: } else {
633: // include searchrange
634: if (strlen($sArtRange) > 0) {
635: $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND B.idart IN ('" . $sArtRange . "') AND ";
636: } else {
637: $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND ";
638: }
639: }
640:
641: if (count($this->_articleSpecs) > 0) {
642: $sArtSpecs = " B.artspec IN ('" . implode("','", $this->_articleSpecs) . "') AND ";
643: } else {
644: $sArtSpecs = '';
645: }
646:
647: $sql = "SELECT
648: A.idart,
649: A.idcat,
650: C.public
651: FROM
652: " . $this->cfg["tab"]["cat_art"] . " as A,
653: " . $this->cfg["tab"]["art_lang"] . " as B,
654: " . $this->cfg["tab"]["cat_lang"] . " as C
655: WHERE
656: " . $sSearchRange . "
657: B.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
658: C.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
659: A.idart = B.idart AND
660: B.searchable = 1 AND
661: A.idcat = C.idcat AND
662: " . $sArtSpecs . "
663: " . $sProtected . " ";
664: $this->_debug('sql', $sql);
665: $this->db->query($sql);
666:
667: $aIdArts = array();
668: while ($this->db->nextRecord()) {
669: if($this->db->f("idcat") != "" && $this->_protected) {
670: if($this->db->f("public") == "0") {
671: // CEC to check category access
672: // break at 'true', default value 'false'
673: cApiCecHook::setBreakCondition(true, false);
674: $allow = cApiCecHook::executeWhileBreakCondition('Contenido.Frontend.CategoryAccess', $this->lang, $this->db->f("idcat"), $auth->auth['uid']);
675: if (!$allow) {
676: continue;
677: }
678: }
679: }
680:
681: $aIdArts[] = $this->db->f('idart');
682: }
683: return $aIdArts;
684: }
685:
686: /**
687: * Fetch all article specifications which are online.
688: *
689: * @return array
690: * Array of article specification Ids
691: */
692: public function getArticleSpecifications() {
693: $sql = "SELECT
694: idartspec
695: FROM
696: " . $this->cfg['tab']['art_spec'] . "
697: WHERE
698: client = " . cSecurity::toInteger($this->client) . " AND
699: lang = " . cSecurity::toInteger($this->lang) . " AND
700: online = 1 ";
701: $this->_debug('sql', $sql);
702: $this->db->query($sql);
703: $aArtspec = array();
704: while ($this->db->nextRecord()) {
705: $aArtspec[] = $this->db->f('idartspec');
706: }
707: return $aArtspec;
708: }
709:
710: /**
711: * Set article specification.
712: *
713: * @param int $iArtspecID
714: */
715: public function setArticleSpecification($iArtspecID) {
716: $this->_articleSpecs[] = $iArtspecID;
717: }
718:
719: /**
720: * Add all article specifications matching name of article
721: * specification (client dependent but language independent).
722: *
723: * @param string $sArtSpecName
724: * @return bool
725: */
726: public function addArticleSpecificationsByName($sArtSpecName) {
727: if (!isset($sArtSpecName) || strlen($sArtSpecName) == 0) {
728: return false;
729: }
730:
731: $sql = "SELECT
732: idartspec
733: FROM
734: " . $this->cfg['tab']['art_spec'] . "
735: WHERE
736: client = " . cSecurity::toInteger($this->client) . " AND
737: artspec = '" . $this->db->escape($sArtSpecName) . "' ";
738: $this->_debug('sql', $sql);
739: $this->db->query($sql);
740: while ($this->db->nextRecord()) {
741: $this->_articleSpecs[] = $this->db->f('idartspec');
742: }
743: }
744: }
745: