1: <?php
2:
3: /**
4: * This file contains the class for content search.
5: *
6: * @package Core
7: * @subpackage Frontend_Search
8: * @version SVN Revision $Rev:$
9: *
10: * @author Willi Man
11: * @copyright four for business AG <www.4fb.de>
12: * @license http://www.contenido.org/license/LIZENZ.txt
13: * @link http://www.4fb.de
14: * @link http://www.contenido.org
15: */
16:
17: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
18:
19: cInclude('includes', 'functions.encoding.php');
20:
21: /**
22: * CONTENIDO API - Search Object
23: *
24: * This object starts a indexed fulltext search
25: *
26: * TODO:
27: * The way to set the search options could be done much more better!
28: * The computation of the set of searchable articles should not be treated in
29: * this class.
30: * It is better to compute the array of searchable articles from the outside and
31: * to pass the array of searchable articles as parameter.
32: * Avoid foreach loops.
33: *
34: * Use object with
35: *
36: * $options = array('db' => 'regexp', // use db function regexp
37: * 'combine' => 'or'); // combine searchwords with or
38: *
39: * The range of searchable articles is by default the complete content which is
40: * online and not protected.
41: *
42: * With option 'searchable_articles' you can define your own set of searchable
43: * articles.
44: * If parameter 'searchable_articles' is set the options 'cat_tree',
45: * 'categories', 'articles', 'exclude', 'artspecs',
46: * 'protected', 'dontshowofflinearticles' don't have any effect.
47: *
48: * $options = array('db' => 'regexp', // use db function regexp
49: * 'combine' => 'or', // combine searchwords with or
50: * 'searchable_articles' => array(5, 6, 9, 13));
51: *
52: * One can define the range of searchable articles by setting the parameter
53: * 'exclude' to false which means the range of categories
54: * defined by parameter 'cat_tree' or 'categories' and the range of articles
55: * defined by parameter 'articles' is included.
56: *
57: * $options = array('db' => 'regexp', // use db function regexp
58: * 'combine' => 'or', // combine searchwords with or
59: * 'exclude' => false, // => searchrange specified in 'cat_tree', 'categories'
60: * and 'articles' is included
61: * 'cat_tree' => array(12), // tree with root 12 included
62: * 'categories' => array(100,111), // categories 100, 111 included
63: * 'articles' => array(33), // article 33 included
64: * 'artspecs' => array(2, 3), // array of article specifications => search only
65: * articles with these artspecs
66: * 'res_per_page' => 2, // results per page
67: * 'protected' => true); // => do not search articles or articles in categories
68: * which are offline or protected
69: * 'dontshowofflinearticles' => false); // => search offline articles or
70: * articles in categories which are offline
71: *
72: * You can build the complement of the range of searchable articles by setting
73: * the parameter 'exclude' to true which means the range of categories
74: * defined by parameter 'cat_tree' or 'categories' and the range of articles
75: * defined by parameter 'articles' is excluded from search.
76: *
77: * $options = array('db' => 'regexp', // use db function regexp
78: * 'combine' => 'or', // combine searchwords with or
79: * 'exclude' => true, // => searchrange specified in 'cat_tree', 'categories'
80: * and 'articles' is excluded
81: * 'cat_tree' => array(12), // tree with root 12 excluded
82: * 'categories' => array(100,111), // categories 100, 111 excluded
83: * 'articles' => array(33), // article 33 excluded
84: * 'artspecs' => array(2, 3), // array of article specifications => search only
85: * articles with these artspecs
86: * 'res_per_page' => 2, // results per page
87: * 'protected' => true); // => do not search articles or articles in categories
88: * which are offline or protected
89: * 'dontshowofflinearticles' => false); // => search offline articles or
90: * articles in categories which are offline
91: *
92: * $search = new Search($options);
93: *
94: * $cms_options = array("htmlhead", "html", "head", "text", "imgdescr", "link",
95: * "linkdescr");
96: * search only in these cms-types
97: * $search->setCmsOptions($cms_options);
98: *
99: * $search_result = $search->searchIndex($searchword, $searchwordex); // start
100: * search
101: *
102: * The search result structure has following form
103: * Array (
104: * [20] => Array (
105: * [CMS_HTML] => Array (
106: * [0] => 1
107: * [1] => 1
108: * [2] => 1
109: * )
110: * [keyword] => Array (
111: * [0] => content
112: * [1] => contenido
113: * [2] => wwwcontenidoorg
114: * )
115: * [search] => Array (
116: * [0] => con
117: * [1] => con
118: * [2] => con
119: * )
120: * [occurence] => Array (
121: * [0] => 1
122: * [1] => 5
123: * [2] => 1
124: * )
125: * [similarity] => 60
126: * )
127: * )
128: *
129: * The keys of the array are the article ID's found by search.
130: *
131: * Searching 'con' matches keywords 'content', 'contenido' and 'wwwcontenidoorg'
132: * in article with ID 20 in content type CMS_HTML[1].
133: * The search term occurs 7 times.
134: * The maximum similarity between searchterm and matching keyword is 60%.
135: *
136: * with $oSearchResults = new cSearchResult($search_result, 10);
137: * one can rank and display the results
138: *
139: * @package Core
140: * @subpackage Frontend_Search
141: */
142: class cSearch extends cSearchBaseAbstract {
143:
144: /**
145: * Instance of class Index
146: *
147: * @var object
148: */
149: protected $_index;
150:
151: /**
152: * the search words
153: *
154: * @var array
155: */
156: protected $_searchWords = array();
157:
158: /**
159: * the words which should be excluded from search
160: *
161: * @var array
162: */
163: protected $_searchWordsExclude = array();
164:
165: /**
166: * type of db search
167: * like => 'sql like', regexp => 'sql regexp'
168: *
169: * @var string
170: */
171: protected $_searchOption;
172:
173: /**
174: * logical combination of searchwords (and, or)
175: *
176: * @var string
177: */
178: protected $_searchCombination;
179:
180: /**
181: * array of searchable articles
182: *
183: * @var array
184: */
185: protected $_searchableArts = array();
186:
187: /**
188: * article specifications
189: *
190: * @var array
191: */
192: protected $_articleSpecs = array();
193:
194: /**
195: * If $protected = true => do not search articles which are offline or
196: * articles in catgeories which are offline (protected) unless the user has access to them
197: *
198: * @var bool
199: */
200: protected $_protected;
201:
202: /**
203: * If $dontshowofflinearticles = false => search offline articles or
204: * articles in categories which are offline
205: *
206: * @var bool
207: */
208: protected $_dontshowofflinearticles;
209:
210: /**
211: * If $exclude = true => the specified search range is excluded from search,
212: * otherwise included
213: *
214: * @var bool
215: */
216: protected $_exclude;
217:
218: /**
219: * Array of article id's with information about cms-types, occurence of
220: * keyword/searchword, similarity .
221: *
222: *
223: *
224: * @var array
225: */
226: protected $_searchResult = array();
227:
228: /**
229: * Constructor
230: *
231: * @param array $options
232: * $options['db']
233: * 'regexp' => DB search with REGEXP
234: * 'like' => DB search with LIKE
235: * 'exact' => exact match;
236: * $options['combine']
237: * 'and', 'or' Combination of search words with AND, OR
238: * $options['exclude']
239: * 'true' => searchrange specified in 'cat_tree', 'categories'
240: * and 'articles' is excluded;
241: * 'false' => searchrange specified in 'cat_tree', 'categories'
242: * and 'articles' is included
243: * $options['cat_tree']
244: * e.g. array(8) => The complete tree with root 8 is in/excluded
245: * from search
246: * $options['categories']
247: * e.g. array(10, 12) => Categories 10, 12 in/excluded
248: * $options['articles']
249: * e.g. array(23) => Article 33 in/excluded
250: * $options['artspecs']
251: * e.g. array(2, 3) => search only articles with certain article
252: * specifications
253: * $options['protected']
254: * 'true' => do not search articles which are offline (locked)
255: * or articles in catgeories which are offline (protected)
256: * $options['dontshowofflinearticles']
257: * 'false' => search offline articles or articles in categories
258: * which are offline
259: * $options['searchable_articles']
260: * array of article ID's which should be searchable
261: * @param cDb $db [optional]
262: * database instance
263: */
264: public function __construct($options, $db = NULL) {
265: parent::__construct($db);
266:
267: $this->_index = new cSearchIndex($db);
268:
269: $this->_searchOption = (array_key_exists('db', $options)) ? strtolower($options['db']) : 'regexp';
270: $this->_searchCombination = (array_key_exists('combine', $options)) ? strtolower($options['combine']) : 'or';
271: $this->_protected = (array_key_exists('protected', $options)) ? $options['protected'] : true;
272: $this->_dontshowofflinearticles = (array_key_exists('dontshowofflinearticles', $options)) ? $options['dontshowofflinearticles'] : true;
273: $this->_exclude = (array_key_exists('exclude', $options)) ? $options['exclude'] : true;
274: $this->_articleSpecs = (array_key_exists('artspecs', $options) && is_array($options['artspecs'])) ? $options['artspecs'] : array();
275:
276: if (array_key_exists('searchable_articles', $options) && is_array($options['searchable_articles'])) {
277: $this->_searchableArts = $options['searchable_articles'];
278: } else {
279: $this->_searchableArts = $this->getSearchableArticles($options);
280: }
281:
282: // minimum similarity between searchword and keyword in percent
283: $this->intMinimumSimilarity = 50;
284: }
285:
286: /**
287: * indexed fulltext search
288: *
289: * @param string $searchwords
290: * The search words
291: * @param string $searchwords_exclude [optional]
292: * The words, which should be excluded from search
293: * @return bool|array
294: */
295: public function searchIndex($searchwords, $searchwords_exclude = '') {
296: if (strlen(trim($searchwords)) > 0) {
297: $this->_searchWords = $this->stripWords($searchwords);
298: } else {
299: return false;
300: }
301:
302: if (strlen(trim($searchwords_exclude)) > 0) {
303: $this->_searchWordsExclude = $this->stripWords($searchwords_exclude);
304: }
305:
306: $tmp_searchwords = array();
307: foreach ($this->_searchWords as $word) {
308: $wordEscaped = cSecurity::escapeDB($word, $this->db);
309: if ($this->_searchOption == 'like') {
310: $wordEscaped = "'%" . $wordEscaped . "%'";
311: } elseif ($this->_searchOption == 'exact') {
312: $wordEscaped = "'" . $wordEscaped . "'";
313: }
314: $tmp_searchwords[] = $wordEscaped;
315: }
316:
317: if (count($this->_searchWordsExclude) > 0) {
318: foreach ($this->_searchWordsExclude as $word) {
319: $wordEscaped = cSecurity::escapeDB($word, $this->db);
320: if ($this->_searchOption == 'like') {
321: $wordEscaped = "'%" . $wordEscaped . "%'";
322: } elseif ($this->_searchOption == 'exact') {
323: $wordEscaped = "'" . $wordEscaped . "'";
324: }
325: $tmp_searchwords[] = $wordEscaped;
326: $this->_searchWords[] = $word;
327: }
328: }
329:
330: if ($this->_searchOption == 'regexp') {
331: // regexp search
332: $kwSql = "keyword REGEXP '" . implode('|', $tmp_searchwords) . "'";
333: } elseif ($this->_searchOption == 'like') {
334: // like search
335: $search_like = implode(" OR keyword LIKE ", $tmp_searchwords);
336: $kwSql = "keyword LIKE " . $search_like;
337: } elseif ($this->_searchOption == 'exact') {
338: // exact match
339: $search_exact = implode(" OR keyword = ", $tmp_searchwords);
340: $kwSql = "keyword LIKE " . $search_exact;
341: }
342:
343: $sql = "SELECT keyword, auto FROM " . $this->cfg['tab']['keywords'] . " WHERE idlang=" . cSecurity::toInteger($this->lang) . " AND " . $kwSql . " ";
344: $this->_debug('sql', $sql);
345: $this->db->query($sql);
346:
347: while ($this->db->nextRecord()) {
348:
349: $tmp_index_string = preg_split('/&/', $this->db->f('auto'), -1, PREG_SPLIT_NO_EMPTY);
350:
351: $this->_debug('index', $this->db->f('auto'));
352:
353: $tmp_index = array();
354: foreach ($tmp_index_string as $string) {
355: $tmp_string = preg_replace('/[=\(\)]/', ' ', $string);
356: $tmp_index[] = preg_split('/\s/', $tmp_string, -1, PREG_SPLIT_NO_EMPTY);
357: }
358: $this->_debug('tmp_index', $tmp_index);
359:
360: foreach ($tmp_index as $string) {
361: $artid = $string[0];
362:
363: // filter nonsearchable articles
364: if (in_array($artid, $this->_searchableArts)) {
365:
366: $cms_place = $string[2];
367: $keyword = $this->db->f('keyword');
368: $percent = 0;
369: $similarity = 0;
370: foreach ($this->_searchWords as $word) {
371: // computes similarity between searchword and keyword in
372: // percent
373: similar_text($word, $keyword, $percent);
374: if ($percent > $similarity) {
375: $similarity = $percent;
376: $searchword = $word;
377: }
378: }
379:
380: $tmp_cmstype = preg_split('/[,]/', $cms_place, -1, PREG_SPLIT_NO_EMPTY);
381: $this->_debug('tmp_cmstype', $tmp_cmstype);
382:
383: $tmp_cmstype2 = array();
384: foreach ($tmp_cmstype as $type) {
385: $tmp_cmstype2[] = preg_split('/-/', $type, -1, PREG_SPLIT_NO_EMPTY);
386: }
387: $this->_debug('tmp_cmstype2', $tmp_cmstype2);
388:
389: foreach ($tmp_cmstype2 as $type) {
390: if (!$this->_index->checkCmsType($type[0])) {
391: // search for specified cms-types
392: if ($similarity >= $this->intMinimumSimilarity) {
393: // include article into searchresult set only if
394: // similarity between searchword and keyword is
395: // big enough
396: $this->_searchResult[$artid][$type[0]][] = $type[1];
397: $this->_searchResult[$artid]['keyword'][] = $this->db->f('keyword');
398: $this->_searchResult[$artid]['search'][] = $searchword;
399: $this->_searchResult[$artid]['occurence'][] = $string[1];
400: $this->_searchResult[$artid]['debug_similarity'][] = $percent;
401: if ($similarity > $this->_searchResult[$artid]['similarity']) {
402: $this->_searchResult[$artid]['similarity'] = $similarity;
403: }
404: }
405: }
406: }
407: }
408: }
409: }
410:
411: if ($this->_searchCombination == 'and') {
412: // all search words must appear in the article
413: foreach ($this->_searchResult as $article => $val) {
414: if (!count(array_diff($this->_searchWords, $val['search'])) == 0) {
415: // $this->rank_structure[$article] = $rank[$article];
416: unset($this->_searchResult[$article]);
417: }
418: }
419: }
420:
421: if (count($this->_searchWordsExclude) > 0) {
422: // search words to be excluded must not appear in article
423: foreach ($this->_searchResult as $article => $val) {
424: if (!count(array_intersect($this->_searchWordsExclude, $val['search'])) == 0) {
425: // $this->rank_structure[$article] = $rank[$article];
426: unset($this->_searchResult[$article]);
427: }
428: }
429: }
430:
431: $this->_debug('$this->search_result', $this->_searchResult);
432: $this->_debug('$this->searchable_arts', $this->_searchableArts);
433:
434: $searchTracking = new cApiSearchTrackingCollection();
435: $searchTracking->trackSearch($searchwords, count($this->_searchResult));
436:
437: return $this->_searchResult;
438: }
439:
440: /**
441: *
442: * @param mixed $cms_options
443: * The cms-types (htmlhead, html, ...) which should explicitly be
444: * searched.
445: */
446: public function setCmsOptions($cms_options) {
447: if (is_array($cms_options) && count($cms_options) > 0) {
448: $this->_index->setCmsOptions($cms_options);
449: }
450: }
451:
452: /**
453: *
454: * @param string $searchwords
455: * The search-words
456: * @return array
457: * of stripped search-words
458: */
459: public function stripWords($searchwords) {
460: // remove backslash and html tags
461: $searchwords = trim(strip_tags(stripslashes($searchwords)));
462:
463: // split the phrase by any number of commas or space characters
464: $tmp_words = mb_split('[\s,]+', $searchwords);
465:
466: $tmp_searchwords = array();
467:
468: foreach ($tmp_words as $word) {
469:
470: $word = htmlentities($word, ENT_COMPAT, 'UTF-8');
471: $word = (trim(strtolower($word)));
472: $word = html_entity_decode($word, ENT_COMPAT, 'UTF-8');
473:
474: // $word =(trim(strtolower($word)));
475: if (strlen($word) > 1) {
476: $tmp_searchwords[] = $word;
477: }
478: }
479:
480: return array_unique($tmp_searchwords);
481: }
482:
483: /**
484: * Returns the category tree array.
485: *
486: * @todo This is not the job for search, should be outsourced ...
487: * @param int $cat_start
488: * Root of a category tree
489: * @return array
490: * Category Tree
491: */
492: public function getSubTree($cat_start) {
493: $sql = "SELECT
494: B.idcat, B.parentid
495: FROM
496: " . $this->cfg['tab']['cat_tree'] . " AS A,
497: " . $this->cfg['tab']['cat'] . " AS B,
498: " . $this->cfg['tab']['cat_lang'] . " AS C
499: WHERE
500: A.idcat = B.idcat AND
501: B.idcat = C.idcat AND
502: C.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
503: B.idclient = '" . cSecurity::toInteger($this->client) . "'
504: ORDER BY
505: idtree";
506: $this->_debug('sql', $sql);
507: $this->db->query($sql);
508:
509: // $aSubCats = array();
510: // $i = false;
511: // while ($this->db->nextRecord()) {
512: // if ($this->db->f('parentid') < $cat_start) {
513: // // ending part of tree
514: // $i = false;
515: // }
516: // if ($this->db->f('idcat') == $cat_start) {
517: // // starting part of tree
518: // $i = true;
519: // }
520: // if ($i == true) {
521: // $aSubCats[] = $this->db->f('idcat');
522: // }
523: // }
524:
525: $aSubCats = array(
526: $cat_start
527: );
528: while ($this->db->nextRecord()) {
529: // ommit if cat is no child of any recognized descendant
530: if (!in_array($this->db->f('parentid'), $aSubCats)) {
531: continue;
532: }
533: // ommit if cat is already recognized (happens with $cat_start)
534: if (in_array($this->db->f('idcat'), $aSubCats)) {
535: continue;
536: }
537: // add cat as recognized descendant
538: $aSubCats[] = $this->db->f('idcat');
539: }
540:
541: return $aSubCats;
542: }
543:
544: /**
545: * Returns list of searchable article ids.
546: *
547: * @param array $search_range
548: * @return array
549: * Articles in specified search range
550: */
551: public function getSearchableArticles($search_range) {
552: global $auth;
553:
554: $aCatRange = array();
555: if (array_key_exists('cat_tree', $search_range) && is_array($search_range['cat_tree'])) {
556: if (count($search_range['cat_tree']) > 0) {
557: foreach ($search_range['cat_tree'] as $cat) {
558: $aCatRange = array_merge($aCatRange, $this->getSubTree($cat));
559: }
560: }
561: }
562:
563: if (array_key_exists('categories', $search_range) && is_array($search_range['categories'])) {
564: if (count($search_range['categories']) > 0) {
565: $aCatRange = array_merge($aCatRange, $search_range['categories']);
566: }
567: }
568:
569: $aCatRange = array_unique($aCatRange);
570: $sCatRange = implode("','", $aCatRange);
571:
572: if (array_key_exists('articles', $search_range) && is_array($search_range['articles'])) {
573: if (count($search_range['articles']) > 0) {
574: $sArtRange = implode("','", $search_range['articles']);
575: } else {
576: $sArtRange = '';
577: }
578: }
579:
580: if ($this->_protected == true) {
581: // access will be checked later
582: $sProtected = " C.visible = 1 AND B.online = 1 ";
583: } else {
584: if ($this->_dontshowofflinearticles == true) {
585: $sProtected = " C.visible = 1 AND B.online = 1 ";
586: } else {
587: $sProtected = " 1 ";
588: }
589: }
590:
591: if ($this->_exclude == true) {
592: // exclude searchrange
593: $sSearchRange = " A.idcat NOT IN ('" . $sCatRange . "') AND B.idart NOT IN ('" . $sArtRange . "') AND ";
594: } else {
595: // include searchrange
596: if (strlen($sArtRange) > 0) {
597: $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND B.idart IN ('" . $sArtRange . "') AND ";
598: } else {
599: $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND ";
600: }
601: }
602:
603: if (count($this->_articleSpecs) > 0) {
604: $sArtSpecs = " B.artspec IN ('" . implode("','", $this->_articleSpecs) . "') AND ";
605: } else {
606: $sArtSpecs = '';
607: }
608:
609: $sql = "SELECT
610: A.idart,
611: A.idcat,
612: C.public
613: FROM
614: " . $this->cfg["tab"]["cat_art"] . " as A,
615: " . $this->cfg["tab"]["art_lang"] . " as B,
616: " . $this->cfg["tab"]["cat_lang"] . " as C
617: WHERE
618: " . $sSearchRange . "
619: B.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
620: C.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
621: A.idart = B.idart AND
622: B.searchable = 1 AND
623: A.idcat = C.idcat AND
624: " . $sArtSpecs . "
625: " . $sProtected . " ";
626: $this->_debug('sql', $sql);
627: $this->db->query($sql);
628:
629: $aIdArts = array();
630: while ($this->db->nextRecord()) {
631: if($this->db->f("idcat") != "" && $this->_protected) {
632: if($this->db->f("public") == "0") {
633: // CEC to check category access
634: // break at 'true', default value 'false'
635: cApiCecHook::setBreakCondition(true, false);
636: $allow = cApiCecHook::executeWhileBreakCondition('Contenido.Frontend.CategoryAccess', $this->lang, $this->db->f("idcat"), $auth->auth['uid']);
637: if (!$allow) {
638: continue;
639: }
640: }
641: }
642:
643: $aIdArts[] = $this->db->f('idart');
644: }
645: return $aIdArts;
646: }
647:
648: /**
649: * Fetch all article specifications which are online,
650: *
651: * @return array
652: * Array of article specification Ids
653: */
654: public function getArticleSpecifications() {
655: $sql = "SELECT
656: idartspec
657: FROM
658: " . $this->cfg['tab']['art_spec'] . "
659: WHERE
660: client = " . cSecurity::toInteger($this->client) . " AND
661: lang = " . cSecurity::toInteger($this->lang) . " AND
662: online = 1 ";
663: $this->_debug('sql', $sql);
664: $this->db->query($sql);
665: $aArtspec = array();
666: while ($this->db->nextRecord()) {
667: $aArtspec[] = $this->db->f('idartspec');
668: }
669: return $aArtspec;
670: }
671:
672: /**
673: * Set article specification
674: *
675: * @param int $iArtspecID
676: */
677: public function setArticleSpecification($iArtspecID) {
678: $this->_articleSpecs[] = $iArtspecID;
679: }
680:
681: /**
682: * Add all article specifications matching name of article specification
683: * (client dependent but language independent)
684: *
685: * @param string $sArtSpecName
686: * @return bool
687: */
688: public function addArticleSpecificationsByName($sArtSpecName) {
689: if (!isset($sArtSpecName) || strlen($sArtSpecName) == 0) {
690: return false;
691: }
692:
693: $sql = "SELECT
694: idartspec
695: FROM
696: " . $this->cfg['tab']['art_spec'] . "
697: WHERE
698: client = " . cSecurity::toInteger($this->client) . " AND
699: artspec = '" . $this->db->escape($sArtSpecName) . "' ";
700: $this->_debug('sql', $sql);
701: $this->db->query($sql);
702: while ($this->db->nextRecord()) {
703: $this->_articleSpecs[] = $this->db->f('idartspec');
704: }
705: }
706: }
707: