1: <?php
2: /**
3: * This file contains the class for content search.
4: *
5: * @package Core
6: * @subpackage Frontend_Search
7: * @version SVN Revision $Rev:$
8: *
9: * @author Willi Man
10: * @copyright four for business AG <www.4fb.de>
11: * @license http://www.contenido.org/license/LIZENZ.txt
12: * @link http://www.4fb.de
13: * @link http://www.contenido.org
14: */
15: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
16:
17: cInclude('includes', 'functions.encoding.php');
18:
19: /**
20: * CONTENIDO API - Search Object
21: *
22: * This object starts a indexed fulltext search
23: *
24: * TODO:
25: * The way to set the search options could be done much more better!
26: * The computation of the set of searchable articles should not be treated in
27: * this class.
28: * It is better to compute the array of searchable articles from the outside and
29: * to pass the array of searchable articles as parameter.
30: * Avoid foreach loops.
31: *
32: * Use object with
33: *
34: * $options = array('db' => 'regexp', // use db function regexp
35: * 'combine' => 'or'); // combine searchwords with or
36: *
37: * The range of searchable articles is by default the complete content which is
38: * online and not protected.
39: *
40: * With option 'searchable_articles' you can define your own set of searchable
41: * articles.
42: * If parameter 'searchable_articles' is set the options 'cat_tree',
43: * 'categories', 'articles', 'exclude', 'artspecs',
44: * 'protected', 'dontshowofflinearticles' don't have any effect.
45: *
46: * $options = array('db' => 'regexp', // use db function regexp
47: * 'combine' => 'or', // combine searchwords with or
48: * 'searchable_articles' => array(5, 6, 9, 13));
49: *
50: * One can define the range of searchable articles by setting the parameter
51: * 'exclude' to false which means the range of categories
52: * defined by parameter 'cat_tree' or 'categories' and the range of articles
53: * defined by parameter 'articles' is included.
54: *
55: * $options = array('db' => 'regexp', // use db function regexp
56: * 'combine' => 'or', // combine searchwords with or
57: * 'exclude' => false, // => searchrange specified in 'cat_tree', 'categories'
58: * and 'articles' is included
59: * 'cat_tree' => array(12), // tree with root 12 included
60: * 'categories' => array(100,111), // categories 100, 111 included
61: * 'articles' => array(33), // article 33 included
62: * 'artspecs' => array(2, 3), // array of article specifications => search only
63: * articles with these artspecs
64: * 'res_per_page' => 2, // results per page
65: * 'protected' => true); // => do not search articles or articles in categories
66: * which are offline or protected
67: * 'dontshowofflinearticles' => false); // => search offline articles or
68: * articles in categories which are offline
69: *
70: * You can build the complement of the range of searchable articles by setting
71: * the parameter 'exclude' to true which means the range of categories
72: * defined by parameter 'cat_tree' or 'categories' and the range of articles
73: * defined by parameter 'articles' is excluded from search.
74: *
75: * $options = array('db' => 'regexp', // use db function regexp
76: * 'combine' => 'or', // combine searchwords with or
77: * 'exclude' => true, // => searchrange specified in 'cat_tree', 'categories'
78: * and 'articles' is excluded
79: * 'cat_tree' => array(12), // tree with root 12 excluded
80: * 'categories' => array(100,111), // categories 100, 111 excluded
81: * 'articles' => array(33), // article 33 excluded
82: * 'artspecs' => array(2, 3), // array of article specifications => search only
83: * articles with these artspecs
84: * 'res_per_page' => 2, // results per page
85: * 'protected' => true); // => do not search articles or articles in categories
86: * which are offline or protected
87: * 'dontshowofflinearticles' => false); // => search offline articles or
88: * articles in categories which are offline
89: *
90: * $search = new Search($options);
91: *
92: * $cms_options = array("htmlhead", "html", "head", "text", "imgdescr", "link",
93: * "linkdescr");
94: * search only in these cms-types
95: * $search->setCmsOptions($cms_options);
96: *
97: * $search_result = $search->searchIndex($searchword, $searchwordex); // start
98: * search
99: *
100: * The search result structure has following form
101: * Array (
102: * [20] => Array (
103: * [CMS_HTML] => Array (
104: * [0] => 1
105: * [1] => 1
106: * [2] => 1
107: * )
108: * [keyword] => Array (
109: * [0] => content
110: * [1] => contenido
111: * [2] => wwwcontenidoorg
112: * )
113: * [search] => Array (
114: * [0] => con
115: * [1] => con
116: * [2] => con
117: * )
118: * [occurence] => Array (
119: * [0] => 1
120: * [1] => 5
121: * [2] => 1
122: * )
123: * [similarity] => 60
124: * )
125: * )
126: *
127: * The keys of the array are the article ID's found by search.
128: *
129: * Searching 'con' matches keywords 'content', 'contenido' and 'wwwcontenidoorg'
130: * in article with ID 20 in content type CMS_HTML[1].
131: * The search term occurs 7 times.
132: * The maximum similarity between searchterm and matching keyword is 60%.
133: *
134: * with $oSearchResults = new cSearchResult($search_result, 10);
135: * one can rank and display the results
136: *
137: * @package Core
138: * @subpackage Frontend_Search
139: */
140: class cSearch extends cSearchBaseAbstract {
141:
142: /**
143: * Instance of class Index
144: *
145: * @var object
146: */
147: protected $_index;
148:
149: /**
150: * the search words
151: *
152: * @var array
153: */
154: protected $_searchWords = array();
155:
156: /**
157: * the words which should be excluded from search
158: *
159: * @var array
160: */
161: protected $_searchWordsExclude = array();
162:
163: /**
164: * type of db search
165: * like => 'sql like', regexp => 'sql regexp'
166: *
167: * @var string
168: */
169: protected $_searchOption;
170:
171: /**
172: * logical combination of searchwords (and, or)
173: *
174: * @var string
175: */
176: protected $_searchCombination;
177:
178: /**
179: * array of searchable articles
180: *
181: * @var array
182: */
183: protected $_searchableArts = array();
184:
185: /**
186: * article specifications
187: *
188: * @var array
189: */
190: protected $_articleSpecs = array();
191:
192: /**
193: * If $protected = true => do not search articles which are offline or
194: * articles in catgeories which are offline (protected) unless the user has access to them
195: *
196: * @var boolean
197: */
198: protected $_protected;
199:
200: /**
201: * If $dontshowofflinearticles = false => search offline articles or
202: * articles in categories which are offline
203: *
204: * @var boolean
205: */
206: protected $_dontshowofflinearticles;
207:
208: /**
209: * If $exclude = true => the specified search range is excluded from search,
210: * otherwise included
211: *
212: * @var boolean
213: */
214: protected $_exclude;
215:
216: /**
217: * Array of article id's with information about cms-types, occurence of
218: * keyword/searchword, similarity .
219: *
220: *
221: *
222: * @var array
223: */
224: protected $_searchResult = array();
225:
226: /**
227: * Constructor
228: *
229: * @param array $options $options['db'] 'regexp' => DB search with REGEXP;
230: * 'like' => DB search with LIKE; 'exact' => exact match;
231: * $options['combine'] 'and', 'or' Combination of search words with
232: * AND, OR
233: * $options['exclude'] 'true' => searchrange specified in 'cat_tree',
234: * 'categories' and 'articles' is excluded; 'false' =>
235: * searchrange specified in 'cat_tree', 'categories' and
236: * 'articles' is included
237: * $options['cat_tree'] e.g. array(8) => The complete tree with root
238: * 8 is in/excluded from search
239: * $options['categories'] e.g. array(10, 12) => Categories 10, 12
240: * in/excluded
241: * $options['articles'] e.g. array(23) => Article 33 in/excluded
242: * $options['artspecs'] => e.g. array(2, 3) => search only articles
243: * with certain article specifications
244: * $options['protected'] 'true' => do not search articles which are
245: * offline (locked) or articles in catgeories which are offline
246: * (protected)
247: * $options['dontshowofflinearticles'] 'false' => search offline
248: * articles or articles in categories which are offline
249: * $options['searchable_articles'] array of article ID's which should
250: * be searchable
251: * @param cDb $db Optional database instance
252: */
253: public function __construct($options, $db = NULL) {
254: parent::__construct($db);
255:
256: $this->_index = new cSearchIndex($db);
257:
258: $this->_searchOption = (array_key_exists('db', $options)) ? strtolower($options['db']) : 'regexp';
259: $this->_searchCombination = (array_key_exists('combine', $options)) ? strtolower($options['combine']) : 'or';
260: $this->_protected = (array_key_exists('protected', $options)) ? $options['protected'] : true;
261: $this->_dontshowofflinearticles = (array_key_exists('dontshowofflinearticles', $options)) ? $options['dontshowofflinearticles'] : true;
262: $this->_exclude = (array_key_exists('exclude', $options)) ? $options['exclude'] : true;
263: $this->_articleSpecs = (array_key_exists('artspecs', $options) && is_array($options['artspecs'])) ? $options['artspecs'] : array();
264:
265: if (array_key_exists('searchable_articles', $options) && is_array($options['searchable_articles'])) {
266: $this->_searchableArts = $options['searchable_articles'];
267: } else {
268: $this->_searchableArts = $this->getSearchableArticles($options);
269: }
270:
271: // minimum similarity between searchword and keyword in percent
272: $this->intMinimumSimilarity = 50;
273: }
274:
275: /**
276: * indexed fulltext search
277: *
278: * @param string $searchwords The search words
279: * @param string $searchwords_exclude The words, which should be excluded
280: * from search
281: * @return boolean multitype:
282: */
283: public function searchIndex($searchwords, $searchwords_exclude = '') {
284: if (strlen(trim($searchwords)) > 0) {
285: $this->_searchWords = $this->stripWords($searchwords);
286: } else {
287: return false;
288: }
289:
290: if (strlen(trim($searchwords_exclude)) > 0) {
291: $this->_searchWordsExclude = $this->stripWords($searchwords_exclude);
292: }
293:
294: $tmp_searchwords = array();
295: foreach ($this->_searchWords as $word) {
296: $wordEscaped = cSecurity::escapeDB($word, $this->db);
297: if ($this->_searchOption == 'like') {
298: $wordEscaped = "'%" . $wordEscaped . "%'";
299: } elseif ($this->_searchOption == 'exact') {
300: $wordEscaped = "'" . $wordEscaped . "'";
301: }
302: $tmp_searchwords[] = $wordEscaped;
303: }
304:
305: if (count($this->_searchWordsExclude) > 0) {
306: foreach ($this->_searchWordsExclude as $word) {
307: $wordEscaped = cSecurity::escapeDB($word, $this->db);
308: if ($this->_searchOption == 'like') {
309: $wordEscaped = "'%" . $wordEscaped . "%'";
310: } elseif ($this->_searchOption == 'exact') {
311: $wordEscaped = "'" . $wordEscaped . "'";
312: }
313: $tmp_searchwords[] = $wordEscaped;
314: $this->_searchWords[] = $word;
315: }
316: }
317:
318: if ($this->_searchOption == 'regexp') {
319: // regexp search
320: $kwSql = "keyword REGEXP '" . implode('|', $tmp_searchwords) . "'";
321: } elseif ($this->_searchOption == 'like') {
322: // like search
323: $search_like = implode(" OR keyword LIKE ", $tmp_searchwords);
324: $kwSql = "keyword LIKE " . $search_like;
325: } elseif ($this->_searchOption == 'exact') {
326: // exact match
327: $search_exact = implode(" OR keyword = ", $tmp_searchwords);
328: $kwSql = "keyword LIKE " . $search_exact;
329: }
330:
331: $sql = "SELECT keyword, auto FROM " . $this->cfg['tab']['keywords'] . " WHERE idlang=" . cSecurity::toInteger($this->lang) . " AND " . $kwSql . " ";
332: $this->_debug('sql', $sql);
333: $this->db->query($sql);
334:
335: while ($this->db->nextRecord()) {
336:
337: $tmp_index_string = preg_split('/&/', $this->db->f('auto'), -1, PREG_SPLIT_NO_EMPTY);
338:
339: $this->_debug('index', $this->db->f('auto'));
340:
341: $tmp_index = array();
342: foreach ($tmp_index_string as $string) {
343: $tmp_string = preg_replace('/[=\(\)]/', ' ', $string);
344: $tmp_index[] = preg_split('/\s/', $tmp_string, -1, PREG_SPLIT_NO_EMPTY);
345: }
346: $this->_debug('tmp_index', $tmp_index);
347:
348: foreach ($tmp_index as $string) {
349: $artid = $string[0];
350:
351: // filter nonsearchable articles
352: if (in_array($artid, $this->_searchableArts)) {
353:
354: $cms_place = $string[2];
355: $keyword = $this->db->f('keyword');
356: $percent = 0;
357: $similarity = 0;
358: foreach ($this->_searchWords as $word) {
359: // computes similarity between searchword and keyword in
360: // percent
361: similar_text($word, $keyword, $percent);
362: if ($percent > $similarity) {
363: $similarity = $percent;
364: $searchword = $word;
365: }
366: }
367:
368: $tmp_cmstype = preg_split('/[,]/', $cms_place, -1, PREG_SPLIT_NO_EMPTY);
369: $this->_debug('tmp_cmstype', $tmp_cmstype);
370:
371: $tmp_cmstype2 = array();
372: foreach ($tmp_cmstype as $type) {
373: $tmp_cmstype2[] = preg_split('/-/', $type, -1, PREG_SPLIT_NO_EMPTY);
374: }
375: $this->_debug('tmp_cmstype2', $tmp_cmstype2);
376:
377: foreach ($tmp_cmstype2 as $type) {
378: if (!$this->_index->checkCmsType($type[0])) {
379: // search for specified cms-types
380: if ($similarity >= $this->intMinimumSimilarity) {
381: // include article into searchresult set only if
382: // similarity between searchword and keyword is
383: // big enough
384: $this->_searchResult[$artid][$type[0]][] = $type[1];
385: $this->_searchResult[$artid]['keyword'][] = $this->db->f('keyword');
386: $this->_searchResult[$artid]['search'][] = $searchword;
387: $this->_searchResult[$artid]['occurence'][] = $string[1];
388: $this->_searchResult[$artid]['debug_similarity'][] = $percent;
389: if ($similarity > $this->_searchResult[$artid]['similarity']) {
390: $this->_searchResult[$artid]['similarity'] = $similarity;
391: }
392: }
393: }
394: }
395: }
396: }
397: }
398:
399: if ($this->_searchCombination == 'and') {
400: // all search words must appear in the article
401: foreach ($this->_searchResult as $article => $val) {
402: if (!count(array_diff($this->_searchWords, $val['search'])) == 0) {
403: // $this->rank_structure[$article] = $rank[$article];
404: unset($this->_searchResult[$article]);
405: }
406: }
407: }
408:
409: if (count($this->_searchWordsExclude) > 0) {
410: // search words to be excluded must not appear in article
411: foreach ($this->_searchResult as $article => $val) {
412: if (!count(array_intersect($this->_searchWordsExclude, $val['search'])) == 0) {
413: // $this->rank_structure[$article] = $rank[$article];
414: unset($this->_searchResult[$article]);
415: }
416: }
417: }
418:
419: $this->_debug('$this->search_result', $this->_searchResult);
420: $this->_debug('$this->searchable_arts', $this->_searchableArts);
421:
422: $searchTracking = new cApiSearchTrackingCollection();
423: $searchTracking->trackSearch($searchwords, count($this->_searchResult));
424:
425: return $this->_searchResult;
426: }
427:
428: /**
429: *
430: * @param mixed $cms_options The cms-types (htmlhead, html, ...) which
431: * should
432: * explicitly be searched
433: */
434: public function setCmsOptions($cms_options) {
435: if (is_array($cms_options) && count($cms_options) > 0) {
436: $this->_index->setCmsOptions($cms_options);
437: }
438: }
439:
440: /**
441: *
442: * @param string $searchwords The search-words
443: * @return array of stripped search-words
444: */
445: public function stripWords($searchwords) {
446: // remove backslash and html tags
447: $searchwords = trim(strip_tags(stripslashes($searchwords)));
448:
449: // split the phrase by any number of commas or space characters
450: $tmp_words = mb_split('[\s,]+', $searchwords);
451:
452: $tmp_searchwords = array();
453:
454: foreach ($tmp_words as $word) {
455:
456: $word = htmlentities($word, ENT_COMPAT, 'UTF-8');
457: $word = (trim(strtolower($word)));
458: $word = html_entity_decode($word, ENT_COMPAT, 'UTF-8');
459:
460: // $word =(trim(strtolower($word)));
461: if (strlen($word) > 1) {
462: $tmp_searchwords[] = $word;
463: }
464: }
465:
466: return array_unique($tmp_searchwords);
467: }
468:
469: /**
470: * Returns the category tree array.
471: *
472: * @param int $cat_start Root of a category tree
473: * @return array Category Tree
474: * @todo This is not the job for search, should be outsourced ...
475: */
476: public function getSubTree($cat_start) {
477: $sql = "SELECT
478: B.idcat, B.parentid
479: FROM
480: " . $this->cfg['tab']['cat_tree'] . " AS A,
481: " . $this->cfg['tab']['cat'] . " AS B,
482: " . $this->cfg['tab']['cat_lang'] . " AS C
483: WHERE
484: A.idcat = B.idcat AND
485: B.idcat = C.idcat AND
486: C.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
487: B.idclient = '" . cSecurity::toInteger($this->client) . "'
488: ORDER BY
489: idtree";
490: $this->_debug('sql', $sql);
491: $this->db->query($sql);
492:
493: // $aSubCats = array();
494: // $i = false;
495: // while ($this->db->nextRecord()) {
496: // if ($this->db->f('parentid') < $cat_start) {
497: // // ending part of tree
498: // $i = false;
499: // }
500: // if ($this->db->f('idcat') == $cat_start) {
501: // // starting part of tree
502: // $i = true;
503: // }
504: // if ($i == true) {
505: // $aSubCats[] = $this->db->f('idcat');
506: // }
507: // }
508:
509: $aSubCats = array(
510: $cat_start
511: );
512: while ($this->db->nextRecord()) {
513: // ommit if cat is no child of any recognized descendant
514: if (!in_array($this->db->f('parentid'), $aSubCats)) {
515: continue;
516: }
517: // ommit if cat is already recognized (happens with $cat_start)
518: if (in_array($this->db->f('idcat'), $aSubCats)) {
519: continue;
520: }
521: // add cat as recognized descendant
522: $aSubCats[] = $this->db->f('idcat');
523: }
524:
525: return $aSubCats;
526: }
527:
528: /**
529: * Returns list of searchable article ids.
530: *
531: * @param array $search_range
532: * @return array Articles in specified search range
533: */
534: public function getSearchableArticles($search_range) {
535: global $auth;
536:
537: $aCatRange = array();
538: if (array_key_exists('cat_tree', $search_range) && is_array($search_range['cat_tree'])) {
539: if (count($search_range['cat_tree']) > 0) {
540: foreach ($search_range['cat_tree'] as $cat) {
541: $aCatRange = array_merge($aCatRange, $this->getSubTree($cat));
542: }
543: }
544: }
545:
546: if (array_key_exists('categories', $search_range) && is_array($search_range['categories'])) {
547: if (count($search_range['categories']) > 0) {
548: $aCatRange = array_merge($aCatRange, $search_range['categories']);
549: }
550: }
551:
552: $aCatRange = array_unique($aCatRange);
553: $sCatRange = implode("','", $aCatRange);
554:
555: if (array_key_exists('articles', $search_range) && is_array($search_range['articles'])) {
556: if (count($search_range['articles']) > 0) {
557: $sArtRange = implode("','", $search_range['articles']);
558: } else {
559: $sArtRange = '';
560: }
561: }
562:
563: if ($this->_protected == true) {
564: // access will be checked later
565: $sProtected = " C.visible = 1 AND B.online = 1 ";
566: } else {
567: if ($this->_dontshowofflinearticles == true) {
568: $sProtected = " C.visible = 1 AND B.online = 1 ";
569: } else {
570: $sProtected = " 1 ";
571: }
572: }
573:
574: if ($this->_exclude == true) {
575: // exclude searchrange
576: $sSearchRange = " A.idcat NOT IN ('" . $sCatRange . "') AND B.idart NOT IN ('" . $sArtRange . "') AND ";
577: } else {
578: // include searchrange
579: if (strlen($sArtRange) > 0) {
580: $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND B.idart IN ('" . $sArtRange . "') AND ";
581: } else {
582: $sSearchRange = " A.idcat IN ('" . $sCatRange . "') AND ";
583: }
584: }
585:
586: if (count($this->_articleSpecs) > 0) {
587: $sArtSpecs = " B.artspec IN ('" . implode("','", $this->_articleSpecs) . "') AND ";
588: } else {
589: $sArtSpecs = '';
590: }
591:
592: $sql = "SELECT
593: A.idart,
594: A.idcat,
595: C.public
596: FROM
597: " . $this->cfg["tab"]["cat_art"] . " as A,
598: " . $this->cfg["tab"]["art_lang"] . " as B,
599: " . $this->cfg["tab"]["cat_lang"] . " as C
600: WHERE
601: " . $sSearchRange . "
602: B.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
603: C.idlang = '" . cSecurity::toInteger($this->lang) . "' AND
604: A.idart = B.idart AND
605: B.searchable = 1 AND
606: A.idcat = C.idcat AND
607: " . $sArtSpecs . "
608: " . $sProtected . " ";
609: $this->_debug('sql', $sql);
610: $this->db->query($sql);
611:
612: $aIdArts = array();
613: while ($this->db->nextRecord()) {
614: if($this->db->f("idcat") != "" && $this->_protected) {
615: if($this->db->f("public") == "0") {
616: // CEC to check category access
617: // break at 'true', default value 'false'
618: cApiCecHook::setBreakCondition(true, false);
619: $allow = cApiCecHook::executeWhileBreakCondition('Contenido.Frontend.CategoryAccess', $this->lang, $this->db->f("idcat"), $auth->auth['uid']);
620: if (!$allow) {
621: continue;
622: }
623: }
624: }
625:
626: $aIdArts[] = $this->db->f('idart');
627: }
628: return $aIdArts;
629: }
630:
631: /**
632: * Fetch all article specifications which are online,
633: *
634: * @return array Array of article specification Ids
635: */
636: public function getArticleSpecifications() {
637: $sql = "SELECT
638: idartspec
639: FROM
640: " . $this->cfg['tab']['art_spec'] . "
641: WHERE
642: client = " . cSecurity::toInteger($this->client) . " AND
643: lang = " . cSecurity::toInteger($this->lang) . " AND
644: online = 1 ";
645: $this->_debug('sql', $sql);
646: $this->db->query($sql);
647: $aArtspec = array();
648: while ($this->db->nextRecord()) {
649: $aArtspec[] = $this->db->f('idartspec');
650: }
651: return $aArtspec;
652: }
653:
654: /**
655: * Set article specification
656: *
657: * @param int $iArtspecID
658: */
659: public function setArticleSpecification($iArtspecID) {
660: $this->_articleSpecs[] = $iArtspecID;
661: }
662:
663: /**
664: * Add all article specifications matching name of article specification
665: * (client dependent but language independent)
666: *
667: * @param string $sArtSpecName
668: * @return boolean
669: */
670: public function addArticleSpecificationsByName($sArtSpecName) {
671: if (!isset($sArtSpecName) || strlen($sArtSpecName) == 0) {
672: return false;
673: }
674:
675: $sql = "SELECT
676: idartspec
677: FROM
678: " . $this->cfg['tab']['art_spec'] . "
679: WHERE
680: client = " . cSecurity::toInteger($this->client) . " AND
681: artspec = '" . $this->db->escape($sArtSpecName) . "' ";
682: $this->_debug('sql', $sql);
683: $this->db->query($sql);
684: while ($this->db->nextRecord()) {
685: $this->_articleSpecs[] = $this->db->f('idartspec');
686: }
687: }
688: }
689: