1: <?php
2:
3: /**
4: *
5: * @package Plugin
6: * @subpackage SearchSolr
7: * @version SVN Revision $Rev:$
8: * @author marcus.gnass
9: * @copyright four for business AG
10: * @link http://www.4fb.de
11: */
12:
13: // assert CONTENIDO framework
14: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
15:
16: /**
17: * This class sends update requests to a Solr core.
18: * If the request failed an
19: * exception is thrown. This class allows handling of more than one article at
20: * once.
21: *
22: * <server>:8080/solr/admin/cores?action=STATUS
23: * <server>:8080/solr/admin/cores?action=RENAME&core=collection1&other=contenido
24: * <server>:8080/solr/admin/cores?action=RELOAD&core=contenido
25: *
26: * @author marcus.gnass
27: */
28: class SolrIndexer {
29:
30: /**
31: * @var bool
32: */
33: const DBG = false;
34:
35: /**
36: * Prefix to be used for Solr <uniqueKey> in order to distinguish docuemnts
37: * from different sources.
38: *
39: * @var string
40: */
41: const ID_PREFIX = 'contenido_article_';
42:
43: /**
44: *
45: * @var array of SolrClient
46: */
47: private $_solrClients = NULL;
48:
49: /**
50: * IDs of articles to be updated / added / deleted.
51: *
52: * @var array
53: */
54: private $_articleIds = array();
55:
56: /**
57: * CEC chain function for updating an article in the Solr core (index).
58: *
59: * This function is intended to be called after storing an article.
60: * This function will delete and eventually add the given article from/to
61: * the SOLR index. Adding will only be performed when the article should be
62: * indexed. Removal will always be performed, even when the article is not
63: * indexable, but it might have been indexed before!
64: *
65: * include.con_editcontent.php
66: *
67: * @param int $idartlang of article to be updated
68: */
69: public static function handleStoringOfArticle(array $newData, array $oldData) {
70:
71: // get IDs of given article language
72: if (cRegistry::getArticleLanguageId() == $newData['idartlang']) {
73: // quite easy if given article is current article
74: $idclient = cRegistry::getClientId();
75: $idlang = cRegistry::getLanguageId();
76: $idcat = cRegistry::getCategoryId();
77: $idart = cRegistry::getArticleId();
78: $idcatlang = cRegistry::getCategoryLanguageId();
79: $idartlang = cRegistry::getArticleLanguageId();
80: } else {
81: // == for other articles these infos have to be read from DB
82: // get idclient by idart
83: $article = new cApiArticle($newData['idart']);
84: if ($article->isLoaded()) {
85: $idclient = $article->get('idclient');
86: }
87: // get idlang by idartlang
88: $articleLanguage = new cApiArticleLanguage($newData['idartlang']);
89: if ($articleLanguage->isLoaded()) {
90: $idlang = $articleLanguage->get('idlang');
91: }
92: // get first idcat by idart
93: $coll = new cApiCategoryArticleCollection();
94: $idcat = array_shift($coll->getCategoryIdsByArticleId($newData['idart']));
95: // get idcatlang by idcat & idlang
96: $categoryLanguage = new cApiCategoryLanguage();
97: $categoryLanguage->loadByCategoryIdAndLanguageId($idcat, $idlang);
98: if ($categoryLanguage->isLoaded()) {
99: $idcatlang = $articleLanguage->get('idlang');
100: }
101: }
102:
103: self::handleStoringOfContentEntry(array(
104: 'idclient' => $idclient,
105: 'idlang' => $idlang,
106: 'idcat' => $idcat,
107: 'idcatlang' => $idcatlang,
108: 'idart' => $idart,
109: 'idartlang' => $idartlang
110: ));
111: }
112:
113: /**
114: * CEC chain function for updating an article in the Solr core (index).
115: *
116: * This function is intended to be called after storing an article.
117: * This function will delete and eventually add the given article from/to
118: * the SOLR index. Adding will only be performed when the article should be
119: * indexed. Removal will always be performed, even when the article is not
120: * indexable, but it might have been indexed before!
121: *
122: * include.con_editcontent.php
123: *
124: * @param int $idartlang of article to be updated
125: */
126: public static function handleStoringOfContentEntry(array $articleIds) {
127: try {
128: // build indexer instance
129: $indexer = new self(array(
130: $articleIds
131: ));
132: // update given articles
133: $indexer->updateArticles();
134: } catch (cException $e) {
135: $lvl = $e instanceof SolrWarning? cGuiNotification::LEVEL_WARNING : cGuiNotification::LEVEL_ERROR;
136: $note = new cGuiNotification();
137: $note->displayNotification($lvl, $e->getMessage());
138: }
139:
140: // destroy indexer to free mem
141: unset($indexer);
142: }
143:
144: /**
145: * Create client instance (connect to Apache Solr) and aggregate it.
146: *
147: * @param array $articleIds IDs of articles to be handled
148: */
149: public function __construct(array $articleIds) {
150: $this->_articleIds = $articleIds;
151: }
152:
153: /**
154: * Destroy aggregated client instance.
155: *
156: * Destroys Solr client to free memory. Is this really neccessary?
157: * As SolClient has a method __destruct() this seems to be correct.
158: */
159: public function __destruct() {
160: foreach ($this->_solrClients as $key => $client) {
161: unset($this->_solrClients[$key]);
162: }
163: }
164:
165: /**
166: *
167: * @param int $idclient
168: * @param int $idlang
169: * @return SolrClient
170: */
171: private function _getSolrClient($idclient, $idlang) {
172:
173: if (!isset($this->_solrClients[$idclient][$idlang])) {
174: $opt = Solr::getClientOptions($idclient, $idlang);
175: Solr::validateClientOptions($opt);
176: $this->_solrClients[$idclient][$idlang] = new SolrClient($opt);
177: }
178:
179: return $this->_solrClients[$idclient][$idlang];
180: }
181:
182: /**
183: * If the current articles are indexable for each article a new index
184: * document will be created and filled with its content and eventually
185: * be added to the index.
186: *
187: * @throws cException if Solr add request failed
188: */
189: public function addArticles() {
190:
191: $toAdd = array();
192: foreach ($this->_articleIds as $articleIds) {
193:
194: // skip if article should not be indexed
195: if (!$this->_isIndexable($articleIds['idartlang'])) {
196: continue;
197: }
198:
199: if (!isset($toAdd[$articleIds['idlang']])) {
200: $toAdd[$articleIds['idlang']] = array(
201: 'idclient' => $articleIds['idclient'],
202: 'documents' => array()
203: );
204: }
205:
206: // get article content to be indexed
207: $articleContent = $this->_getContent($articleIds['idartlang']);
208:
209: // create input document
210: $solrInputDocument = new SolrInputDocument();
211: $solrInputDocument->addField('source', 'contenido_article');
212: $solrInputDocument->addField('url', cUri::getInstance()->build(array(
213: 'idart' => $articleIds['idart'],
214: 'lang' => $articleIds['idlang']
215: )));
216: $solrInputDocument->addField('id', self::ID_PREFIX . $articleIds['idartlang']);
217: // $solrInputDocument->addField('raise_exception', 'uncomment this
218: // to raise an exception');
219: // add IDs
220: $solrInputDocument->addField('id_client', $articleIds['idclient']);
221: $solrInputDocument->addField('id_lang', $articleIds['idlang']);
222: $solrInputDocument->addField('id_cat', $articleIds['idcat']);
223: $solrInputDocument->addField('id_art', $articleIds['idart']);
224: $solrInputDocument->addField('id_cat_lang', $articleIds['idcatlang']);
225: $solrInputDocument->addField('id_art_lang', $articleIds['idartlang']);
226:
227: // add content one by one
228: foreach ($articleContent as $type => $typeContent) {
229:
230: // field names in Solr should always be lowercase!
231: $type = strtolower($type);
232:
233: // == sort content of a certain content type by their typeids
234: // This is important so that the most prominent headline can be
235: // displayed first.
236: ksort($typeContent);
237:
238: // add each content entry seperatly (content type fields are
239: // defined as multiValued)
240: foreach ($typeContent as $typeid => $contentEntry) {
241: $contentEntry = trim($contentEntry);
242: if (0 < strlen($contentEntry)) {
243: $solrInputDocument->addField($type, $contentEntry);
244: }
245: }
246: }
247:
248: if (isset($articleContent['CMS_IMGEDITOR'])) {
249: foreach ($articleContent['CMS_IMGEDITOR'] as $typeid => $idupl) {
250: if (0 == strlen($idupl)) {
251: continue;
252: }
253: $image = $this->_getImageUrlByIdupl($idupl);
254: if (false === $image) {
255: //Util::log("skipped \$idupl: $idupl");
256: continue;
257: }
258: $solrInputDocument->addField('images', $image);
259: }
260: }
261:
262: array_push($toAdd[$articleIds['idlang']]['documents'], $solrInputDocument);
263:
264: }
265:
266: // add and commit documents and then optimze index
267: foreach ($toAdd as $idlang => $data) {
268: try {
269: $solrClient = $this->_getSolrClient($data['idclient'], $idlang);
270: if (self::DBG) {
271: error_log('# addArticles #');
272: error_log('idclient: ' . $data['idclient']);
273: error_log('idlang: ' . $idlang);
274: error_log('config: ' . print_r($solrClient->getOptions(), 1));
275: error_log('#documents: ' . count($data['documents']));
276: } else {
277: @$solrClient->addDocuments($data['documents']);
278: // @$solrClient->commit();
279: // @$solrClient->optimize();
280: }
281: } catch (Exception $e) {
282: // log exception
283: Solr::log($e);
284: // rethrow as cException
285: throw new cException('article could not be deleted from index', 0, $e);
286: }
287: }
288:
289: }
290:
291: /**
292: */
293: private function _getImageUrlByIdupl($idupl) {
294: $upload = new cApiUpload($idupl);
295:
296: if (false === $upload->isLoaded()) {
297: return false;
298: }
299:
300: $idclient = $upload->get('idclient');
301: $dirname = $upload->get('dirname');
302: $filename = $upload->get('filename');
303:
304: $clientConfig = cRegistry::getClientConfig($idclient);
305: $image = $clientConfig['upl']['htmlpath'] . $dirname . $filename;
306:
307: return $image;
308: }
309:
310: /**
311: * Delete all CONTENIDO article documents that are aggregated as
312: * $this->_articleIds.
313: *
314: * @throws SolrClientException if Solr delete request failed
315: */
316: public function deleteArticles() {
317: $toDelete = array();
318: foreach ($this->_articleIds as $articleIds) {
319: if (!isset($toDelete[$articleIds['idlang']])) {
320: $toDelete[$articleIds['idlang']] = array(
321: 'idclient' => $articleIds['idclient'],
322: 'idartlangs' => array()
323: );
324: }
325: $key = self::ID_PREFIX . strval($articleIds['idartlang']);
326: array_push($toDelete[$articleIds['idlang']]['idartlangs'], $key);
327: }
328: foreach ($toDelete as $idlang => $data) {
329: try {
330: $solrClient = $this->_getSolrClient($data['idclient'], $idlang);
331: if (self::DBG) {
332: error_log('# deleteArticles #');
333: error_log('idclient: ' . $data['idclient']);
334: error_log('idlang: ' . $idlang);
335: error_log('config: ' . print_r($solrClient->getOptions(), 1));
336: error_log('#idartlangs: ' . count($data['idartlangs']));
337: error_log('idartlangs: ' . print_r($data['idartlangs'], 1));
338: } else {
339: $solrClient->deleteByIds($data['idartlangs']);
340: // @$solrClient->commit();
341: }
342: } catch (Exception $e) {
343: // log exception
344: Solr::log($e);
345: // rethrow as cException
346: throw new cException('article could not be deleted from index', 0, $e);
347: }
348: }
349: }
350:
351: /**
352: *
353: * @throws cException if Solr delete request failed
354: */
355: public function updateArticles() {
356:
357: // Always delete articles from index, even if article should not be
358: // indexed it might have been indexed before
359: // What happens if an article could not be deleted cause it was not
360: // indexed before? does this throw an exception? if yes an article
361: // could never been indexed!
362: try {
363: $this->deleteArticles();
364: } catch (cException $e) {
365: // ignore exception so that articles can be indexed nonetheless
366: }
367:
368: // add articles to index
369: // will be skipped if article is not indexable
370: $this->addArticles();
371: }
372:
373: /**
374: * An article is indexable if it is online and searchable.
375: *
376: * Articles that are hidden due to a protected category are indexable. The
377: * searcher is responsible for making sure these aticles are only displayed
378: * to privileged users.
379: *
380: * @param int $idartlang of article to be checked
381: * @return bool
382: */
383: private function _isIndexable($idartlang) {
384:
385: // What about time managment?
386: $articleLanguage = new cApiArticleLanguage($idartlang);
387: if (!$articleLanguage->isLoaded()) {
388: return false;
389: } else if (1 != $articleLanguage->get('online')) {
390: return false;
391: } else if (1 != $articleLanguage->get('searchable')) {
392: return false;
393: } else {
394: return true;
395: }
396: }
397:
398: /**
399: *
400: * @param int $idartlang of article to be read
401: * @return array
402: */
403: private function _getContent($idartlang) {
404:
405: // 'CMS_IMG', 'CMS_LINK', 'CMS_LINKTARGET', 'CMS_SWF'
406: $cms = "'CMS_HTMLHEAD','CMS_HTML','CMS_TEXT','CMS_IMGDESCR',"
407: . "'CMS_LINKDESCR','CMS_HEAD','CMS_LINKTITLE','CMS_LINKEDIT',"
408: . "'CMS_RAWLINK','CMS_IMGEDIT','CMS_IMGTITLE','CMS_SIMPLELINKEDIT',"
409: . "'CMS_HTMLTEXT','CMS_EASYIMGEDIT','CMS_DATE','CMS_TEASER',"
410: . "'CMS_FILELIST','CMS_IMGEDITOR','CMS_LINKEDITOR','CMS_PIFAFORM'";
411:
412: // exclude certain content types from indexing
413: // like in conMakeArticleIndex & conGenerateKeywords
414: $db = cRegistry::getDb();
415: $db->query("-- SolrIndexer->_getContent()
416: SELECT
417: con_type.type
418: , con_content.typeid
419: , con_content.value
420: FROM
421: con_content
422: INNER JOIN
423: con_type
424: ON
425: con_content.idtype = con_type.idtype
426: WHERE
427: con_content.idartlang = $idartlang
428: AND con_type.type IN ($cms)
429: ORDER BY
430: con_content.idtype
431: , con_content.typeid
432: ;");
433:
434: $content = array();
435: while (false !== $db->nextRecord()) {
436: $value = $db->f('value');
437: //$value = utf8_encode($value);
438: $value = strip_tags($value);
439: //$value = html_entity_decode($value);
440: $value = html_entity_decode($value, ENT_QUOTES, 'UTF-8');
441: $value = trim($value);
442:
443: $content[$db->f('type')][$db->f('typeid')] = $value;
444: }
445:
446: // TODO check first alternative:
447: // cInclude('includes', 'functions.con.php');
448: // $content = conGetContentFromArticle($this->_idartlang);
449: // TODO check second alternative:
450: // $articleLanguage = new cApiArticleLanguage($this->_idartlang);
451: // if (!$articleLanguage->isLoaded()) {
452: // throw new cException('article could not be loaded');
453: // }
454: // $content = $articleLanguage->getContent();
455:
456: return $content;
457: }
458:
459: /**
460: *
461: * @param SolrResponse $solrResponse
462: * @throws cException if Solr update request failed
463: */
464: private function _checkResponse(SolrResponse $solrResponse, $msg = 'Solr update request failed') {
465: $response = $solrResponse->getResponse();
466:
467: // SolrResponse::getDigestedResponse — Returns the XML response as
468: // serialized PHP data
469: // SolrResponse::getHttpStatus — Returns the HTTP status of the response
470: // SolrResponse::getHttpStatusMessage — Returns more details on the HTTP
471: // status
472: // SolrResponse::getRawRequest — Returns the raw request sent to the
473: // Solr server
474: // SolrResponse::getRawRequestHeaders — Returns the raw request headers
475: // sent to the Solr server
476: // SolrResponse::getRawResponse — Returns the raw response from the
477: // server
478: // SolrResponse::getRawResponseHeaders — Returns the raw response
479: // headers from the server
480: // SolrResponse::getRequestUrl — Returns the full URL the request was
481: // sent to
482: // SolrResponse::getResponse — Returns a SolrObject representing the XML
483: // response from the server
484: // SolrResponse::setParseMode — Sets the parse mode
485: // SolrResponse::success — Was the request a success
486:
487: if (0 != $response->status) {
488: throw new cException($msg);
489: }
490: }
491: }
492: