1: <?php
2:
3: /**
4: *
5: * @package Plugin
6: * @subpackage SearchSolr
7: * @author Marcus Gnaß <marcus.gnass@4fb.de>
8: * @copyright four for business AG
9: * @link http://www.4fb.de
10: */
11:
12: // assert CONTENIDO framework
13: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
14:
15: /**
16: * This class sends update requests to a Solr core.
17: * If the request failed an
18: * exception is thrown. This class allows handling of more than one article at
19: * once.
20: *
21: * <server>:8080/solr/admin/cores?action=STATUS
22: * <server>:8080/solr/admin/cores?action=RENAME&core=collection1&other=contenido
23: * <server>:8080/solr/admin/cores?action=RELOAD&core=contenido
24: *
25: * @author Marcus Gnaß <marcus.gnass@4fb.de>
26: */
27: class SolrIndexer {
28:
29: /**
30: * @var bool
31: */
32: const DBG = false;
33:
34: /**
35: * Prefix to be used for Solr <uniqueKey> in order to distinguish docuemnts
36: * from different sources.
37: *
38: * @var string
39: */
40: const ID_PREFIX = 'contenido_article_';
41:
42: /**
43: *
44: * @var array of SolrClient
45: */
46: private $_solrClients = NULL;
47:
48: /**
49: * IDs of articles to be updated / added / deleted.
50: *
51: * @var array
52: */
53: private $_articleIds = array();
54:
55: /**
56: * CEC chain function for updating an article in the Solr core (index).
57: *
58: * This function is intended to be called after storing an article.
59: * This function will delete and eventually add the given article from/to
60: * the SOLR index. Adding will only be performed when the article should be
61: * indexed. Removal will always be performed, even when the article is not
62: * indexable, but it might have been indexed before!
63: *
64: * include.con_editcontent.php
65: *
66: * @param int $idartlang of article to be updated
67: */
68: public static function handleStoringOfArticle(array $newData, array $oldData) {
69:
70: // get IDs of given article language
71: if (cRegistry::getArticleLanguageId() == $newData['idartlang']) {
72: // quite easy if given article is current article
73: $idclient = cRegistry::getClientId();
74: $idlang = cRegistry::getLanguageId();
75: $idcat = cRegistry::getCategoryId();
76: $idart = cRegistry::getArticleId();
77: $idcatlang = cRegistry::getCategoryLanguageId();
78: $idartlang = cRegistry::getArticleLanguageId();
79: } else {
80: // == for other articles these infos have to be read from DB
81: // get idclient by idart
82: $article = new cApiArticle($newData['idart']);
83: if ($article->isLoaded()) {
84: $idclient = $article->get('idclient');
85: }
86: // get idlang by idartlang
87: $articleLanguage = new cApiArticleLanguage($newData['idartlang']);
88: if ($articleLanguage->isLoaded()) {
89: $idlang = $articleLanguage->get('idlang');
90: }
91: // get first idcat by idart
92: $coll = new cApiCategoryArticleCollection();
93: $idcat = array_shift($coll->getCategoryIdsByArticleId($newData['idart']));
94: // get idcatlang by idcat & idlang
95: $categoryLanguage = new cApiCategoryLanguage();
96: $categoryLanguage->loadByCategoryIdAndLanguageId($idcat, $idlang);
97: if ($categoryLanguage->isLoaded()) {
98: $idcatlang = $articleLanguage->get('idlang');
99: }
100: }
101:
102: self::handleStoringOfContentEntry(array(
103: 'idclient' => $idclient,
104: 'idlang' => $idlang,
105: 'idcat' => $idcat,
106: 'idcatlang' => $idcatlang,
107: 'idart' => $idart,
108: 'idartlang' => $idartlang
109: ));
110: }
111:
112: /**
113: * CEC chain function for updating an article in the Solr core (index).
114: *
115: * This function is intended to be called after storing an article.
116: * This function will delete and eventually add the given article from/to
117: * the SOLR index. Adding will only be performed when the article should be
118: * indexed. Removal will always be performed, even when the article is not
119: * indexable, but it might have been indexed before!
120: *
121: * include.con_editcontent.php
122: *
123: * @param int $idartlang of article to be updated
124: */
125: public static function handleStoringOfContentEntry(array $articleIds) {
126: try {
127: // build indexer instance
128: $indexer = new self(array(
129: $articleIds
130: ));
131: // update given articles
132: $indexer->updateArticles();
133: } catch (cException $e) {
134: $lvl = $e instanceof SolrWarning? cGuiNotification::LEVEL_WARNING : cGuiNotification::LEVEL_ERROR;
135: $note = new cGuiNotification();
136: $note->displayNotification($lvl, $e->getMessage());
137: }
138:
139: // destroy indexer to free mem
140: unset($indexer);
141: }
142:
143: /**
144: * Create client instance (connect to Apache Solr) and aggregate it.
145: *
146: * @param array $articleIds IDs of articles to be handled
147: */
148: public function __construct(array $articleIds) {
149: $this->_articleIds = $articleIds;
150: }
151:
152: /**
153: * Destroy aggregated client instance.
154: *
155: * Destroys Solr client to free memory. Is this really necessary?
156: * As SolClient has a method __destruct() this seems to be correct.
157: */
158: public function __destruct() {
159: foreach ($this->_solrClients as $key => $client) {
160: unset($this->_solrClients[$key]);
161: }
162: }
163:
164: /**
165: *
166: * @param int $idclient
167: * @param int $idlang
168: * @return SolrClient
169: */
170: private function _getSolrClient($idclient, $idlang) {
171:
172: if (!isset($this->_solrClients[$idclient][$idlang])) {
173: $opt = Solr::getClientOptions($idclient, $idlang);
174: Solr::validateClientOptions($opt);
175: $this->_solrClients[$idclient][$idlang] = new SolrClient($opt);
176: }
177:
178: return $this->_solrClients[$idclient][$idlang];
179: }
180:
181: /**
182: * If the current articles are indexable for each article a new index
183: * document will be created and filled with its content and eventually
184: * be added to the index.
185: *
186: * @throws cException if Solr add request failed
187: */
188: public function addArticles() {
189:
190: $toAdd = array();
191: foreach ($this->_articleIds as $articleIds) {
192:
193: // skip if article should not be indexed
194: if (!$this->_isIndexable($articleIds['idartlang'])) {
195: continue;
196: }
197:
198: if (!isset($toAdd[$articleIds['idlang']])) {
199: $toAdd[$articleIds['idlang']] = array(
200: 'idclient' => $articleIds['idclient'],
201: 'documents' => array()
202: );
203: }
204:
205: // get article content to be indexed
206: $articleContent = $this->_getContent($articleIds['idartlang']);
207:
208: // create input document
209: $solrInputDocument = new SolrInputDocument();
210: $solrInputDocument->addField('source', 'contenido_article');
211: $solrInputDocument->addField('url', cUri::getInstance()->build(array(
212: 'idart' => $articleIds['idart'],
213: 'lang' => $articleIds['idlang']
214: )));
215: $solrInputDocument->addField('id', self::ID_PREFIX . $articleIds['idartlang']);
216: // $solrInputDocument->addField('raise_exception', 'uncomment this
217: // to raise an exception');
218: // add IDs
219: $solrInputDocument->addField('id_client', $articleIds['idclient']);
220: $solrInputDocument->addField('id_lang', $articleIds['idlang']);
221: $solrInputDocument->addField('id_cat', $articleIds['idcat']);
222: $solrInputDocument->addField('id_art', $articleIds['idart']);
223: $solrInputDocument->addField('id_cat_lang', $articleIds['idcatlang']);
224: $solrInputDocument->addField('id_art_lang', $articleIds['idartlang']);
225:
226: // add content one by one
227: foreach ($articleContent as $type => $typeContent) {
228:
229: // field names in Solr should always be lowercase!
230: $type = strtolower($type);
231:
232: // == sort content of a certain content type by their typeids
233: // This is important so that the most prominent headline can be
234: // displayed first.
235: ksort($typeContent);
236:
237: // add each content entry seperatly (content type fields are
238: // defined as multiValued)
239: foreach ($typeContent as $typeid => $contentEntry) {
240: $contentEntry = trim($contentEntry);
241: if (0 < strlen($contentEntry)) {
242: $solrInputDocument->addField($type, $contentEntry);
243: }
244: }
245: }
246:
247: if (isset($articleContent['CMS_IMGEDITOR'])) {
248: foreach ($articleContent['CMS_IMGEDITOR'] as $typeid => $idupl) {
249: if (0 == strlen($idupl)) {
250: continue;
251: }
252: $image = $this->_getImageUrlByIdupl($idupl);
253: if (false === $image) {
254: //Util::log("skipped \$idupl: $idupl");
255: continue;
256: }
257: $solrInputDocument->addField('images', $image);
258: }
259: }
260:
261: array_push($toAdd[$articleIds['idlang']]['documents'], $solrInputDocument);
262:
263: }
264:
265: // add and commit documents and then optimze index
266: foreach ($toAdd as $idlang => $data) {
267: try {
268: $solrClient = $this->_getSolrClient($data['idclient'], $idlang);
269: if (self::DBG) {
270: error_log('# addArticles #');
271: error_log('idclient: ' . $data['idclient']);
272: error_log('idlang: ' . $idlang);
273: error_log('config: ' . print_r($solrClient->getOptions(), 1));
274: error_log('#documents: ' . count($data['documents']));
275: } else {
276: @$solrClient->addDocuments($data['documents']);
277: // @$solrClient->commit();
278: // @$solrClient->optimize();
279: }
280: } catch (Exception $e) {
281: // log exception
282: Solr::log($e);
283: // rethrow as cException
284: throw new cException('article could not be deleted from index', 0, $e);
285: }
286: }
287:
288: }
289:
290: /**
291: * Gets path to upload.
292: *
293: * @param int $idupl
294: */
295: private function _getImageUrlByIdupl($idupl) {
296: $upload = new cApiUpload($idupl);
297:
298: if (false === $upload->isLoaded()) {
299: return false;
300: }
301:
302: $idclient = $upload->get('idclient');
303: $dirname = $upload->get('dirname');
304: $filename = $upload->get('filename');
305:
306: $clientConfig = cRegistry::getClientConfig($idclient);
307: $image = $clientConfig['upl']['htmlpath'] . $dirname . $filename;
308:
309: return $image;
310: }
311:
312: /**
313: * Delete all CONTENIDO article documents that are aggregated as
314: * $this->_articleIds.
315: *
316: * @throws SolrClientException if Solr delete request failed
317: */
318: public function deleteArticles() {
319: $toDelete = array();
320: foreach ($this->_articleIds as $articleIds) {
321: if (!isset($toDelete[$articleIds['idlang']])) {
322: $toDelete[$articleIds['idlang']] = array(
323: 'idclient' => $articleIds['idclient'],
324: 'idartlangs' => array()
325: );
326: }
327: $key = self::ID_PREFIX . strval($articleIds['idartlang']);
328: array_push($toDelete[$articleIds['idlang']]['idartlangs'], $key);
329: }
330: foreach ($toDelete as $idlang => $data) {
331: try {
332: $solrClient = $this->_getSolrClient($data['idclient'], $idlang);
333: if (self::DBG) {
334: error_log('# deleteArticles #');
335: error_log('idclient: ' . $data['idclient']);
336: error_log('idlang: ' . $idlang);
337: error_log('config: ' . print_r($solrClient->getOptions(), 1));
338: error_log('#idartlangs: ' . count($data['idartlangs']));
339: error_log('idartlangs: ' . print_r($data['idartlangs'], 1));
340: } else {
341: $solrClient->deleteByIds($data['idartlangs']);
342: // @$solrClient->commit();
343: }
344: } catch (Exception $e) {
345: // log exception
346: Solr::log($e);
347: // rethrow as cException
348: throw new cException('article could not be deleted from index', 0, $e);
349: }
350: }
351: }
352:
353: /**
354: *
355: * @throws cException if Solr delete request failed
356: */
357: public function updateArticles() {
358:
359: // Always delete articles from index, even if article should not be
360: // indexed it might have been indexed before
361: // What happens if an article could not be deleted cause it was not
362: // indexed before? does this throw an exception? if yes an article
363: // could never been indexed!
364: try {
365: $this->deleteArticles();
366: } catch (cException $e) {
367: // ignore exception so that articles can be indexed nonetheless
368: }
369:
370: // add articles to index
371: // will be skipped if article is not indexable
372: $this->addArticles();
373: }
374:
375: /**
376: * An article is indexable if it is online and searchable.
377: *
378: * Articles that are hidden due to a protected category are indexable. The
379: * searcher is responsible for making sure these aticles are only displayed
380: * to privileged users.
381: *
382: * @param int $idartlang of article to be checked
383: * @return bool
384: */
385: private function _isIndexable($idartlang) {
386:
387: // What about time managment?
388: $articleLanguage = new cApiArticleLanguage($idartlang);
389: if (!$articleLanguage->isLoaded()) {
390: return false;
391: } else if (1 != $articleLanguage->get('online')) {
392: return false;
393: } else if (1 != $articleLanguage->get('searchable')) {
394: return false;
395: } else {
396: return true;
397: }
398: }
399:
400: /**
401: *
402: * @param int $idartlang of article to be read
403: * @return array
404: */
405: private function _getContent($idartlang) {
406:
407: // 'CMS_IMG', 'CMS_LINK', 'CMS_LINKTARGET', 'CMS_SWF'
408: $cms = "'CMS_HTMLHEAD','CMS_HTML','CMS_TEXT','CMS_IMGDESCR',"
409: . "'CMS_LINKDESCR','CMS_HEAD','CMS_LINKTITLE','CMS_LINKEDIT',"
410: . "'CMS_RAWLINK','CMS_IMGEDIT','CMS_IMGTITLE','CMS_SIMPLELINKEDIT',"
411: . "'CMS_HTMLTEXT','CMS_EASYIMGEDIT','CMS_DATE','CMS_TEASER',"
412: . "'CMS_FILELIST','CMS_IMGEDITOR','CMS_LINKEDITOR','CMS_PIFAFORM'";
413:
414: // exclude certain content types from indexing
415: // like in conMakeArticleIndex & conGenerateKeywords
416: $db = cRegistry::getDb();
417: $db->query("-- SolrIndexer->_getContent()
418: SELECT
419: con_type.type
420: , con_content.typeid
421: , con_content.value
422: FROM
423: con_content
424: INNER JOIN
425: con_type
426: ON
427: con_content.idtype = con_type.idtype
428: WHERE
429: con_content.idartlang = $idartlang
430: AND con_type.type IN ($cms)
431: ORDER BY
432: con_content.idtype
433: , con_content.typeid
434: ;");
435:
436: $content = array();
437: while (false !== $db->nextRecord()) {
438: $value = $db->f('value');
439: //$value = utf8_encode($value);
440: $value = strip_tags($value);
441: //$value = html_entity_decode($value);
442: $value = html_entity_decode($value, ENT_QUOTES, 'UTF-8');
443: $value = trim($value);
444:
445: $content[$db->f('type')][$db->f('typeid')] = $value;
446: }
447:
448: // TODO check first alternative:
449: // cInclude('includes', 'functions.con.php');
450: // $content = conGetContentFromArticle($this->_idartlang);
451: // TODO check second alternative:
452: // $articleLanguage = new cApiArticleLanguage($this->_idartlang);
453: // if (!$articleLanguage->isLoaded()) {
454: // throw new cException('article could not be loaded');
455: // }
456: // $content = $articleLanguage->getContent();
457:
458: return $content;
459: }
460:
461: /**
462: *
463: * @param SolrResponse $solrResponse
464: * @throws cException if Solr update request failed
465: */
466: private function _checkResponse(SolrResponse $solrResponse, $msg = 'Solr update request failed') {
467: $response = $solrResponse->getResponse();
468:
469: // SolrResponse::getDigestedResponse — Returns the XML response as
470: // serialized PHP data
471: // SolrResponse::getHttpStatus — Returns the HTTP status of the response
472: // SolrResponse::getHttpStatusMessage — Returns more details on the HTTP
473: // status
474: // SolrResponse::getRawRequest — Returns the raw request sent to the
475: // Solr server
476: // SolrResponse::getRawRequestHeaders — Returns the raw request headers
477: // sent to the Solr server
478: // SolrResponse::getRawResponse — Returns the raw response from the
479: // server
480: // SolrResponse::getRawResponseHeaders — Returns the raw response
481: // headers from the server
482: // SolrResponse::getRequestUrl — Returns the full URL the request was
483: // sent to
484: // SolrResponse::getResponse — Returns a SolrObject representing the XML
485: // response from the server
486: // SolrResponse::setParseMode — Sets the parse mode
487: // SolrResponse::success — Was the request a success
488:
489: if (0 != $response->status) {
490: throw new cException($msg);
491: }
492: }
493: }
494: