File plugins/repository/keyword_density.php

  1: <?php
  2: /**
  3:  * This file includes the "keyword density" sub plugin from the old plugin repository.
  4:  *
  5:  * @package    Plugin
  6:  * @subpackage Repository_KeywordDensity
  7:  * @author     Unknown
  8:  * @copyright  four for business AG <www.4fb.de>
  9:  * @license    http://www.contenido.org/license/LIZENZ.txt
 10:  * @link       http://www.4fb.de
 11:  * @link       http://www.contenido.org
 12:  */
 13: 
 14: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 15: 
 16: /**
 17:  * @param     $singlewordcounter
 18:  * @param     $string
 19:  * @param int $quantifier
 20:  *
 21:  * @return mixed
 22:  */
 23: function calcDensity($singlewordcounter, $string, $quantifier = 1) {
 24:     $minLen = 3;
 25: 
 26:     //check if the current language is german
 27:     //
 28:     // in later versions it is possible to manage most used words for every language in the dB.
 29:     if (cRegistry::getLanguageId() == 1)
 30:         //most used german words
 31:         $blacklist = array(
 32:             'in',
 33:             'der',
 34:             'und',
 35:             'zu',
 36:             'den',
 37:             'das',
 38:             'nicht',
 39:             'von',
 40:             'sie',
 41:             'ist',
 42:             'des',
 43:             'sich',
 44:             'mit',
 45:             'sorgt',
 46:             'dem',
 47:             'dass',
 48:             'er',
 49:             'es',
 50:             'ein',
 51:             'ich',
 52:             'auf',
 53:             'so',
 54:             'eine',
 55:             'auch',
 56:             'als',
 57:             'an',
 58:             'nach',
 59:             'wie',
 60:             'im',
 61:             'für',
 62:             'man',
 63:             'aber',
 64:             'aus',
 65:             'durch',
 66:             'wenn',
 67:             'nur',
 68:             'war',
 69:             'noch',
 70:             'werden',
 71:             'bei',
 72:             'hat',
 73:             'wir',
 74:             'was',
 75:             'wird',
 76:             'sein',
 77:             'einen',
 78:             'welche',
 79:             'sind',
 80:             'oder',
 81:             'zur',
 82:             'um',
 83:             'haben',
 84:             'einer',
 85:             'mir',
 86:             'über',
 87:             'ihm',
 88:             'diese',
 89:             'einem',
 90:             'ihr',
 91:             'uns',
 92:             'da',
 93:             'zum',
 94:             'kann',
 95:             'doch',
 96:             'vor',
 97:             'dieser',
 98:             'mich',
 99:             'ihn',
100:             'du',
101:             'hatte',
102:             'seine',
103:             'mehr',
104:             'am',
105:             'denn',
106:             'nun',
107:             'unter',
108:             'sehr',
109:             'selbst',
110:             'schon',
111:             'hier',
112:             'bis',
113:             'habe',
114:             'ihre',
115:             'dann',
116:             'ihnen',
117:             'seiner',
118:             'alle',
119:             'wieder',
120:             'meine',
121:             'Zeit',
122:             'gegen',
123:             'vom',
124:             'ganz',
125:             'einzelnen',
126:             'wo',
127:             'muss',
128:             'ohne',
129:             'eines',
130:             'können',
131:             'sei',
132:             'ja',
133:             'wurde',
134:             'jetzt',
135:             'immer',
136:             'seinen',
137:             'wohl',
138:             'dieses',
139:             'ihren',
140:             'würde',
141:             'diesen',
142:             'sondern',
143:             'weil',
144:             'welcher',
145:             'nichts',
146:             'diesem',
147:             'alles',
148:             'waren',
149:             'will',
150:             'Herr',
151:             'viel',
152:             'mein',
153:             'also',
154:             'soll',
155:             'worden',
156:             'lassen',
157:             'dies',
158:             'machen',
159:             'ihrer',
160:             'weiter',
161:             'Leben',
162:             'recht',
163:             'etwas',
164:             'keine',
165:             'seinem',
166:             'ob',
167:             'dir',
168:             'allen',
169:             'großen',
170:             'die',
171:             'Jahre',
172:             'Weise',
173:             'müssen',
174:             'welches',
175:             'wäre',
176:             'erst',
177:             'einmal',
178:             'Mann',
179:             'hätte',
180:             'zwei',
181:             'dich',
182:             'allein',
183:             'Herren',
184:             'während',
185:             'Paragraph',
186:             'anders',
187:             'Liebe',
188:             'kein',
189:             'damit',
190:             'gar',
191:             'Hand',
192:             'Herrn',
193:             'euch',
194:             'sollte',
195:             'konnte',
196:             'ersten',
197:             'deren',
198:             'zwischen',
199:             'wollen',
200:             'denen',
201:             'dessen',
202:             'sagen',
203:             'bin',
204:             'Menschen',
205:             'gut',
206:             'darauf',
207:             'wurden',
208:             'weiß',
209:             'gewesen',
210:             'Seite',
211:             'bald',
212:             'weit',
213:             'große',
214:             'solche',
215:             'hatten',
216:             'eben',
217:             'andern',
218:             'beiden',
219:             'macht',
220:             'sehen',
221:             'ganze',
222:             'anderen',
223:             'lange',
224:             'wer',
225:             'ihrem',
226:             'zwar',
227:             'gemacht',
228:             'dort',
229:             'kommen',
230:             'Welt',
231:             'heute',
232:             'Frau',
233:             'werde',
234:             'derselben',
235:             'ganzen',
236:             'deutschen',
237:             'lässt',
238:             'vielleicht',
239:             'meiner',
240:             'bereits',
241:             'späteren',
242:             'möglich',
243:             'sowie'
244:         );
245:     else {
246:         $blacklist = array();
247:         $minLen = 5;
248:     }
249: 
250:     // all blacklistentries to lowercase and trim ' ' at front.
251:     for ($i = 0; $i < count($blacklist); $i++) {
252:         $blacklist[$i] = ltrim(cString::toLowerCase($blacklist[$i]), '');
253:     }
254: 
255:     $tmp = explode(' ', $string);
256:     $tmp_size = sizeof($tmp);
257: 
258:     for ($i = 0; $i < $tmp_size; $i++) {
259:         if (cString::getStringLength($tmp[$i]) < $minLen) {
260:             continue;
261:         }
262: 
263:         // replace punctuation marks
264:         $patterns = array(
265:             '/[.,:]/'
266:         );
267:         $replaces = array(
268:             ''
269:         );
270:         $tmp[$i] = preg_replace($patterns, $replaces, $tmp[$i]);
271: 
272:         //trim last char if '-' e.g open-source-
273:         $tmp[$i] = rtrim($tmp[$i], '-');
274: 
275:         // hole word in upper cases ?
276:         (!ctype_upper($tmp[$i])) ? $tmp[$i] = cString::toLowerCase(addslashes($tmp[$i])) : $tmp[$i] = addslashes(preg_replace($patterns, $replaces, $tmp[$i]));
277: 
278:         if (!array_search($tmp[$i], $blacklist)) {
279:             // if hole string in upper cases add additional quantifier else
280:             // use only the string length
281:             if (ctype_upper($tmp[$i])) {
282:                 if (empty($singlewordcounter[cString::toLowerCase($tmp[$i])])) {
283:                     $singlewordcounter[cString::toLowerCase($tmp[$i])] = 0;
284:                 }
285:                 $singlewordcounter[cString::toLowerCase($tmp[$i])] += cString::getStringLength($tmp[$i]) + 10000;
286:             } else {
287:                 if (empty( $singlewordcounter[$tmp[$i]])) {
288:                     $singlewordcounter[$tmp[$i]] = 0;
289:                 }
290:                 $singlewordcounter[$tmp[$i]] += cString::getStringLength($tmp[$i]);
291:             }
292:         }
293:     }
294: 
295:     return $singlewordcounter;
296: }
297: 
298: /**
299:  * @param $a
300:  * @param $b
301:  *
302:  * @return int
303:  */
304: function __cmp($a, $b) {
305:     if ($a == $b)
306:         return 0;
307:     return ($a > $b) ? -1 : 1;
308: }
309: 
310: /**
311:  * @param     $singlewordcounter
312:  * @param int $maxKeywords
313:  *
314:  * @return array
315:  */
316: function stripCount($singlewordcounter, $maxKeywords = 15) {
317: 
318:     // strip all with only 1
319:     $tmp = array();
320: 
321:     $result = array();
322: 
323:     $tmpToRemove = 1;
324:     foreach ($singlewordcounter as $key => $value) {
325:         if ($value > $tmpToRemove) {
326:             $tmp[$key] = $value;
327:         }
328:     }
329: 
330:     if (sizeof($tmp) <= $maxKeywords) {
331:         foreach ($tmp as $key => $value) {
332:             $result[] = $key;
333:         }
334:     } else {
335:         $dist = array();
336: 
337:         foreach ($tmp as $key => $value) {
338:             if (!isset($dist[$value])) {
339:                 $dist[$value] = 0;
340:             } else {
341:                 $dist[$value]++;
342:             }
343:         }
344: 
345:         uksort($dist, "__cmp");
346:         reset($dist);
347: 
348:         $count = 0;
349: 
350:         $resultset = array();
351:         $useQuantity = array();
352: 
353:         foreach ($dist as $key => $value) {
354: 
355:             $_count = $count + $value;
356:             if ($_count <= $maxKeywords) {
357:                 $count += $value;
358:                 $useQuantity[] = $key;
359:             } else {
360:                 break;
361:             }
362:         }
363: 
364:         // run all keywords and select by quantities to use
365:         foreach ($singlewordcounter as $key => $value) {
366:             if (in_array($value, $useQuantity)) {
367:                 $result[] = $key;
368:             }
369:         }
370:     }
371:     return $result;
372: }
373: 
374: /**
375:  * @param $headline
376:  * @param $text
377:  *
378:  * @return bool|string
379:  */
380: function keywordDensity($headline, $text) {
381:     global $lang, $client, $cfgClient;
382: 
383:     $headline = strip_tags($headline);
384:     $text = strip_tags($text);
385: 
386:     $text = conHtmlEntityDecode($text);
387: 
388:     // replace all non converted numbered entities (what about numbered entites?)
389:     // replace all double/more spaces
390:     $patterns = array(
391:         '#&[a-z]+\;#i',
392:         '#\s+#'
393:     );
394:     $replaces = array(
395:         '',
396:         ' '
397:     );
398:     $text = preg_replace($patterns, $replaces, $text);
399: 
400:     // path = cms_getUrlPath($idcat);
401:     // path = str_replace(cRegistry::getFrontendUrl();, '', $path);
402:     // path = cString::getPartOfString($path, 0, cString::getStringLength($path) - 1);
403:     // path = str_replace('/', ' ', $path);
404: 
405:     $singlewordcounter = array();
406: 
407:     // calc for text
408:     $singlewordcounter = calcDensity($singlewordcounter, $text);
409: 
410:     // calc for headline
411:     $singlewordcounter = calcDensity($singlewordcounter, $headline, 2);
412: 
413:     // get urlpath strings
414:     // singlewordcounter = calcDensity($singlewordcounter, $path, 4);
415: 
416:     arsort($singlewordcounter, SORT_NUMERIC);
417:     $singlewordcounter = stripCount($singlewordcounter);
418: 
419:     if (!is_array($singlewordcounter)) {
420:         return false;
421:     } else {
422:         return implode(', ', $singlewordcounter);
423:     }
424: }
425: 
426: ?>
Packages

Functions