File plugins/repository/keyword_density.php

  1: <?php
  2: /**
  3:  * This file includes the "keyword density" sub plugin from the old plugin repository.
  4:  *
  5:  * @package    Plugin
  6:  * @subpackage Repository_KeywordDensity
  7:  * @version    SVN Revision $Rev:$
  8:  *
  9:  * @author     Unknown
 10:  * @copyright  four for business AG <www.4fb.de>
 11:  * @license    http://www.contenido.org/license/LIZENZ.txt
 12:  * @link       http://www.4fb.de
 13:  * @link       http://www.contenido.org
 14:  */
 15: 
 16: defined('CON_FRAMEWORK') || die('Illegal call: Missing framework initialization - request aborted.');
 17: 
 18: function calcDensity($singlewordcounter, $string, $quantifier = 1) {
 19:     $minLen = 3;
 20: 
 21:     //check if the current language is german
 22:     //
 23:     // in later versions it is possible to manage most used words for every language in the dB.
 24:     if (cRegistry::getLanguageId() == 1)
 25:         //most used german words
 26:         $blacklist = array(
 27:             'in',
 28:             'der',
 29:             'und',
 30:             'zu',
 31:             'den',
 32:             'das',
 33:             'nicht',
 34:             'von',
 35:             'sie',
 36:             'ist',
 37:             'des',
 38:             'sich',
 39:             'mit',
 40:             'sorgt',
 41:             'dem',
 42:             'dass',
 43:             'er',
 44:             'es',
 45:             'ein',
 46:             'ich',
 47:             'auf',
 48:             'so',
 49:             'eine',
 50:             'auch',
 51:             'als',
 52:             'an',
 53:             'nach',
 54:             'wie',
 55:             'im',
 56:             'für',
 57:             'man',
 58:             'aber',
 59:             'aus',
 60:             'durch',
 61:             'wenn',
 62:             'nur',
 63:             'war',
 64:             'noch',
 65:             'werden',
 66:             'bei',
 67:             'hat',
 68:             'wir',
 69:             'was',
 70:             'wird',
 71:             'sein',
 72:             'einen',
 73:             'welche',
 74:             'sind',
 75:             'oder',
 76:             'zur',
 77:             'um',
 78:             'haben',
 79:             'einer',
 80:             'mir',
 81:             'über',
 82:             'ihm',
 83:             'diese',
 84:             'einem',
 85:             'ihr',
 86:             'uns',
 87:             'da',
 88:             'zum',
 89:             'kann',
 90:             'doch',
 91:             'vor',
 92:             'dieser',
 93:             'mich',
 94:             'ihn',
 95:             'du',
 96:             'hatte',
 97:             'seine',
 98:             'mehr',
 99:             'am',
100:             'denn',
101:             'nun',
102:             'unter',
103:             'sehr',
104:             'selbst',
105:             'schon',
106:             'hier',
107:             'bis',
108:             'habe',
109:             'ihre',
110:             'dann',
111:             'ihnen',
112:             'seiner',
113:             'alle',
114:             'wieder',
115:             'meine',
116:             'Zeit',
117:             'gegen',
118:             'vom',
119:             'ganz',
120:             'einzelnen',
121:             'wo',
122:             'muss',
123:             'ohne',
124:             'eines',
125:             'können',
126:             'sei',
127:             'ja',
128:             'wurde',
129:             'jetzt',
130:             'immer',
131:             'seinen',
132:             'wohl',
133:             'dieses',
134:             'ihren',
135:             'würde',
136:             'diesen',
137:             'sondern',
138:             'weil',
139:             'welcher',
140:             'nichts',
141:             'diesem',
142:             'alles',
143:             'waren',
144:             'will',
145:             'Herr',
146:             'viel',
147:             'mein',
148:             'also',
149:             'soll',
150:             'worden',
151:             'lassen',
152:             'dies',
153:             'machen',
154:             'ihrer',
155:             'weiter',
156:             'Leben',
157:             'recht',
158:             'etwas',
159:             'keine',
160:             'seinem',
161:             'ob',
162:             'dir',
163:             'allen',
164:             'großen',
165:             'die',
166:             'Jahre',
167:             'Weise',
168:             'müssen',
169:             'welches',
170:             'wäre',
171:             'erst',
172:             'einmal',
173:             'Mann',
174:             'hätte',
175:             'zwei',
176:             'dich',
177:             'allein',
178:             'Herren',
179:             'während',
180:             'Paragraph',
181:             'anders',
182:             'Liebe',
183:             'kein',
184:             'damit',
185:             'gar',
186:             'Hand',
187:             'Herrn',
188:             'euch',
189:             'sollte',
190:             'konnte',
191:             'ersten',
192:             'deren',
193:             'zwischen',
194:             'wollen',
195:             'denen',
196:             'dessen',
197:             'sagen',
198:             'bin',
199:             'Menschen',
200:             'gut',
201:             'darauf',
202:             'wurden',
203:             'weiß',
204:             'gewesen',
205:             'Seite',
206:             'bald',
207:             'weit',
208:             'große',
209:             'solche',
210:             'hatten',
211:             'eben',
212:             'andern',
213:             'beiden',
214:             'macht',
215:             'sehen',
216:             'ganze',
217:             'anderen',
218:             'lange',
219:             'wer',
220:             'ihrem',
221:             'zwar',
222:             'gemacht',
223:             'dort',
224:             'kommen',
225:             'Welt',
226:             'heute',
227:             'Frau',
228:             'werde',
229:             'derselben',
230:             'ganzen',
231:             'deutschen',
232:             'lässt',
233:             'vielleicht',
234:             'meiner',
235:             'bereits',
236:             'späteren',
237:             'möglich',
238:             'sowie'
239:         );
240:     else {
241:         $blacklist = array();
242:         $minLen = 5;
243:     }
244: 
245:     //all blacklistentries to lowercase and trim ' ' at front.
246:     for ($i = 0; $i < count($blacklist); $i++) {
247:         $blacklist[$i] = ltrim(mb_strtolower($blacklist[$i]), '');
248:     }
249:     $tmp = array();
250:     $tmp = explode(' ', $string);
251:     $tmp_size = sizeof($tmp);
252: 
253:     for ($i = 0; $i <= $tmp_size; $i++) {
254:         if (strlen($tmp[$i]) < $minLen) {
255:             continue;
256:         }
257: 
258:         // replace punctuation marks
259:         $patterns = array(
260:             '/[.,:]/'
261:         );
262:         $replaces = array(
263:             ''
264:         );
265:         $tmp[$i] = preg_replace($patterns, $replaces, $tmp[$i]);
266: 
267:         //trim last char if '-' e.g open-source-
268:         $tmp[$i] = rtrim($tmp[$i], '-');
269: 
270:         // hole word in upper cases ?
271:         (!ctype_upper($tmp[$i])) ? $tmp[$i] = mb_strtolower(addslashes($tmp[$i])) : $tmp[$i] = addslashes(preg_replace($patterns, $replaces, $tmp[$i]));
272: 
273:         // using mb_strtolower because of umlauts
274:         if (!array_search($tmp[$i], $blacklist)) {
275:             // if hole string in upper casses add additional quantifiert else
276:             // use only the string length
277:             (ctype_upper($tmp[$i])) ? $singlewordcounter[mb_strtolower($tmp[$i])] += strlen($tmp[$i]) + 10000 : $singlewordcounter[$tmp[$i]] += strlen($tmp[$i]);
278:         }
279:     }
280: 
281:     return $singlewordcounter;
282: }
283: 
284: function __cmp($a, $b) {
285:     if ($a == $b)
286:         return 0;
287:     return ($a > $b) ? -1 : 1;
288: }
289: 
290: function stripCount($singlewordcounter, $maxKeywords = 15) {
291: 
292:     // strip all with only 1
293:     $tmp = array();
294: 
295:     $result = array();
296: 
297:     $tmpToRemove = 1;
298:     foreach ($singlewordcounter as $key => $value) {
299:         if ($value > $tmpToRemove) {
300:             $tmp[$key] = $value;
301:         }
302:     }
303: 
304:     if (sizeof($tmp) <= $maxKeywords) {
305:         foreach ($tmp as $key => $value) {
306:             $result[] = $key;
307:         }
308:     } else {
309:         $dist = array();
310: 
311:         foreach ($tmp as $key => $value) {
312:             $dist[$value]++;
313:         }
314: 
315:         uksort($dist, "__cmp");
316:         reset($dist);
317: 
318:         $count = 0;
319: 
320:         $resultset = array();
321:         $useQuantity = array();
322: 
323:         foreach ($dist as $key => $value) {
324: 
325:             $_count = $count + $value;
326:             if ($_count <= $maxKeywords) {
327:                 $count += $value;
328:                 $useQuantity[] = $key;
329:             } else {
330:                 break;
331:             }
332:         }
333: 
334:         // run all keywords and select by quantities to use
335:         foreach ($singlewordcounter as $key => $value) {
336:             if (in_array($value, $useQuantity)) {
337:                 $result[] = $key;
338:             }
339:         }
340:     }
341:     return $result;
342: }
343: 
344: function keywordDensity($headline, $text) {
345:     global $lang, $client, $cfgClient;
346: 
347:     $headline = strip_tags($headline);
348:     $text = strip_tags($text);
349: 
350:     $text = conHtmlEntityDecode($text);
351: 
352:     // replace all non converted numbered entities (what about numbered entites?)
353:     // replace all double/more spaces
354:     $patterns = array(
355:         '#&[a-z]+\;#i',
356:         '#\s+#'
357:     );
358:     $replaces = array(
359:         '',
360:         ' '
361:     );
362:     $text = preg_replace($patterns, $replaces, $text);
363: 
364:     // path = cms_getUrlPath($idcat);
365:     // path = str_replace(cRegistry::getFrontendUrl();, '', $path);
366:     // path = substr($path, 0, strlen($path) - 1);
367:     // path = str_replace('/', ' ', $path);
368: 
369:     $singlewordcounter = array();
370: 
371:     // calc for text
372:     $singlewordcounter = calcDensity($singlewordcounter, $text);
373: 
374:     // calc for headline
375:     $singlewordcounter = calcDensity($singlewordcounter, $headline, 2);
376: 
377:     // get urlpath strings
378:     // singlewordcounter = calcDensity($singlewordcounter, $path, 4);
379: 
380:     arsort($singlewordcounter, SORT_NUMERIC);
381:     $singlewordcounter = stripCount($singlewordcounter);
382: 
383:     if (!is_array($singlewordcounter)) {
384:         return false;
385:     } else {
386:         return implode(', ', $singlewordcounter);
387:     }
388: }
389: ?>
390:
Packages

Functions