����JFIF��x�x����'
| Server IP : 78.140.185.180 / Your IP : 216.73.216.170 Web Server : LiteSpeed System : Linux cpanel13.v.fozzy.com 4.18.0-513.11.1.lve.el8.x86_64 #1 SMP Thu Jan 18 16:21:02 UTC 2024 x86_64 User : builderbox ( 1072) PHP Version : 7.3.33 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : OFF | Pkexec : OFF Directory : /home/builderbox/./././www/vendor/teamtnt/tntsearch/src/KeywordExtraction/ |
Upload File : |
<?php
namespace TeamTNT\TNTSearch\KeywordExtraction;
class Rake
{
public function __construct($language = "english")
{
$stopwords = file_get_contents(__DIR__."/../Stopwords/".$language.".json");
$this->stopwords = json_decode($stopwords);
}
public function extractKeywords($text, $includeScores = true)
{
$phraseList = $this->generateCandidateKeywords($text);
$wordScores = $this->calculateWordScores($phraseList);
$phraseScores = $this->calculatePhraseScores($phraseList, $wordScores);
arsort($phraseScores);
$oneThird = ceil(count($phraseScores) / 3) + 1;
$phraseScores = array_slice($phraseScores, 0, $oneThird);
if ($includeScores) {
return $phraseScores;
}
return array_keys($phraseScores);
}
public function generateCandidateKeywords($text)
{
$phraseList = [];
$words = $this->tokenize($text);
$phrase = [];
foreach ($words as $word) {
if (in_array($word, $this->stopwords) || ctype_punct($word)) {
if (count($phrase) > 0) {
$phraseList[] = $phrase;
$phrase = [];
}
} else {
$phrase[] = $word;
}
}
if (count($phrase) > 0) {
$phraseList[] = $phrase;
$phrase = [];
}
return $phraseList;
}
public function calculatePhraseScores($phraseList, $wordScores)
{
$result = [];
foreach ($phraseList as $phrase) {
$wordScore = 0;
foreach ($phrase as $word) {
$wordScore += $wordScores[$word];
}
$result[implode(" ", $phrase)] = $wordScore;
}
return $result;
}
public function calculateWordScores($phraseList)
{
$result = [];
foreach ($phraseList as $phrase) {
foreach ($phrase as $word) {
$wordScore = $this->wordDegree($word, $phraseList) / $this->wordFrequency($word, $phraseList);
$result[$word] = $wordScore;
}
}
return $result;
}
public function wordDegree($word, $phraseList)
{
$count = 0;
foreach ($phraseList as $phrase) {
foreach ($phrase as $p) {
if ($p == $word) {
$count += count($phrase);
}
}
}
return $count;
}
public function wordFrequency($word, $phraseList)
{
$count = 0;
foreach ($phraseList as $phrase) {
foreach ($phrase as $p) {
if ($p == $word) {
$count++;
}
}
}
return $count;
}
public function returnFormatedPharaseList($phraseList)
{
$formatedList = [];
foreach ($phraseList as $phrase) {
$formatedList[] = implode(" ", $phrase);
}
return $formatedList;
}
public function tokenize($str)
{
$str = mb_strtolower($str);
$arr = [];
// for the character classes
// see http://php.net/manual/en/regexp.reference.unicode.php
$pat = '/
([\pZ\pC]*) # match any separator or other
# in sequence
(
[^\pP\pZ\pC]+ | # match a sequence of characters
# that are not punctuation,
# separator or other
. # match punctuations one by one
)
([\pZ\pC]*) # match a sequence of separators
# that follows
/xu';
preg_match_all($pat, $str, $arr);
return $arr[2];
}
}