����JFIF��x�x����'
| Server IP : 78.140.185.180 / Your IP : 216.73.216.170 Web Server : LiteSpeed System : Linux cpanel13.v.fozzy.com 4.18.0-513.11.1.lve.el8.x86_64 #1 SMP Thu Jan 18 16:21:02 UTC 2024 x86_64 User : builderbox ( 1072) PHP Version : 7.3.33 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : OFF | Pkexec : OFF Directory : /home/builderbox/./././www/vendor/teamtnt/tntsearch/src/Indexer/ |
Upload File : |
<?php
namespace TeamTNT\TNTSearch\Indexer;
use Exception;
use PDO;
use RecursiveDirectoryIterator;
use RecursiveIteratorIterator;
use TeamTNT\TNTSearch\Connectors\FileSystemConnector;
use TeamTNT\TNTSearch\Connectors\MySqlConnector;
use TeamTNT\TNTSearch\Connectors\PostgresConnector;
use TeamTNT\TNTSearch\Connectors\SQLiteConnector;
use TeamTNT\TNTSearch\Connectors\SqlServerConnector;
use TeamTNT\TNTSearch\FileReaders\TextFileReader;
use TeamTNT\TNTSearch\Stemmer\CroatianStemmer;
use TeamTNT\TNTSearch\Stemmer\PorterStemmer;
use TeamTNT\TNTSearch\Support\Collection;
use TeamTNT\TNTSearch\Support\Tokenizer;
use TeamTNT\TNTSearch\Support\TokenizerInterface;
class TNTIndexer
{
protected $index = null;
protected $dbh = null;
protected $primaryKey = null;
protected $excludePrimaryKey = true;
public $stemmer = null;
public $tokenizer = null;
public $stopWords = [];
public $filereader = null;
public $config = [];
protected $query = "";
protected $wordlist = [];
protected $inMemoryTerms = [];
protected $decodeHTMLEntities = false;
public $disableOutput = false;
public $inMemory = true;
public $steps = 1000;
public $indexName = "";
public $statementsPrepared = false;
public function __construct()
{
$this->stemmer = new PorterStemmer;
$this->tokenizer = new Tokenizer;
$this->filereader = new TextFileReader;
}
/**
* @param TokenizerInterface $tokenizer
*/
public function setTokenizer(TokenizerInterface $tokenizer)
{
$this->tokenizer = $tokenizer;
$this->updateInfoTable('tokenizer', get_class($tokenizer));
}
public function setStopWords(array $stopWords)
{
$this->stopWords = $stopWords;
}
/**
* @param array $config
*/
public function loadConfig(array $config)
{
$this->config = $config;
$this->config['storage'] = rtrim($this->config['storage'], '/').'/';
if (!isset($this->config['driver'])) {
$this->config['driver'] = "";
}
if (!isset($this->config['wal'])) {
$this->config['wal'] = true;
}
}
/**
* @return string
*/
public function getStoragePath()
{
return $this->config['storage'];
}
public function getStemmer()
{
return $this->stemmer;
}
/**
* @return string
*/
public function getPrimaryKey()
{
if (isset($this->primaryKey)) {
return $this->primaryKey;
}
return 'id';
}
/**
* @param string $primaryKey
*/
public function setPrimaryKey($primaryKey)
{
$this->primaryKey = $primaryKey;
}
public function excludePrimaryKey()
{
$this->excludePrimaryKey = true;
}
public function includePrimaryKey()
{
$this->excludePrimaryKey = false;
}
public function setStemmer($stemmer)
{
$this->stemmer = $stemmer;
$this->updateInfoTable('stemmer', get_class($stemmer));
}
public function setCroatianStemmer()
{
$this->setStemmer(new CroatianStemmer);
}
/**
* @param string $language - one of: arabic, croatian, german, italian, porter, russian, ukrainian
*/
public function setLanguage($language = 'porter')
{
$class = 'TeamTNT\\TNTSearch\\Stemmer\\'.ucfirst(strtolower($language)).'Stemmer';
$this->setStemmer(new $class);
}
/**
* @param PDO $index
*/
public function setIndex($index)
{
$this->index = $index;
}
public function setFileReader($filereader)
{
$this->filereader = $filereader;
}
public function prepareStatementsForIndex()
{
if (!$this->statementsPrepared) {
$this->insertWordlistStmt = $this->index->prepare("INSERT INTO wordlist (term, num_hits, num_docs) VALUES (:keyword, :hits, :docs)");
$this->selectWordlistStmt = $this->index->prepare("SELECT * FROM wordlist WHERE term like :keyword LIMIT 1");
$this->updateWordlistStmt = $this->index->prepare("UPDATE wordlist SET num_docs = num_docs + :docs, num_hits = num_hits + :hits WHERE term = :keyword");
$this->statementsPrepared = true;
}
}
/**
* @param string $indexName
*
* @return TNTIndexer
*/
public function createIndex($indexName)
{
$this->indexName = $indexName;
if (file_exists($this->config['storage'].$indexName)) {
unlink($this->config['storage'].$indexName);
}
$this->index = new PDO('sqlite:'.$this->config['storage'].$indexName);
$this->index->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
if($this->config['wal']) {
$this->index->exec("PRAGMA journal_mode=wal;");
}
$this->index->exec("CREATE TABLE IF NOT EXISTS wordlist (
id INTEGER PRIMARY KEY,
term TEXT UNIQUE COLLATE nocase,
num_hits INTEGER,
num_docs INTEGER)");
$this->index->exec("CREATE UNIQUE INDEX 'main'.'index' ON wordlist ('term');");
$this->index->exec("CREATE TABLE IF NOT EXISTS doclist (
term_id INTEGER,
doc_id INTEGER,
hit_count INTEGER)");
$this->index->exec("CREATE TABLE IF NOT EXISTS fields (
id INTEGER PRIMARY KEY,
name TEXT)");
$this->index->exec("CREATE TABLE IF NOT EXISTS hitlist (
term_id INTEGER,
doc_id INTEGER,
field_id INTEGER,
position INTEGER,
hit_count INTEGER)");
$this->index->exec("CREATE TABLE IF NOT EXISTS info (
key TEXT,
value INTEGER)");
$this->index->exec("INSERT INTO info ( 'key', 'value') values ( 'total_documents', 0)");
$this->index->exec("INSERT INTO info ( 'key', 'value') values ( 'stemmer', 'TeamTNT\TNTSearch\Stemmer\NoStemmer')");
$this->index->exec("INSERT INTO info ( 'key', 'value') values ( 'tokenizer', 'TeamTNT\TNTSearch\Support\Tokenizer')");
$this->index->exec("CREATE INDEX IF NOT EXISTS 'main'.'term_id_index' ON doclist ('term_id' COLLATE BINARY);");
$this->index->exec("CREATE INDEX IF NOT EXISTS 'main'.'doc_id_index' ON doclist ('doc_id');");
if (isset($this->config['stemmer'])) {
$this->setStemmer(new $this->config['stemmer']);
}
if (isset($this->config['tokenizer'])) {
$this->setTokenizer(new $this->config['tokenizer']);
}
if (!$this->dbh) {
$connector = $this->createConnector($this->config);
$this->dbh = $connector->connect($this->config);
}
return $this;
}
public function indexBeginTransaction()
{
$this->index->beginTransaction();
}
public function indexEndTransaction()
{
$this->index->commit();
}
/**
* @param array $config
*
* @return FileSystemConnector|MySqlConnector|PostgresConnector|SQLiteConnector|SqlServerConnector
* @throws Exception
*/
public function createConnector(array $config)
{
if (!isset($config['driver'])) {
throw new Exception('A driver must be specified.');
}
switch ($config['driver']) {
case 'mysql':
return new MySqlConnector;
case 'pgsql':
return new PostgresConnector;
case 'sqlite':
return new SQLiteConnector;
case 'sqlsrv':
return new SqlServerConnector;
case 'filesystem':
return new FileSystemConnector;
}
throw new Exception("Unsupported driver [{$config['driver']}]");
}
/**
* @param PDO $dbh
*/
public function setDatabaseHandle(PDO $dbh)
{
$this->dbh = $dbh;
if ($this->dbh->getAttribute(PDO::ATTR_DRIVER_NAME) == 'mysql') {
$this->dbh->setAttribute(PDO::MYSQL_ATTR_USE_BUFFERED_QUERY, false);
}
}
public function query($query)
{
$this->query = $query;
}
public function run()
{
if ($this->config['driver'] == "filesystem") {
return $this->readDocumentsFromFileSystem();
}
$result = $this->dbh->query($this->query);
$counter = 0;
$this->index->beginTransaction();
while ($row = $result->fetch(PDO::FETCH_ASSOC)) {
$counter++;
$this->processDocument(new Collection($row));
if ($counter % $this->steps == 0) {
$this->info("Processed $counter rows");
}
if ($counter % 10000 == 0) {
$this->index->commit();
$this->index->beginTransaction();
$this->info("Committed");
}
}
$this->index->commit();
$this->updateInfoTable('total_documents', $counter);
$this->info("Total rows $counter");
}
public function readDocumentsFromFileSystem()
{
$exclude = [];
if (isset($this->config['exclude'])) {
$exclude = $this->config['exclude'];
}
$this->index->exec("CREATE TABLE IF NOT EXISTS filemap (
id INTEGER PRIMARY KEY,
path TEXT)");
$path = realpath($this->config['location']);
$objects = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($path), RecursiveIteratorIterator::SELF_FIRST);
$this->index->beginTransaction();
$counter = 0;
foreach ($objects as $name => $object) {
$name = str_replace($path.'/', '', $name);
if (is_callable($this->config['extension'])) {
$includeFile = $this->config['extension']($object);
} elseif (is_array($this->config['extension'])) {
$includeFile = in_array($object->getExtension(), $this->config['extension']);
} else {
$includeFile = stringEndsWith($name, $this->config['extension']);
}
if ($includeFile && !in_array($name, $exclude)) {
$counter++;
$file = [
'id' => $counter,
'name' => $name,
'content' => $this->filereader->read($object)
];
$fileCollection = new Collection($file);
if (property_exists($this->filereader, 'fileFilterCallback')
&& is_callable($this->filereader->fileFilterCallback)) {
$fileCollection = $fileCollection->filter($this->filereader->fileFilterCallback);
}
if (property_exists($this->filereader, 'fileMapCallback')
&& is_callable($this->filereader->fileMapCallback)) {
$fileCollection = $fileCollection->map($this->filereader->fileMapCallback);
}
$this->processDocument($fileCollection);
$statement = $this->index->prepare("INSERT INTO filemap ( 'id', 'path') values ( $counter, :object)");
$statement->bindParam(':object', $object);
$statement->execute();
$this->info("Processed $counter $object");
}
}
$this->index->commit();
$this->index->exec("INSERT INTO info ( 'key', 'value') values ( 'total_documents', $counter)");
$this->index->exec("INSERT INTO info ( 'key', 'value') values ( 'driver', 'filesystem')");
$this->info("Total rows $counter");
$this->info("Index created: {$this->config['storage']}");
}
public function processDocument($row)
{
$documentId = $row->get($this->getPrimaryKey());
if ($this->excludePrimaryKey) {
$row->forget($this->getPrimaryKey());
}
$stems = $row->map(function ($columnContent, $columnName) use ($row) {
return $this->stemText($columnContent);
});
$this->saveToIndex($stems, $documentId);
}
public function insert($document)
{
$this->processDocument(new Collection($document));
$total = $this->totalDocumentsInCollection() + 1;
$this->updateInfoTable('total_documents', $total);
}
public function update($id, $document)
{
$this->delete($id);
$this->insert($document);
}
public function delete($documentId)
{
$rows = $this->prepareAndExecuteStatement("SELECT * FROM doclist WHERE doc_id = :documentId;", [
['key' => ':documentId', 'value' => $documentId]
])->fetchAll(PDO::FETCH_ASSOC);
$updateStmt = $this->index->prepare("UPDATE wordlist SET num_docs = num_docs - 1, num_hits = num_hits - :hits WHERE id = :term_id");
foreach ($rows as $document) {
$updateStmt->bindParam(":hits", $document['hit_count']);
$updateStmt->bindParam(":term_id", $document['term_id']);
$updateStmt->execute();
}
$this->prepareAndExecuteStatement("DELETE FROM doclist WHERE doc_id = :documentId;", [
['key' => ':documentId', 'value' => $documentId]
]);
$res = $this->prepareAndExecuteStatement("DELETE FROM wordlist WHERE num_hits = 0");
$affected = $res->rowCount();
if ($affected) {
$total = $this->totalDocumentsInCollection() - 1;
$this->updateInfoTable('total_documents', $total);
}
}
public function updateInfoTable($key, $value)
{
$this->updateInfoTableStmt = $this->index->prepare("UPDATE info SET value = :value WHERE key = :key");
$this->updateInfoTableStmt->bindValue(':key', $key);
$this->updateInfoTableStmt->bindValue(':value', $value);
$this->updateInfoTableStmt->execute();
}
public function stemText($text)
{
$stemmer = $this->getStemmer();
$words = $this->breakIntoTokens($text);
$stems = [];
foreach ($words as $word) {
$stems[] = $stemmer->stem($word);
}
return $stems;
}
public function breakIntoTokens($text)
{
if ($this->decodeHTMLEntities) {
$text = html_entity_decode($text);
}
return $this->tokenizer->tokenize($text, $this->stopWords);
}
public function decodeHtmlEntities($value = true)
{
$this->decodeHTMLEntities = $value;
}
public function saveToIndex($stems, $docId)
{
$this->prepareStatementsForIndex();
$terms = $this->saveWordlist($stems);
$this->saveDoclist($terms, $docId);
$this->saveHitList($stems, $docId, $terms);
}
/**
* @param $stems
*
* @return array
*/
public function saveWordlist($stems)
{
$terms = [];
$stems->map(function ($column, $key) use (&$terms) {
foreach ($column as $term) {
if (array_key_exists($term, $terms)) {
$terms[$term]['hits']++;
$terms[$term]['docs'] = 1;
} else {
$terms[$term] = [
'hits' => 1,
'docs' => 1,
'id' => 0
];
}
}
});
foreach ($terms as $key => $term) {
try {
$this->insertWordlistStmt->bindParam(":keyword", $key);
$this->insertWordlistStmt->bindParam(":hits", $term['hits']);
$this->insertWordlistStmt->bindParam(":docs", $term['docs']);
$this->insertWordlistStmt->execute();
$terms[$key]['id'] = $this->index->lastInsertId();
if ($this->inMemory) {
$this->inMemoryTerms[$key] = $terms[$key]['id'];
}
} catch (\Exception $e) {
if ($e->getCode() == 23000) {
$this->updateWordlistStmt->bindValue(':docs', $term['docs']);
$this->updateWordlistStmt->bindValue(':hits', $term['hits']);
$this->updateWordlistStmt->bindValue(':keyword', $key);
$this->updateWordlistStmt->execute();
if (!$this->inMemory) {
$this->selectWordlistStmt->bindValue(':keyword', $key);
$this->selectWordlistStmt->execute();
$res = $this->selectWordlistStmt->fetch(PDO::FETCH_ASSOC);
$terms[$key]['id'] = $res['id'];
} else {
$terms[$key]['id'] = $this->inMemoryTerms[$key];
}
} else {
echo "Error while saving wordlist: ".$e->getMessage()."\n";
}
// Statements must be refreshed, because in this state they have error attached to them.
$this->statementsPrepared = false;
$this->prepareStatementsForIndex();
}
}
return $terms;
}
public function saveDoclist($terms, $docId)
{
$insert = "INSERT INTO doclist (term_id, doc_id, hit_count) VALUES (:id, :doc, :hits)";
$stmt = $this->index->prepare($insert);
foreach ($terms as $key => $term) {
$stmt->bindValue(':id', $term['id']);
$stmt->bindValue(':doc', $docId);
$stmt->bindValue(':hits', $term['hits']);
try {
$stmt->execute();
} catch (\Exception $e) {
//we have a duplicate
echo $e->getMessage();
}
}
}
public function saveHitList($stems, $docId, $termsList)
{
return;
$fieldCounter = 0;
$fields = [];
$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, position, hit_count)
VALUES (:term_id, :doc_id, :field_id, :position, :hit_count)";
$stmt = $this->index->prepare($insert);
foreach ($stems as $field => $terms) {
$fields[$fieldCounter] = $field;
$positionCounter = 0;
$termCounts = array_count_values($terms);
foreach ($terms as $term) {
if (isset($termsList[$term])) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':position', $positionCounter);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$positionCounter++;
}
$fieldCounter++;
}
}
public function getWordFromWordList($word)
{
$selectStmt = $this->index->prepare("SELECT * FROM wordlist WHERE term like :keyword LIMIT 1");
$selectStmt->bindValue(':keyword', $word);
$selectStmt->execute();
return $selectStmt->fetch(PDO::FETCH_ASSOC);
}
/**
* @param $word
*
* @return int
*/
public function countWordInWordList($word)
{
$res = $this->getWordFromWordList($word);
if ($res) {
return $res['num_hits'];
}
return 0;
}
/**
* @param $word
*
* @return int
*/
public function countDocHitsInWordList($word)
{
$res = $this->getWordFromWordList($word);
if ($res) {
return $res['num_docs'];
}
return 0;
}
public function buildDictionary($filename, $count = -1, $hits = true, $docs = false)
{
$selectStmt = $this->index->prepare("SELECT * FROM wordlist ORDER BY num_hits DESC;");
$selectStmt->execute();
$dictionary = "";
$counter = 0;
while ($row = $selectStmt->fetch(PDO::FETCH_ASSOC)) {
$dictionary .= $row['term'];
if ($hits) {
$dictionary .= "\t".$row['num_hits'];
}
if ($docs) {
$dictionary .= "\t".$row['num_docs'];
}
$counter++;
if ($counter >= $count && $count > 0) {
break;
}
$dictionary .= "\n";
}
file_put_contents($filename, $dictionary, LOCK_EX);
}
/**
* @return int
*/
public function totalDocumentsInCollection()
{
$query = "SELECT * FROM info WHERE key = 'total_documents'";
$docs = $this->index->query($query);
return $docs->fetch(PDO::FETCH_ASSOC)['value'];
}
/**
* @param $keyword
*
* @return string
*/
public function buildTrigrams($keyword)
{
$t = "__".$keyword."__";
$trigrams = "";
for ($i = 0; $i < strlen($t) - 2; $i++) {
$trigrams .= mb_substr($t, $i, 3)." ";
}
return trim($trigrams);
}
public function prepareAndExecuteStatement($query, $params = [])
{
$statemnt = $this->index->prepare($query);
foreach ($params as $param) {
$statemnt->bindParam($param['key'], $param['value']);
}
$statemnt->execute();
return $statemnt;
}
public function info($text)
{
if (!$this->disableOutput) {
echo $text.PHP_EOL;
}
}
}