����JFIF��x�x����'
| Server IP : 78.140.185.180 / Your IP : 216.73.216.170 Web Server : LiteSpeed System : Linux cpanel13.v.fozzy.com 4.18.0-513.11.1.lve.el8.x86_64 #1 SMP Thu Jan 18 16:21:02 UTC 2024 x86_64 User : builderbox ( 1072) PHP Version : 7.3.33 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : OFF | Pkexec : OFF Directory : /home/builderbox/./././www/vendor/teamtnt/tntsearch/src/Stemmer/ |
Upload File : |
<?php
namespace TeamTNT\TNTSearch\Stemmer;
/**
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* This is a reimplementation of the Porter Stemmer Algorithm for Portuguese.
* This script is based on the implementation found on <https://github.com/wamania/php-stemmer>
* and has been rewriten to work with TNTSearch by Lucas Padilha <https://github.com/LucasPadilha>
*
* Takes a word and reduces it to its Portuguese stem using the Porter stemmer algorithm.
*
* References:
* - http://snowball.tartarus.org/algorithms/porter/stemmer.html
* - http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
*
* Usage:
* $stem = PortugueseStemmer::stem($word);
*
* @author Lucas Padilha <https://github.com/LucasPadilha>
*/
class PortugueseStemmer implements Stemmer
{
/**
* UTF-8 Case lookup table
*
* This lookuptable defines the upper case letters to their correspponding
* lower case letter in UTF-8
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
private static $utf8_lower_to_upper = array(
0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
);
private static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'â', 'ê', 'ô');
public static function stem($word)
{
// we do ALL in UTF-8
if (!self::check($word)) {
throw new \Exception('Word must be in UTF-8');
}
$word = self::strtolower($word);
$word = self::str_replace(array('ã', 'õ'), array('a~', 'o~'), $word);
$rv = '';
$rvIndex = '';
self::rv($word, $rv, $rvIndex);
$r1 = '';
$r1Index = '';
self::r1($word, $r1, $r1Index);
$r2 = '';
$r2Index = '';
self::r2($r1, $r1Index, $r2, $r2Index);
$initialWord = $word;
self::step1($word, $r1Index, $r2Index, $rvIndex);
if ($initialWord == $word) {
self::step2($word, $rvIndex);
}
if ($initialWord != $word) {
self::step3($word, $rvIndex);
} else {
self::step4($word, $rvIndex);
}
self::step5($word, $rvIndex);
self::finish($word);
return $word;
}
/**
* R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
*/
private static function r1($word, &$r1, &$r1Index)
{
list($index, $value) = self::rx($word);
$r1 = $value;
$r1Index = $index;
return true;
}
/**
* R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
*/
private static function r2($r1, $r1Index, &$r2, &$r2Index)
{
list($index, $value) = self::rx($r1);
$r2 = $value;
$r2Index = $r1Index + $index;
return true;
}
/**
* Common function for R1 and R2
* Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel.
* R1 : $in = $this->word
* R2 : $in = R1
*/
private static function rx($in)
{
$length = self::strlen($in);
// Defaults
$value = '';
$index = $length;
// Search all vowels
$vowels = array();
for ($i = 0; $i < $length; $i++) {
$letter = self::substr($in, $i, 1);
if (in_array($letter, static::$vowels)) {
$vowels[] = $i;
}
}
// Search the non-vowel following a vowel
foreach ($vowels as $position) {
$after = $position + 1;
$letter = self::substr($in, $after, 1);
if (!in_array($letter, static::$vowels)) {
$index = $after + 1;
$value = self::substr($in, ($after+1));
break;
}
}
return array($index, $value);
}
/**
* Used by spanish, italian, portuguese, etc (but not by french)
*
* If the second letter is a consonant, RV is the region after the next following vowel,
* or if the first two letters are vowels, RV is the region after the next consonant,
* and otherwise (consonant-vowel case) RV is the region after the third letter.
* But RV is the end of the word if these positions cannot be found.
*/
private static function rv($word, &$rv, &$rvIndex)
{
$length = self::strlen($word);
if ($length < 3) {
return true;
}
$first = self::substr($word, 0, 1);
$second = self::substr($word, 1, 1);
// If the second letter is a consonant, RV is the region after the next following vowel,
if (!in_array($second, static::$vowels)) {
for ($i = 2; $i < $length; $i++) {
$letter = self::substr($word, $i, 1);
if (in_array($letter, static::$vowels)) {
$rv = self::substr($word, ($i + 1));
$rvIndex = $i + 1;
return true;
}
}
}
// or if the first two letters are vowels, RV is the region after the next consonant,
if ((in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) {
for ($i = 2; $i < $length; $i++) {
$letter = self::substr($word, $i, 1);
if (!in_array($letter, static::$vowels)) {
$rv = self::substr($word, ($i + 1));
$rvIndex = $i + 1;
return true;
}
}
}
// and otherwise (consonant-vowel case) RV is the region after the third letter.
if ((!in_array($first, static::$vowels)) && (in_array($second, static::$vowels))) {
$rv = self::substr($word, 3);
$rvIndex = 3;
return true;
}
return false;
}
private static function inRv($position, $rvIndex)
{
return ($position >= $rvIndex);
}
private static function inR1($position, $r1Index)
{
return ($position >= $r1Index);
}
private static function inR2($position, $r2Index)
{
return ($position >= $r2Index);
}
private static function searchIfInRv($word, $suffixes, $rvIndex)
{
return self::search($word, $suffixes, $rvIndex);
}
private static function searchIfInR2($word, $suffixes, $r2Index)
{
return self::search($word, $suffixes, $r2Index);
}
private static function search($word, $suffixes, $offset = 0)
{
$length = self::strlen($word);
if ($offset > $length) {
return false;
}
foreach ($suffixes as $suffix) {
if ((($position = self::strrpos($word, $suffix, $offset)) !== false) && ((self::strlen($suffix) + $position) == $length)) {
return $position;
}
}
return false;
}
/**
* Step 1: Standard suffix removal
*/
private static function step1(&$word, $r1Index, $r2Index, $rvIndex)
{
// delete if in R2
if (($position = self::search($word, array('amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância', 'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso', 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) {
if (self::inR2($position, $r2Index)) {
$word = self::substr($word, 0, $position);
}
return true;
}
// replace with log if in R2
if (($position = self::search($word, array('logías', 'logía'))) !== false) {
if (self::inR2($position, $r2Index)) {
$word = preg_replace('#(logías|logía)$#u', 'log', $word);
}
return true;
}
// replace with u if in R2
if (($position = self::search($word, array('uciones', 'ución'))) !== false) {
if (self::inR2($position, $r2Index)) {
$word = preg_replace('#(uciones|ución)$#u', 'u', $word);
}
return true;
}
// replace with ente if in R2
if (($position = self::search($word, array('ências', 'ência'))) !== false) {
if (self::inR2($position, $r2Index)) {
$word = preg_replace('#(ências|ência)$#u', 'ente', $word);
}
return true;
}
// delete if in R1
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
// if preceded by os, ic or ad, delete if in R2
if (($position = self::search($word, array('amente'))) !== false) {
// delete if in R1
if (self::inR1($position, $r1Index)) {
$word = self::substr($word, 0, $position);
}
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
if (($position2 = self::searchIfInR2($word, array('iv'), $r2Index)) !== false) {
$word = self::substr($word, 0, $position2);
if (($position3 = self::searchIfInR2($word, array('at'), $r2Index)) !== false) {
$word = self::substr($word, 0, $position3);
}
// if preceded by os, ic or ad, delete if in R2
} elseif (($position4 = self::searchIfInR2($word, array('os', 'ic', 'ad'), $r2Index)) !== false) {
$word = self::substr($word, 0, $position4);
}
return true;
}
// delete if in R2
// if preceded by ante, avel or ível, delete if in R2
if (($position = self::search($word, array('mente'))) !== false) {
// delete if in R2
if (self::inR2($position, $r2Index)) {
$word = self::substr($word, 0, $position);
}
// if preceded by ante, avel or ível, delete if in R2
if (($position2 = self::searchIfInR2($word, array('ante', 'avel', 'ível'), $r2Index)) != false) {
$word = self::substr($word, 0, $position2);
}
return true;
}
// delete if in R2
// if preceded by abil, ic or iv, delete if in R2
if (($position = self::search($word, array('idades', 'idade'))) !== false) {
// delete if in R2
if (self::inR2($position, $r2Index)) {
$word = self::substr($word, 0, $position);
}
// if preceded by abil, ic or iv, delete if in R2
if (($position2 = self::searchIfInR2($word, array('abil', 'ic', 'iv'), $r2Index)) !== false) {
$word = self::substr($word, 0, $position2);
}
return true;
}
// delete if in R2
// if preceded by at, delete if in R2
if (($position = self::search($word, array('ivas', 'ivos', 'iva', 'ivo'))) !== false) {
// delete if in R2
if (self::inR2($position, $r2Index)) {
$word = self::substr($word, 0, $position);
}
// if preceded by at, delete if in R2
if (($position2 = self::searchIfInR2($word, array('at'), $r2Index)) !== false) {
$word = self::substr($word, 0, $position2);
}
return true;
}
// replace with ir if in RV and preceded by e
if (($position = self::search($word, array('iras', 'ira'))) !== false) {
if (self::inRv($position, $rvIndex)) {
$before = $position - 1;
$letter = self::substr($word, $before, 1);
if ($letter == 'e') {
$word = preg_replace('#(iras|ira)$#u', 'ir', $word);
}
}
return true;
}
return false;
}
/**
* Step 2: Verb suffixes
* Search for the longest among the following suffixes in RV, and if found, delete.
*/
private static function step2(&$word, $rvIndex)
{
if (($position = self::searchIfInRv($word, array('aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos', 'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos', 'aremos', 'eremos', 'iremos', 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis', 'áveis', 'íamos', 'armos', 'ermos', 'irmos', 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas', 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o', 'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou'), $rvIndex)) !== false) {
$word = self::substr($word, 0, $position);
return true;
}
return false;
}
/**
* Step 3: d-suffixes
*
*/
private static function step3(&$word, $rvIndex)
{
// Delete suffix i if in RV and preceded by c
if (self::searchIfInRv($word, array('i'), $rvIndex) !== false) {
$letter = self::substr($word, -2, 1);
if ($letter == 'c') {
$word = self::substr($word, 0, -1);
}
return true;
}
return false;
}
/**
* Step 4
*/
private static function step4(&$word, $rvIndex)
{
// If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it
if (($position = self::searchIfInRv($word, array('os', 'a', 'i', 'o','á', 'í', 'ó'), $rvIndex)) !== false) {
$word = self::substr($word, 0, $position);
return true;
}
return false;
}
/**
* Step 5
*/
private static function step5(&$word, $rvIndex)
{
// If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i).
if (self::searchIfInRv($word, array('e', 'é', 'ê'), $rvIndex) !== false) {
$word = self::substr($word, 0, -1);
if (($position2 = self::search($word, array('gu', 'ci'))) !== false) {
if (self::inRv(($position2 + 1), $rvIndex)) {
$word = self::substr($word, 0, -1);
}
}
return true;
} elseif (self::search($word, array('ç')) !== false) {
$word = preg_replace('#(ç)$#u', 'c', $word);
return true;
}
return false;
}
private static function finish(&$word)
{
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
$word = self::str_replace(array('a~', 'o~'), array('ã', 'õ'), $word);
}
/**
* Tries to detect if a string is in Unicode encoding
*
* @author <bmorel@ssi.fr>
* @link http://www.php.net/manual/en/function.utf8-encode.php
*/
private static function check($str)
{
for ($i=0; $i<strlen($str); $i++) {
if (ord($str[$i]) < 0x80) continue; # 0bbbbbbb
elseif ((ord($str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
elseif ((ord($str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
elseif ((ord($str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
elseif ((ord($str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
elseif ((ord($str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
else return false; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == strlen($str)) || ((ord($str[$i]) & 0xC0) != 0x80))
return false;
}
}
return true;
}
/**
* Unicode aware replacement for strlen()
*
* utf8_decode() converts characters that are not in ISO-8859-1
* to '?', which, for the purpose of counting, is alright - It's
* even faster than mb_strlen.
*
* @author <chernyshevsky at hotmail dot com>
* @see strlen()
* @see utf8_decode()
*/
private static function strlen($string)
{
return strlen(utf8_decode($string));
}
/**
* Unicode aware replacement for substr()
*
* @author lmak at NOSPAM dot iti dot gr
* @link http://www.php.net/manual/en/function.substr.php
* @see substr()
*/
private static function substr($str,$start,$length=null)
{
$ar = array();
preg_match_all("/./u", $str, $ar);
if($length != null) {
return join("",array_slice($ar[0],$start,$length));
} else {
return join("",array_slice($ar[0],$start));
}
}
/**
* Unicode aware replacement for strrepalce()
*
* @author Harry Fuecks <hfuecks@gmail.com>
* @see strreplace();
*/
private static function str_replace($s,$r,$str)
{
if(!is_array($s)){
$s = '!'.preg_quote($s,'!').'!u';
}else{
foreach ($s as $k => $v) {
$s[$k] = '!'.preg_quote($v).'!u';
}
}
return preg_replace($s,$r,$str);
}
/**
* This is a unicode aware replacement for strtolower()
*
* Uses mb_string extension if available
*
* @author Andreas Gohr <andi@splitbrain.org>
* @see strtolower()
* @see utf8_strtoupper()
*/
private static function strtolower($string)
{
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
return mb_strtolower($string,'utf-8');
//global $utf8_upper_to_lower;
$utf8_upper_to_lower = array_flip(self::$utf8_lower_to_upper);
$uni = self::utf8_to_unicode($string);
$cnt = count($uni);
for ($i=0; $i < $cnt; $i++){
if($utf8_upper_to_lower[$uni[$i]]){
$uni[$i] = $utf8_upper_to_lower[$uni[$i]];
}
}
return self::unicode_to_utf8($uni);
}
/**
* This function returns any UTF-8 encoded text as a list of
* Unicode values:
*
* @author Scott Michael Reynen <scott@randomchaos.com>
* @link http://www.randomchaos.com/document.php?source=php_and_unicode
* @see unicode_to_utf8()
*/
private static function utf8_to_unicode( &$str )
{
$unicode = array();
$values = array();
$looking_for = 1;
for ($i = 0; $i < strlen( $str ); $i++ ) {
$this_value = ord( $str[ $i ] );
if ( $this_value < 128 ) $unicode[] = $this_value;
else {
if ( count( $values ) == 0 ) $looking_for = ( $this_value < 224 ) ? 2 : 3;
$values[] = $this_value;
if ( count( $values ) == $looking_for ) {
$number = ( $looking_for == 3 ) ?
( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
$unicode[] = $number;
$values = array();
$looking_for = 1;
}
}
}
return $unicode;
}
/**
* This function converts a Unicode array back to its UTF-8 representation
*
* @author Scott Michael Reynen <scott@randomchaos.com>
* @link http://www.randomchaos.com/document.php?source=php_and_unicode
* @see utf8_to_unicode()
*/
private static function unicode_to_utf8( &$str )
{
if (!is_array($str)) return '';
$utf8 = '';
foreach( $str as $unicode ) {
if ( $unicode < 128 ) {
$utf8.= chr( $unicode );
} elseif ( $unicode < 2048 ) {
$utf8.= chr( 192 + ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
$utf8.= chr( 128 + ( $unicode % 64 ) );
} else {
$utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
$utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
$utf8.= chr( 128 + ( $unicode % 64 ) );
}
}
return $utf8;
}
/**
* This is an Unicode aware replacement for strrpos
*
* Uses mb_string extension if available
*
* @author Harry Fuecks <hfuecks@gmail.com>
* @see strpos()
*/
private static function strrpos($haystack, $needle, $offset=0)
{
if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strrpos'))
return mb_strrpos($haystack, $needle, $offset, 'utf-8');
if (!$offset) {
$ar = self::explode($needle, $haystack);
$count = count($ar);
if ( $count > 1 ) {
return self::strlen($haystack) - self::strlen($ar[($count-1)]) - self::strlen($needle);
}
return false;
} else {
if ( !is_int($offset) ) {
trigger_error('Offset must be an integer', E_USER_WARNING);
return false;
}
$str = self::substr($haystack, $offset);
if ( false !== ($pos = self::strrpos($str, $needle))){
return $pos + $offset;
}
return false;
}
}
/**
* Unicode aware replacement for explode
*
* @author Harry Fuecks <hfuecks@gmail.com>
* @see explode();
*/
private static function explode($sep, $str)
{
if ( $sep == '' ) {
trigger_error('Empty delimiter',E_USER_WARNING);
return FALSE;
}
return preg_split('!'.preg_quote($sep,'!').'!u',$str);
}
}