Skip to content

Commit

Permalink
Removing dependency on voku/portable-utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
Hackwar committed Dec 11, 2024
1 parent adf8578 commit e2c48aa
Show file tree
Hide file tree
Showing 20 changed files with 619 additions and 435 deletions.
4 changes: 2 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
}
],
"require": {
"php": ">=7.3",
"voku/portable-utf8": "^5.4|^6.0"
"php": ">=8.1",
"joomla/string": ">=3.0.1"
},
"require-dev":{
"phpunit/phpunit": "^9.0"
Expand Down
25 changes: 10 additions & 15 deletions src/Stemmer/Catalan.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace Wamania\Snowball\Stemmer;

use voku\helper\UTF8;
use Joomla\String\StringHelper;

/**
*
Expand Down Expand Up @@ -86,12 +86,7 @@ class Catalan extends Stem
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}

$this->word = UTF8::strtolower($word);
$this->word = StringHelper::strtolower($word);

// Catalan stemmer does not use Rv
$this->r1();
Expand Down Expand Up @@ -127,7 +122,7 @@ private function step0()
{
if (($position = $this->search(static::$attached_pronoun)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
return true;
}
}
Expand All @@ -146,7 +141,7 @@ private function step1a()
// delete if in R2
if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -162,11 +157,11 @@ private function step1a()
// atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes
// ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos
// ient otes ots
//
//
// delete if in R1
if (($position = $this->search(self::$standard_suffix_1a)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
}
return true;
}
Expand Down Expand Up @@ -241,7 +236,7 @@ private function step1b()
// delete if in R1
if (($position = $this->search(static::$verb_suffixes)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -251,7 +246,7 @@ private function step1b()
// delete if in R2
if (($position = $this->search(['ando'])) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -270,7 +265,7 @@ private function step2()
// delete if in R1
if (($position = $this->search(static::$residual_suffixes)) !== false) {
if ($this->inR1($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
}
return true;
}
Expand All @@ -294,7 +289,7 @@ private function step2()
*/
private function finish()
{
$this->word = UTF8::str_replace(
$this->word = str_replace(
['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'],
['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'],
$this->word
Expand Down
33 changes: 14 additions & 19 deletions src/Stemmer/Danish.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace Wamania\Snowball\Stemmer;

use voku\helper\UTF8;
use Joomla\String\StringHelper;

/**
*
Expand All @@ -22,20 +22,15 @@ class Danish extends Stem
*/
public function stem($word): string
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}

$this->word = UTF8::strtolower($word);
$this->word = StringHelper::strtolower($word);

// R2 is not used: R1 is defined in the same way as in the German stemmer
$this->r1();

// then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
$this->r1 = StringHelper::substr($this->word, 3);
}

// Do each of steps 1, 2 3 and 4.
Expand All @@ -56,7 +51,7 @@ public function stem($word): string
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
$lastLetter = StringHelper::substr($word, -1, 1);
return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'));
}

Expand All @@ -74,14 +69,14 @@ private function step1()
'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds',
'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e'
))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
return true;
}

// s
// delete if preceded by a valid s-ending
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
$word = UTF8::substr($this->word, 0, $position);
$word = StringHelper::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
Expand All @@ -97,7 +92,7 @@ private function step1()
private function step2()
{
if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
$this->word = StringHelper::substr($this->word, 0, -1);
}
}

Expand All @@ -108,22 +103,22 @@ private function step3()
{
// If the word ends igst, remove the final st.
if ($this->search(array('igst')) !== false) {
$this->word = UTF8::substr($this->word, 0, -2);
$this->word = StringHelper::substr($this->word, 0, -2);
}

// Search for the longest among the following suffixes in R1, and perform the action indicated.
// ig lig elig els
// delete, and then repeat step 2
if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->word = StringHelper::substr($this->word, 0, $position);
$this->step2();
return true;
}

// løst
// replace with løs
if ($this->searchIfInR1(array('løst')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
$this->word = StringHelper::substr($this->word, 0, -1);
}
}

Expand All @@ -133,19 +128,19 @@ private function step3()
*/
private function step4()
{
$length = UTF8::strlen($this->word);
$length = StringHelper::strlen($this->word);
if (!$this->inR1(($length-1))) {
return false;
}

$lastLetter = UTF8::substr($this->word, -1, 1);
$lastLetter = StringHelper::substr($this->word, -1, 1);
if (in_array($lastLetter, self::$vowels)) {
return false;
}
$beforeLastLetter = UTF8::substr($this->word, -2, 1);
$beforeLastLetter = StringHelper::substr($this->word, -2, 1);

if ($lastLetter == $beforeLastLetter) {
$this->word = UTF8::substr($this->word, 0, -1);
$this->word = StringHelper::substr($this->word, 0, -1);
}
return true;
}
Expand Down
Loading

0 comments on commit e2c48aa

Please sign in to comment.