Skip to content

Commit

Permalink
Refactor Tokenizer class and update method signatures
Browse files Browse the repository at this point in the history
Code has been refactored for the Tokenizer class; this includes changing the constructor to accept parameters and adjusting method signatures for 'tokenize' and 'tokenizeBySentences'. Removal of redundant comments and the reorganization of class components were also carried out.
  • Loading branch information
deligoez committed Jan 2, 2024
1 parent 2ad391a commit 98ca5af
Showing 1 changed file with 16 additions and 31 deletions.
47 changes: 16 additions & 31 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,22 @@

class Tokenizer
{
// region Attributes

/** @var array<TokenizerFilter> */
protected array $wordFilters;

/** @var array<string> */
protected array $wordSeparationPatterns;

/** @var array<string> */
public array $sentenceSeparationPatterns;

protected bool $toLowercase;

// endregion

// region Public Methods

public function __construct()
{
$this->wordFilters = [];
$this->wordSeparationPatterns = [];
$this->sentenceSeparationPatterns = [];
$this->toLowercase = false;
/**
* Constructor for initializing the object.
*
* @param array<\Phonyland\NGram\TokenizerFilter> $wordFilters Array of word filters.
* @param array<string> $wordSeparationPatterns Array of word separation patterns.
* @param array<string> $sentenceSeparationPatterns Array of sentence separation patterns.
* @param bool $toLowercase Determines if the text will be converted to lowercase.
*/
public function __construct(
protected array $wordFilters = [],
protected array $wordSeparationPatterns = [],
public array $sentenceSeparationPatterns = [],
protected bool $toLowercase = false,
) {
}

/**
Expand All @@ -39,7 +32,7 @@ public function __construct()
*
* @return array<string>
*/
public function tokenize(string $text, int $minWordLength = null): array
public function tokenize(string $text, ?int $minWordLength = null): array
{
if ($this->wordSeparationPatterns === []) {
throw new RuntimeException('No word separation pattern given!');
Expand Down Expand Up @@ -91,7 +84,7 @@ public function sentences(string $text): array
*
* @return array<array<string>>
*/
public function tokenizeBySentences(string $text, int $minWordLength = null): array
public function tokenizeBySentences(string $text, ?int $minWordLength = null): array
{
$sentences = $this->sentences($text);

Expand Down Expand Up @@ -172,10 +165,6 @@ public function addWordFilterRule(string|TokenizerFilterType $searchRegex, strin

/**
* Adds a separator pattern for the splitting the given text.
*
*
* @param string|\Phonyland\NGram\TokenizerFilterType $wordSeparationPattern
* @return \Phonyland\NGram\Tokenizer
*/
public function addWordSeparatorPattern(string|TokenizerFilterType $wordSeparationPattern): self
{
Expand All @@ -192,7 +181,6 @@ public function addWordSeparatorPattern(string|TokenizerFilterType $wordSeparati
* Adds a separator pattern for the splitting into sentences.
*
* @param string|array<string> $sentenceSeparationPattern
* @return \Phonyland\NGram\Tokenizer
*/
public function addSentenceSeparatorPattern(string|array $sentenceSeparationPattern): self
{
Expand All @@ -210,9 +198,6 @@ public function addSentenceSeparatorPattern(string|array $sentenceSeparationPatt

/**
* Converts all tokens to lowercase.
*
*
* @return \Phonyland\NGram\Tokenizer
*/
public function toLowercase(bool $toLowercase = true): self
{
Expand Down

0 comments on commit 98ca5af

Please sign in to comment.