diff --git a/README.md b/README.md index cd2ba68..be38c2e 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,22 @@ echo String::truncate($text, strlen($text)); // "Lorem ipsum dolorem si amet, lo ``` +**Truncate HTML** + +Truncate and HTML string to word closest to a certain length + +```php +use Coduo\PHPHumanizer\String; + +$text = '

HyperText Markup Language, commonly referred to as HTML, is the standard markup language used to create web pages.[1] Web browsers can read HTML files and render them into visible or audible web pages. HTML describes the structure of a website semantically along with cues for presentation, making it a markup language, rather than a programming language.

'; + +echo String::truncateHtml($text, 3); // "HyperText" +echo String::truncateHtml($text, 12, ''); // "HyperText Markup" +echo String::truncateHtml($text, 50, '', '...'); // "HyperText Markup Language, commonly referred to as..." +echo String::truncateHtml($text, 75, '', '...'); // 'HyperText Markup Language, commonly referred to as HTML, is the standard markup...' + +``` + ## Number **Ordinalize** diff --git a/spec/Coduo/PHPHumanizer/String/WordBreakpointSpec.php b/spec/Coduo/PHPHumanizer/String/WordBreakpointSpec.php new file mode 100644 index 0000000..f608032 --- /dev/null +++ b/spec/Coduo/PHPHumanizer/String/WordBreakpointSpec.php @@ -0,0 +1,36 @@ +calculatePosition('Lorem ipsum dolorem', 2)->shouldReturn(5); + $this->calculatePosition('Lorem ipsum dolorem', 4)->shouldReturn(5); + $this->calculatePosition('Lorem ipsum dolorem', 5)->shouldReturn(5); + $this->calculatePosition('Lorem ipsum dolorem', 10)->shouldReturn(11); + $this->calculatePosition('Lorem ipsum dolorem', -2)->shouldReturn(19); + $this->calculatePosition('Lorem ipsum dolorem', 0)->shouldReturn(5); + } + + function it_calculate_breakpoint_position_when_sentence_is_shorter_than_characters_count() + { + $this->calculatePosition('Lorem ipsum dolorem', 20)->shouldReturn(19); + } + + function it_calculate_breakpoint_position_when_characters_count_ends_in_last_word() + { + $this->calculatePosition('Lorem ipsum', 7)->shouldReturn(11); + } + + function it_calculate_breakpoint_position_when_characters_count_ends_in_last_space() + { + $this->calculatePosition('Lorem ipsum', 5)->shouldReturn(5); + } +} diff --git a/src/Coduo/PHPHumanizer/String.php b/src/Coduo/PHPHumanizer/String.php index 3eed429..70983d3 100644 --- a/src/Coduo/PHPHumanizer/String.php +++ b/src/Coduo/PHPHumanizer/String.php @@ -3,7 +3,9 @@ namespace Coduo\PHPHumanizer; use Coduo\PHPHumanizer\String\Humanize; -use Coduo\PHPHumanizer\String\Truncate; +use Coduo\PHPHumanizer\String\TextTruncate; +use Coduo\PHPHumanizer\String\HtmlTruncate; +use Coduo\PHPHumanizer\String\WordBreakpoint; class String { @@ -12,6 +14,7 @@ class String * @param bool|true $capitalize * @param string $separator * @param array $forbiddenWords + * * @return string */ public static function humanize($text, $capitalize = true, $separator = '_', array $forbiddenWords = array()) @@ -23,10 +26,28 @@ public static function humanize($text, $capitalize = true, $separator = '_', arr * @param $text * @param $charactersCount * @param string $append + * * @return string */ public static function truncate($text, $charactersCount, $append = '') { - return (string) new Truncate($text, $charactersCount, $append); + $truncate = new TextTruncate(new WordBreakpoint(), $append); + + return $truncate->truncate($text, $charactersCount); + } + + /** + * @param $text + * @param $charactersCount + * @param string $allowedTags + * @param string $append + * + * @return string + */ + public static function truncateHtml($text, $charactersCount, $allowedTags = '', $append = '') + { + $truncate = new HtmlTruncate(new WordBreakpoint(), $allowedTags, $append); + + return $truncate->truncate($text, $charactersCount); } } diff --git a/src/Coduo/PHPHumanizer/String/Breakpoint.php b/src/Coduo/PHPHumanizer/String/Breakpoint.php new file mode 100644 index 0000000..48a61e9 --- /dev/null +++ b/src/Coduo/PHPHumanizer/String/Breakpoint.php @@ -0,0 +1,17 @@ +breakpoint = $breakpoint; + $this->append = $append; + $this->allowedTags = $allowedTags; + } + + /** + * @return string + */ + public function truncate($text, $charactersCount) + { + $strippedText = strip_tags($text, $this->allowedTags); + + return $this->truncateHtml($strippedText, $charactersCount); + } + + /** + * Truncates a string to the given length. It will optionally preserve + * HTML tags if $is_html is set to true. + * + * Adapted from FuelPHP Str::truncate (https://github.com/fuelphp/common/blob/master/src/Str.php) + * + * @param string $string + * @param int $charactersCount + * + * @return string the truncated string + */ + private function truncateHtml($string, $charactersCount) + { + $limit = $charactersCount; + $offset = 0; + $tags = array(); + + // Handle special characters. + preg_match_all('/&[a-z]+;/i', strip_tags($string), $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); + foreach ($matches as $match) { + if ($match[0][1] >= $limit) { + break; + } + $limit += (mb_strlen($match[0][0]) - 1); + } + + // Handle all the html tags. + preg_match_all('/<[^>]+>([^<]*)/', $string, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); + foreach ($matches as $match) { + if ($match[0][1] - $offset >= $limit) { + break; + } + + $tag = mb_substr(strtok($match[0][0], " \t\n\r\0\x0B>"), 1); + if ($tag[0] != '/') { + $tags[] = $tag; + } elseif (end($tags) == mb_substr($tag, 1)) { + array_pop($tags); + } + + $offset += $match[1][1] - $match[0][1]; + } + + $newString = mb_substr($string, 0, $limit = min(mb_strlen($string), $this->breakpoint->calculatePosition($string, $limit + $offset))); + $newString .= (mb_strlen($string) > $limit ? $this->append : ''); + $newString .= (count($tags = array_reverse($tags)) ? '' : ''); + + return $newString; + } +} diff --git a/src/Coduo/PHPHumanizer/String/TextTruncate.php b/src/Coduo/PHPHumanizer/String/TextTruncate.php new file mode 100644 index 0000000..dd99238 --- /dev/null +++ b/src/Coduo/PHPHumanizer/String/TextTruncate.php @@ -0,0 +1,42 @@ +breakpoint = $breakpoint; + $this->append = $append; + } + + /** + * @param string $text + * @param int $charactersCount + * @return string + */ + public function truncate($text, $charactersCount) + { + if ($charactersCount < 0 || mb_strlen($text) <= $charactersCount) { + return $text; + } + + $truncatedText = rtrim(mb_substr($text, 0, $this->breakpoint->calculatePosition($text, $charactersCount))); + + return ($truncatedText === $text) ? $truncatedText : $truncatedText . $this->append; + } +} diff --git a/src/Coduo/PHPHumanizer/String/Truncate.php b/src/Coduo/PHPHumanizer/String/Truncate.php index f207142..fa815fa 100644 --- a/src/Coduo/PHPHumanizer/String/Truncate.php +++ b/src/Coduo/PHPHumanizer/String/Truncate.php @@ -2,6 +2,9 @@ namespace Coduo\PHPHumanizer\String; +/** + * @deprecated since 1.0 use Coduo\PHPHumanizer\String\TextTruncate or Coduo\PHPHumanizer\String\HtmlTruncate instead + */ class Truncate { /** @@ -44,4 +47,4 @@ public function __toString() return rtrim(mb_substr($this->text, 0, $length)).$this->append; } -} +} \ No newline at end of file diff --git a/src/Coduo/PHPHumanizer/String/TruncateInterface.php b/src/Coduo/PHPHumanizer/String/TruncateInterface.php new file mode 100644 index 0000000..a6e1bb7 --- /dev/null +++ b/src/Coduo/PHPHumanizer/String/TruncateInterface.php @@ -0,0 +1,13 @@ + mb_strlen($text)) { + return mb_strlen($text); + } + + $breakpoint = mb_strpos($text, ' ', $charactersCount); + + if (false === $breakpoint) { + return mb_strlen($text); + } + + return $breakpoint; + } +} diff --git a/tests/Coduo/PHPHumanizer/Tests/StringTest.php b/tests/Coduo/PHPHumanizer/Tests/StringTest.php index eef471a..74331c1 100644 --- a/tests/Coduo/PHPHumanizer/Tests/StringTest.php +++ b/tests/Coduo/PHPHumanizer/Tests/StringTest.php @@ -34,6 +34,11 @@ function test_truncate_string_to_word_closest_to_a_certain_number_of_characters( $this->assertEquals($expected, String::truncate($text, $charactersCount, $append)); } + function it_truncate_string_to_word_closest_to_a_certain_number_of_characters_with_html_tags($text, $charactersCount, $allowedTags, $expected, $append = '') + { + $this->assertEquals($expected, String::truncateHtml($text, $charactersCount, $allowedTags, $append)); + } + /** * * @return array @@ -71,11 +76,32 @@ public function truncateStringProvider() array($shortText, "Short...", 3, '...'), array($shortText, "Short...", 4, '...'), array($shortText, "Short...", 5, '...'), - array($shortText, "Short...", 6, '...'), + array($shortText, "Short text", 6, '...'), array($shortText, "Short text", 7, '...'), array($shortText, "Short text", 8, '...'), array($shortText, "Short text", 9, '...'), array($shortText, "Short text", 10, '...') ); } + + public function truncateHtmlStringProvider() + { + $text = '

HyperText Markup Language, commonly referred to as HTML, is the standard markup language used to create web pages.[1] Web browsers can read HTML files and render them into visible or audible web pages. HTML describes the structure of a website semantically along with cues for presentation, making it a markup language, rather than a programming language.

'; + + return array( + array($text, 3, '', "HyperText"), + array($text, 12, '', "HyperText Markup"), + array($text, 30, '', "HyperText Markup Language, commonly"), + array($text, 50, '', "HyperText Markup Language, commonly referred to as"), + array($text, 75, '', 'HyperText Markup Language, commonly referred to as HTML, is the standard markup'), + array($text, 100,'', 'HyperText Markup Language, commonly referred to as HTML, is the standard markup language used to create'), + array($text, 3 , '', "HyperText"), + array($text, 12 , '', "HyperText Markup"), + array($text, 50 , '', "HyperText Markup Language, commonly referred to as"), + array($text, 75 , '', "HyperText Markup Language, commonly referred to as HTML, is the standard markup"), + array($text, 100, '', "HyperText Markup Language, commonly referred to as HTML, is the standard markup language used to create"), + array($text, 50, '', "HyperText Markup Language, commonly referred to as...", '...'), + array($text, 75, '', 'HyperText Markup Language, commonly referred to as HTML, is the standard markup...', '...') + ); + } } \ No newline at end of file