Skip to content

Commit

Permalink
If there is an encoding description in the response header or documen…
Browse files Browse the repository at this point in the history
…t, it is given priority.
  • Loading branch information
osapon committed Nov 3, 2024
1 parent 85dccc6 commit d6ac21f
Show file tree
Hide file tree
Showing 7 changed files with 2,607 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
.php_cs.dist export-ignore
.travis.yml export-ignore
phpunit.xml.dist export-ignore
/tests/cache/4pda.to.2022-12-04-406834-sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty.php working-tree-encoding=windows-1251 diff=windows-1251
/tests/cache/www.itmedia.co.jp.news-articles-2410-28-news159.html.php working-tree-encoding=sjis diff=sjis
13 changes: 12 additions & 1 deletion src/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,18 @@ public function __construct(Extractor $extractor)
$html = str_replace('<br>', "\n<br>", $html);
$html = str_replace('<br ', "\n<br ", $html);

$this->document = !empty($html) ? Parser::parse($html) : new DOMDocument();
$encoding = null;
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match);
if (!empty($match[1])) {
$encoding = $match[1];
} elseif (!empty($html)) {
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match);
if (!empty($match[1])) {
$encoding = $match[1];
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
}

Expand Down
2 changes: 2 additions & 0 deletions tests/PagesTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public function urlDataProvider(): array
['http://www.ustream.tv/channel/red-shoes-billiards-60803-camera-1'],
['http://www.viddler.com/v/bdce8c7'],
['http://www.wired.com/?p=2064839'],
['https://www.itmedia.co.jp/news/articles/2410/28/news159.html'],
['https://4pda.to/2022/12/04/406834/sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty/'],
];
}

Expand Down

Large diffs are not rendered by default.

Loading

0 comments on commit d6ac21f

Please sign in to comment.