-
Notifications
You must be signed in to change notification settings - Fork 2
/
Parser.php
95 lines (71 loc) · 2.42 KB
/
Parser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
<?php
declare(strict_types = 1);
namespace HtmlParser;
use Exception;
use DOMNode;
use DOMDocument;
use DOMDocumentFragment;
use SimpleXMLElement;
use DOMXPath;
class Parser
{
public static function stringify(DOMNode $node): string
{
if ($node instanceof DOMDocument) {
return $node->saveHTML($node);
}
return $node->ownerDocument->saveHTML($node);
}
public static function parse(string $html, ?string $encoding = null): DOMDocument
{
$detected = $encoding ?? mb_detect_encoding($html);
if ($detected) {
$html = mb_encode_numericentity($html, [0x80, 0xFFFFFF, 0, -1], $detected);
}
$document = self::createDOMDocument($html);
$xpath = new DOMXPath($document);
$charset = $xpath->query('.//meta[@charset]')->item(0);
$httpEquiv = $xpath->query('.//meta[@http-equiv]')->item(0);
if ($charset || $httpEquiv) {
$charset = $charset ? self::stringify($charset) : null;
$httpEquiv = $httpEquiv ? self::stringify($httpEquiv) : null;
$html = preg_replace(
'/<head[^>]*>/',
'<head>'.$charset.$httpEquiv,
$html
);
return self::createDOMDocument($html);
}
return $document;
}
public static function parseFragment(string $html, ?string $encoding = null): DOMDocumentFragment
{
$html = "<html><head></head><body>{$html}</body></html>";
$document = static::parse($html, $encoding);
$fragment = $document->createDocumentFragment();
$body = $document->getElementsByTagName('body')->item(0);
$nodes = [];
foreach ($body->childNodes as $node) {
$nodes[] = $node;
}
foreach ($nodes as $node) {
$fragment->appendChild($node);
}
return $fragment;
}
private static function createDOMDocument(string $code): DOMDocument
{
$errors = libxml_use_internal_errors(true);
// Enabled by default in PHP 8
if (PHP_MAJOR_VERSION < 8) {
$entities = libxml_disable_entity_loader(true);
}
$document = new DOMDocument();
$document->loadHTML($code);
libxml_use_internal_errors($errors);
if (PHP_MAJOR_VERSION < 8) {
libxml_disable_entity_loader($entities);
}
return $document;
}
}