-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathJustHTML.php
More file actions
114 lines (94 loc) · 3.54 KB
/
JustHTML.php
File metadata and controls
114 lines (94 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
<?php
declare(strict_types=1);
namespace JustHTML;
final class StrictModeError extends \RuntimeException
{
public ParseError $error;
public function __construct(ParseError $error)
{
$this->error = $error;
parent::__construct((string)$error);
}
}
final class JustHTML
{
public bool $debug;
public ?string $encoding;
/** @var array<int, ParseError> */
public array $errors;
public ?FragmentContext $fragment_context;
public SimpleDomNode $root;
public Tokenizer $tokenizer;
public TreeBuilder $tree_builder;
/** @param array<string, mixed> $options */
public function __construct($html, array $options = [])
{
$this->debug = (bool)($options['debug'] ?? false);
$this->fragment_context = $options['fragment_context'] ?? null;
$this->encoding = null;
$collect_errors = (bool)($options['collect_errors'] ?? false);
$strict = (bool)($options['strict'] ?? false);
$should_collect = $collect_errors || $strict;
$transport_encoding = $options['encoding'] ?? null;
$iframe_srcdoc = (bool)($options['iframe_srcdoc'] ?? false);
$tokenizer_opts = $options['tokenizer_opts'] ?? null;
$tree_builder = $options['tree_builder'] ?? null;
$is_bytes = (bool)($options['bytes'] ?? false);
if ($html === null) {
$html_str = '';
} elseif ($is_bytes) {
[$html_str, $chosen] = Encoding::decodeHtml((string)$html, $transport_encoding);
$this->encoding = $chosen;
} else {
$html_str = (string)$html;
}
$this->tree_builder = $tree_builder ?? new TreeBuilder(
$this->fragment_context,
$iframe_srcdoc,
$should_collect
);
$opts = $tokenizer_opts instanceof TokenizerOpts ? $tokenizer_opts : new TokenizerOpts();
if ($this->fragment_context !== null && $this->fragment_context->namespace === null) {
$tag_name = strtolower($this->fragment_context->tagName);
if (in_array($tag_name, ['textarea', 'title', 'style'], true)) {
$opts->initialState = Tokenizer::RAWTEXT;
$opts->initialRawtextTag = $tag_name;
} elseif (in_array($tag_name, ['plaintext', 'script'], true)) {
$opts->initialState = Tokenizer::PLAINTEXT;
}
}
$this->tokenizer = new Tokenizer($this->tree_builder, $opts, $should_collect);
$this->tree_builder->tokenizer = $this->tokenizer;
$this->tokenizer->run($html_str);
$this->root = $this->tree_builder->finish();
$this->errors = array_merge($this->tokenizer->errors, $this->tree_builder->errors);
if ($strict && $this->errors) {
throw new StrictModeError($this->errors[0]);
}
}
public function toHtml(bool $pretty = true, int $indent_size = 2): string
{
return $this->root->toHtml(0, $indent_size, $pretty);
}
public function toText(string $separator = ' ', bool $strip = true): string
{
return $this->root->toText($separator, $strip);
}
public function toMarkdown(): string
{
return $this->root->toMarkdown();
}
/** @return array<int, mixed> */
public function query(string $selector): array
{
return $this->root->query($selector);
}
public function queryFirst(string $selector)
{
return $this->root->queryFirst($selector);
}
public function toTestFormat(): string
{
return $this->root->toTestFormat(0);
}
}