Skip to content

Commit 12d3f8c

Browse files
[DomCrawler] Use the native HTM5 parser on PHP 8.4
1 parent b223c40 commit 12d3f8c

File tree

4 files changed

+174
-17
lines changed

4 files changed

+174
-17
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ public function __construct(
6262
\DOMNodeList|\DOMNode|array|string|null $node = null,
6363
protected ?string $uri = null,
6464
?string $baseHref = null,
65-
bool $useHtml5Parser = true,
65+
private bool $useHtml5Parser = true,
6666
) {
6767
$this->baseHref = $baseHref ?: $uri;
68-
$this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
68+
$this->html5Parser = \PHP_VERSION_ID < 80400 && $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
6969
$this->cachedNamespaces = new \ArrayObject();
7070

7171
$this->add($node);
@@ -1081,6 +1081,22 @@ private function supportsEncoding(string $encoding): bool
10811081

10821082
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
10831083
{
1084+
if (\PHP_VERSION_ID >= 80400 && $this->useHtml5Parser) {
1085+
$document = @\Dom\HTMLDocument::createFromString($htmlContent, \Dom\HTML_NO_DEFAULT_NS, $charset);
1086+
$htmlContent = $document->saveXml();
1087+
$charset = $document->inputEncoding;
1088+
1089+
$dom = new \DOMDocument('1.0', $charset);
1090+
$dom->loadXML($htmlContent);
1091+
1092+
// Register id attributes as ID attributes for getElementById to work
1093+
foreach ((new \DOMXPath($dom))->query('//*[@id]') as $element) {
1094+
$element->setIdAttribute('id', true);
1095+
}
1096+
1097+
return $dom;
1098+
}
1099+
10841100
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
10851101
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
10861102
} else {
@@ -1216,7 +1232,7 @@ private function canParseHtml5String(string $content): bool
12161232
return false;
12171233
}
12181234

1219-
if (false === ($pos = stripos($content, '<!doctype html>'))) {
1235+
if (false === $pos = stripos($content, '<!doctype html>')) {
12201236
return false;
12211237
}
12221238

src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,17 @@ abstract class AbstractCrawlerTestCase extends TestCase
2424
{
2525
abstract public static function getDoctype(): string;
2626

27-
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null, bool $useHtml5Parser = true)
27+
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
2828
{
29-
return new Crawler($node, $uri, $baseHref, $useHtml5Parser);
29+
return new Crawler($node, $uri, $baseHref, false);
3030
}
3131

3232
public function testConstructor()
3333
{
3434
$crawler = $this->createCrawler();
3535
$this->assertCount(0, $crawler, '__construct() returns an empty crawler');
3636

37-
$doc = new \DOMDocument();
37+
$doc = $this->createDomDocument();
3838
$node = $doc->createElement('test');
3939

4040
$crawler = $this->createCrawler($node);
@@ -236,7 +236,7 @@ public function testAddNode()
236236

237237
public function testClear()
238238
{
239-
$doc = new \DOMDocument();
239+
$doc = $this->createDomDocument();
240240
$node = $doc->createElement('test');
241241

242242
$crawler = $this->createCrawler($node);
@@ -421,9 +421,9 @@ public function testHtml()
421421

422422
public function testEmojis()
423423
{
424-
$crawler = $this->createCrawler('<body><p>Hey 👋</p></body>');
424+
$crawler = $this->createCrawler('<head></head><body><p>Hey 👋</p></body>');
425425

426-
$this->assertSame('<body><p>Hey 👋</p></body>', $crawler->html());
426+
$this->assertSame('<head></head><body><p>Hey 👋</p></body>', $crawler->html());
427427
}
428428

429429
public function testExtract()
@@ -530,6 +530,16 @@ public function testFilterXPathWithAnUrl()
530530
$this->assertSame('Music', $crawler->text());
531531
}
532532

533+
public function testCaseSentivity()
534+
{
535+
$crawler = $this->createTestXmlCrawler();
536+
537+
$crawler = $crawler->filterXPath('//*[local-name() = "CaseSensitiveTag"]');
538+
$this->assertCount(1, $crawler);
539+
$this->assertSame('Some Content', $crawler->text());
540+
$this->assertSame('CaseSensitiveTag', $crawler->nodeName());
541+
}
542+
533543
public function testFilterXPathWithFakeRoot()
534544
{
535545
$crawler = $this->createTestCrawler();
@@ -1292,8 +1302,7 @@ public function testAddHtmlContentUnsupportedCharset()
12921302

12931303
public function createTestCrawler($uri = null)
12941304
{
1295-
$dom = new \DOMDocument();
1296-
$dom->loadHTML($this->getDoctype().'
1305+
$html = $this->getDoctype().'
12971306
<html>
12981307
<body>
12991308
<a href="foo">Foo</a>
@@ -1352,9 +1361,9 @@ public function createTestCrawler($uri = null)
13521361
</div>
13531362
</body>
13541363
</html>
1355-
');
1364+
';
13561365

1357-
return $this->createCrawler($dom, $uri);
1366+
return $this->createCrawler($html, $uri);
13581367
}
13591368

13601369
protected function createTestXmlCrawler($uri = null)
@@ -1369,6 +1378,7 @@ protected function createTestXmlCrawler($uri = null)
13691378
<yt:aspectRatio>widescreen</yt:aspectRatio>
13701379
</media:group>
13711380
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
1381+
<CaseSensitiveTag>Some Content</CaseSensitiveTag>
13721382
</entry>';
13731383

13741384
return $this->createCrawler($xml, $uri);

src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php renamed to src/Symfony/Component/DomCrawler/Tests/LegacyHtml5ParserCrawlerTest.php

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212
namespace Symfony\Component\DomCrawler\Tests;
1313

1414
use PHPUnit\Framework\Attributes\DataProvider;
15+
use PHPUnit\Framework\Attributes\RequiresPhp;
1516
use PHPUnit\Framework\Attributes\TestWith;
17+
use Symfony\Component\DomCrawler\Crawler;
1618

17-
class Html5ParserCrawlerTest extends AbstractCrawlerTestCase
19+
#[RequiresPhp('<8.4')]
20+
class LegacyHtml5ParserCrawlerTest extends AbstractCrawlerTestCase
1821
{
1922
public static function getDoctype(): string
2023
{
@@ -54,10 +57,10 @@ public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
5457
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
5558
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
5659

57-
$html5Crawler = $this->createCrawler(null, null, null, true);
60+
$html5Crawler = $this->createCrawler();
5861
$html5Crawler->add($html);
5962

60-
$nativeCrawler = $this->createCrawler(null, null, null, false);
63+
$nativeCrawler = parent::createCrawler();
6164
$nativeCrawler->add($html);
6265

6366
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
@@ -67,7 +70,7 @@ public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
6770
#[TestWith([false])]
6871
public function testHasHtml5Parser(bool $useHtml5Parser)
6972
{
70-
$crawler = $this->createCrawler(null, null, null, $useHtml5Parser);
73+
$crawler = $useHtml5Parser ? $this->createCrawler() : parent::createCrawler();
7174

7275
$r = new \ReflectionProperty($crawler::class, 'html5Parser');
7376
$html5Parser = $r->getValue($crawler);
@@ -99,4 +102,9 @@ public static function invalidHtml5Provider(): iterable
99102
yield 'Text' => ['hello world'.$html];
100103
yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
101104
}
105+
106+
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
107+
{
108+
return new Crawler($node, $uri, $baseHref, true);
109+
}
102110
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler\Tests;
13+
14+
use PHPUnit\Framework\Attributes\DataProvider;
15+
use PHPUnit\Framework\Attributes\RequiresPhp;
16+
use Symfony\Component\DomCrawler\Crawler;
17+
18+
#[RequiresPhp('8.4')]
19+
class NativeHtml5ParserCrawlerTest extends AbstractCrawlerTestCase
20+
{
21+
public static function getDoctype(): string
22+
{
23+
return '<!DOCTYPE html>';
24+
}
25+
26+
public function testIteration()
27+
{
28+
$crawler = $this->createTestCrawler()->filterXPath('//li');
29+
30+
$this->assertInstanceOf(\Traversable::class, $crawler);
31+
$this->assertContainsOnlyInstancesOf('DOMElement', iterator_to_array($crawler), 'Iterating a Crawler gives DOMElement instances');
32+
}
33+
34+
public function testHtml()
35+
{
36+
$this->assertEquals('<img alt="Bar">', $this->createTestCrawler()->filterXPath('//a[5]')->html());
37+
$this->assertEquals('<input type="text" value="TextValue" name="TextName"><input type="submit" value="FooValue" name="FooName" id="FooId"><input type="button" value="BarValue" name="BarName" id="BarId"><button value="ButtonValue" name="ButtonName" id="ButtonId"><input type="submit" value="FooBarValue" name="FooBarName" form="FooFormId"><input type="text" value="FooTextValue" name="FooTextName" form="FooFormId"><input type="image" alt="ImageAlt" form="FooFormId"></button>', trim(preg_replace('~>\s+<~', '><', $this->createTestCrawler()->filterXPath('//form[@id="FooFormId"]')->html())));
38+
39+
try {
40+
$this->createTestCrawler()->filterXPath('//ol')->html();
41+
$this->fail('->html() throws an \InvalidArgumentException if the node list is empty');
42+
} catch (\InvalidArgumentException $e) {
43+
$this->assertTrue(true, '->html() throws an \InvalidArgumentException if the node list is empty');
44+
}
45+
46+
$this->assertSame('my value', $this->createTestCrawler(null)->filterXPath('//ol')->html('my value'));
47+
}
48+
49+
public function testFilterXpathComplexQueries()
50+
{
51+
$crawler = $this->createTestCrawler()->filterXPath('//body');
52+
53+
$this->assertCount(0, $crawler->filterXPath('/input'));
54+
$this->assertCount(0, $crawler->filterXPath('/body'));
55+
$this->assertCount(1, $crawler->filterXPath('./body'));
56+
$this->assertCount(1, $crawler->filterXPath('.//body'));
57+
$this->assertCount(6, $crawler->filterXPath('.//input'));
58+
$this->assertCount(7, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
59+
$this->assertCount(1, $crawler->filterXPath('body'));
60+
$this->assertCount(8, $crawler->filterXPath('//button | //input'));
61+
$this->assertCount(1, $crawler->filterXPath('//body'));
62+
$this->assertCount(1, $crawler->filterXPath('descendant-or-self::body'));
63+
$this->assertCount(1, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('./div'), 'A child selection finds only the current div');
64+
$this->assertCount(3, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('descendant::div'), 'A descendant selector matches the current div and its child');
65+
$this->assertCount(3, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('//div'), 'A descendant selector matches the current div and its child');
66+
$this->assertCount(5, $crawler->filterXPath('(//a | //div)//img'));
67+
$this->assertCount(7, $crawler->filterXPath('((//a | //div)//img | //ul)'));
68+
$this->assertCount(7, $crawler->filterXPath('( ( //a | //div )//img | //ul )'));
69+
$this->assertCount(1, $crawler->filterXPath("//a[./@href][((./@id = 'Klausi|Claudiu' or normalize-space(string(.)) = 'Klausi|Claudiu' or ./@title = 'Klausi|Claudiu' or ./@rel = 'Klausi|Claudiu') or .//img[./@alt = 'Klausi|Claudiu'])]"));
70+
}
71+
72+
public function testAddHtml5()
73+
{
74+
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
75+
$crawler = $this->createCrawler();
76+
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
77+
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
78+
}
79+
80+
#[DataProvider('validHtml5Provider')]
81+
public function testHtml5ParserParseContentStartingWithValidHeading(string $content)
82+
{
83+
$crawler = $this->createCrawler();
84+
$crawler->addHtmlContent($content);
85+
self::assertEquals(
86+
'Foo',
87+
$crawler->filterXPath('//h1')->text(),
88+
'->addHtmlContent() parses valid HTML with comment before doctype'
89+
);
90+
}
91+
92+
public static function validHtml5Provider(): iterable
93+
{
94+
$html = self::getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
95+
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
96+
97+
yield 'BOM first' => [$BOM.$html];
98+
yield 'Single comment' => ['<!-- comment -->'.$html];
99+
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
100+
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
101+
yield 'Whitespaces' => [' '.$html];
102+
yield 'All together' => [$BOM.' <!--c-->'.$html];
103+
}
104+
105+
public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
106+
{
107+
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
108+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
109+
110+
$html5Crawler = $this->createCrawler();
111+
$html5Crawler->add($html);
112+
113+
$nativeCrawler = parent::createCrawler();
114+
$nativeCrawler->add($html);
115+
116+
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
117+
}
118+
119+
protected function createCrawler($node = null, ?string $uri = null, ?string $baseHref = null)
120+
{
121+
return new Crawler($node, $uri, $baseHref, true);
122+
}
123+
}

0 commit comments

Comments
 (0)