Skip to content

Commit 2d2e052

Browse files
committed
Use Nokogiri::HTML5::Inference for parsing HTML fragments
1 parent 90b3af3 commit 2d2e052

5 files changed

Lines changed: 48 additions & 60 deletions

File tree

gem/Gemfile.lock

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ PATH
2121
deface (~> 1.9)
2222
html_press (~> 0.8.2)
2323
nokogiri (~> 1.0)
24+
nokogiri-html5-inference (~> 0.2)
2425
phlex (~> 1.6)
2526
phlex-rails (>= 0.9, < 2.0)
2627
syntax_tree (~> 6.0)
@@ -105,6 +106,8 @@ GEM
105106
racc (~> 1.4)
106107
nokogiri (1.16.2-x86_64-linux)
107108
racc (~> 1.4)
109+
nokogiri-html5-inference (0.2.0)
110+
nokogiri (~> 1.14)
108111
parallel (1.22.1)
109112
parser (3.2.0.0)
110113
ast (~> 2.4.1)

gem/lib/phlexing/parser.rb

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,15 @@
11
# frozen_string_literal: true
22

33
require "nokogiri"
4+
require "nokogiri/html5/inference"
45

56
module Phlexing
67
class Parser
78
def self.call(source)
89
source = ERBTransformer.call(source)
910
source = Minifier.call(source)
1011

11-
# Credit:
12-
# https://github.com/spree/deface/blob/6bf18df76715ee3eb3d0cd1b6eda822817ace91c/lib/deface/parser.rb#L105-L111
13-
#
14-
15-
html_tag = /<html(( .*?(?:(?!>)[\s\S])*>)|>)/i
16-
head_tag = /<head(( .*?(?:(?!>)[\s\S])*>)|>)/i
17-
body_tag = /<body(( .*?(?:(?!>)[\s\S])*>)|>)/i
18-
19-
if source =~ html_tag
20-
Nokogiri::HTML::Document.parse(source)
21-
elsif source =~ head_tag && source =~ body_tag
22-
Nokogiri::HTML::Document.parse(source).css("html").first
23-
elsif source =~ head_tag
24-
Nokogiri::HTML::Document.parse(source).css("head").first
25-
elsif source =~ body_tag
26-
Nokogiri::HTML::Document.parse(source).css("body").first
27-
else
28-
Nokogiri::HTML::DocumentFragment.parse(source)
29-
end
12+
Nokogiri::HTML5::Inference.parse(source)
3013
end
3114
end
3215
end

gem/phlexing.gemspec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
3232
spec.add_dependency "deface", "~> 1.9"
3333
spec.add_dependency "html_press", "~> 0.8.2"
3434
spec.add_dependency "nokogiri", "~> 1.0"
35+
spec.add_dependency "nokogiri-html5-inference", "~> 0.2"
3536
spec.add_dependency "phlex", "~> 1.6"
3637
spec.add_dependency "phlex-rails", ">= 0.9", "< 2.0"
3738
spec.add_dependency "syntax_tree", "~> 6.0"

gem/test/phlexing/converter/uppercase_tags_test.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ class Phlexing::Converter::UppercaseTagsTest < Minitest::Spec
7676

7777
expected = <<~PHLEX.strip
7878
html do
79-
whitespace
8079
head
8180
whitespace
8281
body

gem/test/phlexing/parser_test.rb

Lines changed: 42 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@
22

33
require_relative "../test_helper"
44

5-
def assert_dom_equal(expected, actual)
6-
assert_equal expected, actual.gsub(%(<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">), "").squish
7-
end
8-
95
module Phlexing
106
class ParserTest < Minitest::Spec
117
before(:each) do
128
@nodes = []
139
end
1410

1511
def extract_children(node)
16-
@nodes << node.name
12+
@nodes << node.name if node.respond_to?(:name)
1713

18-
if node&.children
19-
node.children.each do |node|
20-
extract_children(node)
14+
if node.is_a?(Nokogiri::XML::NodeSet)
15+
node.each do |n|
16+
extract_children(n)
17+
end
18+
else
19+
if node&.children
20+
node.children.each do |node|
21+
extract_children(node)
22+
end
2123
end
2224
end
2325

@@ -28,88 +30,88 @@ def extract_children(node)
2830
parser = Parser.call(nil)
2931

3032
assert_equal "#document-fragment", extract_children(parser).join(",")
31-
assert_dom_equal "", parser.to_xml
33+
assert_equal "", parser.to_html
3234
assert_equal "#document-fragment", parser.name
33-
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
35+
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
3436
end
3537

3638
it "should handle empty string" do
3739
parser = Parser.call("")
3840

3941
assert_equal "#document-fragment", extract_children(parser).join(",")
40-
assert_dom_equal "", parser.to_xml
42+
assert_equal "", parser.to_html
4143
assert_equal "#document-fragment", parser.name
42-
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
44+
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
4345
end
4446

4547
it "should handle simple div" do
4648
parser = Parser.call("<div></div>")
4749

4850
assert_equal "#document-fragment,div", extract_children(parser).join(",")
49-
assert_dom_equal %(<div></div>), parser.to_html
51+
assert_equal %(<div></div>), parser.to_html
5052
assert_equal "#document-fragment", parser.name
51-
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
53+
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
5254
end
5355

5456
it "should handle ERB" do
5557
parser = Parser.call("<div><%= some_method %></div>")
5658

5759
assert_equal "#document-fragment,div,erb,text", extract_children(parser).join(",")
58-
assert_dom_equal %(<div> <erb loud=""> some_method </erb> </div>), parser.to_xml
60+
assert_equal %(<div><erb loud=""> some_method </erb></div>), parser.to_html
5961
assert_equal "#document-fragment", parser.name
60-
assert_equal Nokogiri::HTML4::DocumentFragment, parser.class
62+
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
6163
end
6264

6365
it "should handle html" do
6466
parser = Parser.call("<html></html>")
6567

66-
assert_equal "document,html,html", extract_children(parser).join(",")
67-
assert_dom_equal %(<html></html>), parser.to_xml
68+
assert_equal "document,html,head,body", extract_children(parser).join(",")
69+
assert_equal %(<html></html>), parser.to_html
6870
assert_equal "document", parser.name
69-
assert_equal Nokogiri::HTML4::Document, parser.class
71+
assert_equal Nokogiri::HTML5::DocumentFragment, parser.class
7072
end
7173

7274
it "should handle html, head and body" do
7375
parser = Parser.call("<html><head><title>Title</title></head><body><h1>Hello</h1></body></html>")
7476

75-
assert_equal "document,html,html,head,title,text,body,h1,text", extract_children(parser).join(",")
76-
assert_dom_equal %(<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Title</title> </head> <body><h1>Hello</h1></body> </html>), parser.to_xml
77+
assert_equal "document,html,head,title,text,body,h1,text", extract_children(parser).join(",")
78+
assert_equal %(<html><head><title>Title</title></head><body><h1>Hello</h1></body></html>), parser.to_html
7779
assert_equal "document", parser.name
78-
assert_equal Nokogiri::HTML4::Document, parser.class
80+
assert_equal Nokogiri::HTML5::Document, parser.class
7981
end
8082

8183
it "should handle html and head" do
8284
parser = Parser.call("<html><head><title>Title</title></head></html>")
8385

84-
assert_equal "document,html,html,head,title,text", extract_children(parser).join(",")
85-
assert_dom_equal %(<html><head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Title</title> </head></html>), parser.to_xml
86+
assert_equal "document,html,head,title,text,body", extract_children(parser).join(",")
87+
assert_equal %(<html><head><title>Title</title></head></html>), parser.to_html
8688
assert_equal "document", parser.name
87-
assert_equal Nokogiri::HTML4::Document, parser.class
89+
assert_equal Nokogiri::HTML5::Document, parser.class
8890
end
8991

9092
it "should handle html and body" do
9193
parser = Parser.call("<html><body><h1>Hello</h1></body></html>")
9294

93-
assert_equal "document,html,html,body,h1,text", extract_children(parser).join(",")
94-
assert_dom_equal %(<html><body><h1>Hello</h1></body></html>), parser.to_xml
95+
assert_equal "document,html,body,h1,text", extract_children(parser).join(",")
96+
assert_equal %(<html><body><h1>Hello</h1></body></html>), parser.to_html
9597
assert_equal "document", parser.name
96-
assert_equal Nokogiri::HTML4::Document, parser.class
98+
assert_equal Nokogiri::HTML5::Document, parser.class
9799
end
98100

99101
it "should handle head and body" do
100102
parser = Parser.call("<head><title>Title</title></head><body><h1>Hello</h1></body>")
101103

102-
assert_equal "html,head,title,text,body,h1,text", extract_children(parser).join(",")
103-
assert_dom_equal %(<html> <head> <title>Title</title> </head> <body> <h1>Hello</h1> </body> </html>), parser.to_xml
104-
assert_equal "html", parser.name
105-
assert_equal Nokogiri::XML::Element, parser.class
104+
assert_equal "head,title,text,body,h1,text", extract_children(parser).join(",")
105+
assert_equal %(<head><title>Title</title></head><body><h1>Hello</h1></body>), parser.to_html
106+
assert_equal false, parser.respond_to?(:name)
107+
assert_equal Nokogiri::XML::NodeSet, parser.class
106108
end
107109

108110
it "should handle head with title" do
109111
parser = Parser.call("<head><title>Title</title></head>")
110112

111113
assert_equal "head,title,text", extract_children(parser).join(",")
112-
assert_dom_equal %(<head> <title>Title</title> </head>), parser.to_xml
114+
assert_equal %(<head><title>Title</title></head>), parser.to_html
113115
assert_equal "head", parser.name
114116
assert_equal Nokogiri::XML::Element, parser.class
115117
end
@@ -118,7 +120,7 @@ def extract_children(node)
118120
parser = Parser.call("<head></head>")
119121

120122
assert_equal "head", extract_children(parser).join(",")
121-
assert_dom_equal %(<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head>), parser.to_html
123+
assert_equal %(<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head>), parser.to_html
122124
assert_equal "head", parser.name
123125
assert_equal Nokogiri::XML::Element, parser.class
124126
end
@@ -127,18 +129,18 @@ def extract_children(node)
127129
parser = Parser.call("<body><h1>Hello</h1></body>")
128130

129131
assert_equal "body,h1,text", extract_children(parser).join(",")
130-
assert_dom_equal %(<body> <h1>Hello</h1> </body>), parser.to_xml
131-
assert_equal "body", parser.name
132-
assert_equal Nokogiri::XML::Element, parser.class
132+
assert_equal %(<body><h1>Hello</h1></body>), parser.to_html
133+
# assert_equal "body", parser.name
134+
assert_equal Nokogiri::XML::NodeSet, parser.class
133135
end
134136

135137
it "should handle body" do
136138
parser = Parser.call("<body></body>")
137139

138140
assert_equal "body", extract_children(parser).join(",")
139-
assert_dom_equal %(<body></body>), parser.to_html
140-
assert_equal "body", parser.name
141-
assert_equal Nokogiri::XML::Element, parser.class
141+
assert_equal %(<body></body>), parser.to_html
142+
# assert_equal "body", parser.name
143+
assert_equal Nokogiri::XML::NodeSet, parser.class
142144
end
143145
end
144146
end

0 commit comments

Comments
 (0)