-
Notifications
You must be signed in to change notification settings - Fork 91
Split tests/test_parser.py into tests/test_modest_parser.py, tests/test_lexbor_parser.py #202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
+306
−108
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,267 @@ | ||
| import threading | ||
| from difflib import SequenceMatcher | ||
|
|
||
| import pytest | ||
|
|
||
| from selectolax.lexbor import LexborHTMLParser, LexborNode, SelectolaxError, create_tag | ||
|
|
||
| """ | ||
| We'are testing only our own code. | ||
| Many functionality are already tested in the Lexbor engine, so there is no reason to test every case. | ||
| """ | ||
|
|
||
|
|
||
| def test_parser(): | ||
| html = LexborHTMLParser("") | ||
| assert isinstance(html, LexborHTMLParser) | ||
|
|
||
| with pytest.raises(TypeError): | ||
| LexborHTMLParser(123) | ||
|
|
||
| with pytest.raises(TypeError): | ||
| LexborHTMLParser("asd").css(123) | ||
|
|
||
|
|
||
| def test_malformed_data(): | ||
| malformed_inputs = [ | ||
| b"\x00\x01\x02\x03", | ||
| "<div><p><span></div>", | ||
| "<" + "a" * 1000 + ">", | ||
| ] | ||
|
|
||
| for malformed_html in malformed_inputs: | ||
| try: | ||
| html_parser = LexborHTMLParser(malformed_html) | ||
| # Should not crash, but may return None or empty results | ||
| result = html_parser.html | ||
| assert result is None or isinstance(result, str) | ||
| except (ValueError, RuntimeError, UnicodeDecodeError): | ||
| # These exceptions are acceptable for malformed input | ||
| pass | ||
|
|
||
|
|
||
| def test_properties(): | ||
| html_parser = LexborHTMLParser("<div><p>test</p></div>") | ||
|
|
||
| properties_to_test = ["root", "head", "body", "html"] | ||
|
|
||
| for prop_name in properties_to_test: | ||
| getattr(html_parser, prop_name) | ||
|
|
||
|
|
||
| def test_unicode_handling(): | ||
| unicode_content = [ | ||
| "Hello 世界", | ||
| "🚀🌟💫", | ||
| "Café résumé naïve", | ||
| ] | ||
|
|
||
| for content in unicode_content: | ||
| html = f"<div>{content}</div>" | ||
| try: | ||
| html_parser = LexborHTMLParser(html) | ||
| result = html_parser.css_first("div") | ||
| if result: | ||
| extracted_text = result.text() | ||
| assert content in extracted_text | ||
| except UnicodeEncodeError: | ||
| # Some encoding issues might be expected | ||
| pass | ||
|
|
||
|
|
||
| def test_tag_name_validation(): | ||
| """Test that tag name validation works correctly.""" | ||
| html_parser = LexborHTMLParser("<div></div>") | ||
|
|
||
| # Empty tag name should be rejected | ||
| with pytest.raises(ValueError, match="Tag name cannot be empty"): | ||
| html_parser.tags("") | ||
|
|
||
| # Very long tag names should be rejected | ||
| long_tag_name = "a" * 101 # Exceeds 100 character limit | ||
| with pytest.raises(ValueError, match="Tag name is too long"): | ||
| html_parser.tags(long_tag_name) | ||
|
|
||
|
|
||
| def test_nodes(): | ||
| html = ( | ||
| '<div><p id="p1"></p><p id="p2"></p><p id="p3"><a>link</a></p>' | ||
| '<p id="p4"></p><p id="p5">text</p><p id="p6"></p></div>' | ||
| ) | ||
| htmlp = LexborHTMLParser(html) | ||
|
|
||
| assert isinstance(htmlp.root, LexborNode) | ||
| assert isinstance(htmlp.body, LexborNode) | ||
| html_output = htmlp.html | ||
| assert len(html_output) >= len(html) | ||
| assert SequenceMatcher(None, html, html_output).ratio() > 0.8 | ||
|
|
||
|
|
||
| def test_root_css(): | ||
| tree = LexborHTMLParser("test") | ||
| assert len(tree.root.css("data")) == 0 | ||
|
|
||
|
|
||
| def test_strip_tags_from_root(): | ||
| html = "<body><div></div><script></script></body>" | ||
| html_parser = LexborHTMLParser(html) | ||
| html_parser.root.strip_tags(["div", "script"]) | ||
| assert html_parser.html == "<html><head></head><body></body></html>" | ||
|
|
||
| with pytest.raises(TypeError): | ||
| html_parser.strip_tags(1) | ||
|
|
||
|
|
||
| def test_clone(): | ||
| html_parser = LexborHTMLParser("""<h1>Welcome</h1>""") | ||
| clone = html_parser.clone() | ||
| html_parser.root.css_first("h1").decompose() | ||
| del html_parser | ||
| assert clone.html == "<html><head></head><body><h1>Welcome</h1></body></html>" | ||
|
|
||
|
|
||
| def test_tags(): | ||
| html_parser = LexborHTMLParser(""" | ||
| <div><span><span></span></span></div> | ||
| <div><span></span></div> | ||
| <div><div></div></div> | ||
| <span></span> | ||
| <div></div> | ||
| """) | ||
| assert len(html_parser.tags("div")) == 5 | ||
|
|
||
|
|
||
| def test_preserves_doctype(): | ||
| html_parser = LexborHTMLParser(""" | ||
| <!DOCTYPE html> | ||
| <html> | ||
| <head><title>Test</title></head> | ||
| <body><p>Hello World</p></body> | ||
| </html> | ||
| """) | ||
| assert "<!DOCTYPE html>" in html_parser.html | ||
|
|
||
|
|
||
| def test_invalid_input_types(): | ||
| with pytest.raises(TypeError, match="Expected a string"): | ||
| LexborHTMLParser(123) | ||
|
|
||
| with pytest.raises(TypeError, match="Expected a string"): | ||
| LexborHTMLParser([]) | ||
|
|
||
| with pytest.raises(TypeError, match="Expected a string"): | ||
| LexborHTMLParser(None) | ||
|
|
||
|
|
||
| def test_clone_handling(): | ||
| html_parser = LexborHTMLParser("<div>test</div>") | ||
|
|
||
| cloned = html_parser.clone() | ||
| assert cloned.html is not None | ||
|
|
||
| assert html_parser.html is not None | ||
|
|
||
|
|
||
| def test_concurrent_parsing(): | ||
| """Test that concurrent parsing doesn't cause race conditions.""" | ||
| results = [] | ||
| errors = [] | ||
| lock = threading.Lock() | ||
|
|
||
| def parse_html(content): | ||
| try: | ||
| html_parser = LexborHTMLParser(content) | ||
| result = html_parser.body.text() | ||
| if result: | ||
| with lock: | ||
| results.append(result) | ||
| except Exception as e: | ||
| with lock: | ||
| errors.append(e) | ||
|
|
||
| threads = [] | ||
| test_content = "<div>Content {}</div>" | ||
|
|
||
| for i in range(50): | ||
| content = test_content.format(i) | ||
| t1 = threading.Thread(target=parse_html, args=(content,)) | ||
| threads.append(t1) | ||
|
|
||
| for t in threads: | ||
| t.start() | ||
|
|
||
| for t in threads: | ||
| t.join() | ||
|
|
||
| assert len(errors) == 0 | ||
| assert len(results) == 50 | ||
|
|
||
|
|
||
| def test_css_selector_error_handling(): | ||
| html_parser = LexborHTMLParser("<div class='test'>content</div>") | ||
|
|
||
| # Invalid selector types should raise TypeError | ||
| with pytest.raises(TypeError): | ||
| html_parser.css(123) | ||
|
|
||
| with pytest.raises(TypeError): | ||
| html_parser.css(None) | ||
|
|
||
| invalid_selectors = [ | ||
| ":::", | ||
| "[[[", | ||
| "div{color:red}", | ||
| 'h3:contains("some substring")', | ||
| ] | ||
|
|
||
| for selector in invalid_selectors: | ||
| try: | ||
| result = html_parser.css(selector) | ||
| # Should return empty list or raise specific exception | ||
| assert isinstance(result, list) | ||
| except SelectolaxError: | ||
| # Specific parsing errors are acceptable | ||
| pass | ||
|
|
||
|
|
||
| def test_null_pointer_safety(): | ||
| """Test that NULL pointer checks prevent crashes.""" | ||
| # Test edge cases that might result in NULL pointers | ||
| edge_cases = [ | ||
| "", # Empty HTML | ||
| "<>", # Empty tag | ||
| "<!>", # Empty declaration | ||
| "<html></html>", # Minimal valid HTML | ||
| ] | ||
| properties_to_test = ["root", "head", "body", "html"] | ||
| for html_content in edge_cases: | ||
| html_parser = LexborHTMLParser(html_content) | ||
|
|
||
| for prop_name in properties_to_test: | ||
| getattr(html_parser, prop_name) | ||
|
|
||
|
|
||
| def test_decompose_root_node(): | ||
| html_parser = LexborHTMLParser("<div><p>test</p></div>") | ||
| with pytest.raises(SelectolaxError): | ||
| html_parser.root.decompose() | ||
|
|
||
|
|
||
| def test_empty_attribute_lexbor(): | ||
| div = create_tag("div") | ||
| div.attrs["hidden"] = None | ||
| assert div.html == "<div hidden></div>" | ||
|
|
||
|
|
||
| def test_pseudo_class_contains(): | ||
| html = "<div><p>hello world</p><p id='main'>AwesOme t3xt</p></div>" | ||
| parser = LexborHTMLParser(html) | ||
| results = parser.css('p:lexbor-contains("awesome" i)') | ||
| assert len(results) == 1 | ||
| assert results[0].text() == "AwesOme t3xt" | ||
|
|
||
|
|
||
| def test_css_matches_returns_bool(): | ||
| res = LexborHTMLParser("<div>test</div>").css_matches("div") | ||
| assert isinstance(res, bool) | ||
| assert res is True | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Corrected spelling of 'We'are' to 'We're'.