From 88864d7afa639d69cc9d2a60d6d7598d7187f74d Mon Sep 17 00:00:00 2001 From: Dylan Lukes Date: Thu, 23 Jan 2020 17:15:38 -0800 Subject: [PATCH 1/6] replace re => regex --- baron/grouper.py | 2 +- baron/tokenizer.py | 2 +- baron/utils.py | 2 +- requirements.txt | 1 + setup.py | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/baron/grouper.py b/baron/grouper.py index 70ef033e..39289a5d 100644 --- a/baron/grouper.py +++ b/baron/grouper.py @@ -1,6 +1,6 @@ # encoding: utf-8 -import re +import regex as re from .utils import FlexibleIterator to_group = ( diff --git a/baron/tokenizer.py b/baron/tokenizer.py index f4bb60b7..1eacd7bf 100644 --- a/baron/tokenizer.py +++ b/baron/tokenizer.py @@ -1,4 +1,4 @@ -import re +import regex as re from .utils import BaronError diff --git a/baron/utils.py b/baron/utils.py index f90b630b..bd4512ed 100644 --- a/baron/utils.py +++ b/baron/utils.py @@ -1,5 +1,5 @@ import sys -import re +import regex as re python_version = sys.version_info[0] diff --git a/requirements.txt b/requirements.txt index 31ad5809..e84fabe6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ rply +regex \ No newline at end of file diff --git a/setup.py b/setup.py index a9740c02..fa2ad351 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ long_description=read_md("README.md") + "\n\n" + open("CHANGELOG", "r").read(), author_email='cortex@worlddomination.be', url='https://github.com/PyCQA/baron', - install_requires=['rply'], + install_requires=['rply', 'regex'], packages=['baron'], license='lgplv3+', scripts=[], From 99514e97fd71c0a1eed62d0906f24b7016282f39 Mon Sep 17 00:00:00 2001 From: Dylan Lukes Date: Thu, 23 Jan 2020 17:19:54 -0800 Subject: [PATCH 2/6] NAME regex "[a-zA-Z_]\w*" => "[\p{XID_Start}_]\p{XID_Continue}*" --- baron/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baron/tokenizer.py b/baron/tokenizer.py index 1eacd7bf..9c770e90 100644 --- a/baron/tokenizer.py +++ b/baron/tokenizer.py @@ -9,7 +9,7 @@ class UnknowItem(BaronError): KEYWORDS = ("and", "as", "assert", "break", "class", "continue", "def", "del", "elif", "else", "except", "exec", "finally", "for", "from", "global", "nonlocal", "if", "import", "in", "is", "lambda", "not", "or", "pass", "print", "raise", "return", "try", "while", "with", "yield") TOKENS = ( - (r'[a-zA-Z_]\w*', 'NAME'), + (r'[\p{XID_Start}_]\p{XID_Continue}*', 'NAME'), (r'0', 'INT'), (r'[-+]?\d+[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'), (r'[-+]?\d+.\d?[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'), From 9517d7e34ed6b78f2ae58f187945a0a5769a62cd Mon Sep 17 00:00:00 2001 From: Dylan Lukes Date: Thu, 23 Jan 2020 17:21:05 -0800 Subject: [PATCH 3/6] add test for unicode name --- tests/test_tokenizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index ae865420..01d03809 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -22,6 +22,11 @@ def test_name__(): match('_a', 'NAME') +def test_name_unicode(): + match('β', 'NAME') + match('가사', 'NAME') + + def test_name_number(): match('a123', 'NAME') @@ -551,6 +556,7 @@ def test_exponant_complex(): match("-1.1E+1J", "FLOAT_EXPONANT_COMPLEX") match("-.1E+1J", "FLOAT_EXPONANT_COMPLEX") + # TODO 1.1e1j From 19d61efd0ec208b5155c96b1aa02f47937e0d2ba Mon Sep 17 00:00:00 2001 From: Dylan Lukes Date: Thu, 23 Jan 2020 22:12:38 -0800 Subject: [PATCH 4/6] support for splitting non-ASCII --- baron/spliter.py | 6 +++++- baron/utils.py | 41 +++++++++++++++++++++++++++++------------ tests/test_spliter.py | 12 ++++++++++-- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/baron/spliter.py b/baron/spliter.py index e833d40e..956d8f19 100644 --- a/baron/spliter.py +++ b/baron/spliter.py @@ -1,5 +1,5 @@ import string -from .utils import FlexibleIterator, BaronError +from .utils import FlexibleIterator, BaronError, is_xid_start, is_xid_continue def split(sequence): @@ -58,6 +58,10 @@ def split_generator(sequence): not_found = False yield iterator.grab(lambda iterator: iterator.show_next() in section) + if iterator.next_is(is_xid_start) or iterator.next_is(is_xid_continue): + not_found = False + yield iterator.grab(lambda iterator: iterator.next_is(is_xid_start) or iterator.next_is(is_xid_continue)) + for one in "@,.;()=*:+-/^%&<>|\r\n~[]{}!``\\": if iterator.next_in(one): not_found = False diff --git a/baron/utils.py b/baron/utils.py index bd4512ed..cc7d709d 100644 --- a/baron/utils.py +++ b/baron/utils.py @@ -1,6 +1,7 @@ import sys -import regex as re +import unicodedata +import regex as re python_version = sys.version_info[0] python_subversion = sys.version_info[1] @@ -33,9 +34,12 @@ def next_starts_with(self, sentence): return self.sequence[self.position + 1: self.position + 1 + size_of_choice] == sentence def next_in(self, choice): + return self.next_is(lambda item: item in choice) + + def next_is(self, predicate): if self.position + 1 >= len(self.sequence): return False - return self.sequence[self.position + 1] in choice + return predicate(self.sequence[self.position + 1]) def show_next(self, at=1): if self.position + at >= len(self.sequence): @@ -106,6 +110,20 @@ def split_on_newlines(text): yield text[current_position:] +xid_start_regex = re.compile(r"\p{XID_Start}") + + +def is_xid_start(char): + return xid_start_regex.match(char) + + +xid_continue_regex = re.compile(r"\p{XID_Continue}") + + +def is_xid_continue(char): + return xid_continue_regex.match(char) + + # Thanks to # https://github.com/nvie/rq/commit/282f4be9316d608ebbacd6114aab1203591e8f95 if python_version >= 3 or python_subversion >= 7: @@ -115,26 +133,25 @@ def total_ordering(cls): """Class decorator that fills in missing ordering methods""" convert = { '__lt__': [('__gt__', lambda self, other: other < self), - ('__le__', lambda self, other: not other < self), - ('__ge__', lambda self, other: not self < other)], + ('__le__', lambda self, other: not other < self), + ('__ge__', lambda self, other: not self < other)], '__le__': [('__ge__', lambda self, other: other <= self), - ('__lt__', lambda self, other: not other <= self), - ('__gt__', lambda self, other: not self <= other)], + ('__lt__', lambda self, other: not other <= self), + ('__gt__', lambda self, other: not self <= other)], '__gt__': [('__lt__', lambda self, other: other > self), - ('__ge__', lambda self, other: not other > self), - ('__le__', lambda self, other: not self > other)], + ('__ge__', lambda self, other: not other > self), + ('__le__', lambda self, other: not self > other)], '__ge__': [('__le__', lambda self, other: other >= self), - ('__gt__', lambda self, other: not other >= self), - ('__lt__', lambda self, other: not self >= other)] + ('__gt__', lambda self, other: not other >= self), + ('__lt__', lambda self, other: not self >= other)] } roots = set(dir(cls)) & set(convert) if not roots: raise ValueError('must define at least one ordering operation: < > <= >=') # noqa - root = max(roots) # prefer __lt__ to __le__ to __gt__ to __ge__ + root = max(roots) # prefer __lt__ to __le__ to __gt__ to __ge__ for opname, opfunc in convert[root]: if opname not in roots: opfunc.__name__ = opname opfunc.__doc__ = getattr(int, opname).__doc__ setattr(cls, opname, opfunc) return cls - diff --git a/tests/test_spliter.py b/tests/test_spliter.py index 34402aa0..87fac56b 100644 --- a/tests/test_spliter.py +++ b/tests/test_spliter.py @@ -91,6 +91,10 @@ def test_assign(): assert split("a = b") == ["a", " ", "=", " ", "b"] +def test_assign_unicode(): + assert split("α = β") == ["α", " ", "=", " ", "β"] + + def test_call(): assert split("function()") == ["function", "(", ")"] @@ -247,7 +251,9 @@ def test_if(): def test_if_elif_else(): - assert split("if a:\n pass\nelif b:\n pass\nelse: \n pass") == ["if", " ", "a", ":", "\n", " ", "pass", "\n", "elif", " ", "b", ":", "\n", " ", "pass", "\n", "else", ":", " ", "\n", " ", "pass"] + assert split("if a:\n pass\nelif b:\n pass\nelse: \n pass") == ["if", " ", "a", ":", "\n", " ", "pass", "\n", + "elif", " ", "b", ":", "\n", " ", "pass", "\n", + "else", ":", " ", "\n", " ", "pass"] def test_while(): @@ -365,7 +371,9 @@ def test_backslash_in_comment(): def test_regression(): - assert split("(r'[\"\\'](.|\n|\r)*[\"\\']', 'STRING'),") == ["(", "r", "'[\"\\'](.|\n|\r)*[\"\\']'", ",", " ", "'STRING'", ")", ","] + assert split("(r'[\"\\'](.|\n|\r)*[\"\\']', 'STRING'),") == ["(", "r", "'[\"\\'](.|\n|\r)*[\"\\']'", ",", " ", + "'STRING'", ")", ","] + # TODO: make this test pass in python3 also # requires to remove dependency on ast.py From e6628bcdc58e9c80bd65fb01ddc5f0baa76d1307 Mon Sep 17 00:00:00 2001 From: Dylan Lukes Date: Fri, 24 Jan 2020 16:03:17 -0800 Subject: [PATCH 5/6] minor cleanup --- baron/spliter.py | 2 +- baron/utils.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/baron/spliter.py b/baron/spliter.py index 956d8f19..5f3dcc48 100644 --- a/baron/spliter.py +++ b/baron/spliter.py @@ -67,7 +67,7 @@ def split_generator(sequence): not_found = False yield next(iterator) - if iterator.show_next().__repr__().startswith("'\\x"): + if iterator.show_next().__repr__().startswith(r"'\x"): # guys, seriously, how do you manage to put this shit in your code? # I mean, I don't even know how this is possible! # example of guilty file: ve/lib/python2.7/site-packages/tests/test_oauth.py diff --git a/baron/utils.py b/baron/utils.py index cc7d709d..c4ce7c49 100644 --- a/baron/utils.py +++ b/baron/utils.py @@ -1,5 +1,4 @@ import sys -import unicodedata import regex as re From 4c202094412938e21dab6c9d108ea8edaac049a2 Mon Sep 17 00:00:00 2001 From: Dylan Lukes Date: Fri, 24 Jan 2020 16:28:41 -0800 Subject: [PATCH 6/6] add six, make unicode tests conditional on py version --- baron/render.py | 4 +++- baron/utils.py | 3 ++- requirements.txt | 3 ++- setup.py | 2 +- tests/test_path.py | 7 ++++--- tests/test_spliter.py | 9 +++++---- tests/test_tokenizer.py | 9 +++++---- 7 files changed, 22 insertions(+), 15 deletions(-) diff --git a/baron/render.py b/baron/render.py index 7addb110..6234a2cf 100644 --- a/baron/render.py +++ b/baron/render.py @@ -1,6 +1,8 @@ import sys import json +import six + def render(node, strict=False): """Recipe to render a given FST node. @@ -69,7 +71,7 @@ def render_node(node, strict=False): if key_type == "key": assert isinstance(node[render_key], (dict, type(None))), "Key '%s' is expected to have type of 'key' (dict/None) but has type of '%s' instead" % (render_key, type(node[render_key])) elif key_type == "string": - assert isinstance(node[render_key], str), "Key '%s' is expected to have type of 'string' but has type of '%s' instead" % (render_key, type(node[render_key])) + assert isinstance(node[render_key], six.string_types), "Key '%s' is expected to have type of 'string' but has type of '%s' instead" % (render_key, type(node[render_key])) elif key_type in ("list", "formatting"): assert isinstance(node[render_key], list), "Key '%s' is expected to have type of 'list' but has type of '%s' instead" % (render_key, type(node[render_key])) elif key_type == "constant": diff --git a/baron/utils.py b/baron/utils.py index c4ce7c49..ed3df74e 100644 --- a/baron/utils.py +++ b/baron/utils.py @@ -1,10 +1,11 @@ import sys import regex as re +import six python_version = sys.version_info[0] python_subversion = sys.version_info[1] -string_instance = str if python_version == 3 else basestring +string_instance = six.string_types # alias, for isinstance usage in redbaron class BaronError(Exception): diff --git a/requirements.txt b/requirements.txt index e84fabe6..92268f84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ rply -regex \ No newline at end of file +regex +six \ No newline at end of file diff --git a/setup.py b/setup.py index fa2ad351..5c96d51d 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ long_description=read_md("README.md") + "\n\n" + open("CHANGELOG", "r").read(), author_email='cortex@worlddomination.be', url='https://github.com/PyCQA/baron', - install_requires=['rply', 'regex'], + install_requires=['rply', 'regex', 'six'], packages=['baron'], license='lgplv3+', scripts=[], diff --git a/tests/test_path.py b/tests/test_path.py index 99063978..2e80d06d 100644 --- a/tests/test_path.py +++ b/tests/test_path.py @@ -1,8 +1,9 @@ +import six + from baron.baron import parse from baron.path import PathWalker, Position, BoundingBox -from baron.path import position_to_path, path_to_node, position_to_node from baron.path import path_to_bounding_box, node_to_bounding_box -from baron.utils import string_instance +from baron.path import position_to_path, path_to_node, position_to_node def test_position(): @@ -145,7 +146,7 @@ def check_path(code, positions, target_path): return node = path_to_node(tree, path) - assert isinstance(node, string_instance) + assert isinstance(node, six.string_types) assert position_to_node(tree, position) is node diff --git a/tests/test_spliter.py b/tests/test_spliter.py index 87fac56b..2f9bde6f 100644 --- a/tests/test_spliter.py +++ b/tests/test_spliter.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding:Utf-8 -*- - +import six from baron.spliter import split, UntreatedError from baron.utils import python_version @@ -91,8 +91,9 @@ def test_assign(): assert split("a = b") == ["a", " ", "=", " ", "b"] -def test_assign_unicode(): - assert split("α = β") == ["α", " ", "=", " ", "β"] +if six.PY3: + def test_assign_unicode(): + assert split("α = β") == ["α", " ", "=", " ", "β"] def test_call(): @@ -377,6 +378,6 @@ def test_regression(): # TODO: make this test pass in python3 also # requires to remove dependency on ast.py -if python_version == 2: +if six.PY2: def test_remove_crap(): assert split("\x0c\xef\xbb\xbf") == [] diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 01d03809..da5ad037 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding:Utf-8 -*- - +import six from baron.tokenizer import tokenize, KEYWORDS @@ -22,9 +22,10 @@ def test_name__(): match('_a', 'NAME') -def test_name_unicode(): - match('β', 'NAME') - match('가사', 'NAME') +if six.PY3: + def test_name_unicode(): + match('β', 'NAME') + match('가사', 'NAME') def test_name_number():