diff --git a/src/__init__.py b/src/__init__.py index b3afeca..f0c726e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,2 +1,5 @@ from .tokenizer import Tokenizer from .parser import Parser +from .token import Token +from .fqn import FQN +from .scope import Scope diff --git a/src/fqn.py b/src/fqn.py index 06ce3c3..f7abba0 100644 --- a/src/fqn.py +++ b/src/fqn.py @@ -6,6 +6,19 @@ @dataclass class FQN: + """ + Represents a fully qualified name (FQN) of a function, method, or symbol. + + Attributes: + name (str): The simple (unqualified) name. + full_name (str): The fully qualified name (e.g., including namespaces or modules). + return_type (Optional[str]): The return type of the symbol, if known. + args (Optional[List[str]]): The list of argument types or names. + scopes (Optional[List[Scope]]): The lexical or semantic scopes the symbol belongs to. + template (Optional[str]): Template/generic type information, if applicable. + constant (bool): Whether the symbol represents a constant value. + volatile (bool): Whether the symbol is considered volatile (e.g., changes frequently or is side-effect-prone). + """ name: str full_name: str return_type: Optional[str] = None @@ -16,6 +29,15 @@ class FQN: volatile: bool = False def __eq__(self, other: object) -> bool: + """ + Compare this FQN instance with another for structural equality. + + Args: + other (object): Another object to compare with. + + Returns: + bool: True if `other` is an FQN with equal attributes, False otherwise. + """ return (isinstance(other, FQN) and self.name == other.name and self.full_name == other.full_name and diff --git a/src/parser.py b/src/parser.py index d8d5d1a..6027dd0 100644 --- a/src/parser.py +++ b/src/parser.py @@ -6,37 +6,92 @@ class Parser: + """ + Parses a string representation of a fully qualified name (FQN) into a structured FQN object. + + This parser consumes tokens produced by a `Tokenizer` and reconstructs: + - the symbol name + - its argument list + - template definitions + - qualifier flags (const, volatile) + - return type + - and any enclosing scopes + + Attributes: + string (str): The original input string. + tokenizer (Tokenizer): Tokenizer instance processing the input. + tokens (List[Token]): All parsed tokens from the input. + __cursor (int): Current index in the token list (reverse parsing). + """ def __init__(self, string: str) -> None: + """ + Initializes the parser and tokenizes the input string. + + Args: + string (str): The string to parse. + """ self.string: str = string self.tokenizer: Tokenizer = Tokenizer(string) self.tokens: List[Token] = list(self.tokenizer.get_all_tokens()) - self.cursor: int = len(self.tokens) - 1 - self._has_args: bool = True + self.__cursor: int = len(self.tokens) - 1 + + def _peek(self) -> Optional[Token]: + """ + Looks at the current token without consuming it. + + Returns: + Optional[Token]: The current token, or None if at the start. + """ + return self.tokens[self.__cursor] if self.__cursor >= 0 else None + + def _consume(self, expected_type: Optional[str] = None) -> Token: + """ + Consumes and returns the current token, optionally verifying its type. - def peek(self) -> Optional[Token]: - return self.tokens[self.cursor] if self.cursor >= 0 else None + Args: + expected_type (Optional[str]): Expected token type (if any). - def consume(self, expected_type: Optional[str] = None) -> Token: - token: Optional[Token] = self.peek() + Returns: + Token: The consumed token. + + Raises: + SyntaxError: If the token type doesn't match or input ends unexpectedly. + """ + token: Optional[Token] = self._peek() if token is None: raise SyntaxError(f"Unexpected end of input, expected: '{expected_type}'") if expected_type and token.type_ != expected_type: raise SyntaxError(f"Expected token type '{token.type_}' with value '{token.value}'. " f"Expected type: '{expected_type}'") - self.cursor -= 1 + self.__cursor -= 1 return token - def match(self, token_type: str) -> bool: - token = self.peek() + def _match(self, token_type: str) -> bool: + """ + Checks if the current token matches a given type. + + Args: + token_type (str): The token type to check. + + Returns: + bool: True if the current token matches the type, False otherwise. + """ + token = self._peek() return token is not None and token.type_ == token_type def parse(self) -> FQN: - fqn_qualifiers: Dict[str, bool] = self.parse_qualifiers() - fqn_args: Optional[List[str]] = self.parse_args() - fqn_template: Optional[str] = self.parse_template() - fqn_name: str = self.parse_name() - fqn_scopes: Optional[List[Scope]] = self.parse_scopes() - fqn_return_type: Optional[str] = self.parse_return_type() + """ + Parses the entire input string into an FQN object. + + Returns: + FQN: The parsed fully qualified name structure. + """ + fqn_qualifiers: Dict[str, bool] = self._parse_qualifiers() + fqn_args: Optional[List[str]] = self._parse_args() + fqn_template: Optional[str] = self._parse_template() + fqn_name: str = self._parse_name() + fqn_scopes: Optional[List[Scope]] = self._parse_scopes() + fqn_return_type: Optional[str] = self._parse_return_type() return FQN(name=fqn_name, full_name=self.string, return_type=fqn_return_type, @@ -46,11 +101,17 @@ def parse(self) -> FQN: constant=fqn_qualifiers["constant"], volatile=fqn_qualifiers["volatile"]) - def parse_qualifiers(self) -> Dict[str, bool]: - if not self.match("MEMBER"): + def _parse_qualifiers(self) -> Dict[str, bool]: + """ + Parses trailing qualifiers like 'const' and 'volatile'. + + Returns: + Dict[str, bool]: A dictionary with boolean flags: {'constant': bool, 'volatile': bool} + """ + if not self._match("MEMBER"): return {"constant": False, "volatile": False} - token: Token = self.consume("MEMBER") + token: Token = self._consume("MEMBER") constant: bool = token.value == "const" volatile: bool = token.value == "volatile" @@ -58,40 +119,46 @@ def parse_qualifiers(self) -> Dict[str, bool]: raise SyntaxError("FQN has no arguments. " f"Last token is '{token.value}' but should be 'const', 'volatile' or ')'.") - if not self.match("WHITESPACE"): - _temp: Optional[Token] = self.peek() + if not self._match("WHITESPACE"): + _temp: Optional[Token] = self._peek() raise SyntaxError(f"Expected WHITESPACE, found '{_temp.type_ if _temp else None}'") - self.consume("WHITESPACE") + self._consume("WHITESPACE") - if not self.match("MEMBER"): + if not self._match("MEMBER"): return {"constant": constant, "volatile": volatile} - token = self.consume("MEMBER") + token = self._consume("MEMBER") constant = token.value == "const" if not constant else constant volatile = token.value == "volatile" if not volatile else volatile - if not self.match("WHITESPACE"): - _temp = self.peek() + if not self._match("WHITESPACE"): + _temp = self._peek() raise SyntaxError(f"Expected WHITESPACE, found '{_temp.type_ if _temp else None}'") - self.consume("WHITESPACE") + self._consume("WHITESPACE") return {"constant": constant, "volatile": volatile} - def parse_args(self) -> Optional[List[str]]: - if not self.match("PARENTHESIS_END"): - _temp: Optional[Token] = self.peek() + def _parse_args(self) -> Optional[List[str]]: + """ + Parses function arguments inside parentheses. + + Returns: + Optional[List[str]]: The list of argument strings, or None if no arguments found. + """ + if not self._match("PARENTHESIS_END"): + _temp: Optional[Token] = self._peek() raise SyntaxError(f"Expected ')', but found {_temp.value if _temp else None}") - self.consume("PARENTHESIS_END") + self._consume("PARENTHESIS_END") args_list: List[List[str]] = [[]] counter: int = 0 - while not self.match("PARENTHESIS_START"): - if self.match("SEPARATOR"): + while not self._match("PARENTHESIS_START"): + if self._match("SEPARATOR"): counter += 1 args_list.append([]) - self.consume("SEPARATOR") - args_list[counter].append(self.consume().value) - self.consume("PARENTHESIS_START") + self._consume("SEPARATOR") + args_list[counter].append(self._consume().value) + self._consume("PARENTHESIS_START") args: List[str] = [''.join(arg[::-1]) for arg in args_list] @@ -100,37 +167,61 @@ def parse_args(self) -> Optional[List[str]]: return args[::-1] - def parse_template(self) -> Optional[str]: - if self.match("WHITESPACE"): - self.consume("WHITESPACE") + def _parse_template(self) -> Optional[str]: + """ + Parses template type parameters if present. + + Returns: + Optional[str]: The raw template string, or None. + """ + if self._match("WHITESPACE"): + self._consume("WHITESPACE") template: Optional[str] = None - if self.match("TEMPLATE_END"): - template = self.parse_nested_templates() + if self._match("TEMPLATE_END"): + template = self._parse_nested_templates() return template - def parse_name(self) -> str: - if self.match("WHITESPACE"): - self.consume("WHITESPACE") + def _parse_name(self) -> str: + """ + Parses the function or symbol name. + + Returns: + str: The unqualified name. + + Raises: + SyntaxError: If a valid member token is not found. + """ + if self._match("WHITESPACE"): + self._consume("WHITESPACE") - if not self.match("MEMBER"): - _temp: Optional[Token] = self.peek() + if not self._match("MEMBER"): + _temp: Optional[Token] = self._peek() raise SyntaxError(f"Expected 'MEMBER', but found '{_temp.type_ if _temp else 'None'}'") - name: str = self.consume("MEMBER").value + name: str = self._consume("MEMBER").value return name - def parse_nested_templates(self) -> str: - if not self.match("TEMPLATE_END"): - _temp: Optional[Token] = self.peek() + def _parse_nested_templates(self) -> str: + """ + Parses a possibly nested set of template tokens. + + Returns: + str: The raw template string (reversed back to original order). + + Raises: + SyntaxError: If improper template structure is found. + """ + if not self._match("TEMPLATE_END"): + _temp: Optional[Token] = self._peek() raise SyntaxError(f"Expected '>', but found '{_temp.value if _temp else 'None'}'") - tokens: List[str] = [self.consume("TEMPLATE_END").value] + tokens: List[str] = [self._consume("TEMPLATE_END").value] depth: int = 1 while depth > 0: - token: Token = self.consume() + token: Token = self._consume() tokens.append(token.value) if token.type_ == "TEMPLATE_END": depth += 1 @@ -139,31 +230,43 @@ def parse_nested_templates(self) -> str: return ''.join(tokens[::-1]) - def parse_scopes(self) -> Optional[List[Scope]]: - if not self.match("SCOPE"): + def _parse_scopes(self) -> Optional[List[Scope]]: + """ + Parses namespace or class scopes, if present. + + Returns: + Optional[List[Scope]]: A list of Scope objects, or None if no scopes found. + """ + if not self._match("SCOPE"): return None scopes: List[Scope] = [] - while not self.match("WHITESPACE") and self.peek(): - self.consume("SCOPE") - template: Optional[str] = self.parse_nested_templates() if self.match("TEMPLATE_END") else None - token: Token = self.consume("MEMBER") + while not self._match("WHITESPACE") and self._peek(): + self._consume("SCOPE") + template: Optional[str] = self._parse_nested_templates() if self._match("TEMPLATE_END") else None + token: Token = self._consume("MEMBER") scopes.append(Scope(token.value, template)) return scopes[::-1] - def parse_return_type(self) -> Optional[str]: - if self.match("WHITESPACE"): - self.consume("WHITESPACE") + def _parse_return_type(self) -> Optional[str]: + """ + Parses any tokens remaining at the start of the string as a return type. + + Returns: + Optional[str]: The return type as a string, or None if not found. + """ + if self._match("WHITESPACE"): + self._consume("WHITESPACE") - if not self.peek(): + if not self._peek(): return None return_type: List[str] = [] - while self.peek(): - token = self.consume() + while self._peek(): + token = self._consume() return_type.append(token.value) return ''.join(return_type[::-1]) diff --git a/src/scope.py b/src/scope.py index 9beab62..49feadf 100644 --- a/src/scope.py +++ b/src/scope.py @@ -4,6 +4,13 @@ @dataclass class Scope: + """ + Represents a lexical or semantic scope, such as a namespace, class, or function context. + + Attributes: + name (str): The name of the scope (e.g., function name, class name, or module). + template (Optional[str]): Template or generic parameter associated with the scope, if any. + """ name: str template: Optional[str] = None diff --git a/src/tokenizer.py b/src/tokenizer.py index e133782..f7585ba 100644 --- a/src/tokenizer.py +++ b/src/tokenizer.py @@ -20,21 +20,46 @@ class Tokenizer: + """ + A simple tokenizer that converts a string into a stream of tokens based on predefined patterns. + + Attributes: + string (str): The input string to tokenize. + __cursor (int): Internal cursor tracking the current position in the input string. + """ def __init__(self, string: str) -> None: + """ + Initializes the tokenizer with the input string. + + Args: + string (str): The string to be tokenized. + """ self.string: str = string - self.cursor: int = 0 + self.__cursor: int = 0 - def is_eof(self) -> bool: - return self.cursor == len(self.string) + def _has_more_tokens(self) -> bool: + """ + Checks whether there are more tokens to extract. - def has_more_tokens(self) -> bool: - return self.cursor < len(self.string) + Returns: + bool: True if there are unprocessed characters in the input string. + """ + return self.__cursor < len(self.string) def get_next_token(self) -> Optional[Token]: - if not self.has_more_tokens(): + """ + Extracts the next token from the input string. + + Returns: + Optional[Token]: The next token, or None if the end of input is reached. + + Raises: + SyntaxError: If an unrecognized token is encountered. + """ + if not self._has_more_tokens(): return None - string: str = self.string[self.cursor:] + string: str = self.string[self.__cursor:] for pattern, token_type in _SPEC: token_value: Optional[str] = self._match(pattern, string) @@ -47,14 +72,30 @@ def get_next_token(self) -> Optional[Token]: raise SyntaxError(f"Unexpected token '{string[0]}'") def _match(self, pattern: str | Pattern[str], string: str) -> Optional[str]: + """ + Tries to match a regex pattern at the start of the input string. + + Args: + pattern (str | Pattern[str]): The regex pattern to match. + string (str): The remaining string to match against. + + Returns: + Optional[str]: The matched string, or None if no match was found. + """ matched: Optional[Match] = re.match(pattern, string) if matched is not None: token: str = matched[0] - self.cursor += len(token) + self.__cursor += len(token) return token return None def get_all_tokens(self) -> Iterator[Token]: - self.cursor = 0 + """ + Tokenizes the entire input string. + + Yields: + Iterator[Token]: A generator of Token objects in the order they appear. + """ + self.__cursor = 0 while token := self.get_next_token(): yield token diff --git a/tests/conftest.py b/tests/conftest.py index ad7153c..15ab1bb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -84,4 +84,4 @@ def parser_expected(): scopes=scopes, template=None, constant=False, - volatile=True) \ No newline at end of file + volatile=True)