diff --git a/docs/undate/converters.rst b/docs/undate/converters.rst index b93b81e..719d065 100644 --- a/docs/undate/converters.rst +++ b/docs/undate/converters.rst @@ -4,10 +4,20 @@ Converters Overview -------- + +.. automodule:: undate.converters + +----- + .. automodule:: undate.converters.base :members: :undoc-members: + +.. autoclass:: undate.converters.combined.OmnibusDateConverter + :members: + + Formats -------- @@ -33,6 +43,8 @@ Extended Date-Time Format (EDTF) Calendars --------- +.. automodule:: undate.converters.calendars + Gregorian ^^^^^^^^^ diff --git a/src/undate/converters/__init__.py b/src/undate/converters/__init__.py index e13532d..c13f2f1 100644 --- a/src/undate/converters/__init__.py +++ b/src/undate/converters/__init__.py @@ -1 +1,29 @@ -from undate.converters.base import BaseDateConverter as BaseDateConverter +""" +Converter classes add support for parsing and serializing dates +in a variety of formats. A subset of these are calendar converters +(:mod:`undate.converters.calendar`), which means they support both parsing +and conversion from an alternate calendar to a common Gregorian +for comparison across dates. + +To parse a date with a supported converter, use the ``Undate`` class method +:meth:`~undate.undate.Undate.parse` and specify the date as a string +with the desired format or calendar, e.g. + +.. code-block:: + + Undate.parse("2001-05", "EDTF") + Undate.parse("7 Heshvan 5425", "Hebrew") + +For converters that support it, you can also serialize a date in a specified +format with ``Undate`` class method :meth:`~undate.undate.Undate.format`: + +.. code-block:: + + Undate.parse("Rabīʿ ath-Thānī 343", "Islamic").format("EDTF") + + +""" + +from undate.converters.base import BaseDateConverter, GRAMMAR_FILE_PATH + +__all__ = ["BaseDateConverter", "GRAMMAR_FILE_PATH"] diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index 1cf1b6d..93a63a7 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -44,6 +44,7 @@ import importlib import logging +import pathlib import pkgutil from functools import cache from typing import Dict, Type @@ -53,6 +54,10 @@ logger = logging.getLogger(__name__) +#: Path to parser grammar files +GRAMMAR_FILE_PATH = pathlib.Path(__file__).parent / "grammars" + + class BaseDateConverter: """Base class for parsing, formatting, and converting dates to handle specific formats and different calendars.""" diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py index 5654f60..3056f85 100644 --- a/src/undate/converters/calendars/hebrew/parser.py +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -1,8 +1,8 @@ -import pathlib - from lark import Lark -grammar_path = pathlib.Path(__file__).parent / "hebrew.lark" +from undate.converters import GRAMMAR_FILE_PATH + +grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark" with open(grammar_path) as grammar: # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py index 8880434..1ca8c39 100644 --- a/src/undate/converters/calendars/hebrew/transformer.py +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -26,12 +26,13 @@ def hebrew_date(self, items): # initialize and return an undate with year, month, day and # configured calendar (hebrew by default) + # NOTE: use self.calendar so Seleucid can extend more easily return Undate(**parts, calendar=self.calendar) - # year translation is not needed since we want a tree with name year - # this is equivalent to a no-op - # def year(self, items): - # return Tree(data="year", children=[items[0]]) + def year(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) def month(self, items): # month has a nested tree for the rule and the value diff --git a/src/undate/converters/calendars/islamic/parser.py b/src/undate/converters/calendars/islamic/parser.py index b103711..61a0cf0 100644 --- a/src/undate/converters/calendars/islamic/parser.py +++ b/src/undate/converters/calendars/islamic/parser.py @@ -1,8 +1,8 @@ -import pathlib - from lark import Lark -grammar_path = pathlib.Path(__file__).parent / "islamic.lark" +from undate.converters import GRAMMAR_FILE_PATH + +grammar_path = GRAMMAR_FILE_PATH / "islamic.lark" with open(grammar_path) as grammar: # NOTE: LALR parser is faster but can't be used due to ambiguity between years and days diff --git a/src/undate/converters/calendars/islamic/transformer.py b/src/undate/converters/calendars/islamic/transformer.py index 9ffce36..7310d86 100644 --- a/src/undate/converters/calendars/islamic/transformer.py +++ b/src/undate/converters/calendars/islamic/transformer.py @@ -28,8 +28,17 @@ def islamic_date(self, items): # year translation is not needed since we want a tree with name year # this is equivalent to a no-op - # def year(self, items): - # return Tree(data="year", children=[items[0]]) + def year(self, items): + # combine multiple parts into a single string + # (for some reason we're getting an anonymous token in combined parser) + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) + + def day(self, items): + # combine multiple parts into a single string + # (for some reason we're getting an anonymous token in combined parser) + value = "".join([str(i) for i in items]) + return Tree(data="day", children=[value]) def month(self, items): # month has a nested tree for the rule and the value diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py new file mode 100644 index 0000000..54d66a5 --- /dev/null +++ b/src/undate/converters/combined.py @@ -0,0 +1,85 @@ +""" +**Experimental** combined parser. Supports EDTF, Hebrew, and Hijri +where dates are unambiguous. (Year-only dates are parsed as EDTF in +Gregorian calendar.) +""" + +from typing import Union + +from lark import Lark +from lark.exceptions import UnexpectedCharacters +from lark.visitors import Transformer, merge_transformers + +from undate import Undate, UndateInterval +from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH +from undate.converters.edtf.transformer import EDTFTransformer +from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer +from undate.converters.calendars.islamic.transformer import IslamicDateTransformer + + +class CombinedDateTransformer(Transformer): + def start(self, children): + # trigger the transformer for the appropriate part of the grammar + return children + + +# NOTE: currently year-only dates in combined parser are interpreted as +# EDTF and use Gregorian calendar. +# In future, we could refine by adding calendar names & abbreviations +# to the parser in order to recognize years from other calendars. + +combined_transformer = merge_transformers( + CombinedDateTransformer(), + edtf=EDTFTransformer(), + hebrew=HebrewDateTransformer(), + islamic=IslamicDateTransformer(), +) + + +# open based on filename so we can specify relative import path based on grammar file +parser = Lark.open( + str(GRAMMAR_FILE_PATH / "combined.lark"), rel_to=__file__, strict=True +) + + +class OmnibusDateConverter(BaseDateConverter): + """ + Combination parser that aggregates existing parser grammars. + Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous. + (Year-only dates are parsed as EDTF in Gregorian calendar.) + + Does not support serialization. + + Example usage:: + + Undate.parse("Tammuz 4816", "omnibus") + + """ + + #: converter name: omnibus + name: str = "omnibus" + + def __init__(self): + self.transformer = combined_transformer + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + """ + Parse a string in a supported format and return an :class:`~undate.undate.Undate` + or :class:`~undate.undate.UndateInterval`. + """ + if not value: + raise ValueError("Parsing empty/unset string is not supported") + + # parse the input string, then transform to undate object + try: + parsetree = parser.parse(value) + # transform returns a list; we want the first item in the list + return self.transformer.transform(parsetree)[0] + except UnexpectedCharacters: + raise ValueError( + "Parsing failed: '%s' is not in a recognized date format" % value + ) + + def to_string(self, undate: Union[Undate, UndateInterval]) -> str: + "Not supported by this converter. Will raise :class:`ValueError`" + raise ValueError("Omnibus converter does not support serialization") diff --git a/src/undate/converters/edtf/parser.py b/src/undate/converters/edtf/parser.py index 27c2bd6..bc8f0ef 100644 --- a/src/undate/converters/edtf/parser.py +++ b/src/undate/converters/edtf/parser.py @@ -1,8 +1,8 @@ -import pathlib - from lark import Lark -grammar_path = pathlib.Path(__file__).parent / "edtf.lark" +from undate.converters import GRAMMAR_FILE_PATH + +grammar_path = GRAMMAR_FILE_PATH / "edtf.lark" with open(grammar_path) as grammar: edtf_parser = Lark(grammar.read(), start="edtf") diff --git a/src/undate/converters/edtf/transformer.py b/src/undate/converters/edtf/transformer.py index 0b1de76..3167248 100644 --- a/src/undate/converters/edtf/transformer.py +++ b/src/undate/converters/edtf/transformer.py @@ -66,7 +66,10 @@ def day_unspecified(self, items): def date_level1(self, items): return self.date(items) - # year (including negative years) use default transformation + def year(self, items): + # combine parts (numeric & unknown) into a single string + value = "".join(self.get_values(items)) + return Tree(data="year", children=[value]) def year_fivedigitsplus(self, items): # strip off the leading Y and convert to integer diff --git a/src/undate/converters/grammars/combined.lark b/src/undate/converters/grammars/combined.lark new file mode 100644 index 0000000..0e77b5c --- /dev/null +++ b/src/undate/converters/grammars/combined.lark @@ -0,0 +1,32 @@ +%import common.WS +%ignore WS + +start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) + +// Renaming of the import variables is required, as they receive the namespace of this file. +// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 + +// All grammars are in the same file, so we can use relative imports + +// relative import from edtf.lark +%import .edtf.edtf -> edtf__start + +// relative import from hebrew.lark +%import .hebrew.hebrew_date -> hebrew__hebrew_date +%import .hebrew.day -> hebrew__day +%import .hebrew.month -> hebrew__month +%import .hebrew.year -> hebrew__year + +// relative import from islamic.lark +%import .islamic.islamic_date -> islamic__islamic_date +%import .islamic.day -> islamic__day +%import .islamic.month -> islamic__month +%import .islamic.year -> islamic__year + + +// override hebrew date to omit year-only, since year without calendar is ambiguous +// NOTE: potentially support year with calendar label +%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year + +// same for islamic date, year alone is ambiguous +%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year diff --git a/src/undate/converters/edtf/edtf.lark b/src/undate/converters/grammars/edtf.lark similarity index 100% rename from src/undate/converters/edtf/edtf.lark rename to src/undate/converters/grammars/edtf.lark diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/grammars/hebrew.lark similarity index 85% rename from src/undate/converters/calendars/hebrew/hebrew.lark rename to src/undate/converters/grammars/hebrew.lark index 6f4244c..118ed98 100644 --- a/src/undate/converters/calendars/hebrew/hebrew.lark +++ b/src/undate/converters/grammars/hebrew.lark @@ -11,23 +11,23 @@ hebrew_date: weekday? day month comma? year | month year | year // PGP dates use qualifiers like "first decade of" (for beginning of month) // "first third of", seasons (can look for more examples) -// Hebrew calendar starts with year 1 in 3761 BCE +// Hebrew calendar starts with year 1 in 3761 BCE year: /\d+/ // months month: month_1 | month_2 - | month_3 - | month_4 - | month_5 - | month_6 - | month_7 - | month_8 - | month_9 - | month_10 - | month_11 - | month_12 - | month_13 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + | month_13 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ diff --git a/src/undate/converters/calendars/islamic/islamic.lark b/src/undate/converters/grammars/islamic.lark similarity index 100% rename from src/undate/converters/calendars/islamic/islamic.lark rename to src/undate/converters/grammars/islamic.lark diff --git a/tests/test_converters/test_combined_parser.py b/tests/test_converters/test_combined_parser.py new file mode 100644 index 0000000..717a16e --- /dev/null +++ b/tests/test_converters/test_combined_parser.py @@ -0,0 +1,54 @@ +import pytest + +from undate.converters.combined import parser, combined_transformer + +from undate import Undate, UndateInterval + +# test that valid dates can be parsed + +testcases = [ + # EDTF + ("1984", Undate(1984)), + ("201X", Undate("201X")), + ("20XX", Undate("20XX")), + ("2004-XX", Undate(2004, "XX")), + ("1000/2000", UndateInterval(Undate(1000), Undate(2000))), + # Hebrew / Anno Mundi calendar + ("Tammuz 4816", Undate(4816, 4, calendar="Hebrew")), + # Islamic / Hijri calendar + ("Jumādā I 1243", Undate(1243, 5, calendar="Islamic")), + ("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Islamic")), + ("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Islamic")), +] + + +@pytest.mark.parametrize("date_string,expected", testcases) +def test_transform(date_string, expected): + # test the transformer directly + transformer = combined_transformer + # parse the input string, then transform to undate object + parsetree = parser.parse(date_string) + # since the same unknown date is not considered strictly equal, + # compare object representations + transformed_date = transformer.transform(parsetree) + assert repr(transformed_date[0]) == repr(expected) + + +@pytest.mark.parametrize("date_string,expected", testcases) +def test_converter(date_string, expected): + # should work the same way when called through the converter class + assert repr(Undate.parse(date_string, "omnibus")) == repr(expected) + + +def test_parse_errors(): + # empty string not supported + with pytest.raises(ValueError, match="not supported"): + Undate.parse("", "omnibus") + + with pytest.raises(ValueError, match="not in a recognized date format"): + Undate.parse("Monday 2023", "omnibus") + + +def test_no_serialize(): + with pytest.raises(ValueError, match="does not support"): + Undate("2022").format("omnibus")