diff --git a/src/openedx_tagging/api.py b/src/openedx_tagging/api.py index 767915e7..13f6a9a9 100644 --- a/src/openedx_tagging/api.py +++ b/src/openedx_tagging/api.py @@ -200,11 +200,12 @@ def get_object_tags( .select_related("taxonomy", "tag", "tag__parent", "tag__parent__parent") # Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this: .annotate(sort_key=Lower(Concat( + # TODO: rewrite this to use a "WITH RECURSIVE" common table expression ? ConcatNull(F("tag__parent__parent__parent__value"), Value("\t")), ConcatNull(F("tag__parent__parent__value"), Value("\t")), ConcatNull(F("tag__parent__value"), Value("\t")), Coalesce(F("tag__value"), F("_value")), - Value("\t"), + Value("\t"), # Required for MySQL to sort correctly output_field=models.CharField(), ))) .annotate(taxonomy_name=Coalesce(F("taxonomy__name"), F("_export_id"))) diff --git a/src/openedx_tagging/models/base.py b/src/openedx_tagging/models/base.py index 668ba73e..11ac5fd5 100644 --- a/src/openedx_tagging/models/base.py +++ b/src/openedx_tagging/models/base.py @@ -5,7 +5,7 @@ import logging import re -from typing import List +from typing import List, Self from django.core.exceptions import ValidationError from django.db import models @@ -14,21 +14,23 @@ from django.utils.functional import cached_property from django.utils.module_loading import import_string from django.utils.translation import gettext_lazy as _ -from typing_extensions import Self # Until we upgrade to python 3.11 from openedx_django_lib.fields import MultiCollationTextField, case_insensitive_char_field, case_sensitive_char_field from ..data import TagDataQuerySet -from .utils import RESERVED_TAG_CHARS, ConcatNull +from .utils import RESERVED_TAG_CHARS log = logging.getLogger(__name__) -# Maximum depth allowed for a hierarchical taxonomy's tree of tags. +# Maximum depth of tags that can be retrieved from any single API call. +# Currently, taxonomies support unlimited depth but you can only retrieve up to three levels deep at one time. +# TODO: add an absolute limit as well. +# TODO: not all APIs work with deeper than 3 tags - e.g. get_object_tags will always return the correct +# results, but the order of tags below depth 4 won't be correct. TAXONOMY_MAX_DEPTH = 3 # Ancestry of a given tag; the Tag.value fields of a given tag and its parents, starting from the root. -# Will contain 0...TAXONOMY_MAX_DEPTH elements. Lineage = List[str] @@ -147,20 +149,34 @@ def depth(self) -> int: return depth @staticmethod - def annotate_depth(qs: models.QuerySet) -> models.QuerySet: + def annotate_depth( + qs: models.QuerySet, + from_tag_id=None, # depth from None means absolute depth (default) + depth_offset=0, # If using a "from tag" (subtree depth), specify an offset here to add to each tag's depth + ) -> models.QuerySet: """ Given a query that loads Tag objects, annotate it with the depth of each tag. - """ - return qs.annotate(depth=models.Case( - models.When(parent_id=None, then=0), - models.When(parent__parent_id=None, then=1), - models.When(parent__parent__parent_id=None, then=2), - models.When(parent__parent__parent__parent_id=None, then=3), - # If the depth is 4 or more, currently we just "collapse" the depth - # to 4 in order not to add too many joins to this query in general. - default=4, - )) + + Normally this refers to the "absolute depth" (where root tags with no + parent have depth 0, but you can also get the depth relative to some + other tag, i.e. subtree depth, by passing in the ID of a tag known to be + an ancestor of all the tags in the query set and its depth.) + """ + # fmt: off + return qs.annotate( + depth=models.Case( + models.When(parent_id=from_tag_id, then=depth_offset), # e.g. when no parent, depth = 0 + models.When(parent__parent_id=from_tag_id, then=depth_offset + 1), # e.g. when 1 parent, depth = 1 + models.When(parent__parent__parent_id=from_tag_id, then=depth_offset + 2), + models.When(parent__parent__parent__parent_id=from_tag_id, then=depth_offset + 3), + # If the depth is 4 or more, currently we just "collapse" the depth + # to 4 in order not to add too many joins to this query in general. + # But you can call this again using a new 'from_tag' to get accurate calculations at greater depths. + default=depth_offset + 4, + ) + ) + # fmt: on @cached_property def child_count(self) -> int: @@ -526,9 +542,14 @@ def _get_filtered_tags_deep( """ # All tags (possibly below a certain tag?) in the closed taxonomy, up to depth TAXONOMY_MAX_DEPTH if parent_tag_value: - main_parent_id = self.tag_for_value(parent_tag_value).pk + # Get a subtree, up to three levels deep below this tag: + main_parent = self.tag_for_value(parent_tag_value, select_related=["parent", "parent__parent"]) + main_parent_id = main_parent.pk + depth_offset = main_parent.depth + 1 else: + # Load the first three levels of the taxonomy. main_parent_id = None + depth_offset = 0 assert TAXONOMY_MAX_DEPTH == 3 # If we change TAXONOMY_MAX_DEPTH we need to change this query code: qs: models.QuerySet = self.tag_set.filter( @@ -574,19 +595,21 @@ def _get_filtered_tags_deep( qs = qs.annotate(descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count")) # Add the "depth" to each tag: - qs = Tag.annotate_depth(qs) + qs = Tag.annotate_depth(qs, from_tag_id=main_parent_id, depth_offset=depth_offset) # Add the "lineage" as a field called "sort_key" to sort them in order correctly: + # For a root tag, we want sort_key="RootValue" and for a depth=1 tag, we want sort_key="RootValue\tValue", and + # so on. To make this work with very deep trees, we generate the lineage of a subtree only (below main_parent). + # fmt: off qs = qs.annotate(sort_key=Lower(Concat( - # For a root tag, we want sort_key="RootValue" and for a depth=1 tag - # we want sort_key="RootValue\tValue". The following does that, since - # ConcatNull(...) returns NULL if any argument is NULL. - ConcatNull(F("parent__parent__parent__value"), Value("\t")), - ConcatNull(F("parent__parent__value"), Value("\t")), - ConcatNull(F("parent__value"), Value("\t")), + # TODO: update this to use a "WITH RECURSIVE" common table expression? + models.Case(models.When(depth__gte=3 + depth_offset, then=Concat(F("parent__"*3+"value"), Value("\t")))), + models.Case(models.When(depth__gte=2 + depth_offset, then=Concat(F("parent__"*2+"value"), Value("\t")))), + models.Case(models.When(depth__gte=1 + depth_offset, then=Concat(F("parent__"*1+"value"), Value("\t")))), F("value"), Value("\t"), # We also need the '\t' separator character at the end, or MySQL will sort things wrong output_field=models.CharField(), ))) + # fmt: on # Add the parent value qs = qs.annotate(parent_value=F("parent__value")) qs = qs.annotate(_id=F("id")) # ID has an underscore to encourage use of 'value' rather than this internal ID @@ -715,7 +738,7 @@ def validate_value(self, value: str) -> bool: return value != "" and isinstance(value, str) return self.tag_set.filter(value__iexact=value).exists() - def tag_for_value(self, value: str) -> Tag: + def tag_for_value(self, value: str, select_related: list[str] | None = None) -> Tag: """ Get the Tag object for the given value. Some Taxonomies may auto-create the Tag at this point, e.g. a User @@ -726,7 +749,11 @@ def tag_for_value(self, value: str) -> Tag: self.check_casted() if self.allow_free_text: raise ValueError("tag_for_value() doesn't work for free text taxonomies. They don't use Tag instances.") - return self.tag_set.get(value__iexact=value) + if select_related: + qs = self.tag_set.select_related(*select_related) + else: + qs = self.tag_set + return qs.get(value__iexact=value) def validate_external_id(self, external_id: str) -> bool: """ diff --git a/tests/openedx_tagging/test_api.py b/tests/openedx_tagging/test_api.py index aaa372b4..ca404178 100644 --- a/tests/openedx_tagging/test_api.py +++ b/tests/openedx_tagging/test_api.py @@ -1070,3 +1070,105 @@ def test_unmark_copied_tags(self) -> None: tags = tagging_api.get_object_tags(obj2) assert tags.filter(is_copied=False).count() == 3 assert tags.filter(is_copied=True).count() == 0 + + def test_deep_taxonomy(self) -> None: + """ + Test that a taxonomy's tags can be nested deeply + """ + taxonomy = tagging_api.create_taxonomy(name="Deep Taxonomy") + + tag_0 = tagging_api.add_tag_to_taxonomy(taxonomy, "Root - depth 0") + assert tag_0.depth == 0 + + tag_1 = tagging_api.add_tag_to_taxonomy(taxonomy, "Child - depth 1", parent_tag_value=tag_0.value) + assert tag_1.depth == 1 + + tag_2 = tagging_api.add_tag_to_taxonomy(taxonomy, "Grandchild - depth 2", parent_tag_value=tag_1.value) + assert tag_2.depth == 2 + + # Up to this point is the level at which all the APIs are guaranteed to work correctly. + + tag_3 = tagging_api.add_tag_to_taxonomy(taxonomy, "Great-Grandchild - depth 3", parent_tag_value=tag_2.value) + assert tag_3.depth == 3 + + tag_4 = tagging_api.add_tag_to_taxonomy(taxonomy, "Great-Great-Grandchild - depth 4", parent_tag_value=tag_3.value) + assert tag_4.depth == 4 + + tag_5 = tagging_api.add_tag_to_taxonomy(taxonomy, "Great-Great-Great-Grandchild - depth 5", parent_tag_value=tag_4.value) + assert tag_5.depth == 5 + + # APIs like `get_tags()` will return only one level at a time: + assert pretty_format_tags( + tagging_api.get_tags(taxonomy) + ) == [ + 'Root - depth 0 (None) (children: 1 + 2)', + ' Child - depth 1 (Root - depth 0) (children: 1 + 2)', + ' Grandchild - depth 2 (Child - depth 1) (children: 1 + 2)', + ] + # But we can always load deeper levels one level at a time: + assert pretty_format_tags( + tagging_api.get_children_tags(taxonomy, parent_tag_value=tag_3.value) + ) == [ + ' Great-Great-Grandchild - depth 4 (Great-Grandchild - depth 3) (children: 1)', + ] + assert pretty_format_tags( + tagging_api.get_children_tags(taxonomy, parent_tag_value=tag_4.value) + ) == [ + ' Great-Great-Great-Grandchild - depth 5 (Great-Great-Grandchild - depth 4) (children: 0)', + ] + # Or even up to three levels at a time: + deep_result = taxonomy.get_filtered_tags(depth=None, parent_tag_value=tag_2.value) + assert pretty_format_tags(deep_result, parent=False) == [ + ' Great-Grandchild - depth 3 (children: 1 + 1)', + ' Great-Great-Grandchild - depth 4 (children: 1)', + ' Great-Great-Great-Grandchild - depth 5 (children: 0)', + ] + # Let's check that all the fields on the last entry, especially 'depth': + assert deep_result[2]["value"] == "Great-Great-Great-Grandchild - depth 5" + assert deep_result[2]["parent_value"] == "Great-Great-Grandchild - depth 4" + assert deep_result[2]["depth"] == 5 + assert deep_result[2]["child_count"] == 0 + + # Also check that Tag.get_lineage() works correctly for super deep tags: + assert tag_5.get_lineage() == [ + 'Root - depth 0', + 'Child - depth 1', + 'Grandchild - depth 2', + 'Great-Grandchild - depth 3', + 'Great-Great-Grandchild - depth 4', + 'Great-Great-Great-Grandchild - depth 5', + ] + + + def test_object_tags_deep(self) -> None: + """ + Test that a deep taxonomy's tags can be used to tag objects + """ + taxonomy = tagging_api.create_taxonomy(name="Deep Taxonomy") + tag_0 = tagging_api.add_tag_to_taxonomy(taxonomy, "Bob - depth 0") + tag_1 = tagging_api.add_tag_to_taxonomy(taxonomy, "Janet - depth 1", parent_tag_value=tag_0.value) + tag_2 = tagging_api.add_tag_to_taxonomy(taxonomy, "Alice - depth 2", parent_tag_value=tag_1.value) + tag_3 = tagging_api.add_tag_to_taxonomy(taxonomy, "Fred - depth 3", parent_tag_value=tag_2.value) + tag_4 = tagging_api.add_tag_to_taxonomy(taxonomy, "Patty - depth 4", parent_tag_value=tag_3.value) + tag_5 = tagging_api.add_tag_to_taxonomy(taxonomy, "Clara - depth 5", parent_tag_value=tag_4.value) + assert tag_5.depth == 5 + + object_id = "some object" + tagging_api.tag_object(object_id, taxonomy, tags=[tag_5.value]) + + assert [ot.tag for ot in tagging_api.get_object_tags(object_id)] == [tag_5] + + # Now test the sort order of many tags: + # tagging_api.tag_object(object_id, taxonomy, tags=[ + # tag_0.value, tag_1.value, tag_2.value, tag_3.value, tag_4.value, tag_5.value, + # ]) + # assert [ot.tag for ot in tagging_api.get_object_tags(object_id)] == [ + # tag_0, + # tag_1, + # tag_2, + # tag_3, + # tag_4, + # tag_5, + # ] + # Note: the above isn't quite working yet - tag_5 is out of order. + # See TODO note in get_object_tags()