diff --git a/src/openedx_tagging/models/base.py b/src/openedx_tagging/models/base.py index 668ba73e..1bc5b766 100644 --- a/src/openedx_tagging/models/base.py +++ b/src/openedx_tagging/models/base.py @@ -9,8 +9,8 @@ from django.core.exceptions import ValidationError from django.db import models -from django.db.models import F, Q, Value -from django.db.models.functions import Concat, Lower +from django.db.models import Count, F, IntegerField, OuterRef, Q, Subquery, Value +from django.db.models.functions import Coalesce, Concat, Lower from django.utils.functional import cached_property from django.utils.module_loading import import_string from django.utils.translation import gettext_lazy as _ @@ -25,6 +25,8 @@ # Maximum depth allowed for a hierarchical taxonomy's tree of tags. +# Note: if this changes please check logic in this file for notes +# about necessary changes TAXONOMY_MAX_DEPTH = 3 # Ancestry of a given tag; the Tag.value fields of a given tag and its parents, starting from the root. @@ -501,16 +503,7 @@ def _get_filtered_tags_one_level( qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id") qs = qs.order_by("value") if include_counts: - # We need to include the count of how many times this tag is used to tag objects. - # You'd think we could just use: - # qs = qs.annotate(usage_count=models.Count("objecttag__pk")) - # but that adds another join which starts creating a cross product and the children and usage_count become - # intertwined and multiplied with each other. So we use a subquery. - obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate( - # We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027 - count=models.Func(F('id'), function='Count') - ) - qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count'))) + qs = self.add_counts_query(qs) return qs # type: ignore[return-value] def _get_filtered_tags_deep( @@ -530,12 +523,28 @@ def _get_filtered_tags_deep( else: main_parent_id = None + qs: models.QuerySet = self.tag_set.none() # empty queryset assert TAXONOMY_MAX_DEPTH == 3 # If we change TAXONOMY_MAX_DEPTH we need to change this query code: - qs: models.QuerySet = self.tag_set.filter( - Q(parent_id=main_parent_id) | - Q(parent__parent_id=main_parent_id) | - Q(parent__parent__parent_id=main_parent_id) - ) + + if (main_parent_id is not None): + # If we're starting from a specific parent tag, we only want to include tags that are descendants of that parent tag. + qs: models.QuerySet = self.tag_set.filter( + # Main tag at depth 0is not included in the results + Q(parent_id=main_parent_id) | # Direct children of the parent tag at depth 1 + Q(parent__parent_id=main_parent_id) | # Grandchildren of the parent tag at depth 2 + Q(parent__parent__parent_id=main_parent_id) # Great-grandchildren of the parent tag at depth 3 + ) + else: + # If we're starting from the top, we want to include all tags up to the max depth. + qs: models.QuerySet = self.tag_set.filter( + Q(parent=None) | # Root tags at depth 0 + Q(parent__parent=None) | # Children of root tags at depth 1 + Q(parent__parent__parent=None) | # Grandchildren of root tags at depth 2 + Q(parent__parent__parent__parent=None) # Great-grandchildren of root tags at depth 3 + ) + + # Regardless of where we start, we only support tags up to TAXONOMY_MAX_DEPTH from the root. + qs = qs.filter(parent__parent__parent__parent=None) if search_term: # We need to do an additional query to find all the tags that match the search term, then limit the @@ -593,14 +602,50 @@ def _get_filtered_tags_deep( qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id") qs = qs.order_by("sort_key") if include_counts: - # Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level() - obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate( - # We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027 - count=models.Func(F('id'), function='Count') - ) - qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count'))) + qs = self.add_counts_query(qs) + return qs # type: ignore[return-value] + def add_counts_query(self, qs: models.QuerySet ): + # Adds a subquery to the passed-in queryset that returns the number + # of times a tag has been used. + # + # Note: The count is not a simple count, we need to do a 'roll up' + # where we count the number of times a tag is directly used and applied, + # but then that also needs to add a "1" count to the lineage tags + # (parent, grandparent, etc.), but de-duplicate counts for any children + # so that if we have "2" child tags, it only counts towards "1" for the + # parent. + # This query gets the raw counts for each tag usage, gets the distinct + # usages (so de-duplicates counts) by actual application to an "Object" + # (library, course, course module, course section, etc.), which creates + # a count per tag, annotated to that particular tag from the passed-in + # queryset. + # + # Note: This only works with a tag lineage depth of "3" (the now + # current value of TAXONOMY_MAX_DEPTH), inclusive of 0, so 0...3 + # if we change TAXONOMY_MAX_DEPTH this code will need to be updated. + + assert TAXONOMY_MAX_DEPTH == 3 # If we change TAXONOMY_MAX_DEPTH we need to change this query code + usage_count_qs = ObjectTag.objects.filter( + Q(tag_id=OuterRef('pk')) | + Q(tag__parent_id=OuterRef('pk')) | + Q(tag__parent__parent_id=OuterRef('pk')) | + Q(tag__parent__parent__parent_id=OuterRef('pk')) + ).values('object_id').distinct().annotate( + intermediate_grouping=Value(1, output_field=IntegerField()) + ).values('intermediate_grouping').annotate( + total_usage=Count('object_id', distinct=True) + ).values('total_usage') + + qs = qs.annotate( + usage_count=Coalesce( + Subquery(usage_count_qs, output_field=IntegerField()), + 0 # Coalesce ensures we return 0 instead of None if there are no usages + ) + ) + return qs + def add_tag( self, tag_value: str, diff --git a/tests/openedx_tagging/test_models.py b/tests/openedx_tagging/test_models.py index 67ae68e2..5a7784e5 100644 --- a/tests/openedx_tagging/test_models.py +++ b/tests/openedx_tagging/test_models.py @@ -50,6 +50,7 @@ def setUp(self): self.chordata = get_tag("Chordata") self.mammalia = get_tag("Mammalia") self.animalia = get_tag("Animalia") + self.eukaryota = get_tag("Eukaryota") self.system_taxonomy_tag = get_tag("System Tag 1") self.english_tag = self.language_taxonomy.tag_for_external_id("en") self.user_1 = get_user_model()( @@ -443,7 +444,8 @@ def test_get_all(self) -> None: "Eukaryota (None) (children: 5 + 8)", " Animalia (Eukaryota) (children: 7 + 1)", " Arthropoda (Animalia) (children: 0)", - " Chordata (Animalia) (children: 1)", # note this has a child but the child is not included + " Chordata (Animalia) (children: 1)", + " Mammalia (Chordata) (children: 0)", " Cnidaria (Animalia) (children: 0)", " Ctenophora (Animalia) (children: 0)", " Gastrotrich (Animalia) (children: 0)", @@ -543,7 +545,9 @@ def test_get_external_id(self) -> None: def test_usage_count(self) -> None: """ - Test that the usage count in the results is right + Test that the usage count in the results is right for a basic case; + many objects tagged seperately should return a simple usage count that + reflects lineage de-duplication (or lack thereof, in this case) """ api.tag_object(object_id="obj01", taxonomy=self.taxonomy, tags=["Bacteria"]) api.tag_object(object_id="obj02", taxonomy=self.taxonomy, tags=["Bacteria"]) @@ -552,7 +556,7 @@ def test_usage_count(self) -> None: # Now the API should reflect these usage counts: result = pretty_format_tags(self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True)) assert result == [ - "Bacteria (None) (used: 3, children: 2)", + "Bacteria (None) (used: 4, children: 2)", " Archaebacteria (Bacteria) (used: 0, children: 0)", " Eubacteria (Bacteria) (used: 1, children: 0)", ] @@ -561,9 +565,245 @@ def test_usage_count(self) -> None: self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True, depth=1) ) assert result1 == [ - "Bacteria (None) (used: 3, children: 2)", + "Bacteria (None) (used: 4, children: 2)", ] + def test_usage_count_lineage_count_across_same_course(self) -> None: + """ + Test that the usage count is correct and parent counts are included based on + child tags being added to an object. However, we de-duplicate and only count + 1 parent tag towards a course even if 2 children are applied to that course + """ + api.tag_object(object_id="obj01", taxonomy=self.taxonomy, tags=["Bacteria"]) + api.tag_object(object_id="obj01", taxonomy=self.taxonomy, tags=["Archaebacteria"]) + api.tag_object(object_id="obj02", taxonomy=self.taxonomy, tags=["Archaebacteria"]) + api.tag_object(object_id="obj01", taxonomy=self.taxonomy, tags=["Eubacteria"]) + # Now the API should reflect these usage counts: + result = pretty_format_tags(self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True)) + assert result == [ + "Bacteria (None) (used: 2, children: 2)", + " Archaebacteria (Bacteria) (used: 1, children: 0)", + " Eubacteria (Bacteria) (used: 1, children: 0)", + ] + # Same with depth=1, which uses a different query internally: + result1 = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True, depth=1) + ) + assert result1 == [ + "Bacteria (None) (used: 2, children: 2)", + ] + + def test_usage_count_rolls_up_to_ancestors_deep(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When a child tag (depth 3) is applied to an object, it should + roll up the count to all its ancestors when using _get_filtered_tags_deep. + The child tag and each of its ancestors should have usage_count=1. + """ + api.tag_object("obj:1", self.taxonomy, [self.mammalia.value]) + result = pretty_format_tags(self.taxonomy.get_filtered_tags(include_counts=True)) + assert result == [ + "Archaea (None) (used: 0, children: 3)", + " DPANN (Archaea) (used: 0, children: 0)", + " Euryarchaeida (Archaea) (used: 0, children: 0)", + " Proteoarchaeota (Archaea) (used: 0, children: 0)", + "Bacteria (None) (used: 0, children: 2)", + " Archaebacteria (Bacteria) (used: 0, children: 0)", + " Eubacteria (Bacteria) (used: 0, children: 0)", + "Eukaryota (None) (used: 1, children: 5 + 8)", + " Animalia (Eukaryota) (used: 1, children: 7 + 1)", + " Arthropoda (Animalia) (used: 0, children: 0)", + " Chordata (Animalia) (used: 1, children: 1)", + " Mammalia (Chordata) (used: 1, children: 0)", + " Cnidaria (Animalia) (used: 0, children: 0)", + " Ctenophora (Animalia) (used: 0, children: 0)", + " Gastrotrich (Animalia) (used: 0, children: 0)", + " Placozoa (Animalia) (used: 0, children: 0)", + " Porifera (Animalia) (used: 0, children: 0)", + " Fungi (Eukaryota) (used: 0, children: 0)", + " Monera (Eukaryota) (used: 0, children: 0)", + " Plantae (Eukaryota) (used: 0, children: 0)", + " Protista (Eukaryota) (used: 0, children: 0)", + ] + + def test_usage_count_multiple_objects_same_tag_deep(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When two distinct objects (e.g. seperate courses, modules, etc.) are tagged + with the same child tag, it should count 2 for that tag (and roll up 2 + to ancestors). Each distinct object should contribute exactly 1 to the count. + """ + api.tag_object("obj:1", self.taxonomy, [self.chordata.value]) + api.tag_object("obj:2", self.taxonomy, [self.chordata.value]) + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="chordata", include_counts=True) + ) + assert result == [ + "Eukaryota (None) (used: 2, children: 1 + 1)", + " Animalia (Eukaryota) (used: 2, children: 1)", + " Chordata (Animalia) (used: 2, children: 0)", + ] + + def test_usage_count_sibling_tags_same_object_deduplication_deep(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When one object is tagged with two sibling tags (both children of the same + parent), the parent's usage_count should be 1, not 2. It should de-duplicate. + """ + self.taxonomy.allow_multiple = True + self.taxonomy.save() + # Eubacteria and Archaebacteria are both children of Bacteria + api.tag_object("obj:1", self.taxonomy, [self.eubacteria.value, self.archaebacteria.value]) + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True) + ) + assert result == [ + "Bacteria (None) (used: 1, children: 2)", + " Archaebacteria (Bacteria) (used: 1, children: 0)", + " Eubacteria (Bacteria) (used: 1, children: 0)", + ] + + def test_usage_count_sibling_tags_different_objects_deep(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When two different objects are each tagged with a different sibling tag, + the parent's usage_count should be 2, not 1. + """ + api.tag_object("obj:1", self.taxonomy, [self.eubacteria.value]) + api.tag_object("obj:2", self.taxonomy, [self.archaebacteria.value]) + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True) + ) + assert result == [ + "Bacteria (None) (used: 2, children: 2)", + " Archaebacteria (Bacteria) (used: 1, children: 0)", + " Eubacteria (Bacteria) (used: 1, children: 0)", + ] + + def test_usage_count_one_level_root_tags(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + _get_filtered_tags_one_level (depth=1) with include_counts=True should + reflect the rolled-up usage count, not just direct usage. + Tagging an object with a child tag should increment the root tag's count. + """ + api.tag_object("obj:1", self.taxonomy, [self.eubacteria.value]) # child of Bacteria + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(depth=1, include_counts=True) + ) + assert result == [ + "Archaea (None) (used: 0, children: 3)", + "Bacteria (None) (used: 1, children: 2)", + "Eukaryota (None) (used: 0, children: 5 + 8)", + ] + + def test_usage_count_one_level_child_tags(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When listing children of a tag (depth=1, parent_tag_value=...), the + usage_count of each child should only reflect the objects tagged with + that child or any of its descendants. + """ + api.tag_object("obj:1", self.taxonomy, [self.mammalia.value]) # grandchild of Animalia via Chordata + api.tag_object("obj:2", self.taxonomy, [self.chordata.value]) # direct child of Animalia + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(depth=1, parent_tag_value="Animalia", include_counts=True) + ) + assert result == [ + " Arthropoda (Animalia) (used: 0, children: 0)", + " Chordata (Animalia) (used: 2, children: 1)", + " Cnidaria (Animalia) (used: 0, children: 0)", + " Ctenophora (Animalia) (used: 0, children: 0)", + " Gastrotrich (Animalia) (used: 0, children: 0)", + " Placozoa (Animalia) (used: 0, children: 0)", + " Porifera (Animalia) (used: 0, children: 0)", + ] + + def test_usage_count_three_levels_deep_rollup(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + Tagging an object with a depth-3 tag (Chordata) should roll up + to grandparent (Animalia) and great-grandparent (Eukaryota), + verifying the full 3-level lineage query in add_counts_query. + """ + api.tag_object("obj:1", self.taxonomy, [self.animalia.value]) + api.tag_object("obj:1", self.taxonomy, [self.chordata.value]) + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="chordata", include_counts=True) + ) + assert result == [ + "Eukaryota (None) (used: 1, children: 1 + 1)", + " Animalia (Eukaryota) (used: 1, children: 1)", + " Chordata (Animalia) (used: 1, children: 0)", + ] + + def test_usage_count_returns_zero_not_none_deep(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When no object has been tagged with a tag or any of its + descendants, usage_count must be 0 (integer), not None. + """ + result = pretty_format_tags(self.taxonomy.get_filtered_tags(include_counts=True)) + assert result == [ + "Archaea (None) (used: 0, children: 3)", + " DPANN (Archaea) (used: 0, children: 0)", + " Euryarchaeida (Archaea) (used: 0, children: 0)", + " Proteoarchaeota (Archaea) (used: 0, children: 0)", + "Bacteria (None) (used: 0, children: 2)", + " Archaebacteria (Bacteria) (used: 0, children: 0)", + " Eubacteria (Bacteria) (used: 0, children: 0)", + "Eukaryota (None) (used: 0, children: 5 + 8)", + " Animalia (Eukaryota) (used: 0, children: 7 + 1)", + " Arthropoda (Animalia) (used: 0, children: 0)", + " Chordata (Animalia) (used: 0, children: 1)", + " Mammalia (Chordata) (used: 0, children: 0)", + " Cnidaria (Animalia) (used: 0, children: 0)", + " Ctenophora (Animalia) (used: 0, children: 0)", + " Gastrotrich (Animalia) (used: 0, children: 0)", + " Placozoa (Animalia) (used: 0, children: 0)", + " Porifera (Animalia) (used: 0, children: 0)", + " Fungi (Eukaryota) (used: 0, children: 0)", + " Monera (Eukaryota) (used: 0, children: 0)", + " Plantae (Eukaryota) (used: 0, children: 0)", + " Protista (Eukaryota) (used: 0, children: 0)", + ] + + def test_usage_count_with_search_term_deep(self) -> None: + """ + AI/Claude4.6 generated via IntelliJ IDEA AI Assistant + When using get_filtered_tags() with both a search_term and + include_counts=True, the usage_count returned should still + reflect the true count for each matching tag, not be affected + by the search filter. + """ + api.tag_object("obj:1", self.taxonomy, [self.eubacteria.value]) + api.tag_object("obj:2", self.taxonomy, [self.archaebacteria.value]) + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="bacteria", include_counts=True) + ) + assert result == [ + "Bacteria (None) (used: 2, children: 2)", + " Archaebacteria (Bacteria) (used: 1, children: 0)", + " Eubacteria (Bacteria) (used: 1, children: 0)", + ] + + def test_usage_count_at_depth_3_rolls_up_to_root_level(self) -> None: + """ + Tagging a tag at depth 3 should roll usage up through every ancestor level, + meaning that the root tag (depth 0) should include the usage count of the depth 3 tag. + """ + api.tag_object("obj:1", self.taxonomy, [self.mammalia.value]) + result = pretty_format_tags( + self.taxonomy.get_filtered_tags(search_term="mammalia", include_counts=True) + ) + assert result == [ + "Eukaryota (None) (used: 1, children: 1 + 2)", + " Animalia (Eukaryota) (used: 1, children: 1 + 1)", + " Chordata (Animalia) (used: 1, children: 1)", + " Mammalia (Chordata) (used: 1, children: 0)", + ] + + def test_tree_sort(self) -> None: """ Verify that taxonomies can be sorted correctly in tree orer (case insensitive). @@ -622,8 +862,31 @@ def test_descendant_counts(self) -> None: "Interests (None) (used: 0, children: 1 + 7)", " Holland Codes (Interests) (used: 0, children: 1 + 6)", " Interests - Holland Codes (Holland Codes) (used: 0, children: 6)", + " Artistic (Interests - Holland Codes) (used: 0, children: 0)", + " Conventional (Interests - Holland Codes) (used: 0, children: 0)", + " Enterprising (Interests - Holland Codes) (used: 0, children: 0)", + " Investigative (Interests - Holland Codes) (used: 0, children: 0)", + " Realistic (Interests - Holland Codes) (used: 0, children: 0)", + " Social (Interests - Holland Codes) (used: 0, children: 0)", ] + def test_parent_query_depth_limit(self) -> None: + """ + Verify that filtering below a specific parent still respects the + taxonomy max supported depth from the root (depth <= 3). + """ + taxonomy = api.create_taxonomy("Deep Parent Filter") + root = api.add_tag_to_taxonomy(taxonomy, "Root") + depth1 = api.add_tag_to_taxonomy(taxonomy, "Depth1", parent_tag_value=root.value) + depth2 = api.add_tag_to_taxonomy(taxonomy, "Depth2", parent_tag_value=depth1.value) + depth3 = api.add_tag_to_taxonomy(taxonomy, "Depth3", parent_tag_value=depth2.value) + api.add_tag_to_taxonomy(taxonomy, "Depth4", parent_tag_value=depth3.value) + + result = list(taxonomy.get_filtered_tags(depth=None, parent_tag_value=depth1.value)) + result_values = [row["value"] for row in result] + + assert result_values == ["Depth2", "Depth3"] + class TestFilteredTagsFreeTextTaxonomy(TestCase): """