From 65e898c7d5ea6e75b253a043e281db0122b8d3be Mon Sep 17 00:00:00 2001 From: rtibblesbot Date: Fri, 3 Jul 2026 01:07:31 -0700 Subject: [PATCH 1/2] chore(deps): bump le-utils from 0.2.17 to 0.2.18 Ships the QTI question type constant so AssessmentItem code can reference exercises.QTI instead of a hardcoded string. --- requirements.in | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.in b/requirements.in index 17b03aa0a4..398c01aae7 100644 --- a/requirements.in +++ b/requirements.in @@ -5,7 +5,7 @@ djangorestframework==3.15.1 psycopg2-binary==2.9.11 django-js-reverse==0.10.2 django-registration==3.4 -le-utils==0.2.17 +le-utils==0.2.18 gunicorn==26.0.0 django-postmark==0.1.6 jsonfield==3.1.0 diff --git a/requirements.txt b/requirements.txt index 1f6e4367f9..b1df711e1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -177,7 +177,7 @@ langcodes==3.5.1 # via -r requirements.in latex2mathml==3.78.1 # via -r requirements.in -le-utils==0.2.17 +le-utils==0.2.18 # via -r requirements.in lxml==6.1.1 # via -r requirements.in From 397f1f74e2c236d18c2ff315dc5ea1c3a225e5b5 Mon Sep 17 00:00:00 2001 From: rtibblesbot Date: Fri, 3 Jul 2026 01:07:59 -0700 Subject: [PATCH 2/2] feat: round-trip QTI AssessmentItem raw_data through save and sync Store the editor's authored QTI XML on AssessmentItem (type='QTI', full item XML in raw_data) and round-trip it byte-for-byte through the save and sync API paths. - Persist raw_data verbatim, bypassing legacy answers/hints JSON coercion for QTI items - Validate raw_data against the QTI 3.0 schema on save, rejecting invalid items through the sync error channel - Extract src/srcset/href/data file references from QTI XML into the existing File claim/create/delete flow Fixes #5999 --- .../0168_alter_assessmentitem_type.py | 30 +++ .../tests/viewsets/test_assessmentitem.py | 223 ++++++++++++++++++ .../utils/assessment/qti/validation.py | 8 +- .../viewsets/assessmentitem.py | 98 +++++++- 4 files changed, 349 insertions(+), 10 deletions(-) create mode 100644 contentcuration/contentcuration/migrations/0168_alter_assessmentitem_type.py diff --git a/contentcuration/contentcuration/migrations/0168_alter_assessmentitem_type.py b/contentcuration/contentcuration/migrations/0168_alter_assessmentitem_type.py new file mode 100644 index 0000000000..6a3c844d70 --- /dev/null +++ b/contentcuration/contentcuration/migrations/0168_alter_assessmentitem_type.py @@ -0,0 +1,30 @@ +# Generated by Django 3.2.24 on 2026-07-03 04:48 +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + + dependencies = [ + ("contentcuration", "0167_add_organization"), + ] + + operations = [ + migrations.AlterField( + model_name="assessmentitem", + name="type", + field=models.CharField( + choices=[ + ("input_question", "Input Question"), + ("multiple_selection", "Multiple Selection"), + ("single_selection", "Single Selection"), + ("free_response", "Free Response"), + ("perseus_question", "Perseus Question"), + ("QTI", "QTI"), + ("true_false", "True/False"), + ], + default="multiple_selection", + max_length=50, + ), + ), + ] diff --git a/contentcuration/contentcuration/tests/viewsets/test_assessmentitem.py b/contentcuration/contentcuration/tests/viewsets/test_assessmentitem.py index 56f85c5799..925239fc76 100644 --- a/contentcuration/contentcuration/tests/viewsets/test_assessmentitem.py +++ b/contentcuration/contentcuration/tests/viewsets/test_assessmentitem.py @@ -8,6 +8,8 @@ from contentcuration import models from contentcuration.tests import testdata from contentcuration.tests.base import StudioAPITestCase +from contentcuration.tests.utils.qti.test_validation import _item_xml +from contentcuration.tests.utils.qti.test_validation import VALID_CHOICE_ITEM from contentcuration.tests.viewsets.base import generate_create_event from contentcuration.tests.viewsets.base import generate_delete_event from contentcuration.tests.viewsets.base import generate_update_event @@ -15,6 +17,25 @@ from contentcuration.viewsets.sync.constants import ASSESSMENTITEM +QTI_ITEM_WITH_FILES = _item_xml( + "item_1", + "Sample Item", + '' + "choice_0" + "", + 'Select the correct answer. ' + '' + 'Option A' + '' + 'Option B' + "", +) + + class SyncTestCase(SyncTestMixin, StudioAPITestCase): @property def assessmentitem_metadata(self): @@ -440,6 +461,160 @@ def test_update_assessmentitem_to_true_false(self): "true_false", ) + def test_update_assessmentitem_to_qti(self): + assessmentitem = models.AssessmentItem.objects.create( + **self.assessmentitem_db_metadata + ) + self.client.force_authenticate(user=self.user) + response = self.sync_changes( + [ + generate_update_event( + [assessmentitem.contentnode_id, assessmentitem.assessment_id], + ASSESSMENTITEM, + {"type": "QTI", "raw_data": VALID_CHOICE_ITEM}, + channel_id=self.channel.id, + ) + ], + ) + self.assertEqual(response.status_code, 200, response.content) + updated = models.AssessmentItem.objects.get(id=assessmentitem.id) + self.assertEqual(updated.type, "QTI") + self.assertEqual(updated.raw_data, VALID_CHOICE_ITEM) + + def test_create_assessmentitem_preserves_raw_data_whitespace(self): + self.client.force_authenticate(user=self.user) + assessmentitem = self.assessmentitem_metadata + # Trailing whitespace only: a leading pad before the XML declaration + # would be invalid XML syntax and get rejected by QTI schema validation. + padded_raw_data = VALID_CHOICE_ITEM + "\n\n " + assessmentitem["type"] = "QTI" + assessmentitem["raw_data"] = padded_raw_data + response = self.sync_changes( + [ + generate_create_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + assessmentitem, + channel_id=self.channel.id, + ) + ], + ) + self.assertEqual(response.status_code, 200, response.content) + created = models.AssessmentItem.objects.get( + assessment_id=assessmentitem["assessment_id"] + ) + self.assertEqual(created.raw_data, padded_raw_data) + + def test_create_qti_assessmentitem_bypasses_hints_answers_coercion(self): + self.client.force_authenticate(user=self.user) + assessmentitem = self.assessmentitem_metadata + assessmentitem["type"] = "QTI" + assessmentitem["raw_data"] = VALID_CHOICE_ITEM + assessmentitem["hints"] = "not a json array" + assessmentitem["answers"] = "also not a json array" + response = self.sync_changes( + [ + generate_create_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + assessmentitem, + channel_id=self.channel.id, + ) + ], + ) + self.assertEqual(response.status_code, 200, response.content) + created = models.AssessmentItem.objects.get( + assessment_id=assessmentitem["assessment_id"] + ) + self.assertEqual(created.hints, "not a json array") + self.assertEqual(created.answers, "also not a json array") + + def _create_qti_referenced_files(self, checksums): + for checksum, ext in zip(checksums, ["png", "png", "pdf", "pdf"]): + image_file = testdata.fileobj_exercise_image() + image_file.checksum = checksum + image_file.file_format_id = ext + image_file.uploaded_by = self.user + image_file.save() + + def test_create_qti_assessmentitem_extracts_files(self): + self.client.force_authenticate(user=self.user) + checksums = ["a" * 32, "c" * 32, "d" * 32, "b" * 32] + self._create_qti_referenced_files(checksums) + + assessmentitem = self.assessmentitem_metadata + assessmentitem["type"] = "QTI" + assessmentitem["raw_data"] = QTI_ITEM_WITH_FILES + response = self.sync_changes( + [ + generate_create_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + assessmentitem, + channel_id=self.channel.id, + ) + ], + ) + self.assertEqual(response.status_code, 200, response.content) + ai = models.AssessmentItem.objects.get( + assessment_id=assessmentitem["assessment_id"] + ) + linked_checksums = set(ai.files.values_list("checksum", flat=True)) + self.assertEqual(linked_checksums, set(checksums)) + + def test_qti_assessmentitem_full_round_trip(self): + self.client.force_authenticate(user=self.user) + checksums = ["a" * 32, "c" * 32, "d" * 32, "b" * 32] + self._create_qti_referenced_files(checksums) + + assessmentitem = self.assessmentitem_metadata + assessmentitem["type"] = "QTI" + assessmentitem["raw_data"] = QTI_ITEM_WITH_FILES + + create_response = self.sync_changes( + [ + generate_create_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + assessmentitem, + channel_id=self.channel.id, + ) + ], + ) + self.assertEqual(create_response.status_code, 200, create_response.content) + ai = models.AssessmentItem.objects.get( + assessment_id=assessmentitem["assessment_id"] + ) + self.assertEqual(ai.raw_data, QTI_ITEM_WITH_FILES) + self.assertEqual(ai.type, "QTI") + self.assertEqual( + set(ai.files.values_list("checksum", flat=True)), set(checksums) + ) + + # Drop the object/data reference (checksum "d"*32) and change the title. + updated_raw_data = QTI_ITEM_WITH_FILES.replace( + '', + "", + ).replace('title="Sample Item"', 'title="Updated Item"') + update_response = self.sync_changes( + [ + generate_update_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + {"raw_data": updated_raw_data}, + channel_id=self.channel.id, + ) + ], + ) + self.assertEqual(update_response.status_code, 200, update_response.content) + ai.refresh_from_db() + self.assertEqual(ai.raw_data, updated_raw_data) + self.assertEqual( + set(ai.files.values_list("checksum", flat=True)), + set(checksums) - {"d" * 32}, + ) + def test_attempt_update_missing_assessmentitem(self): self.client.force_authenticate(user=self.user) @@ -744,6 +919,54 @@ def test_invalid_hints_assessmentitem(self): assessment_id=assessmentitem["assessment_id"] ) + def test_invalid_qti_assessmentitem(self): + self.client.force_authenticate(user=self.user) + assessmentitem = self.assessmentitem_metadata + assessmentitem["type"] = "QTI" + assessmentitem["raw_data"] = VALID_CHOICE_ITEM.replace( + 'orientation="vertical"', 'orientation="sideways"' + ) + response = self.sync_changes( + [ + generate_create_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + assessmentitem, + channel_id=self.channel.id, + ), + ], + ) + self.assertEqual(response.json()["errors"][0]["table"], "assessmentitem") + self.assertTrue(response.json()["errors"][0]["errors"]["raw_data"]) + self.assertEqual(len(response.json()["errors"]), 1) + with self.assertRaises( + models.AssessmentItem.DoesNotExist, msg="AssessmentItem was created" + ): + models.AssessmentItem.objects.get( + assessment_id=assessmentitem["assessment_id"] + ) + + def test_invalid_qti_xml_syntax_assessmentitem(self): + self.client.force_authenticate(user=self.user) + assessmentitem = self.assessmentitem_metadata + assessmentitem["type"] = "QTI" + assessmentitem["raw_data"] = "" + response = self.sync_changes( + [ + generate_create_event( + [assessmentitem["contentnode"], assessmentitem["assessment_id"]], + ASSESSMENTITEM, + assessmentitem, + channel_id=self.channel.id, + ), + ], + ) + self.assertTrue(response.json()["errors"][0]["errors"]["raw_data"]) + with self.assertRaises(models.AssessmentItem.DoesNotExist): + models.AssessmentItem.objects.get( + assessment_id=assessmentitem["assessment_id"] + ) + def test_valid_answers_assessmentitem(self): self.client.force_authenticate(user=self.user) assessmentitem = self.assessmentitem_metadata diff --git a/contentcuration/contentcuration/utils/assessment/qti/validation.py b/contentcuration/contentcuration/utils/assessment/qti/validation.py index 7cea89f276..b2f2243dbe 100644 --- a/contentcuration/contentcuration/utils/assessment/qti/validation.py +++ b/contentcuration/contentcuration/utils/assessment/qti/validation.py @@ -39,7 +39,7 @@ class QTIValidationResult: errors: List[QTIValidationError] = field(default_factory=list) -def _secure_parser() -> etree.XMLParser: +def secure_parser() -> etree.XMLParser: # resolve_entities=False + load_dtd=False blocks XXE/entity-expansion attacks; # no_network=True blocks remote schemaLocation/entity fetches. QTI reaches this # validator from ricecooker uploads and direct sync-API writes, not just the @@ -54,12 +54,16 @@ def _secure_parser() -> etree.XMLParser: ) +def parse_qti_xml(xml: bytes) -> etree._Element: + return etree.parse(BytesIO(xml), parser=secure_parser()) + + def validate_qti_item(xml: Union[str, bytes]) -> QTIValidationResult: if isinstance(xml, str): xml = xml.encode("utf-8") try: - doc = etree.parse(BytesIO(xml), parser=_secure_parser()) + doc = parse_qti_xml(xml) except etree.XMLSyntaxError as exc: return QTIValidationResult( is_valid=False, diff --git a/contentcuration/contentcuration/viewsets/assessmentitem.py b/contentcuration/contentcuration/viewsets/assessmentitem.py index 1877bfe6d7..4fa8614227 100644 --- a/contentcuration/contentcuration/viewsets/assessmentitem.py +++ b/contentcuration/contentcuration/viewsets/assessmentitem.py @@ -4,6 +4,7 @@ from django.db import transaction from le_utils.constants import exercises from le_utils.constants import format_presets +from lxml import etree from rest_framework import serializers from rest_framework.permissions import IsAuthenticated from rest_framework.serializers import ValidationError @@ -12,6 +13,11 @@ from contentcuration.models import ContentNode from contentcuration.models import File from contentcuration.models import generate_object_storage_name +from contentcuration.utils.assessment.qti.fields import ( + entry_pattern as srcset_entry_pattern, +) +from contentcuration.utils.assessment.qti.validation import parse_qti_xml +from contentcuration.utils.assessment.qti.validation import validate_qti_item from contentcuration.viewsets.base import BulkCreateMixin from contentcuration.viewsets.base import BulkListSerializer from contentcuration.viewsets.base import BulkModelSerializer @@ -58,6 +64,34 @@ def get_filenames_from_assessment(assessment_item): ) +qti_reference_attributes = ("src", "href", "data") +qti_checksum_filename_regex = re.compile(r"^[a-f0-9]{32}\.[0-9a-z]+$") + + +def get_filenames_from_qti_item(assessment_item): + # QTI attribute values are bare . references, matching + # the TipTap editor's existing permanentSrc="." convention + # for round-tripped tags -- not the markdown-only + # ${CONTENT_STORAGE}/. placeholder legacy types use. + checksums = set() + try: + doc = parse_qti_xml(assessment_item.raw_data.encode("utf-8")) + except etree.XMLSyntaxError: + return checksums + for element in doc.xpath("//*[@src or @href or @data or @srcset]"): + for attribute in qti_reference_attributes: + value = element.attrib.get(attribute) + if value and qti_checksum_filename_regex.match(value): + checksums.add(value) + srcset = element.attrib.get("srcset") + if srcset: + for entry in re.findall(srcset_entry_pattern, srcset): + token = entry[0].strip() + if token and qti_checksum_filename_regex.match(token): + checksums.add(token) + return checksums + + class AssessmentListSerializer(BulkListSerializer): def create(self, all_validated_data): with transaction.atomic(): @@ -81,6 +115,7 @@ class AssessmentItemSerializer(BulkModelSerializer): # to set it. hints = serializers.CharField(required=False) answers = serializers.CharField(required=False) + raw_data = serializers.CharField(required=False, trim_whitespace=False) assessment_id = UUIDRegexField() contentnode = UserFilteredPrimaryKeyRelatedField( queryset=ContentNode.objects.all(), required=False @@ -105,7 +140,33 @@ class Meta: # Use the contentnode and assessment_id as the lookup field for updates update_lookup_field = ("contentnode", "assessment_id") + @property + def _item_type(self): + return self.initial_data.get( + "type", self.instance.type if self.instance else None + ) + + def validate(self, data): + # Raising here (during is_valid(), before .save()) keeps this error + # attributed to just this item in the sync error channel. Raising from + # create()/update()/set_files() instead would be caught by the broad + # except Exception in create_from_changes/update_from_changes and + # reported as "Internal server error" for the whole batch. + data = super(AssessmentItemSerializer, self).validate(data) + if self._item_type == exercises.QTI: + raw_data = data.get( + "raw_data", self.instance.raw_data if self.instance else "" + ) + result = validate_qti_item(raw_data) + if not result.is_valid: + raise ValidationError( + {"raw_data": [error.message for error in result.errors]} + ) + return data + def validate_answers(self, value): + if self._item_type == exercises.QTI: + return value answers = json.loads(value) for answer in answers: if not type(answer) is dict: @@ -115,6 +176,8 @@ def validate_answers(self, value): return value def validate_hints(self, value): + if self._item_type == exercises.QTI: + return value hints = json.loads(value) for hint in hints: if not type(hint) is dict: @@ -128,25 +191,40 @@ def set_files(self, all_objects, all_validated_data=None): # noqa C901 files_to_update = {} current_files_by_aitem = {} - # Create a set of assessment item ids that have had markdown fields modified. if all_validated_data: # If this is an update operation, check the validated data for which items - # have had these fields modified. - md_fields_modified = { + # have had these fields modified. raw_data only carries file references for + # QTI items, so a raw_data-only change only counts as "modified" for those. + item_types_by_lookup = { + self.id_value_lookup(ai): ai.type for ai in all_objects + } + editable_text_fields_modified = { self.id_value_lookup(ai) for ai in all_validated_data - if "question" in ai or "hints" in ai or "answers" in ai + if "question" in ai + or "hints" in ai + or "answers" in ai + or ( + "raw_data" in ai + and item_types_by_lookup.get(self.id_value_lookup(ai)) + == exercises.QTI + ) } else: # If this is a create operation, just check if these fields are not null. - md_fields_modified = { + editable_text_fields_modified = { self.id_value_lookup(ai) for ai in all_objects - if ai.question or ai.hints or ai.answers + if ai.question + or ai.hints + or ai.answers + or (ai.type == exercises.QTI and ai.raw_data) } all_objects = [ - ai for ai in all_objects if self.id_value_lookup(ai) in md_fields_modified + ai + for ai in all_objects + if self.id_value_lookup(ai) in editable_text_fields_modified ] for file in File.objects.filter(assessment_item__in=all_objects): @@ -156,7 +234,11 @@ def set_files(self, all_objects, all_validated_data=None): # noqa C901 for aitem in all_objects: current_files = current_files_by_aitem.get(aitem.id, []) - filenames = get_filenames_from_assessment(aitem) + filenames = ( + get_filenames_from_qti_item(aitem) + if aitem.type == exercises.QTI + else get_filenames_from_assessment(aitem) + ) set_checksums = {filename.split(".")[0] for filename in filenames} current_checksums = {f.checksum for f in current_files}