From a941ff32e47b28c1e7ca895a220747943a03ffd2 Mon Sep 17 00:00:00 2001 From: Muhammad Arslan Date: Thu, 8 Jan 2026 20:33:04 +0500 Subject: [PATCH] fix: race condition in shared runtime services (#37825) There is a singleton SplitMongoModuleStore instance that is returned whenever we call the ubiquitous modulestore() function (wrapped in a MixedModuleStore). During initialization, SplitMongoModuleStore sets up a small handful of XBlock runtime services that are intended to be shared globally: i18n, fs, cache. When we get an individual block back from the store using get_item(), SplitMongoModuleStore creates a SplitModuleStoreRuntime using SplitMongoModuleStore.create_runtime(). These runtimes are intended to be modified on a per-item, and later per-user basis (using prepare_runtime_for_user()). Prior to this commit, the create_runtime() method was assigning the globally shared SplitMongoModuleStore.services dict directly to the newly instantiated SplitModuleStoreRuntime. This meant that even though each block had its own _services dict, they were all in fact pointing to the same underlying object. This exposed us to a risk of multiple threads contaminating each other's SplitModuleStoreRuntime services when deployed under load in multithreaded mode. We believe this led to a race condition that caused student submissions to be mis-scored in some cases. This commit makes a copy of the SplitMongoModuleStore.services dict for each SplitModuleStoreRuntime. The baseline global services are still shared, but other per-item and per-user services are now better isolated from each other. This commit also includes a small modification to the PartitionService, which up until this point had relied on the (incorrect) shared instance behavior. The details are provided in the comments in the PartitionService __init__(). It's worth noting that the historical rationale for having a singleton ModuleStore instance is that the ModuleStore used to be extremely expensive to initialize. This was because at one point, the init process required reading entire XML-based courses into memory, or pre-computing complex field inheritance caches. This is no longer the case, and SplitMongoModuleStore initialization is in the 1-2 ms range, with most of that being for PyMongo's connection setup. We should try to fully remove the global singleton in the Verawood release cycle in order to make this kind of bug less likely. --- xmodule/modulestore/split_mongo/split.py | 6 +++- xmodule/partitions/partitions_service.py | 43 ++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/xmodule/modulestore/split_mongo/split.py b/xmodule/modulestore/split_mongo/split.py index 07c985c21cc5..74539fabed90 100644 --- a/xmodule/modulestore/split_mongo/split.py +++ b/xmodule/modulestore/split_mongo/split.py @@ -3283,7 +3283,11 @@ def create_runtime(self, course_entry, lazy): """ Create the proper runtime for this course """ - services = self.services + # A single SplitMongoModuleStore may create many SplitModuleStoreRuntimes, + # each of which will later modify its internal dict of services on a per-item and often per-user basis. + # Therefore, it's critical that we make a new copy of our baseline services dict here, + # so that each runtime is free to add and replace its services without impacting other runtimes. + services = self.services.copy() # Only the CourseBlock can have user partitions. Therefore, creating the PartitionService with the library key # instead of the course key does not work. The XBlock validation in Studio fails with the following message: # "This component's access settings refer to deleted or invalid group configurations.". diff --git a/xmodule/partitions/partitions_service.py b/xmodule/partitions/partitions_service.py index 6cffd2c20c7b..ddd37d5212f5 100644 --- a/xmodule/partitions/partitions_service.py +++ b/xmodule/partitions/partitions_service.py @@ -99,8 +99,47 @@ class PartitionService: with a given course. """ - def __init__(self, course_id, cache=None, course=None): - self._course_id = course_id + def __init__(self, course_id: CourseKey, cache=None, course=None): + """Create a new ParititonService. This is user-specific.""" + + # There is a surprising amount of complexity in how to save the + # course_id we were passed in this constructor. + if course_id.org and course_id.course and course_id.run: + # This is the normal case, where we're instantiated with a CourseKey + # that has org, course, and run information. It will also often have + # a version_guid attached in this case, and we will want to strip + # that off in most cases. + # + # The reason for this is that the PartitionService is going to get + # recreated for every runtime (i.e. every block that's created for a + # user). Say you do the following: + # + # 1. You query the modulestore's get_item() for block A. + # 2. You update_item() for a different block B + # 3. You publish block B. + # + # When get_item() was called, a SplitModuleStoreRuntime was created + # for block A and it was given a CourseKey that had the version_guid + # encoded in it. If we persist that CourseKey with the version guid + # intact, then it will be incorrect after B is published, and any + # future access checks on A will break because it will try to query + # for a version of the course that is no longer published. + # + # Note that we still need to keep the branch information, or else + # this wouldn't work right in preview mode. + self._course_id = course_id.replace(version_guid=None) + else: + # If we're here, it means that the CourseKey we were sent doesn't + # have an org, course, and run. A much less common (but still legal) + # way to query by CourseKey involves a version_guid-only query, i.e. + # everything is None but the version_guid. In this scenario, it + # doesn't make sense to remove the one identifying piece of + # information we have, so we just assign the CourseKey without + # modification. We *could* potentially query the modulestore + # here and get the more normal form of the CourseKey, but that would + # be much more expensive and require database access. + self._course_id = course_id + self._cache = cache self.course = course