diff --git a/documentation/plugins.rst b/documentation/plugins.rst index 1af39426..f3131224 100644 --- a/documentation/plugins.rst +++ b/documentation/plugins.rst @@ -3,7 +3,6 @@ Developing plugins for MMIF Python SDK ====================================== - Overview -------- @@ -80,10 +79,41 @@ And the plugin code. def help(): return "location format: `.video`" - - -Bulit-in Document Location Scheme Plugins +Built-in Document Location Scheme Plugins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. +At the moment, ``mmif-python`` PyPI distribution ships a built-in *docloc* plugin that support both ``http`` and ``https`` schemes. This plugin implements caching as described above, so repeated access to the same URL will not trigger multiple downloads. Take a look at :mod:`mmif_docloc_http` module for details. + +Caching for Remote File Access +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When developing plugins that resolve remote document locations (e.g., ``http``, ``s3``, or custom schemes), it is highly recommended to implement caching to avoid repeated network requests or file downloads. Since ``mmif-python`` may call the ``resolve`` function multiple times for the same document location during processing, caching can significantly improve performance. + +A simple and effective approach is to use a module-level dictionary as a cache. Because Python modules are singletons (loaded once and cached in ``sys.modules``), this cache persists for the entire lifetime of the Python process, across multiple MMIF files and Document objects. + +Here's an example of how to implement caching in a plugin: + +.. code-block:: python + + # mmif_docloc_myscheme/__init__.py + + _cache = {} + + def resolve(docloc): + if docloc in _cache: + return _cache[docloc] + + # ... your resolution logic here ... + resolved_path = do_actual_resolution(docloc) + + _cache[docloc] = resolved_path + return resolved_path + +This pattern ensures that: + +* The first call to ``resolve`` performs the actual resolution (download, API call, etc.) +* Subsequent calls for the same location return the cached result immediately +* The cache is shared across all MMIF objects processed within the same Python process + +See :mod:`mmif_docloc_http` for a concrete example of this caching strategy in action. diff --git a/mmif_docloc_http/__init__.py b/mmif_docloc_http/__init__.py index 9bdf9f22..f92c87ba 100644 --- a/mmif_docloc_http/__init__.py +++ b/mmif_docloc_http/__init__.py @@ -1,16 +1,22 @@ import urllib.request import urllib.error +_cache = {} + def resolve(docloc): + if docloc in _cache: + return _cache[docloc] try: if docloc.startswith('http://') or docloc.startswith('https://'): - return urllib.request.urlretrieve(docloc)[0] + path = urllib.request.urlretrieve(docloc)[0] + _cache[docloc] = path + return path else: raise ValueError(f'cannot handle document location scheme: {docloc}') except urllib.error.URLError as e: raise e - - + + def help(): return "location must be a URL string." diff --git a/tests/test_serialize.py b/tests/test_serialize.py index b0836c5a..9e857a00 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -269,6 +269,21 @@ def test_document_location_helpers_http(self): # round_trip = Document(new_doc.serialize()) self.assertEqual(Document(new_doc.serialize()).serialize(), new_doc.serialize()) + def test_document_location_http_caching(self): + import mmif_docloc_http + mmif_docloc_http._cache.clear() + test_url = "https://example.com/" + self.assertNotIn(test_url, mmif_docloc_http._cache) + new_doc = Document() + new_doc.id = "d1" + new_doc.location = test_url + new_doc.location_path() + self.assertIn(test_url, mmif_docloc_http._cache) + # second call should use cache (same path returned) + cached_path = mmif_docloc_http._cache[test_url] + second_path = new_doc.location_path() + self.assertEqual(cached_path, second_path) + def test_get_documents_locations(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) self.assertEqual(1, len(mmif_obj.get_documents_locations(DocumentTypes.VideoDocument)))