From b62b990ac711d31fe9b7e2e5a8b701516dd7479b Mon Sep 17 00:00:00 2001 From: mcarans Date: Fri, 2 Jan 2026 11:47:22 +1300 Subject: [PATCH 1/5] Use get_size_and_hash from HDX Python Utilities --- documentation/index.md | 12 ++++--- pyproject.toml | 2 +- requirements.txt | 20 +++++------ src/hdx/api/utilities/filestore_helper.py | 2 +- src/hdx/api/utilities/size_hash.py | 43 ----------------------- src/hdx/data/resource.py | 2 +- 6 files changed, 20 insertions(+), 61 deletions(-) delete mode 100644 src/hdx/api/utilities/size_hash.py diff --git a/documentation/index.md b/documentation/index.md index 09e3d66..074612b 100755 --- a/documentation/index.md +++ b/documentation/index.md @@ -54,6 +54,8 @@ The library has detailed API documentation which can be found in the menu at the ## Breaking Changes +From 6.5.7, get_size_and_hash moved to HDX Python Utilities + From 6.5.2, remove unused `generate_qc_resource_from_rows` method. `generate_resource_from_rows`, `generate_resource_from_iterable` and `download_and_generate_resource` are deprecated. They are replaced by @@ -906,11 +908,11 @@ corresponding id and that resource on HDX will be overwritten. resource.create_in_hdx(dataset=DATASET) -Alternatively, if a resource doesn't have an id, but contains a package_id, the create -and update methods will use it to load the corresponding dataset, the resource will be -assigned to that dataset and it will be compared to resources in that dataset. If a -match is found, then the resource will be given the corresponding id and that resource -on HDX will be overwritten. +Alternatively, if a resource doesn't have an id, but contains a package_id, the create +and update methods will use it to load the corresponding dataset, the resource will be +assigned to that dataset and it will be compared to resources in that dataset. If a +match is found, then the resource will be given the corresponding id and that resource +on HDX will be overwritten. You can download a resource using the **download** function eg. diff --git a/pyproject.toml b/pyproject.toml index aad7e57..667fefb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "defopt>=7.0.0", "email_validator", "hdx-python-country>=3.9.8", - "hdx-python-utilities>=3.9.6", + "hdx-python-utilities>=3.9.7", "libhxl>=5.2.2", "makefun", "quantulum3", diff --git a/requirements.txt b/requirements.txt index 2b3b81b..a6da385 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ click==8.3.1 # typer colorama==0.4.6 # via mkdocs-material -coverage==7.13.0 +coverage==7.13.1 # via pytest-cov defopt==7.0.0 # via hdx-python-api (pyproject.toml) @@ -44,7 +44,7 @@ docopt==0.6.2 # via # ckanapi # num2words -docutils==0.22.3 +docutils==0.22.4 # via defopt email-validator==2.3.0 # via hdx-python-api (pyproject.toml) @@ -66,11 +66,11 @@ gspread==6.2.1 # via hdx-python-api (pyproject.toml) hdx-python-country==3.9.8 # via hdx-python-api (pyproject.toml) -hdx-python-utilities==3.9.6 +hdx-python-utilities==3.9.7 # via # hdx-python-api (pyproject.toml) # hdx-python-country -humanize==4.14.0 +humanize==4.15.0 # via frictionless identify==2.6.15 # via pre-commit @@ -137,13 +137,13 @@ mkdocs==1.6.1 # mkdocs-material mkdocs-get-deps==0.2.0 # via mkdocs -mkdocs-material==9.7.0 +mkdocs-material==9.7.1 # via mkapi mkdocs-material-extensions==1.3.1 # via mkdocs-material more-itertools==10.8.0 # via inflect -nodeenv==1.9.1 +nodeenv==1.10.0 # via pre-commit num2words==0.5.14 # via quantulum3 @@ -192,7 +192,7 @@ pygments==2.19.2 # mkdocs-material # pytest # rich -pymdown-extensions==10.19.1 +pymdown-extensions==10.20 # via mkdocs-material pyphonetics==0.5.3 # via hdx-python-utilities @@ -259,9 +259,9 @@ rpds-py==0.30.0 # referencing rsa==4.9.1 # via google-auth -ruamel-yaml==0.18.17 +ruamel-yaml==0.19.0 # via hdx-python-utilities -ruamel-yaml-clib==0.2.15 +ruamel-yaml-clibz==0.3.4 # via ruamel-yaml setuptools==80.9.0 # via ckanapi @@ -291,7 +291,7 @@ text-unidecode==1.3 # via python-slugify typeguard==4.4.4 # via inflect -typer==0.20.0 +typer==0.21.0 # via frictionless typing-extensions==4.15.0 # via diff --git a/src/hdx/api/utilities/filestore_helper.py b/src/hdx/api/utilities/filestore_helper.py index b3fd298..23050c9 100755 --- a/src/hdx/api/utilities/filestore_helper.py +++ b/src/hdx/api/utilities/filestore_helper.py @@ -3,8 +3,8 @@ import logging from typing import TYPE_CHECKING, Any, Dict -from hdx.api.utilities.size_hash import get_size_and_hash from hdx.utilities.dateparse import now_utc_notz +from hdx.utilities.file_hashing import get_size_and_hash if TYPE_CHECKING: from hdx.data.resource import Resource diff --git a/src/hdx/api/utilities/size_hash.py b/src/hdx/api/utilities/size_hash.py deleted file mode 100644 index fc5137d..0000000 --- a/src/hdx/api/utilities/size_hash.py +++ /dev/null @@ -1,43 +0,0 @@ -import hashlib -from io import BytesIO -from typing import Tuple - -from openpyxl import load_workbook - - -def get_size_and_hash(file_to_upload: str, file_format: str) -> Tuple[int, str]: - """Return the size and hash of file to upload - - Args: - file_to_upload: File to upload - file_format (str): File format - - Returns: - Tuple[int, str]: Tuple (size, hash) - """ - f = open(file_to_upload, "rb") - md5hash = hashlib.md5() - if file_format == "xlsx": - first_chunk = f.read(4096) - size = len(first_chunk) - signature = first_chunk[:4] - if signature == b"PK\x03\x04": # xlsx - xlsxbuffer = bytearray(first_chunk) - while chunk := f.read(4096): - size += len(chunk) - xlsxbuffer.extend(chunk) - workbook = load_workbook(filename=BytesIO(xlsxbuffer), read_only=True) - for sheet_name in workbook.sheetnames: - sheet = workbook[sheet_name] - for cols in sheet.iter_rows(values_only=True): - md5hash.update(bytes(str(cols), "utf-8")) - workbook.close() - return size, md5hash.hexdigest() - else: - md5hash.update(first_chunk) - else: - size = 0 - while chunk := f.read(4096): - size += len(chunk) - md5hash.update(chunk) - return size, md5hash.hexdigest() diff --git a/src/hdx/data/resource.py b/src/hdx/data/resource.py index 5fea664..c413a29 100755 --- a/src/hdx/data/resource.py +++ b/src/hdx/data/resource.py @@ -11,11 +11,11 @@ import hdx.data.resource_matcher from hdx.api.configuration import Configuration from hdx.api.utilities.date_helper import DateHelper -from hdx.api.utilities.size_hash import get_size_and_hash from hdx.data.hdxobject import HDXError, HDXObject from hdx.data.resource_view import ResourceView from hdx.utilities.dateparse import now_utc, now_utc_notz, parse_date from hdx.utilities.downloader import Download +from hdx.utilities.file_hashing import get_size_and_hash from hdx.utilities.retriever import Retrieve from hdx.utilities.typehint import ListTuple from hdx.utilities.uuid import is_valid_uuid From 34a33ba4f9389b9a8a6f93f2c55b2184d75055c2 Mon Sep 17 00:00:00 2001 From: mcarans Date: Mon, 5 Jan 2026 17:16:45 +1300 Subject: [PATCH 2/5] Use get_size_and_hash from HDX Python Utilities --- requirements.txt | 8 +++----- tests/hdx/api/utilities/test_size_hash.py | 12 ------------ 2 files changed, 3 insertions(+), 17 deletions(-) delete mode 100644 tests/hdx/api/utilities/test_size_hash.py diff --git a/requirements.txt b/requirements.txt index a6da385..297bcf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ backrefs==6.1 # via mkdocs-material cachetools==6.2.4 # via google-auth -certifi==2025.11.12 +certifi==2026.1.4 # via requests cfgv==3.5.0 # via pre-commit @@ -50,7 +50,7 @@ email-validator==2.3.0 # via hdx-python-api (pyproject.toml) et-xmlfile==2.0.0 # via openpyxl -filelock==3.20.1 +filelock==3.20.2 # via virtualenv frictionless==5.18.1 # via hdx-python-utilities @@ -259,10 +259,8 @@ rpds-py==0.30.0 # referencing rsa==4.9.1 # via google-auth -ruamel-yaml==0.19.0 +ruamel-yaml==0.19.1 # via hdx-python-utilities -ruamel-yaml-clibz==0.3.4 - # via ruamel-yaml setuptools==80.9.0 # via ckanapi shellingham==1.5.4 diff --git a/tests/hdx/api/utilities/test_size_hash.py b/tests/hdx/api/utilities/test_size_hash.py deleted file mode 100644 index ef938ad..0000000 --- a/tests/hdx/api/utilities/test_size_hash.py +++ /dev/null @@ -1,12 +0,0 @@ -from hdx.api.utilities.size_hash import get_size_and_hash - - -class TestSizeHash: - def test_get_size_and_hash(self, test_data, test_xlsx): - size, hash = get_size_and_hash(test_data, "csv") - assert size == 1548 - assert hash == "3790da698479326339fa99a074cbc1f7" - - size, hash = get_size_and_hash(test_xlsx, "xlsx") - assert size == 23724 - assert hash == "6b8acf7e28d62685a1e829e7fa220d17" From 54a49d99fc1c0f6e8a4c6a3d34af1752a0b8a8dd Mon Sep 17 00:00:00 2001 From: mcarans Date: Tue, 6 Jan 2026 10:47:48 +1300 Subject: [PATCH 3/5] Update tests --- pyproject.toml | 2 +- requirements.txt | 6 +++--- tests/hdx/__init__.py | 2 +- tests/hdx/data/test_dataset_core.py | 4 ++-- tests/hdx/data/test_update_dataset_resources.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 667fefb..7729933 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "defopt>=7.0.0", "email_validator", "hdx-python-country>=3.9.8", - "hdx-python-utilities>=3.9.7", + "hdx-python-utilities>=3.9.9", "libhxl>=5.2.2", "makefun", "quantulum3", diff --git a/requirements.txt b/requirements.txt index 297bcf1..946019c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,7 +56,7 @@ frictionless==5.18.1 # via hdx-python-utilities ghp-import==2.1.0 # via mkdocs -google-auth==2.45.0 +google-auth==2.46.0 # via # google-auth-oauthlib # gspread @@ -66,7 +66,7 @@ gspread==6.2.1 # via hdx-python-api (pyproject.toml) hdx-python-country==3.9.8 # via hdx-python-api (pyproject.toml) -hdx-python-utilities==3.9.7 +hdx-python-utilities==3.9.9 # via # hdx-python-api (pyproject.toml) # hdx-python-country @@ -117,7 +117,7 @@ markdown==3.10 # pymdown-extensions markdown-it-py==4.0.0 # via rich -marko==2.2.1 +marko==2.2.2 # via frictionless markupsafe==3.0.3 # via diff --git a/tests/hdx/__init__.py b/tests/hdx/__init__.py index f4d475a..2c3dde6 100755 --- a/tests/hdx/__init__.py +++ b/tests/hdx/__init__.py @@ -299,7 +299,7 @@ def dataset_mockshow(url, datadict): # test existing size and hash same resource = resultdictcopy["resources"][0] resource["size"] = 23724 - resource["hash"] = "6b8acf7e28d62685a1e829e7fa220d17" + resource["hash"] = "b2f92ef4b1c895568421cb887859a13d" result = json.dumps(resultdictcopy) return MockResponse( 200, diff --git a/tests/hdx/data/test_dataset_core.py b/tests/hdx/data/test_dataset_core.py index 66ceb3d..9e8081c 100755 --- a/tests/hdx/data/test_dataset_core.py +++ b/tests/hdx/data/test_dataset_core.py @@ -774,7 +774,7 @@ def test_update_in_hdx(self, configuration, post_update, date_pattern, test_xlsx assert len(dataset._resources) == 3 result = dataset.get_resource() assert result["size"] == 23724 - assert result["hash"] == "6b8acf7e28d62685a1e829e7fa220d17" + assert result["hash"] == "b2f92ef4b1c895568421cb887859a13d" assert statuses == {"Resource1": 2, "Resource2": 1, "Resource3": 1} resource["name"] = "123" resource.set_file_to_upload(None) @@ -848,7 +848,7 @@ def test_update_in_hdx(self, configuration, post_update, date_pattern, test_xlsx } result = dataset.get_resource(2) assert result["size"] == 23724 - assert result["hash"] == "6b8acf7e28d62685a1e829e7fa220d17" + assert result["hash"] == "b2f92ef4b1c895568421cb887859a13d" assert dataset["state"] == "active" assert len(dataset._resources) == 3 dataset = Dataset(datasetdata) diff --git a/tests/hdx/data/test_update_dataset_resources.py b/tests/hdx/data/test_update_dataset_resources.py index 20a36ed..c7a4a30 100644 --- a/tests/hdx/data/test_update_dataset_resources.py +++ b/tests/hdx/data/test_update_dataset_resources.py @@ -326,7 +326,7 @@ def test_dataset_update_resources_position( { "description": "test2", "format": "xlsx", - "hash": "6b8acf7e28d62685a1e829e7fa220d17", + "hash": "b2f92ef4b1c895568421cb887859a13d", "name": "test2", "resource_type": "file.upload", "size": 23724, From e1eca823dced0c2d0290032ec04c85f1a8c0e425 Mon Sep 17 00:00:00 2001 From: mcarans Date: Tue, 6 Jan 2026 13:13:32 +1300 Subject: [PATCH 4/5] Don't check size when updating resources - just check hash --- src/hdx/api/utilities/filestore_helper.py | 6 +----- src/hdx/data/resource.py | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/hdx/api/utilities/filestore_helper.py b/src/hdx/api/utilities/filestore_helper.py index 23050c9..a079b47 100755 --- a/src/hdx/api/utilities/filestore_helper.py +++ b/src/hdx/api/utilities/filestore_helper.py @@ -112,11 +112,7 @@ def dataset_update_filestore_resource( force_update = kwargs.pop("force_update", False) file_format = resource_data_to_update.get("format", "").lower() size, hash = get_size_and_hash(file_to_upload, file_format) - if ( - not force_update - and size == original_resource_data.get("size") - and hash == original_resource_data.get("hash") - ): + if not force_update and hash == original_resource_data.get("hash"): logger.warning( f"Not updating filestore for resource {original_resource_data['name']} as size and hash unchanged!" ) diff --git a/src/hdx/data/resource.py b/src/hdx/data/resource.py index c413a29..b1da57b 100755 --- a/src/hdx/data/resource.py +++ b/src/hdx/data/resource.py @@ -415,11 +415,7 @@ def _resource_merge_hdx_update( force_update = kwargs.pop("force_update", False) file_format = self._old_data.get("format", "").lower() size, hash = get_size_and_hash(self._file_to_upload, file_format) - if ( - not force_update - and size == self.data.get("size") - and hash == self.data.get("hash") - ): + if not force_update and hash == self.data.get("hash"): logger.warning( f"Not updating filestore for resource {self.data['name']} as size and hash unchanged!" ) From 129b37f536e65135d8c7248d817591c9b6461a28 Mon Sep 17 00:00:00 2001 From: mcarans Date: Tue, 6 Jan 2026 14:13:35 +1300 Subject: [PATCH 5/5] Don't check size when updating resources - just check hash --- src/hdx/api/utilities/filestore_helper.py | 2 +- src/hdx/data/resource.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hdx/api/utilities/filestore_helper.py b/src/hdx/api/utilities/filestore_helper.py index a079b47..dea5258 100755 --- a/src/hdx/api/utilities/filestore_helper.py +++ b/src/hdx/api/utilities/filestore_helper.py @@ -114,7 +114,7 @@ def dataset_update_filestore_resource( size, hash = get_size_and_hash(file_to_upload, file_format) if not force_update and hash == original_resource_data.get("hash"): logger.warning( - f"Not updating filestore for resource {original_resource_data['name']} as size and hash unchanged!" + f"Not updating filestore for resource {original_resource_data['name']} as hash unchanged!" ) if resource_data_to_update._url_backup: resource_data_to_update["url"] = resource_data_to_update._url_backup diff --git a/src/hdx/data/resource.py b/src/hdx/data/resource.py index b1da57b..9212b6c 100755 --- a/src/hdx/data/resource.py +++ b/src/hdx/data/resource.py @@ -417,7 +417,7 @@ def _resource_merge_hdx_update( size, hash = get_size_and_hash(self._file_to_upload, file_format) if not force_update and hash == self.data.get("hash"): logger.warning( - f"Not updating filestore for resource {self.data['name']} as size and hash unchanged!" + f"Not updating filestore for resource {self.data['name']} as hash unchanged!" ) if self._url_backup: self._old_data["url"] = self._url_backup