Skip to content

Commit 51730f4

Browse files
authored
HDXDSYS-2546 Fix hash checking for zipped files in HDX Python API (#99)
* Use get_size_and_hash from HDX Python Utilities * Update tests * Don't check size when updating resources - just check hash
1 parent 0999a7a commit 51730f4

10 files changed

Lines changed: 31 additions & 94 deletions

File tree

documentation/index.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ The library has detailed API documentation which can be found in the menu at the
5454

5555

5656
## Breaking Changes
57+
From 6.5.7, get_size_and_hash moved to HDX Python Utilities
58+
5759
From 6.5.2, remove unused `generate_qc_resource_from_rows` method.
5860
`generate_resource_from_rows`, `generate_resource_from_iterable` and
5961
`download_and_generate_resource` are deprecated. They are replaced by
@@ -906,11 +908,11 @@ corresponding id and that resource on HDX will be overwritten.
906908

907909
resource.create_in_hdx(dataset=DATASET)
908910

909-
Alternatively, if a resource doesn't have an id, but contains a package_id, the create
910-
and update methods will use it to load the corresponding dataset, the resource will be
911-
assigned to that dataset and it will be compared to resources in that dataset. If a
912-
match is found, then the resource will be given the corresponding id and that resource
913-
on HDX will be overwritten.
911+
Alternatively, if a resource doesn't have an id, but contains a package_id, the create
912+
and update methods will use it to load the corresponding dataset, the resource will be
913+
assigned to that dataset and it will be compared to resources in that dataset. If a
914+
match is found, then the resource will be given the corresponding id and that resource
915+
on HDX will be overwritten.
914916

915917
You can download a resource using the **download** function eg.
916918

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ dependencies = [
3838
"defopt>=7.0.0",
3939
"email_validator",
4040
"hdx-python-country>=3.9.8",
41-
"hdx-python-utilities>=3.9.6",
41+
"hdx-python-utilities>=3.9.9",
4242
"libhxl>=5.2.2",
4343
"makefun",
4444
"quantulum3",

requirements.txt

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ backrefs==6.1
1616
# via mkdocs-material
1717
cachetools==6.2.4
1818
# via google-auth
19-
certifi==2025.11.12
19+
certifi==2026.1.4
2020
# via requests
2121
cfgv==3.5.0
2222
# via pre-commit
@@ -32,7 +32,7 @@ click==8.3.1
3232
# typer
3333
colorama==0.4.6
3434
# via mkdocs-material
35-
coverage==7.13.0
35+
coverage==7.13.1
3636
# via pytest-cov
3737
defopt==7.0.0
3838
# via hdx-python-api (pyproject.toml)
@@ -44,19 +44,19 @@ docopt==0.6.2
4444
# via
4545
# ckanapi
4646
# num2words
47-
docutils==0.22.3
47+
docutils==0.22.4
4848
# via defopt
4949
email-validator==2.3.0
5050
# via hdx-python-api (pyproject.toml)
5151
et-xmlfile==2.0.0
5252
# via openpyxl
53-
filelock==3.20.1
53+
filelock==3.20.2
5454
# via virtualenv
5555
frictionless==5.18.1
5656
# via hdx-python-utilities
5757
ghp-import==2.1.0
5858
# via mkdocs
59-
google-auth==2.45.0
59+
google-auth==2.46.0
6060
# via
6161
# google-auth-oauthlib
6262
# gspread
@@ -66,11 +66,11 @@ gspread==6.2.1
6666
# via hdx-python-api (pyproject.toml)
6767
hdx-python-country==3.9.8
6868
# via hdx-python-api (pyproject.toml)
69-
hdx-python-utilities==3.9.6
69+
hdx-python-utilities==3.9.9
7070
# via
7171
# hdx-python-api (pyproject.toml)
7272
# hdx-python-country
73-
humanize==4.14.0
73+
humanize==4.15.0
7474
# via frictionless
7575
identify==2.6.15
7676
# via pre-commit
@@ -117,7 +117,7 @@ markdown==3.10
117117
# pymdown-extensions
118118
markdown-it-py==4.0.0
119119
# via rich
120-
marko==2.2.1
120+
marko==2.2.2
121121
# via frictionless
122122
markupsafe==3.0.3
123123
# via
@@ -137,13 +137,13 @@ mkdocs==1.6.1
137137
# mkdocs-material
138138
mkdocs-get-deps==0.2.0
139139
# via mkdocs
140-
mkdocs-material==9.7.0
140+
mkdocs-material==9.7.1
141141
# via mkapi
142142
mkdocs-material-extensions==1.3.1
143143
# via mkdocs-material
144144
more-itertools==10.8.0
145145
# via inflect
146-
nodeenv==1.9.1
146+
nodeenv==1.10.0
147147
# via pre-commit
148148
num2words==0.5.14
149149
# via quantulum3
@@ -192,7 +192,7 @@ pygments==2.19.2
192192
# mkdocs-material
193193
# pytest
194194
# rich
195-
pymdown-extensions==10.19.1
195+
pymdown-extensions==10.20
196196
# via mkdocs-material
197197
pyphonetics==0.5.3
198198
# via hdx-python-utilities
@@ -259,10 +259,8 @@ rpds-py==0.30.0
259259
# referencing
260260
rsa==4.9.1
261261
# via google-auth
262-
ruamel-yaml==0.18.17
262+
ruamel-yaml==0.19.1
263263
# via hdx-python-utilities
264-
ruamel-yaml-clib==0.2.15
265-
# via ruamel-yaml
266264
setuptools==80.9.0
267265
# via ckanapi
268266
shellingham==1.5.4
@@ -291,7 +289,7 @@ text-unidecode==1.3
291289
# via python-slugify
292290
typeguard==4.4.4
293291
# via inflect
294-
typer==0.20.0
292+
typer==0.21.0
295293
# via frictionless
296294
typing-extensions==4.15.0
297295
# via

src/hdx/api/utilities/filestore_helper.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import logging
44
from typing import TYPE_CHECKING, Any, Dict
55

6-
from hdx.api.utilities.size_hash import get_size_and_hash
76
from hdx.utilities.dateparse import now_utc_notz
7+
from hdx.utilities.file_hashing import get_size_and_hash
88

99
if TYPE_CHECKING:
1010
from hdx.data.resource import Resource
@@ -112,13 +112,9 @@ def dataset_update_filestore_resource(
112112
force_update = kwargs.pop("force_update", False)
113113
file_format = resource_data_to_update.get("format", "").lower()
114114
size, hash = get_size_and_hash(file_to_upload, file_format)
115-
if (
116-
not force_update
117-
and size == original_resource_data.get("size")
118-
and hash == original_resource_data.get("hash")
119-
):
115+
if not force_update and hash == original_resource_data.get("hash"):
120116
logger.warning(
121-
f"Not updating filestore for resource {original_resource_data['name']} as size and hash unchanged!"
117+
f"Not updating filestore for resource {original_resource_data['name']} as hash unchanged!"
122118
)
123119
if resource_data_to_update._url_backup:
124120
resource_data_to_update["url"] = resource_data_to_update._url_backup

src/hdx/api/utilities/size_hash.py

Lines changed: 0 additions & 43 deletions
This file was deleted.

src/hdx/data/resource.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
import hdx.data.resource_matcher
1212
from hdx.api.configuration import Configuration
1313
from hdx.api.utilities.date_helper import DateHelper
14-
from hdx.api.utilities.size_hash import get_size_and_hash
1514
from hdx.data.hdxobject import HDXError, HDXObject
1615
from hdx.data.resource_view import ResourceView
1716
from hdx.utilities.dateparse import now_utc, now_utc_notz, parse_date
1817
from hdx.utilities.downloader import Download
18+
from hdx.utilities.file_hashing import get_size_and_hash
1919
from hdx.utilities.retriever import Retrieve
2020
from hdx.utilities.typehint import ListTuple
2121
from hdx.utilities.uuid import is_valid_uuid
@@ -415,13 +415,9 @@ def _resource_merge_hdx_update(
415415
force_update = kwargs.pop("force_update", False)
416416
file_format = self._old_data.get("format", "").lower()
417417
size, hash = get_size_and_hash(self._file_to_upload, file_format)
418-
if (
419-
not force_update
420-
and size == self.data.get("size")
421-
and hash == self.data.get("hash")
422-
):
418+
if not force_update and hash == self.data.get("hash"):
423419
logger.warning(
424-
f"Not updating filestore for resource {self.data['name']} as size and hash unchanged!"
420+
f"Not updating filestore for resource {self.data['name']} as hash unchanged!"
425421
)
426422
if self._url_backup:
427423
self._old_data["url"] = self._url_backup

tests/hdx/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def dataset_mockshow(url, datadict):
299299
# test existing size and hash same
300300
resource = resultdictcopy["resources"][0]
301301
resource["size"] = 23724
302-
resource["hash"] = "6b8acf7e28d62685a1e829e7fa220d17"
302+
resource["hash"] = "b2f92ef4b1c895568421cb887859a13d"
303303
result = json.dumps(resultdictcopy)
304304
return MockResponse(
305305
200,

tests/hdx/api/utilities/test_size_hash.py

Lines changed: 0 additions & 12 deletions
This file was deleted.

tests/hdx/data/test_dataset_core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,7 @@ def test_update_in_hdx(self, configuration, post_update, date_pattern, test_xlsx
774774
assert len(dataset._resources) == 3
775775
result = dataset.get_resource()
776776
assert result["size"] == 23724
777-
assert result["hash"] == "6b8acf7e28d62685a1e829e7fa220d17"
777+
assert result["hash"] == "b2f92ef4b1c895568421cb887859a13d"
778778
assert statuses == {"Resource1": 2, "Resource2": 1, "Resource3": 1}
779779
resource["name"] = "123"
780780
resource.set_file_to_upload(None)
@@ -848,7 +848,7 @@ def test_update_in_hdx(self, configuration, post_update, date_pattern, test_xlsx
848848
}
849849
result = dataset.get_resource(2)
850850
assert result["size"] == 23724
851-
assert result["hash"] == "6b8acf7e28d62685a1e829e7fa220d17"
851+
assert result["hash"] == "b2f92ef4b1c895568421cb887859a13d"
852852
assert dataset["state"] == "active"
853853
assert len(dataset._resources) == 3
854854
dataset = Dataset(datasetdata)

tests/hdx/data/test_update_dataset_resources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def test_dataset_update_resources_position(
326326
{
327327
"description": "test2",
328328
"format": "xlsx",
329-
"hash": "6b8acf7e28d62685a1e829e7fa220d17",
329+
"hash": "b2f92ef4b1c895568421cb887859a13d",
330330
"name": "test2",
331331
"resource_type": "file.upload",
332332
"size": 23724,

0 commit comments

Comments
 (0)