Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions documentation/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ The library has detailed API documentation which can be found in the menu at the


## Breaking Changes
From 6.5.0, files will not be uploaded to the HDX filestore if the hash and size have
not changed, but if there are any resource metadata changes, except for last_modified,
they will still take place).

From 6.4.5, fix for changes in dependency defopt 7.0.0

From 6.2.8, fix mark_data_updated which was broken due to an error in
Expand Down
29 changes: 15 additions & 14 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,18 @@ cfgv==3.4.0
# via pre-commit
chardet==5.2.0
# via frictionless
charset-normalizer==3.4.2
charset-normalizer==3.4.3
# via requests
ckanapi==4.8
# via hdx-python-api (pyproject.toml)
click==8.2.1
# via
# mkdocs
# mkdocs-material
# typer
colorama==0.4.6
# via mkdocs-material
coverage==7.10.2
coverage==7.10.4
# via pytest-cov
defopt==7.0.0
# via hdx-python-api (pyproject.toml)
Expand All @@ -50,7 +51,7 @@ email-validator==2.2.0
# via hdx-python-api (pyproject.toml)
et-xmlfile==2.0.0
# via openpyxl
filelock==3.18.0
filelock==3.19.1
# via virtualenv
frictionless==5.18.1
# via hdx-python-utilities
Expand All @@ -72,7 +73,7 @@ hdx-python-utilities==3.9.0
# hdx-python-country
humanize==4.12.3
# via frictionless
identify==2.6.12
identify==2.6.13
# via pre-commit
idna==3.10
# via
Expand All @@ -96,7 +97,7 @@ jsonlines==4.0.0
# via hdx-python-utilities
jsonpath-ng==1.7.0
# via libhxl
jsonschema==4.25.0
jsonschema==4.25.1
# via
# frictionless
# tableschema-to-template
Expand All @@ -115,9 +116,9 @@ markdown==3.8.2
# mkdocs
# mkdocs-material
# pymdown-extensions
markdown-it-py==3.0.0
markdown-it-py==4.0.0
# via rich
marko==2.1.4
marko==2.2.0
# via frictionless
markupsafe==3.0.2
# via
Expand All @@ -137,7 +138,7 @@ mkdocs==1.6.1
# mkdocs-material
mkdocs-get-deps==0.2.0
# via mkdocs
mkdocs-material==9.6.16
mkdocs-material==9.6.17
# via mkapi
mkdocs-material-extensions==1.3.1
# via mkdocs-material
Expand Down Expand Up @@ -175,7 +176,7 @@ ply==3.11
# libhxl
pockets==0.9.1
# via sphinxcontrib-napoleon
pre-commit==4.2.0
pre-commit==4.3.0
# via hdx-python-api (pyproject.toml)
pyasn1==0.6.1
# via
Expand Down Expand Up @@ -236,7 +237,7 @@ referencing==0.36.2
# via
# jsonschema
# jsonschema-specifications
requests==2.32.4
requests==2.32.5
# via
# hdx-python-api (pyproject.toml)
# ckanapi
Expand All @@ -253,13 +254,13 @@ rfc3986==2.0.0
# via frictionless
rich==14.1.0
# via typer
rpds-py==0.26.0
rpds-py==0.27.0
# via
# jsonschema
# referencing
rsa==4.9.1
# via google-auth
ruamel-yaml==0.18.14
ruamel-yaml==0.18.15
# via hdx-python-utilities
ruamel-yaml-clib==0.2.12
# via ruamel-yaml
Expand Down Expand Up @@ -291,7 +292,7 @@ text-unidecode==1.3
# via python-slugify
typeguard==4.4.4
# via inflect
typer==0.16.0
typer==0.16.1
# via frictionless
typing-extensions==4.14.1
# via
Expand All @@ -313,7 +314,7 @@ urllib3==2.5.0
# requests
validators==0.35.0
# via frictionless
virtualenv==20.33.1
virtualenv==20.34.0
# via pre-commit
watchdog==6.0.0
# via mkdocs
Expand Down
2 changes: 1 addition & 1 deletion src/hdx/api/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def call_remoteckan(self, *args: Any, **kwargs: Any) -> Dict:
Dict: The response from the remote CKAN call_action method

"""
requests_kwargs = kwargs.get("requests_kwargs", dict())
requests_kwargs = kwargs.get("requests_kwargs", {})
credentials = self._get_credentials()
if credentials:
requests_kwargs["auth"] = credentials
Expand Down
8 changes: 6 additions & 2 deletions src/hdx/api/hdx_base_configuration.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
# HDX configuration
hdx_prod_site:
url: "https://data.humdata.org"
hdx_demo_site:
url: "https://demo.data-humdata-org.ahconu.org"
hdx_stage_site:
url: "https://stage.data-humdata-org.ahconu.org"
hdx_feature_site:
url: "https://feature.data-humdata-org.ahconu.org"
hdx_dev_site:
url: "https://dev.data-humdata-org.ahconu.org"
hdx_demo_site:
url: "https://demo.data-humdata-org.ahconu.org"
hdx_bluedemo_site:
url: "https://blue.demo.data-humdata-org.ahconu.org"
hdx_greendemo_site:
url: "https://green.demo.data-humdata-org.ahconu.org"
dataset:
required_fields:
- name
Expand Down
104 changes: 83 additions & 21 deletions src/hdx/api/utilities/filestore_helper.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,34 @@
"""Helper to the Dataset class for handling resources with filestores."""

import logging
from typing import TYPE_CHECKING, Any, Dict

from hdx.api.utilities.size_hash import get_size_and_hash
from hdx.utilities.dateparse import now_utc_notz

if TYPE_CHECKING:
from hdx.data.resource import Resource

logger = logging.getLogger(__name__)


class FilestoreHelper:
temporary_url = "updated_by_file_upload_step"

@staticmethod
def resource_check_required_fields(
resource: "Resource", check_upload: bool = False, **kwargs: Any
) -> None:
def resource_check_required_fields(resource: "Resource", **kwargs: Any) -> None:
"""Helper method to get ignore_fields from kwargs if it exists and add package_id

Args:
resource (Resource): Resource to check
check_upload (bool): Whether to check for file upload. Defaults to False.
**kwargs: Keyword arguments

Returns:
None
"""
if "ignore_check" in kwargs: # allow ignoring of field checks
return
if check_upload and resource.get_file_to_upload() and "url" in resource.data:
del resource.data["url"]
ignore_fields = kwargs.get("ignore_fields", list())
ignore_fields = kwargs.get("ignore_fields", [])
resource_ignore_fields = []
for ignore_field in ignore_fields:
if ignore_field.startswith("resource:"):
Expand All @@ -46,49 +45,112 @@ def check_filestore_resource(
filestore_resources: Dict[int, str],
resource_index: int,
**kwargs: Any,
) -> None:
) -> int:
"""Helper method to add new resource from dataset including filestore.
Returns status code where:
0 = no file to upload and last_modified set to now
(resource creation or data_updated flag is True),
1 = no file to upload and data_updated flag is False,
2 = file uploaded to filestore (resource creation or either hash or size of file
has changed),
3 = file not uploaded to filestore (hash and size of file are the same),
4 = file not uploaded (hash, size unchanged), given last_modified ignored

Args:
resource_data_to_update (Resource): Updated resource from dataset
filestore_resources (Dict[int, str]): List of (index of resource, file to upload)
resource_index (int): Index of resource

Returns:
None
int: Status code
"""
resource_data_to_update.set_types()
resource_data_to_update.correct_format(resource_data_to_update.data)
cls.resource_check_required_fields(resource_data_to_update, **kwargs)
file_to_upload = resource_data_to_update.get_file_to_upload()
if file_to_upload:
file_format = resource_data_to_update.get("format", "").lower()
size, hash = get_size_and_hash(file_to_upload, file_format)
filestore_resources[resource_index] = file_to_upload
resource_data_to_update["url"] = cls.temporary_url
resource_data_to_update["size"] = size
resource_data_to_update["hash"] = hash
return 2
return 0

@classmethod
def dataset_update_filestore_resource(
cls,
original_resource_data: "Resource",
resource_data_to_update: "Resource",
filestore_resources: Dict[int, str],
resource_index: int,
) -> None:
) -> int:
"""Helper method to merge updated resource from dataset into HDX resource read from HDX including filestore.
Returns status code where:
0 = no file to upload and last_modified set to now
(resource creation or data_updated flag is True),
1 = no file to upload and data_updated flag is False,
2 = file uploaded to filestore (resource creation or either hash or size of file
has changed),
3 = file not uploaded to filestore (hash and size of file are the same),
4 = file not uploaded (hash, size unchanged), given last_modified ignored

Args:
original_resource_data (Resource): Original resource from dataset
resource_data_to_update (Resource): Updated resource from dataset
filestore_resources (Dict[int, str]): List of (index of resources, file to upload)
resource_index (int): Index of resource

Returns:
None
int: Status code
"""
file_to_upload = resource_data_to_update.get_file_to_upload()
if file_to_upload:
filestore_resources[resource_index] = file_to_upload
resource_data_to_update["url"] = cls.temporary_url

data_updated = resource_data_to_update.is_marked_data_updated()
if data_updated:
# Should not output timezone info here
resource_data_to_update["last_modified"] = now_utc_notz().isoformat(
timespec="microseconds"
)
resource_data_to_update.data_updated = False
file_format = resource_data_to_update.get("format", "").lower()
size, hash = get_size_and_hash(file_to_upload, file_format)
if size == original_resource_data.get(
"size"
) and hash == original_resource_data.get("hash"):
logger.warning(
f"Not updating filestore for resource {original_resource_data['name']} as size and hash unchanged!"
)
if resource_data_to_update._url_backup:
resource_data_to_update["url"] = resource_data_to_update._url_backup
resource_data_to_update._url_backup = None
# ensure last_modified is not updated if file hasn't changed
if "last_modified" in resource_data_to_update:
del resource_data_to_update["last_modified"]
status = 4
else:
status = 3
else:
# update file if size or hash has changed
filestore_resources[resource_index] = file_to_upload
resource_data_to_update["resource_type"] = "file.upload"
resource_data_to_update["url_type"] = "upload"
if "tracking_summary" in resource_data_to_update:
del resource_data_to_update["tracking_summary"]
resource_data_to_update["url"] = cls.temporary_url
resource_data_to_update["size"] = size
resource_data_to_update["hash"] = hash
resource_data_to_update._url_backup = None
status = 2
else:
if (
"url" in resource_data_to_update
and resource_data_to_update.get("url_type") != "upload"
):
resource_data_to_update["resource_type"] = "api"
resource_data_to_update["url_type"] = "api"
if resource_data_to_update.is_marked_data_updated():
# Should not output timezone info here
resource_data_to_update["last_modified"] = now_utc_notz().isoformat(
timespec="microseconds"
)
resource_data_to_update._data_updated = False
status = 0
else:
status = 1
resource_data_to_update.correct_format(resource_data_to_update.data)
return status
43 changes: 43 additions & 0 deletions src/hdx/api/utilities/size_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import hashlib
from io import BytesIO
from typing import Tuple

from openpyxl import load_workbook


def get_size_and_hash(file_to_upload: str, file_format: str) -> Tuple[int, str]:
"""Return the size and hash of file to upload

Args:
file_to_upload: File to upload
file_format (str): File format

Returns:
Tuple[int, str]: Tuple (size, hash)
"""
f = open(file_to_upload, "rb")
md5hash = hashlib.md5()
if file_format == "xlsx":
first_chunk = f.read(4096)
size = len(first_chunk)
signature = first_chunk[:4]
if signature == b"PK\x03\x04": # xlsx
xlsxbuffer = bytearray(first_chunk)
while chunk := f.read(4096):
size += len(chunk)
xlsxbuffer.extend(chunk)
workbook = load_workbook(filename=BytesIO(xlsxbuffer), read_only=True)
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
for cols in sheet.iter_rows(values_only=True):
md5hash.update(bytes(str(cols), "utf-8"))
workbook.close()
return size, md5hash.hexdigest()
else:
md5hash.update(first_chunk)
else:
size = 0
while chunk := f.read(4096):
size += len(chunk)
md5hash.update(chunk)
return size, md5hash.hexdigest()
Loading
Loading