From 4c647acf0cd8ef5801258639ddfcf228f05fabc6 Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 21:15:23 +0100 Subject: [PATCH 1/7] Initial draft for adding the azure utils --- pyproject.toml | 1 + src/hdx/utilities/azure_utils.py | 153 +++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 src/hdx/utilities/azure_utils.py diff --git a/pyproject.toml b/pyproject.toml index c5e617f..6dbcdbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ html = ["beautifulsoup4", "html5lib"] email = ["email_validator"] test = ["pytest", "pytest-cov", "pytest-loguru"] dev = ["pre-commit"] +azure = ["azure.storage.blob","BlobServiceClient","ContentSettings","pandas"] ######### diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py new file mode 100644 index 0000000..f2c83e3 --- /dev/null +++ b/src/hdx/utilities/azure_utils.py @@ -0,0 +1,153 @@ +import base64 +import hashlib +import hmac +import io +import logging +from datetime import datetime +from os.path import exists +from typing import Any +from .downloader import Download +try: + from azure.storage.blob import BlobServiceClient, ContentSettings + import pandas as pd +except ImportError: + BlobServiceClient = None + ContentSettings = None + pd = None + + +logger = logging.getLogger(__name__) + + +class AzureBlobDownload(Download): + + def download_file( + self, + url: str, + account: str, + container: str, + key: str, + blob: None, + **kwargs: Any, + ) -> str: + """Download a blob file from an Azure Storage + + Args: + url (str): URL for the exact blob location + account (str): Storage account to access the blob + container (str): Container to download from + key (str): Key to access the blob + blob (str): Name of the blob to be downloaded. If empty, then it is assumed to download the whole container. + **kwargs: See below + path (str): Full path to use for downloaded file instead of folder and filename. + keep (bool): Whether to keep already downloaded file. Defaults to False. + post (bool): Whether to use POST instead of GET. Defaults to False. + parameters (Dict): Parameters to pass. Defaults to None. + timeout (float): Timeout for connecting to URL. Defaults to None (no timeout). + headers (Dict): Headers to pass. Defaults to None. + encoding (str): Encoding to use for text response. Defaults to None (best guess). + """ + path = kwargs.get("path") + keep = kwargs.get("keep", False) + + request_time = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') + api_version = '2018-03-28' + parameters = { + 'verb': 'GET', + 'Content-Encoding': '', + 'Content-Language': '', + 'Content-Length': '', + 'Content-MD5': '', + 'Content-Type': '', + 'Date': '', + 'If-Modified-Since': '', + 'If-Match': '', + 'If-None-Match': '', + 'If-Unmodified-Since': '', + 'Range': '', + 'CanonicalizedHeaders': 'x-ms-date:' + request_time + '\nx-ms-version:' + api_version + '\n', + 'CanonicalizedResource': '/' + account + '/' + container + '/' + blob + } + + signature = (parameters['verb'] + '\n' + + parameters['Content-Encoding'] + '\n' + + parameters['Content-Language'] + '\n' + + parameters['Content-Length'] + '\n' + + parameters['Content-MD5'] + '\n' + + parameters['Content-Type'] + '\n' + + parameters['Date'] + '\n' + + parameters['If-Modified-Since'] + '\n' + + parameters['If-Match'] + '\n' + + parameters['If-None-Match'] + '\n' + + parameters['If-Unmodified-Since'] + '\n' + + parameters['Range'] + '\n' + + parameters['CanonicalizedHeaders'] + + parameters['CanonicalizedResource']) + + signed_string = base64.b64encode(hmac.new(base64.b64decode(key), msg=signature.encode('utf-8'), + digestmod=hashlib.sha256).digest()).decode() + + headers = { + 'x-ms-date': request_time, + 'x-ms-version': api_version, + 'Authorization': ('SharedKey ' + account + ':' + signed_string) + } + + url = ('https://' + account + '.blob.core.windows.net/' + container + '/' + blob) + + if keep and exists(url): + print(f"The blob URL exists: {url}") + return path + self.setup( + url=url, + stream=True, + post=kwargs.get("post", False), + parameters=kwargs.get("parameters"), + timeout=kwargs.get("timeout"), + headers=headers, + encoding=kwargs.get("encoding"), + ) + return self.stream_path( + path, f"Download of {url} failed in retrieval of stream!" + ) + + +class AzureBlobUpload: + + def upload_file( + self, + dataset_name: str, + filename: str, + account: str, + container: str, + key: str, + data: None + ) -> str: + """Upload a file to a blob storage within a container for an azure storage account + Args: + dataset_name (str): name of the dataset within the dictionary list to upload + filename (str): new name for the file once it is uploaded to the container + account (str): Storage account + container (str): Name of the container where the file will be uploaded to. + key (str): Access key to container + data : json type of dicts with multiple datasets or just one + """ + + blob_service = BlobServiceClient.from_connection_string( + f"DefaultEndpointsProtocol=https;AccountName={account};AccountKey= " + f"{key};EndpointSuffix=core.windows.net") + + blob_client = blob_service.get_blob_client(container=container, + blob=filename) + + try: + stream = io.StringIO() + df = pd.DataFrame(data[dataset_name]) + df.to_csv(stream, sep=";") + file_to_blob = stream.getvalue() + blob_client.upload_blob(file_to_blob, + overwrite=True, + content_settings=ContentSettings(content_type="text/csv")) + logger.info(f"Successfully uploaded: {dataset_name}") + except Exception: + logger.error(f"Failed to upload dataset: {dataset_name}") From fe497ad6f77eb073e6b2791f6bdbe4ab6d09a83f Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 21:34:17 +0100 Subject: [PATCH 2/7] Adding header to the utils and reordering import --- src/hdx/utilities/azure_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py index f2c83e3..41294a6 100644 --- a/src/hdx/utilities/azure_utils.py +++ b/src/hdx/utilities/azure_utils.py @@ -1,3 +1,4 @@ +""" All the logic around Azure blob uploads and downloads of files""" import base64 import hashlib import hmac @@ -6,7 +7,7 @@ from datetime import datetime from os.path import exists from typing import Any -from .downloader import Download + try: from azure.storage.blob import BlobServiceClient, ContentSettings import pandas as pd @@ -15,6 +16,7 @@ ContentSettings = None pd = None +from .downloader import Download logger = logging.getLogger(__name__) From 24f90df819d49e73f5a4f659eb0bc8bd63bc3ece Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 21:38:52 +0100 Subject: [PATCH 3/7] Reordering import [2] --- src/hdx/utilities/azure_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py index 41294a6..8a396c9 100644 --- a/src/hdx/utilities/azure_utils.py +++ b/src/hdx/utilities/azure_utils.py @@ -9,8 +9,8 @@ from typing import Any try: - from azure.storage.blob import BlobServiceClient, ContentSettings import pandas as pd + from azure.storage.blob import BlobServiceClient, ContentSettings except ImportError: BlobServiceClient = None ContentSettings = None From 4fdb343102be503e12ac02e7a5f6f6da50e158ad Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 22:21:53 +0100 Subject: [PATCH 4/7] Reformatting file --- src/hdx/utilities/azure_utils.py | 87 +++++++++++++++++--------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py index 8a396c9..0be2f6e 100644 --- a/src/hdx/utilities/azure_utils.py +++ b/src/hdx/utilities/azure_utils.py @@ -1,4 +1,4 @@ -""" All the logic around Azure blob uploads and downloads of files""" +"""All the logic around Azure blob uploads and downloads of files""" import base64 import hashlib import hmac @@ -22,6 +22,7 @@ class AzureBlobDownload(Download): + """Wrapper for Azure Blob download logic""" def download_file( self, @@ -39,7 +40,8 @@ def download_file( account (str): Storage account to access the blob container (str): Container to download from key (str): Key to access the blob - blob (str): Name of the blob to be downloaded. If empty, then it is assumed to download the whole container. + blob (str): Name of the blob to be downloaded. If empty, then it is assumed to download + the whole container. **kwargs: See below path (str): Full path to use for downloaded file instead of folder and filename. keep (bool): Whether to keep already downloaded file. Defaults to False. @@ -52,50 +54,52 @@ def download_file( path = kwargs.get("path") keep = kwargs.get("keep", False) - request_time = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT') - api_version = '2018-03-28' + request_time = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S GMT") + api_version = "2018-03-28" parameters = { - 'verb': 'GET', - 'Content-Encoding': '', - 'Content-Language': '', - 'Content-Length': '', - 'Content-MD5': '', - 'Content-Type': '', - 'Date': '', - 'If-Modified-Since': '', - 'If-Match': '', - 'If-None-Match': '', - 'If-Unmodified-Since': '', - 'Range': '', - 'CanonicalizedHeaders': 'x-ms-date:' + request_time + '\nx-ms-version:' + api_version + '\n', - 'CanonicalizedResource': '/' + account + '/' + container + '/' + blob + "verb": "GET", + "Content-Encoding": "", + "Content-Language": "", + "Content-Length": "", + "Content-MD5": "", + "Content-Type": "", + "Date": "", + "If-Modified-Since": "", + "If-Match": "", + "If-None-Match": "", + "If-Unmodified-Since": "", + "Range": "", + "CanonicalizedHeaders": "x-ms-date:" + request_time + "\nx-ms-version:" + + api_version + "\n", + "CanonicalizedResource": "/" + account + "/" + container + "/" + blob } - signature = (parameters['verb'] + '\n' - + parameters['Content-Encoding'] + '\n' - + parameters['Content-Language'] + '\n' - + parameters['Content-Length'] + '\n' - + parameters['Content-MD5'] + '\n' - + parameters['Content-Type'] + '\n' - + parameters['Date'] + '\n' - + parameters['If-Modified-Since'] + '\n' - + parameters['If-Match'] + '\n' - + parameters['If-None-Match'] + '\n' - + parameters['If-Unmodified-Since'] + '\n' - + parameters['Range'] + '\n' - + parameters['CanonicalizedHeaders'] - + parameters['CanonicalizedResource']) - - signed_string = base64.b64encode(hmac.new(base64.b64decode(key), msg=signature.encode('utf-8'), + signature = (parameters["verb"] + "\n" + + parameters["Content-Encoding"] + "\n" + + parameters["Content-Language"] + "\n" + + parameters["Content-Length"] + "\n" + + parameters["Content-MD5"] + "\n" + + parameters["Content-Type"] + "\n" + + parameters["Date"] + "\n" + + parameters["If-Modified-Since"] + "\n" + + parameters["If-Match"] + "\n" + + parameters["If-None-Match"] + "\n" + + parameters["If-Unmodified-Since"] + "\n" + + parameters["Range"] + "\n" + + parameters["CanonicalizedHeaders"] + + parameters["CanonicalizedResource"]) + + signed_string = base64.b64encode(hmac.new(base64.b64decode(key), + msg=signature.encode("utf-8"), digestmod=hashlib.sha256).digest()).decode() headers = { - 'x-ms-date': request_time, - 'x-ms-version': api_version, - 'Authorization': ('SharedKey ' + account + ':' + signed_string) + "x-ms-date": request_time, + "x-ms-version": api_version, + "Authorization": ("SharedKey " + account + ":" + signed_string) } - url = ('https://' + account + '.blob.core.windows.net/' + container + '/' + blob) + url = "https://" + account + ".blob.core.windows.net/" + container + "/" + blob if keep and exists(url): print(f"The blob URL exists: {url}") @@ -110,11 +114,12 @@ def download_file( encoding=kwargs.get("encoding"), ) return self.stream_path( - path, f"Download of {url} failed in retrieval of stream!" + path, "Download of %s failed in retrieval of stream!" % url ) class AzureBlobUpload: + """Wrapper for Azure Blob upload logic""" def upload_file( self, @@ -150,6 +155,6 @@ def upload_file( blob_client.upload_blob(file_to_blob, overwrite=True, content_settings=ContentSettings(content_type="text/csv")) - logger.info(f"Successfully uploaded: {dataset_name}") + logger.info("Successfully uploaded: %s" % dataset_name) except Exception: - logger.error(f"Failed to upload dataset: {dataset_name}") + logger.error("Failed to upload dataset: %s" % dataset_name) From 94a735a1fa996e8fdf2d081d177e44316131dc9b Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 22:32:34 +0100 Subject: [PATCH 5/7] Reformatting file [2] --- src/hdx/utilities/azure_utils.py | 123 ++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py index 0be2f6e..a3a4eed 100644 --- a/src/hdx/utilities/azure_utils.py +++ b/src/hdx/utilities/azure_utils.py @@ -1,4 +1,5 @@ """All the logic around Azure blob uploads and downloads of files""" + import base64 import hashlib import hmac @@ -25,13 +26,13 @@ class AzureBlobDownload(Download): """Wrapper for Azure Blob download logic""" def download_file( - self, - url: str, - account: str, - container: str, - key: str, - blob: None, - **kwargs: Any, + self, + url: str, + account: str, + container: str, + key: str, + blob: None, + **kwargs: Any, ) -> str: """Download a blob file from an Azure Storage @@ -69,37 +70,69 @@ def download_file( "If-None-Match": "", "If-Unmodified-Since": "", "Range": "", - "CanonicalizedHeaders": "x-ms-date:" + request_time + "\nx-ms-version:" - + api_version + "\n", - "CanonicalizedResource": "/" + account + "/" + container + "/" + blob + "CanonicalizedHeaders": "x-ms-date:" + + request_time + + "\nx-ms-version:" + + api_version + + "\n", + "CanonicalizedResource": "/" + + account + + "/" + + container + + "/" + + blob, } - signature = (parameters["verb"] + "\n" - + parameters["Content-Encoding"] + "\n" - + parameters["Content-Language"] + "\n" - + parameters["Content-Length"] + "\n" - + parameters["Content-MD5"] + "\n" - + parameters["Content-Type"] + "\n" - + parameters["Date"] + "\n" - + parameters["If-Modified-Since"] + "\n" - + parameters["If-Match"] + "\n" - + parameters["If-None-Match"] + "\n" - + parameters["If-Unmodified-Since"] + "\n" - + parameters["Range"] + "\n" - + parameters["CanonicalizedHeaders"] - + parameters["CanonicalizedResource"]) - - signed_string = base64.b64encode(hmac.new(base64.b64decode(key), - msg=signature.encode("utf-8"), - digestmod=hashlib.sha256).digest()).decode() + signature = ( + parameters["verb"] + + "\n" + + parameters["Content-Encoding"] + + "\n" + + parameters["Content-Language"] + + "\n" + + parameters["Content-Length"] + + "\n" + + parameters["Content-MD5"] + + "\n" + + parameters["Content-Type"] + + "\n" + + parameters["Date"] + + "\n" + + parameters["If-Modified-Since"] + + "\n" + + parameters["If-Match"] + + "\n" + + parameters["If-None-Match"] + + "\n" + + parameters["If-Unmodified-Since"] + + "\n" + + parameters["Range"] + + "\n" + + parameters["CanonicalizedHeaders"] + + parameters["CanonicalizedResource"] + ) + signed_string = base64.b64encode( + hmac.new( + base64.b64decode(key), + msg=signature.encode("utf-8"), + digestmod=hashlib.sha256, + ).digest() + ).decode() headers = { "x-ms-date": request_time, "x-ms-version": api_version, - "Authorization": ("SharedKey " + account + ":" + signed_string) + "Authorization": ("SharedKey " + account + ":" + signed_string,) } - url = "https://" + account + ".blob.core.windows.net/" + container + "/" + blob + url = ( + "https://" + + account + + ".blob.core.windows.net/" + + container + + "/" + + blob + ) if keep and exists(url): print(f"The blob URL exists: {url}") @@ -122,13 +155,13 @@ class AzureBlobUpload: """Wrapper for Azure Blob upload logic""" def upload_file( - self, - dataset_name: str, - filename: str, - account: str, - container: str, - key: str, - data: None + self, + dataset_name: str, + filename: str, + account: str, + container: str, + key: str, + data: None ) -> str: """Upload a file to a blob storage within a container for an azure storage account Args: @@ -142,19 +175,23 @@ def upload_file( blob_service = BlobServiceClient.from_connection_string( f"DefaultEndpointsProtocol=https;AccountName={account};AccountKey= " - f"{key};EndpointSuffix=core.windows.net") + f"{key};EndpointSuffix=core.windows.net" + ) - blob_client = blob_service.get_blob_client(container=container, - blob=filename) + blob_client = blob_service.get_blob_client( + container=container, + blob=filename) try: stream = io.StringIO() df = pd.DataFrame(data[dataset_name]) df.to_csv(stream, sep=";") file_to_blob = stream.getvalue() - blob_client.upload_blob(file_to_blob, - overwrite=True, - content_settings=ContentSettings(content_type="text/csv")) + blob_client.upload_blob( + file_to_blob, + overwrite=True, + content_settings=ContentSettings(content_type="text/csv") + ) logger.info("Successfully uploaded: %s" % dataset_name) except Exception: logger.error("Failed to upload dataset: %s" % dataset_name) From 12cc7daa7989f5a3e91a8a8d24195bb4f1c87f0d Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 22:35:58 +0100 Subject: [PATCH 6/7] Reformatting file [3] --- src/hdx/utilities/azure_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py index a3a4eed..5033325 100644 --- a/src/hdx/utilities/azure_utils.py +++ b/src/hdx/utilities/azure_utils.py @@ -122,16 +122,16 @@ def download_file( headers = { "x-ms-date": request_time, "x-ms-version": api_version, - "Authorization": ("SharedKey " + account + ":" + signed_string,) + "Authorization": ("SharedKey " + account + ":" + signed_string,), } url = ( - "https://" - + account - + ".blob.core.windows.net/" - + container - + "/" - + blob + "https://" + + account + + ".blob.core.windows.net/" + + container + + "/" + + blob ) if keep and exists(url): @@ -161,7 +161,7 @@ def upload_file( account: str, container: str, key: str, - data: None + data: None, ) -> str: """Upload a file to a blob storage within a container for an azure storage account Args: @@ -190,7 +190,7 @@ def upload_file( blob_client.upload_blob( file_to_blob, overwrite=True, - content_settings=ContentSettings(content_type="text/csv") + content_settings=ContentSettings(content_type="text/csv"), ) logger.info("Successfully uploaded: %s" % dataset_name) except Exception: From 9d4cb9b33ec5df287b73e096359b86c0146f1f69 Mon Sep 17 00:00:00 2001 From: "Isabelle Tot (UN)" Date: Mon, 29 Apr 2024 22:37:58 +0100 Subject: [PATCH 7/7] Reformatting file [4] --- src/hdx/utilities/azure_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hdx/utilities/azure_utils.py b/src/hdx/utilities/azure_utils.py index 5033325..737bccd 100644 --- a/src/hdx/utilities/azure_utils.py +++ b/src/hdx/utilities/azure_utils.py @@ -179,8 +179,8 @@ def upload_file( ) blob_client = blob_service.get_blob_client( - container=container, - blob=filename) + container=container, blob=filename + ) try: stream = io.StringIO()