From 148be62005d7d64f8e630836c6d7940cec5024e2 Mon Sep 17 00:00:00 2001 From: Ramesh R Date: Wed, 19 Nov 2025 20:59:22 +0530 Subject: [PATCH] EDPOPS-321 include block count in md5 --- solvebio/cli/data.py | 4 ++-- solvebio/resource/object.py | 10 ++++++++-- solvebio/utils/md5sum.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/solvebio/cli/data.py b/solvebio/cli/data.py index 6d718217..687c365f 100644 --- a/solvebio/cli/data.py +++ b/solvebio/cli/data.py @@ -295,7 +295,7 @@ def _object_exists(remote_parent, local_path, _client): return False else: # Check if the md5sum matches - local_md5 = md5sum(local_path)[0] + local_md5 = md5sum(local_path) remote_md5 = obj.get("md5") if remote_md5 and remote_md5 == local_md5: return True @@ -831,7 +831,7 @@ def _download_recursive( # Skip over files that match remote md5 checksum if os.path.exists(local_path): remote_md5 = remote_file.get("md5") - if remote_md5 and remote_md5 == md5sum(local_path)[0]: + if remote_md5 and remote_md5 == md5sum(local_path): print("Skipping {} already in sync".format(local_path)) continue diff --git a/solvebio/resource/object.py b/solvebio/resource/object.py index bc94c751..5e77f1d7 100644 --- a/solvebio/resource/object.py +++ b/solvebio/resource/object.py @@ -600,7 +600,7 @@ def _upload_single_file(cls, obj, local_path, **kwargs): size = os.path.getsize(local_path) # Get MD5 for single part upload - local_md5, _ = md5sum(local_path, multipart_threshold=None) + local_md5 = md5sum(local_path, multipart_threshold=None) upload_url = obj.upload_url @@ -934,10 +934,16 @@ def _upload_single_part( if not chunk_data: break + def md5_base64(data): + import hashlib + md5 = hashlib.md5(data).digest() + return base64.b64encode(md5).decode("utf-8") + # Upload without requests-level retry (let our custom retry handle it) session = requests.Session() - headers = {"Content-Length": str(len(chunk_data))} + headers = {"Content-Length": str(len(chunk_data)), + "ContentMD5": md5_base64(chunk_data)} # Calculate timeout based on part size part_size_mb = len(chunk_data) / (1024 * 1024) diff --git a/solvebio/utils/md5sum.py b/solvebio/utils/md5sum.py index 82ade22f..ca710f4f 100644 --- a/solvebio/utils/md5sum.py +++ b/solvebio/utils/md5sum.py @@ -36,4 +36,4 @@ def _read_chunks(f, chunk_size): for block in _read_chunks(f, multipart_chunksize): md5.update(block) - return md5.hexdigest(), block_count + return f"{md5.hexdigest()}-{block_count}" if block_count else md5.hexdigest()