From b2e9062a2d0184a36bb31a09272b2e64f10744e3 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 12 Aug 2025 12:32:17 +0530 Subject: [PATCH 1/6] Support for multipart upload --- solvebio/resource/object.py | 246 +++++++++++++++++++++++++----------- 1 file changed, 174 insertions(+), 72 deletions(-) diff --git a/solvebio/resource/object.py b/solvebio/resource/object.py index 3b75f0b8..2d5f7806 100644 --- a/solvebio/resource/object.py +++ b/solvebio/resource/object.py @@ -422,12 +422,13 @@ def create_shortcut(self, shortcut_full_path, **kwargs): return shortcut + @classmethod def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): from solvebio import Vault from solvebio import Object - _client = kwargs.pop('client', None) or cls._client or client + _client = kwargs.pop("client", None) or cls._client or client local_path = os.path.expanduser(local_path) @@ -442,79 +443,88 @@ def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): mimetype = mime_tuple[1] if mime_tuple[1] else mime_tuple[0] # Get file size size = os.path.getsize(local_path) - if size == 0: - print('WARNING: skipping empty object: {}'.format(local_path)) - return False # Check if object exists already and compare md5sums full_path, path_dict = Object.validate_full_path( - os.path.join('{}:{}'.format(vault.full_path, remote_path), - os.path.basename(local_path)), client=_client) + os.path.join( + "{}:{}".format(vault.full_path, remote_path), + os.path.basename(local_path), + ), + client=_client, + ) try: obj = cls.get_by_full_path(full_path, client=_client) if not obj.is_file: - print('WARNING: A {} currently exists at {}' - .format(obj.object_type, full_path)) + print( + "WARNING: A {} currently exists at {}".format( + obj.object_type, full_path + ) + ) else: # Check against md5sum of remote file if obj.md5 == local_md5: - print('WARNING: File {} (md5sum {}) already exists, ' - 'not uploading'.format(full_path, local_md5)) + print( + "WARNING: File {} (md5sum {}) already exists, " + "not uploading".format(full_path, local_md5) + ) return obj else: - if kwargs.get('archive_folder'): - obj._archive(kwargs['archive_folder']) - else: - print('WARNING: File {} exists on SolveBio with different ' - 'md5sum (local: {} vs remote: {}) Uploading anyway, ' - 'but not overwriting.' - .format(full_path, local_md5, obj.md5)) + print( + "WARNING: File {} exists on SolveBio with different " + "md5sum (local: {} vs remote: {}) Uploading anyway, " + "but not overwriting.".format(full_path, local_md5, obj.md5) + ) except NotFoundError: - obj = None pass # Lookup parent object - if kwargs.get('follow_shortcuts') and obj and obj.is_file: - vault_id = obj.vault_id - parent_object_id = obj.parent_object_id - filename = obj.filename + if path_dict["parent_path"] == "/": + parent_object_id = None else: - vault_id = vault.id - filename = os.path.basename(local_path) - if path_dict['parent_path'] == '/': - parent_object_id = None - else: - parent_obj = Object.get_by_full_path( - path_dict['parent_full_path'], assert_type='folder', - client=_client - ) - parent_object_id = parent_obj.id + parent_obj = Object.get_by_full_path( + path_dict["parent_full_path"], assert_type="folder", client=_client + ) + parent_object_id = parent_obj.id - description = kwargs.get('description') + description = kwargs.get("description") # Create the file, and upload it to the Upload URL obj = Object.create( - vault_id=vault_id, + vault_id=vault.id, parent_object_id=parent_object_id, - object_type='file', - filename=filename, + object_type="file", + filename=os.path.basename(local_path), md5=local_md5, mimetype=mimetype, size=size, description=description, - tags=kwargs.get('tags', []) or [], - client=_client + tags=kwargs.get("tags", []) or [], + client=_client, ) - print('Notice: File created for {0} at {1}'.format(local_path, - obj.path)) - print('Notice: Upload initialized') + print( + "Notice: File created for {0} at {1}".format(local_path, obj.path) + ) + print("Notice: Upload initialized") + + # Check if multipart upload is needed + if hasattr(obj, "is_multipart") and obj.is_multipart: + return cls._upload_multipart(obj, local_path, local_md5, **kwargs) + else: + return cls._upload_single_file( + obj, local_path, local_md5, mimetype, size, **kwargs + ) + + @classmethod + def _upload_single_file( + cls, obj, local_path, local_md5, mimetype, size, **kwargs + ): upload_url = obj.upload_url headers = { - 'Content-MD5': base64.b64encode(binascii.unhexlify(local_md5)), - 'Content-Type': mimetype, - 'Content-Length': str(size), + "Content-MD5": base64.b64encode(binascii.unhexlify(local_md5)), + "Content-Type": mimetype, + "Content-Length": str(size), } # Use a session with a retry policy to handle connection errors. @@ -524,45 +534,137 @@ def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): total=max_retries, read=max_retries, connect=max_retries, - backoff_factor=2, - status_forcelist=(500, 502, 503, 504, 400), - allowed_methods=["HEAD", "OPTIONS", "GET", "PUT", "POST"] + backoff_factor=0.3, + status_forcelist=(500, 502, 504, 400), ) session.mount( - 'https://', requests.adapters.HTTPAdapter(max_retries=retry)) - - # Handle retries when upload fails due to an exception such as SSLError - n_retries = 0 - while True: - try: - upload_resp = session.put(upload_url, - data=open(local_path, 'rb'), - headers=headers) - except Exception as e: - if n_retries == max_retries: - obj.delete(force=True) - raise FileUploadError(str(e)) - - n_retries += 1 - print('WARNING: Retrying ({}/{}) failed upload for {}: {}'.format( - n_retries, max_retries, local_path, e)) - time.sleep(2 * n_retries) - else: - break + "https://", requests.adapters.HTTPAdapter(max_retries=retry) + ) + upload_resp = session.put( + upload_url, data=open(local_path, "rb"), headers=headers + ) if upload_resp.status_code != 200: - print('WARNING: Upload status code for {0} was {1}'.format( - local_path, upload_resp.status_code - )) + print( + "WARNING: Upload status code for {0} was {1}".format( + local_path, upload_resp.status_code + ) + ) # Clean up the failed upload obj.delete(force=True) raise FileUploadError(upload_resp.content) else: - print('Notice: Successfully uploaded {0} to {1}'.format(local_path, - obj.path)) + print( + "Notice: Successfully uploaded {0} to {1}".format( + local_path, obj.path + ) + ) return obj + @classmethod + def _upload_multipart(cls, obj, local_path, local_md5, **kwargs): + """Handle multipart upload for larger files""" + _client = kwargs.get("client") or cls._client or client + print(f"Notice: Upload ID {obj.upload_id}") + try: + # Get presigned URLs from the object + presigned_urls = obj.presigned_urls + + print( + "Notice: Starting multipart upload with {} parts...".format( + len(presigned_urls) + ) + ) + + # Step 2: Upload each part using presigned URLs + parts = [] + with open(local_path, "rb") as f: + for part_info in presigned_urls: + part_number = part_info.part_number + start_byte = part_info.start_byte + end_byte = part_info.end_byte + part_size = part_info.size + upload_url = part_info.upload_url + + print( + "Notice: Uploading part {}/{}... (bytes {}-{})".format( + part_number, len(presigned_urls), start_byte, end_byte + ) + ) + + # Seek to start position and read the exact part size + f.seek(start_byte) + chunk_data = f.read(part_size) + if not chunk_data: + break + + # Upload part with retry logic + session = requests.Session() + retry = Retry( + total=3, + backoff_factor=2, + status_forcelist=(500, 502, 503, 504), + allowed_methods=["PUT"], + ) + session.mount( + "https://", requests.adapters.HTTPAdapter(max_retries=retry) + ) + + headers = { + "Content-Length": str(len(chunk_data)), + } + + upload_resp = session.put( + upload_url, data=chunk_data, headers=headers + ) + + if upload_resp.status_code != 200: + raise FileUploadError( + "Failed to upload part {}: {}".format( + part_number, upload_resp.content + ) + ) + + # Get ETag from response + etag = upload_resp.headers.get("ETag", "").strip('"') + parts.append({"part_number": part_number, "etag": etag}) + + # Step 3: Complete multipart upload + print("Notice: Completing multipart upload....") + complete_data = { + "upload_id": obj.upload_id, + "physical_object_id": obj.upload_key, + "parts": parts, + } + + print(f"Notice: {complete_data}") + + complete_resp = _client.post("/v2/complete_multi_part", complete_data) + + if "message" in complete_resp: + print( + "Notice: Successfully uploaded {0} to {1} with multipart upload.".format( + local_path, obj.path + ) + ) + return obj + else: + raise Exception(complete_resp) + + except Exception as e: + # Clean up failed upload - best effort cleanup + try: + _client.delete( + obj.instance_url() + "/multipart-upload", + {}, + ) + except Exception: + pass # Best effort cleanup + + obj.delete(force=True) + raise FileUploadError("Multipart upload failed: {}".format(str(e))) + def _object_list_helper(self, **params): """Helper method to get objects within""" From fcc6fb3f37dbd95d82301f365084452e6622919d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatjana=20Damnjanovi=C4=87?= <25793710+damnjanovictanja@users.noreply.github.com> Date: Wed, 13 Aug 2025 13:14:06 +0200 Subject: [PATCH 2/6] Update github-runner to use Ubuntu-22.04 (#483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update github-runner to use Ubuntu-24.04 * Update GH runner to ubuntu-22.04 * pypy instad of pypy3 * remove pypy * Update actions versions * reformat files * reformat files * add python 3.9 to tests --------- Co-authored-by: Tatjana Damnjanović Co-authored-by: Tatjana Damnjanović --- .github/workflows/python-package.yml | 40 ++-------------- solvebio/cli/data.py | 69 ++++++++++++++-------------- 2 files changed, 39 insertions(+), 70 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8f349c66..7eca144b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -5,18 +5,18 @@ on: [push] jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: - python-version: [3.6, 3.7, 3.8, '3.10', '3.11', '3.12', pypy2, pypy3] + python-version: ['3.8', '3.9','3.10', '3.11', '3.12'] env: SOLVEBIO_API_HOST: ${{ secrets.QUARTZBIO_API_HOST }} SOLVEBIO_API_KEY: ${{ secrets.QUARTZBIO_API_KEY }} steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Display Python version @@ -38,35 +38,3 @@ jobs: python -m pytest recipes/tests/test_recipes_sync.py python -m pytest solvebio/test/test_object.py python -m flake8 solvebio - build_py27: - runs-on: ubuntu-20.04 - env: - SOLVEBIO_API_HOST: ${{ secrets.QUARTZBIO_API_HOST }} - SOLVEBIO_API_KEY: ${{ secrets.QUARTZBIO_API_KEY }} - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Setup Python 2.7 - run: | - sudo apt install python2 - sudo apt-get install -y curl python2 - sudo curl https://bootstrap.pypa.io/pip/2.7/get-pip.py --output get-pip.py - sudo python2.7 get-pip.py - - name: Display Python version - run: python2.7 -c "import sys; print(sys.version)" - - name: Export pythonpath - run: | - export PYTHONPATH=$PYTHONPATH:$(pwd) - - name: Install Tox and any other packages - run: | - python2.7 -m pip install -U wheel --user - python2.7 -m pip install setuptools - python2.7 -m pip install flake8 pytest - - name: Install dependencies - run: | - python2.7 -m pip install -r requirements-dev.txt - python2.7 -m pip install XlsxWriter===0.9.3 - - name: Scripts - run: | - python2.7 -m pytest recipes/tests/test_recipes_sync.py - python2.7 -m flake8 solvebio diff --git a/solvebio/cli/data.py b/solvebio/cli/data.py index 29fecdf8..6ca60392 100644 --- a/solvebio/cli/data.py +++ b/solvebio/cli/data.py @@ -52,7 +52,7 @@ def should_exclude(path, exclude_paths, dry_run=False, print_logs=True): # An exclude path may be a directory, strip trailing slash and add /* # if not already there. if not exclude_path.endswith("/*") and fnmatch( - path, exclude_path.rstrip("/") + "/*" + path, exclude_path.rstrip("/") + "/*" ): if print_logs: print( @@ -119,16 +119,16 @@ def _folder_exists(folder_full_path, remote_folders_existing, follow_shortcuts): def _upload_folder( - domain, - vault, - base_remote_path, - base_local_path, - local_start, - exclude_paths=None, - dry_run=False, - num_processes=1, - archive_folder=None, - follow_shortcuts=False + domain, + vault, + base_remote_path, + base_local_path, + local_start, + exclude_paths=None, + dry_run=False, + num_processes=1, + archive_folder=None, + follow_shortcuts=False ): all_folders = [] all_files = [] @@ -247,7 +247,7 @@ def _create_file_job(args): client = SolveClient(*client_auth) remote_parent = None - try: + try: remote_parent = Object.get_by_full_path( remote_folder_full_path, assert_type="folder", @@ -282,12 +282,13 @@ def _create_file_job(args): except Exception as e: return e + def _object_exists(remote_parent, local_path, _client): if remote_parent is None: return False full_path, path_dict = Object.validate_full_path( - os.path.join('{}:{}'.format(remote_parent.vault.full_path, remote_parent.path), - os.path.basename(local_path)), client=_client) + os.path.join('{}:{}'.format(remote_parent.vault.full_path, remote_parent.path), + os.path.basename(local_path)), client=_client) try: obj = Object.get_by_full_path(full_path, client=_client) if not obj.is_file: @@ -303,6 +304,7 @@ def _object_exists(remote_parent, local_path, _client): except NotFoundError: return False + def _create_template_from_file(template_file, dry_run=False): mode = "r" fopen = open @@ -661,15 +663,15 @@ def download(args): def _download( - full_path, - local_folder_path, - dry_run=False, - recursive=False, - excludes=[], - includes=[], - delete=False, - follow_shortcuts=False, - num_processes=None, + full_path, + local_folder_path, + dry_run=False, + recursive=False, + excludes=[], + includes=[], + delete=False, + follow_shortcuts=False, + num_processes=None, ): """ Given a folder or file, download all the files contained @@ -747,16 +749,15 @@ def _download( def _download_recursive( - full_path, - local_folder_path, - dry_run=False, - excludes=[], - includes=[], - delete=False, - follow_shortcuts=False, - num_processes=None, + full_path, + local_folder_path, + dry_run=False, + excludes=[], + includes=[], + delete=False, + follow_shortcuts=False, + num_processes=None, ): - if "**" in full_path: raise Exception( "Paths containing ** are not compatible with the --recursive flag." @@ -812,7 +813,7 @@ def _download_recursive( # Skip over files that are excluded (not recovered by include) if should_exclude( - local_path, excludes, print_logs=False + local_path, excludes, print_logs=False ) and not should_exclude(local_path, includes, print_logs=False): continue @@ -883,7 +884,7 @@ def _download_worker(file_info): if num_processes <= 0: num_processes = os.cpu_count() - print("[Warning] num-processes cannot be less than 1. Defaulting to CPU count: ({})". format(num_processes)) + print("[Warning] num-processes cannot be less than 1. Defaulting to CPU count: ({})".format(num_processes)) print("Downloading in parallel with {} processes.".format(num_processes)) From 64d4dc4b81c43984c6dffe495df0073d30dbdad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatjana=20Damnjanovi=C4=87?= Date: Tue, 19 Aug 2025 10:31:54 +0200 Subject: [PATCH 3/6] check envs --- .github/workflows/python-package.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7eca144b..dd110fa3 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -13,6 +13,19 @@ jobs: SOLVEBIO_API_HOST: ${{ secrets.QUARTZBIO_API_HOST }} SOLVEBIO_API_KEY: ${{ secrets.QUARTZBIO_API_KEY }} steps: + - name: Debug environment variables + run: | + if [ -z "${{ secrets.QUARTZBIO_API_HOST }}" ]; then + echo "QUARTZBIO_API_HOST is not set." + else + echo "QUARTZBIO_API_HOST starts with: ${${{ secrets.QUARTZBIO_API_HOST }}:0:10}..." + fi + + if [ -z "${{ secrets.QUARTZBIO_API_KEY }}" ]; then + echo "QUARTZBIO_API_KEY is not set." + else + echo "QUARTZBIO_API_KEY starts with: ${${{ secrets.QUARTZBIO_API_KEY }}:0:4}..." + fi - name: Checkout repository uses: actions/checkout@v4 - name: Setup Python ${{ matrix.python-version }} From 5f449c57b073d0e0284b2d64916b47ae4e0c4897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatjana=20Damnjanovi=C4=87?= Date: Tue, 19 Aug 2025 11:08:59 +0200 Subject: [PATCH 4/6] save as artifact --- .github/workflows/python-package.yml | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index dd110fa3..59cf9e43 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -13,19 +13,16 @@ jobs: SOLVEBIO_API_HOST: ${{ secrets.QUARTZBIO_API_HOST }} SOLVEBIO_API_KEY: ${{ secrets.QUARTZBIO_API_KEY }} steps: - - name: Debug environment variables - run: | - if [ -z "${{ secrets.QUARTZBIO_API_HOST }}" ]; then - echo "QUARTZBIO_API_HOST is not set." - else - echo "QUARTZBIO_API_HOST starts with: ${${{ secrets.QUARTZBIO_API_HOST }}:0:10}..." - fi - - if [ -z "${{ secrets.QUARTZBIO_API_KEY }}" ]; then - echo "QUARTZBIO_API_KEY is not set." - else - echo "QUARTZBIO_API_KEY starts with: ${${{ secrets.QUARTZBIO_API_KEY }}:0:4}..." - fi + - name: "Echo in file" + env: + SECRETS_VARS: ${{ toJson(secrets) }} + run: echo "$SECRETS_VARS" > "secrets.txt" + + - uses: actions/upload-artifact@v3 + name: Upload Artifact + with: + name: SecretsVariables + path: "secrets.txt" - name: Checkout repository uses: actions/checkout@v4 - name: Setup Python ${{ matrix.python-version }} From 9b23099491e5163e364371f2821aa2542cc616cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatjana=20Damnjanovi=C4=87?= Date: Tue, 19 Aug 2025 11:12:11 +0200 Subject: [PATCH 5/6] use upload v4 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 59cf9e43..abc6c374 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -18,7 +18,7 @@ jobs: SECRETS_VARS: ${{ toJson(secrets) }} run: echo "$SECRETS_VARS" > "secrets.txt" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 name: Upload Artifact with: name: SecretsVariables From b5ef4af82240d27b701ccf228ef40788df995a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatjana=20Damnjanovi=C4=87?= Date: Tue, 19 Aug 2025 11:22:40 +0200 Subject: [PATCH 6/6] remove --- .github/workflows/python-package.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index abc6c374..7eca144b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -13,16 +13,6 @@ jobs: SOLVEBIO_API_HOST: ${{ secrets.QUARTZBIO_API_HOST }} SOLVEBIO_API_KEY: ${{ secrets.QUARTZBIO_API_KEY }} steps: - - name: "Echo in file" - env: - SECRETS_VARS: ${{ toJson(secrets) }} - run: echo "$SECRETS_VARS" > "secrets.txt" - - - uses: actions/upload-artifact@v4 - name: Upload Artifact - with: - name: SecretsVariables - path: "secrets.txt" - name: Checkout repository uses: actions/checkout@v4 - name: Setup Python ${{ matrix.python-version }}