diff --git a/setup.py b/setup.py index e304e9f7..1664905b 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ description='The SolveBio Python client', long_description=long_description, long_description_content_type='text/markdown', + python_requires='>=3.8', author='Solve, Inc.', author_email='contact@solvebio.com', url='https://github.com/solvebio/solvebio-python', diff --git a/solvebio/__init__.py b/solvebio/__init__.py index 8f5cd14b..460b3848 100644 --- a/solvebio/__init__.py +++ b/solvebio/__init__.py @@ -103,7 +103,6 @@ def _set_cached_api_host(host): from .global_search import GlobalSearch from .annotate import Annotator, Expression from .client import client, SolveClient -from .auth import authenticate from .resource import ( Application, Beacon, diff --git a/solvebio/__main__.py b/solvebio/__main__.py index 9070a092..4e063336 100644 --- a/solvebio/__main__.py +++ b/solvebio/__main__.py @@ -1,6 +1,3 @@ - -import sys - from .cli.main import main if __name__ == "__main__": diff --git a/solvebio/auth.py b/solvebio/auth.py index ac39e284..47e78e56 100644 --- a/solvebio/auth.py +++ b/solvebio/auth.py @@ -58,22 +58,22 @@ def authenticate( # Find credentials from environment variables if not host: host = ( - os.environ.get("QUARTZBIO_API_HOST", None) - or os.environ.get("EDP_API_HOST", None) - or os.environ.get("SOLVEBIO_API_HOST", None) + os.environ.get("QUARTZBIO_API_HOST", None) or + os.environ.get("EDP_API_HOST", None) or + os.environ.get("SOLVEBIO_API_HOST", None) ) if not token: api_key = ( - os.environ.get("QUARTZBIO_API_KEY", None) - or os.environ.get("EDP_API_KEY", None) - or os.environ.get("SOLVEBIO_API_KEY", None) + os.environ.get("QUARTZBIO_API_KEY", None) or + os.environ.get("EDP_API_KEY", None) or + os.environ.get("SOLVEBIO_API_KEY", None) ) access_token = ( - os.environ.get("QUARTZ_ACCESS_TOKEN", None) - or os.environ.get("EDP_ACCESS_TOKEN", None) - or os.environ.get("SOLVEBIO_ACCESS_TOKEN", None) + os.environ.get("QUARTZ_ACCESS_TOKEN", None) or + os.environ.get("EDP_ACCESS_TOKEN", None) or + os.environ.get("SOLVEBIO_ACCESS_TOKEN", None) ) if access_token: @@ -184,4 +184,4 @@ def validate_api_host_url(url): elif not parsed.netloc: raise SolveError("Invalid API host: %s." % url) - return parsed.geturl() \ No newline at end of file + return parsed.geturl() diff --git a/solvebio/cli/auth.py b/solvebio/cli/auth.py index c215bf84..ba147243 100644 --- a/solvebio/cli/auth.py +++ b/solvebio/cli/auth.py @@ -81,4 +81,4 @@ def print_user(user): email = user['email'] domain = user['account']['domain'] print(f'You are logged-in to the "{domain}" domain as {email}' - f' (server: {solvebio.get_api_host()}).') + f' (server: {solvebio.get_api_host()}).') diff --git a/solvebio/cli/credentials.py b/solvebio/cli/credentials.py index a28c9051..0e2d155d 100644 --- a/solvebio/cli/credentials.py +++ b/solvebio/cli/credentials.py @@ -109,7 +109,7 @@ def get_credentials(api_host: str = None) -> ApiCredentials: # available option that ends with '.api.quartzbio.com', netrc_host = next( filter(lambda h: h.endswith(".api.quartzbio.com"), netrc_obj.hosts), None - ) + ) # Otherwise use the first available. if netrc_host is None: diff --git a/solvebio/cli/data.py b/solvebio/cli/data.py index 6ca60392..9917ab1b 100644 --- a/solvebio/cli/data.py +++ b/solvebio/cli/data.py @@ -1044,10 +1044,6 @@ def _ls(full_path, recursive=False, follow_shortcuts=False): return files -def _is_single_file(objects): - return len(objects) == 1 and objects[0].get("object_type") == "file" - - def should_tag_by_object_type(args, object_): """Returns True if object matches object type requirements""" valid = True diff --git a/solvebio/resource/object.py b/solvebio/resource/object.py index 3b75f0b8..0c27f8e8 100644 --- a/solvebio/resource/object.py +++ b/solvebio/resource/object.py @@ -434,8 +434,12 @@ def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): # Get vault vault = Vault.get_by_full_path(vault_full_path, client=_client) - # Get MD5 - local_md5, _ = md5sum(local_path, multipart_threshold=None) + # Get MD5 and check if multipart upload is needed + multipart_threshold = kwargs.get( + "multipart_threshold", 64 * 1024 * 1024 + ) # 64MB default + local_md5, _ = md5sum(local_path, multipart_threshold=multipart_threshold) + # Get a mimetype of file mime_tuple = mimetypes.guess_type(local_path) # If a file is compressed get a compression type, otherwise a file type @@ -509,6 +513,27 @@ def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): obj.path)) print('Notice: Upload initialized') + # Check if multipart upload is needed + if hasattr(obj, "is_multipart") and obj.is_multipart: + return cls._upload_multipart(obj, local_path, local_md5, **kwargs) + else: + return cls._upload_single_file(obj, local_path, **kwargs) + + @classmethod + def _upload_single_file(cls, obj, local_path, **kwargs): + """Handle single-part upload for smaller files""" + import mimetypes + + # Get a mimetype of file + mime_tuple = mimetypes.guess_type(local_path) + # If a file is compressed get a compression type, otherwise a file type + mimetype = mime_tuple[1] if mime_tuple[1] else mime_tuple[0] + # Get file size + size = os.path.getsize(local_path) + + # Get MD5 for single part upload + local_md5, _ = md5sum(local_path, multipart_threshold=None) + upload_url = obj.upload_url headers = { @@ -563,6 +588,109 @@ def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): return obj + @classmethod + def _upload_multipart(cls, obj, local_path, local_md5, **kwargs): + """Handle multipart upload for larger files""" + _client = kwargs.get("client") or cls._client or client + print(f"Notice: Upload ID {obj.upload_id}") + try: + # Get presigned URLs from the object + presigned_urls = obj.presigned_urls + + print( + "Notice: Starting multipart upload with {} parts...".format( + len(presigned_urls) + ) + ) + + # Step 2: Upload each part using presigned URLs + parts = [] + with open(local_path, "rb") as f: + for part_info in presigned_urls: + part_number = part_info.part_number + start_byte = part_info.start_byte + end_byte = part_info.end_byte + part_size = part_info.size + upload_url = part_info.upload_url + + print( + "Notice: Uploading part {}/{}... (bytes {}-{})".format( + part_number, len(presigned_urls), start_byte, end_byte + ) + ) + + # Seek to start position and read the exact part size + f.seek(start_byte) + chunk_data = f.read(part_size) + if not chunk_data: + break + + # Upload part with retry logic + session = requests.Session() + retry = Retry( + total=3, + backoff_factor=2, + status_forcelist=(500, 502, 503, 504), + allowed_methods=["PUT"], + ) + session.mount( + "https://", requests.adapters.HTTPAdapter(max_retries=retry) + ) + + headers = { + "Content-Length": str(len(chunk_data)), + } + + upload_resp = session.put( + upload_url, data=chunk_data, headers=headers + ) + + if upload_resp.status_code != 200: + raise FileUploadError( + "Failed to upload part {}: {}".format( + part_number, upload_resp.content + ) + ) + + # Get ETag from response + etag = upload_resp.headers.get("ETag", "").strip('"') + parts.append({"part_number": part_number, "etag": etag}) + + # Step 3: Complete multipart upload + print("Notice: Completing multipart upload....") + complete_data = { + "upload_id": obj.upload_id, + "physical_object_id": obj.upload_key, + "parts": parts, + } + + print(f"Notice: {complete_data}") + + complete_resp = _client.post("/v2/complete_multi_part", complete_data) + + if "message" in complete_resp: + print( + "Notice: Successfully uploaded {0} to {1} with multipart upload.".format( + local_path, obj.path + ) + ) + return obj + else: + raise Exception(complete_resp) + + except Exception as e: + # Clean up failed upload - best effort cleanup + try: + _client.delete( + obj.instance_url() + "/multipart-upload", + {}, + ) + except Exception: + pass # Best effort cleanup + + obj.delete(force=True) + raise FileUploadError("Multipart upload failed: {}".format(str(e))) + def _object_list_helper(self, **params): """Helper method to get objects within"""