gdcc · JR-1991 · Nov 26, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -6,16 +6,10 @@ jobs:
   build:
     runs-on: ubuntu-latest
 
-    services:
-      squid:
-        image: ubuntu/squid:latest
-        ports:
-          - 3128:3128
-
     strategy:
       max-parallel: 4
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
 
     env:
       PORT: 8080

diff --git a/README.md b/README.md
@@ -210,10 +210,24 @@ export DVUPLOADER_TESTING=true
 
 **3. Run the test(s) with pytest**
 
+Run all tests:
+
 ```bash
 poetry run pytest
 ```
 
+Run a specific test:
+
+```bash
+poetry run pytest -k test_native_upload_with_large_file
+```
+
+Run all non-expensive tests:  
+
+```bash
+poetry run pytest -m "not expensive"
+```
+
 ### Linting
 
 This repository uses `ruff` to lint the code and `codespell` to check for spelling mistakes. You can run the linters with the following command:

diff --git a/dvuploader/dvuploader.py b/dvuploader/dvuploader.py
@@ -105,6 +105,7 @@ def upload(
             persistent_id=persistent_id,
             api_token=api_token,
             replace_existing=replace_existing,
+            proxy=proxy,
         )
 
         # Sort files by size
@@ -146,6 +147,7 @@ def upload(
                         n_parallel_uploads=n_parallel_uploads,
                         progress=progress,
                         pbars=pbars,
+                        proxy=proxy,
                     )
                 )
         else:
@@ -159,6 +161,7 @@ def upload(
                         pbars=pbars,
                         progress=progress,
                         n_parallel_uploads=n_parallel_uploads,
+                        proxy=proxy,
                     )
                 )
 
@@ -196,7 +199,8 @@ def _check_duplicates(
         persistent_id: str,
         api_token: str,
         replace_existing: bool,
-    ):
+        proxy: Optional[str] = None,
+    ) -> None:
         """
         Checks for duplicate files in the dataset by comparing paths and filenames.
 
@@ -205,7 +209,7 @@ def _check_duplicates(
             persistent_id (str): The persistent ID of the dataset.
             api_token (str): The API token for accessing the Dataverse repository.
             replace_existing (bool): Whether to replace files that already exist.
-
+            proxy (Optional[str]): The proxy to use for the request.
         Returns:
             None
         """
@@ -214,6 +218,7 @@ def _check_duplicates(
             dataverse_url=dataverse_url,
             persistent_id=persistent_id,
             api_token=api_token,
+            proxy=proxy,
         )
 
         table = Table(
@@ -252,7 +257,7 @@ def _check_duplicates(
                         # calculate checksum
                         file.update_checksum_chunked()
                         file.apply_checksum()
-                        file._unchanged_data = self._check_hashes(file, ds_file)
+                        file._unchanged_data = self._check_hashes(file, ds_file)  # type: ignore
                     if file._unchanged_data:
                         table.add_row(
                             file.file_name,

diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py
@@ -4,7 +4,7 @@
 import tempfile
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import IO, Dict, List, Optional, Tuple
 
 import httpx
 import rich
@@ -65,6 +65,36 @@
 ZIP_LIMIT_MESSAGE = "The number of files in the zip archive is over the limit"
 
 
+class _ProgressFileWrapper:
+    """
+    Wrap a binary file-like object and update a rich progress bar on reads.
+    httpx's multipart expects a synchronous file-like object exposing .read().
+    """
+
+    def __init__(
+        self,
+        file: IO[bytes],
+        progress: Progress,
+        pbar: TaskID,
+        chunk_size: int = 1024 * 1024,
+    ):
+        self._file = file
+        self._progress = progress
+        self._pbar = pbar
+        self._chunk_size = chunk_size
+
+    def read(self, size: int = -1) -> bytes:
+        if size is None or size < 0:
+            size = self._chunk_size
+        data = self._file.read(size)
+        if data:
+            self._progress.update(self._pbar, advance=len(data))
+        return data
+
+    def __getattr__(self, name):
+        return getattr(self._file, name)
+
+
 init_logging()
 
 
@@ -161,6 +191,7 @@ async def native_upload(
                 persistent_id=persistent_id,
                 dataverse_url=dataverse_url,
                 api_token=api_token,
+                proxy=proxy,
             )
 
 
@@ -255,7 +286,9 @@ def _reset_progress(
 @tenacity.retry(
     wait=RETRY_STRAT,
     stop=tenacity.stop_after_attempt(MAX_RETRIES),
-    retry=tenacity.retry_if_exception_type((httpx.HTTPStatusError,)),
+    retry=tenacity.retry_if_exception_type(
+        (httpx.HTTPStatusError, httpx.ReadError, httpx.RequestError)
+    ),
 )
 async def _single_native_upload(
     session: httpx.AsyncClient,
@@ -301,10 +334,12 @@ async def _single_native_upload(
     json_data = _get_json_data(file)
     handler = file.get_handler()
 
+    assert handler is not None, "File handler is required for native upload"
+
     files = {
         "file": (
             file.file_name,
-            handler,
+            _ProgressFileWrapper(handler, progress, pbar),  # type: ignore[arg-type]
             file.mimeType,
         ),
         "jsonData": (
@@ -316,7 +351,7 @@ async def _single_native_upload(
 
     response = await session.post(
         endpoint,
-        files=files,  # type: ignore
+        files=files,
     )
 
     if response.status_code == 400 and response.json()["message"].startswith(
@@ -371,6 +406,7 @@ async def _update_metadata(
     dataverse_url: str,
     api_token: str,
     persistent_id: str,
+    proxy: Optional[str],
 ):
     """
     Updates the metadata of the given files in a Dataverse repository.
@@ -390,6 +426,7 @@ async def _update_metadata(
         persistent_id=persistent_id,
         dataverse_url=dataverse_url,
         api_token=api_token,
+        proxy=proxy,
     )
 
     tasks = []
@@ -505,6 +542,7 @@ def _retrieve_file_ids(
     persistent_id: str,
     dataverse_url: str,
     api_token: str,
+    proxy: Optional[str] = None,
 ) -> Dict[str, str]:
     """
     Retrieves the file IDs of files in a dataset.
@@ -513,7 +551,7 @@ def _retrieve_file_ids(
         persistent_id (str): The persistent identifier of the dataset.
         dataverse_url (str): The URL of the Dataverse repository.
         api_token (str): The API token of the Dataverse repository.
-
+        proxy (str): The proxy to use for the request.
     Returns:
         Dict[str, str]: Dictionary mapping file paths to their IDs.
     """
@@ -523,6 +561,7 @@ def _retrieve_file_ids(
         persistent_id=persistent_id,
         dataverse_url=dataverse_url,
         api_token=api_token,
+        proxy=proxy,
     )
 
     return _create_file_id_path_mapping(ds_files)

diff --git a/dvuploader/utils.py b/dvuploader/utils.py
@@ -4,7 +4,7 @@
 import pathlib
 import re
 import time
-from typing import List
+from typing import List, Optional
 from urllib.parse import urljoin
 
 import httpx
@@ -59,6 +59,7 @@ def retrieve_dataset_files(
     dataverse_url: str,
     persistent_id: str,
     api_token: str,
+    proxy: Optional[str] = None,
 ):
     """
     Retrieve the files of a specific dataset from a Dataverse repository.
@@ -67,6 +68,7 @@ def retrieve_dataset_files(
         dataverse_url (str): The base URL of the Dataverse repository.
         persistent_id (str): The persistent identifier (PID) of the dataset.
         api_token (str): API token for authentication.
+        proxy (Optional[str]): The proxy to use for the request.
 
     Returns:
         list: A list of files in the dataset.
@@ -80,6 +82,7 @@ def retrieve_dataset_files(
     response = httpx.get(
         urljoin(dataverse_url, DATASET_ENDPOINT),
         headers={"X-Dataverse-key": api_token},
+        proxy=proxy,
     )
 
     response.raise_for_status()

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ ipywidgets = "^8.1.1"
 pytest-cov = "^4.1.0"
 pytest-asyncio = "^0.23.3"
 pytest-httpx = "^0.35.0"
+"proxy.py" = "^2.4.4"
 
 [tool.poetry.group.linting.dependencies]
 codespell = "^2.2.6"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -60,7 +60,7 @@ def create_dataset(
     response = httpx.post(
         url=url,
         headers={"X-Dataverse-key": api_token},
-        data=open("./tests/fixtures/create_dataset.json", "rb"),  # type: ignore
+        data=open("./tests/fixtures/create_dataset.json", "rb"),  # type: ignore[reportUnboundVariable]
     )
 
     response.raise_for_status()