Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,10 @@ jobs:
build:
runs-on: ubuntu-latest

services:
squid:
image: ubuntu/squid:latest
ports:
- 3128:3128

strategy:
max-parallel: 4
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
python-version: ['3.8', '3.9', '3.10', '3.11']

env:
PORT: 8080
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,24 @@ export DVUPLOADER_TESTING=true

**3. Run the test(s) with pytest**

Run all tests:

```bash
poetry run pytest
```

Run a specific test:

```bash
poetry run pytest -k test_native_upload_with_large_file
```

Run all non-expensive tests:

```bash
poetry run pytest -m "not expensive"
```

### Linting

This repository uses `ruff` to lint the code and `codespell` to check for spelling mistakes. You can run the linters with the following command:
Expand Down
11 changes: 8 additions & 3 deletions dvuploader/dvuploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def upload(
persistent_id=persistent_id,
api_token=api_token,
replace_existing=replace_existing,
proxy=proxy,
)

# Sort files by size
Expand Down Expand Up @@ -146,6 +147,7 @@ def upload(
n_parallel_uploads=n_parallel_uploads,
progress=progress,
pbars=pbars,
proxy=proxy,
)
)
else:
Expand All @@ -159,6 +161,7 @@ def upload(
pbars=pbars,
progress=progress,
n_parallel_uploads=n_parallel_uploads,
proxy=proxy,
)
)

Expand Down Expand Up @@ -196,7 +199,8 @@ def _check_duplicates(
persistent_id: str,
api_token: str,
replace_existing: bool,
):
proxy: Optional[str] = None,
) -> None:
"""
Checks for duplicate files in the dataset by comparing paths and filenames.

Expand All @@ -205,7 +209,7 @@ def _check_duplicates(
persistent_id (str): The persistent ID of the dataset.
api_token (str): The API token for accessing the Dataverse repository.
replace_existing (bool): Whether to replace files that already exist.

proxy (Optional[str]): The proxy to use for the request.
Returns:
None
"""
Expand All @@ -214,6 +218,7 @@ def _check_duplicates(
dataverse_url=dataverse_url,
persistent_id=persistent_id,
api_token=api_token,
proxy=proxy,
)

table = Table(
Expand Down Expand Up @@ -252,7 +257,7 @@ def _check_duplicates(
# calculate checksum
file.update_checksum_chunked()
file.apply_checksum()
file._unchanged_data = self._check_hashes(file, ds_file)
file._unchanged_data = self._check_hashes(file, ds_file) # type: ignore
if file._unchanged_data:
table.add_row(
file.file_name,
Expand Down
49 changes: 44 additions & 5 deletions dvuploader/nativeupload.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tempfile
from io import BytesIO
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import IO, Dict, List, Optional, Tuple

import httpx
import rich
Expand Down Expand Up @@ -65,6 +65,36 @@
ZIP_LIMIT_MESSAGE = "The number of files in the zip archive is over the limit"


class _ProgressFileWrapper:
"""
Wrap a binary file-like object and update a rich progress bar on reads.
httpx's multipart expects a synchronous file-like object exposing .read().
"""

def __init__(
self,
file: IO[bytes],
progress: Progress,
pbar: TaskID,
chunk_size: int = 1024 * 1024,
):
self._file = file
self._progress = progress
self._pbar = pbar
self._chunk_size = chunk_size

def read(self, size: int = -1) -> bytes:
if size is None or size < 0:
size = self._chunk_size
data = self._file.read(size)
if data:
self._progress.update(self._pbar, advance=len(data))
return data

def __getattr__(self, name):
return getattr(self._file, name)


init_logging()


Expand Down Expand Up @@ -161,6 +191,7 @@ async def native_upload(
persistent_id=persistent_id,
dataverse_url=dataverse_url,
api_token=api_token,
proxy=proxy,
)


Expand Down Expand Up @@ -255,7 +286,9 @@ def _reset_progress(
@tenacity.retry(
wait=RETRY_STRAT,
stop=tenacity.stop_after_attempt(MAX_RETRIES),
retry=tenacity.retry_if_exception_type((httpx.HTTPStatusError,)),
retry=tenacity.retry_if_exception_type(
(httpx.HTTPStatusError, httpx.ReadError, httpx.RequestError)
),
)
async def _single_native_upload(
session: httpx.AsyncClient,
Expand Down Expand Up @@ -301,10 +334,12 @@ async def _single_native_upload(
json_data = _get_json_data(file)
handler = file.get_handler()

assert handler is not None, "File handler is required for native upload"

files = {
"file": (
file.file_name,
handler,
_ProgressFileWrapper(handler, progress, pbar), # type: ignore[arg-type]
file.mimeType,
),
"jsonData": (
Expand All @@ -316,7 +351,7 @@ async def _single_native_upload(

response = await session.post(
endpoint,
files=files, # type: ignore
files=files,
)

if response.status_code == 400 and response.json()["message"].startswith(
Expand Down Expand Up @@ -371,6 +406,7 @@ async def _update_metadata(
dataverse_url: str,
api_token: str,
persistent_id: str,
proxy: Optional[str],
):
"""
Updates the metadata of the given files in a Dataverse repository.
Expand All @@ -390,6 +426,7 @@ async def _update_metadata(
persistent_id=persistent_id,
dataverse_url=dataverse_url,
api_token=api_token,
proxy=proxy,
)

tasks = []
Expand Down Expand Up @@ -505,6 +542,7 @@ def _retrieve_file_ids(
persistent_id: str,
dataverse_url: str,
api_token: str,
proxy: Optional[str] = None,
) -> Dict[str, str]:
"""
Retrieves the file IDs of files in a dataset.
Expand All @@ -513,7 +551,7 @@ def _retrieve_file_ids(
persistent_id (str): The persistent identifier of the dataset.
dataverse_url (str): The URL of the Dataverse repository.
api_token (str): The API token of the Dataverse repository.

proxy (str): The proxy to use for the request.
Returns:
Dict[str, str]: Dictionary mapping file paths to their IDs.
"""
Expand All @@ -523,6 +561,7 @@ def _retrieve_file_ids(
persistent_id=persistent_id,
dataverse_url=dataverse_url,
api_token=api_token,
proxy=proxy,
)

return _create_file_id_path_mapping(ds_files)
Expand Down
5 changes: 4 additions & 1 deletion dvuploader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pathlib
import re
import time
from typing import List
from typing import List, Optional
from urllib.parse import urljoin

import httpx
Expand Down Expand Up @@ -59,6 +59,7 @@ def retrieve_dataset_files(
dataverse_url: str,
persistent_id: str,
api_token: str,
proxy: Optional[str] = None,
):
"""
Retrieve the files of a specific dataset from a Dataverse repository.
Expand All @@ -67,6 +68,7 @@ def retrieve_dataset_files(
dataverse_url (str): The base URL of the Dataverse repository.
persistent_id (str): The persistent identifier (PID) of the dataset.
api_token (str): API token for authentication.
proxy (Optional[str]): The proxy to use for the request.

Returns:
list: A list of files in the dataset.
Expand All @@ -80,6 +82,7 @@ def retrieve_dataset_files(
response = httpx.get(
urljoin(dataverse_url, DATASET_ENDPOINT),
headers={"X-Dataverse-key": api_token},
proxy=proxy,
)

response.raise_for_status()
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ ipywidgets = "^8.1.1"
pytest-cov = "^4.1.0"
pytest-asyncio = "^0.23.3"
pytest-httpx = "^0.35.0"
"proxy.py" = "^2.4.4"

[tool.poetry.group.linting.dependencies]
codespell = "^2.2.6"
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def create_dataset(
response = httpx.post(
url=url,
headers={"X-Dataverse-key": api_token},
data=open("./tests/fixtures/create_dataset.json", "rb"), # type: ignore
data=open("./tests/fixtures/create_dataset.json", "rb"), # type: ignore[reportUnboundVariable]
)

response.raise_for_status()
Expand Down
Loading