ROCm · glimchb · Feb 27, 2026
@@ -6,3 +6,9 @@ build/
 
 # Ignore .cache directory generated by clangd
 .cache/
+
+# Python
+__pycache__/
+*.pyc
+.pytest_cache/
+*.egg-info/
@@ -0,0 +1,213 @@
+# hipfile – Python bindings for AMD hipFile
+
+Python `ctypes`-based bindings for [AMD hipFile](https://github.com/ROCm/hipFile),
+the ROCm equivalent of NVIDIA's cuFile, enabling **GPU-direct storage** – data
+movement directly between NVMe/filesystem storage and GPU memory, bypassing CPU
+staging buffers.
+
+> **Status:** Early-stage community bindings, tracking
+> [ROCm/hipFile#201](https://github.com/ROCm/hipFile/issues/201).
+
+---
+
+## Requirements
+
+- Linux (x86_64 or aarch64)
+- ROCm installed (tested with ROCm 6.x)
+- hipFile library built and installed from [ROCm/hipFile](https://github.com/ROCm/hipFile)
+- Python 3.8+
+
+---
+
+## Installation
+
+```bash
+# From source
+git clone https://github.com/ROCm/hipFile.git
+cd hipFile/python
+pip install -e .
+```
+
+Make sure `libhipfile.so` is on your `LD_LIBRARY_PATH`:
+
+```bash
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+```
+
+---
+
+## Quick start
+
+```python
+import os
+import hipfile
+import ctypes
+
+# --- Open the driver ---
+with hipfile.CuFileDriver():
+
+    # Open a file with O_DIRECT for best performance
+    fd = os.open("data.bin", os.O_RDONLY | os.O_DIRECT)
+    try:
+        byte_size = 4 * 1024 * 1024  # 4 MB
+
+        # Allocate GPU memory (example using PyTorch)
+        import torch
+        tensor = torch.empty(1024 * 1024, dtype=torch.float32, device="cuda")
+        gpu_ptr = tensor.data_ptr()
+
+        # Register the GPU buffer, then do the I/O
+        from hipfile.bindings import hipFileBufRegister, hipFileBufDeregister
+
+        hipFileBufRegister(ctypes.c_void_p(gpu_ptr), byte_size, 0)
+        try:
+            with hipfile.CuFile("data.bin", "r") as f:
+                n = f.read(ctypes.c_void_p(gpu_ptr), byte_size, file_offset=0)
+                print(f"Read {n} bytes directly into GPU memory")
+        finally:
+            hipFileBufDeregister(ctypes.c_void_p(gpu_ptr))
+
+    finally:
+        os.close(fd)
+```
+
+---
+
+## API overview
+
+### Driver lifecycle
+
+```python
+hipfile.hipFileDriverOpen()            # initialise the hipFile driver
+hipfile.hipFileDriverClose()           # tear down
+
+props = hipfile.hipFileDriverGetProperties()  # returns hipFileDriverProps_t
+print(props.major_version, props.minor_version)
+
+hipfile.hipFileDriverSetMaxDirectIOSize(128)   # KB
+hipfile.hipFileDriverSetMaxCacheSize(512)       # KB
+hipfile.hipFileDriverSetMaxPinnedMemSize(256)  # KB
+```
+
+### Context managers
+
+```python
+from hipfile.bindings import hipFileBufRegister, hipFileBufDeregister
+
+with hipfile.CuFileDriver():                    # open / close driver
+    # Register GPU buffer
+    hipFileBufRegister(ctypes.c_void_p(ptr), size, 0)
+    try:
+        with hipfile.CuFile("data.bin", "r+") as f:  # open / close file
+            f.read(ptr, count=size, file_offset=0)
+            f.write(ptr, count=size, file_offset=0)
+    finally:
+        hipFileBufDeregister(ctypes.c_void_p(ptr))
+```
+
+### Buffer registration
+
+```python
+from hipfile.bindings import hipFileBufRegister, hipFileBufDeregister
+
+hipFileBufRegister(ctypes.c_void_p(gpu_ptr), size, 0)
+hipFileBufDeregister(ctypes.c_void_p(gpu_ptr))
+```
+
+### Error handling
+
+```python
+try:
+    hipfile.hipFileDriverOpen()
+except hipfile.HipFileError as e:
+    print(f"HipFile error occurred")
+```
+
+---
+
+## Running the tests
+
+The test suite uses mocks and runs without real hardware:
+
+```bash
+pip install pytest
+pytest tests/ -v
+```
+
+For LMCache integration testing:
+
+```bash
+python test_lmcache_integration.py
+```
+
+---
+
+## PyTorch example
+
+```bash
+python examples/pytorch_example.py --create --count 1048576
+```
+
+---
+
+## LMCache Integration
+
+hipFile Python bindings are designed to be a drop-in replacement for NVIDIA's cuFile in applications like LMCache:
+
+```python
+# Works with both cuFile and hipFile
+try:
+    import cufile as gds_lib
+except ImportError:
+    import hipfile as gds_lib
+
+# Same API for both
+with gds_lib.CuFileDriver():
+    from gds_lib.bindings import hipFileBufRegister, hipFileBufDeregister
+    hipFileBufRegister(ctypes.c_void_p(tensor.data_ptr()), tensor.nbytes, 0)
+    # ... perform GDS operations ...
+```
+
+---
+
+## Building & Publishing
+
+```bash
+# Build sdist and wheel
+uv build
+
+# Validate the built artifacts
+uvx twine check dist/*
+
+# Publish to PyPI
+uv publish
+```
+
+---
+
+## How it works
+
+hipFile provides a C API for GPU-direct I/O on AMD ROCm hardware. These Python
+bindings use `ctypes` to call `libhipfile.so` directly, with no C compilation
+needed. The binding layer:
+
+1. Loads `libhipfile.so` at import time (lazy, configurable via `HIPFILE_LIB_PATH`).
+2. Declares `argtypes` / `restype` for each API function.
+3. Wraps the C types in Pythonic classes with context-manager support.
+4. Translates error status codes to `HipFileError` exceptions.
+5. Provides cuFile-compatible API for easy migration.
+
+---
+
+## Known Limitations
+
+- `RegisteredBuffer` context manager is not yet implemented
+- Some advanced cuFile features may not yet have hipFile equivalents
+- Error reporting is less detailed than NVIDIA's cuFile
+
+---
+
+## Contributing
+
+PRs welcome! The main tracking issue for official bindings is
+[ROCm/hipFile#201](https://github.com/ROCm/hipFile/issues/201).
@@ -0,0 +1,91 @@
+"""
+examples/pytorch_example.py
+----------------------------
+Demonstrates loading a tensor from disk directly into GPU memory using
+hipFile + PyTorch on an AMD ROCm system.
+
+Requirements:
+    - AMD GPU with ROCm installed
+    - hipFile library (see https://github.com/ROCm/hipFile)
+    - PyTorch with ROCm support (pip install torch --index-url https://download.pytorch.org/whl/rocm6.1)
+    - This package: pip install hipfile  (or python -m pip install -e .)
+
+Usage::
+
+    python pytorch_example.py --file /path/to/float32_tensor.bin --count 1048576
+"""
+
+import argparse
+import ctypes
+import os
+import struct
+import time
+
+import hipfile
+
+
+def create_test_file(path: str, n_floats: int = 1024 * 1024) -> None:
+    """Write n_floats random float32 values to a file for testing."""
+    import random
+    data = struct.pack(f"{n_floats}f", *[random.random() for _ in range(n_floats)])
+    with open(path, "wb") as f:
+        f.write(data)
+    print(f"Created test file: {path} ({len(data)} bytes)")
+
+
+def load_tensor_gpu_direct(filepath: str, n_floats: int):
+    """Load float32 data from file directly into a GPU tensor via hipFile."""
+    try:
+        import torch
+    except ImportError:
+        print("PyTorch not available – skipping GPU demo.")
+        return
+
+    if not torch.cuda.is_available():
+        print("No GPU available (torch.cuda.is_available() = False).")
+        return
+
+    dtype     = torch.float32
+    byte_size = n_floats * dtype.itemsize
+
+    # 1. Allocate device tensor
+    tensor = torch.empty(n_floats, dtype=dtype, device="cuda")
+    dev_ptr = tensor.data_ptr()  # raw integer GPU pointer
+
+    # 2. Initialise hipFile driver and register the GPU buffer
+    from hipfile.bindings import hipFileBufRegister, hipFileBufDeregister
+
+    hipfile.CuFileDriver()
+    hipFileBufRegister(ctypes.c_void_p(dev_ptr), byte_size, 0)
+    try:
+        with hipfile.CuFile(filepath, "r", use_direct_io=True) as hf:
+            t0 = time.perf_counter()
+            n_read = hf.read(dev_ptr, byte_size, file_offset=0)
+            elapsed = time.perf_counter() - t0
+    finally:
+        hipFileBufDeregister(ctypes.c_void_p(dev_ptr))
+
+    bw_gb = (n_read / elapsed) / 1e9
+    print(f"GPU-direct read: {n_read} bytes in {elapsed*1000:.2f} ms  "
+          f"({bw_gb:.2f} GB/s)")
+    print(f"Tensor first 5 values: {tensor[:5].tolist()}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="hipFile + PyTorch example")
+    parser.add_argument("--file",  default="/tmp/test_hipfile.bin",
+                        help="Path to float32 binary file")
+    parser.add_argument("--count", type=int, default=1024 * 1024,
+                        help="Number of float32 values")
+    parser.add_argument("--create", action="store_true",
+                        help="Create a test file before reading")
+    args = parser.parse_args()
+
+    if args.create or not os.path.exists(args.file):
+        create_test_file(args.file, args.count)
+
+    load_tensor_gpu_direct(args.file, args.count)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,58 @@
+"""
+hipfile – Python bindings for AMD hipFile (GPU-direct storage).
+
+Drop-in replacement for the cufile Python package.
+Mirrors cufile/__init__.py structure with hip* naming.
+
+Quick start::
+
+    import hipfile
+
+    with hipfile.CuFile("data.bin", "r+") as f:
+        f.write(ctypes.c_void_p(gpu_ptr), size, file_offset=0)
+"""
+
+# High-level (mirrors cufile.cufile exports)
+from .hipfile import (
+    CuFile,
+    CuFileDriver,
+)
+
+# Low-level convenience functions (mirrors cufile.bindings exports)
+from .bindings import (
+    HipFileError,
+    hipFileDriverOpen,
+    hipFileDriverClose,
+    hipFileHandleRegister,
+    hipFileHandleDeregister,
+    hipFileBufRegister,
+    hipFileBufDeregister,
+    hipFileRead,
+    hipFileWrite,
+    hipFileHandle_t,
+    hipFileStatus,
+    hipFileDescr,
+    DescrUnion,
+)
+
+__version__ = "0.1.0"
+
+__all__ = [
+    # High-level
+    "CuFile",
+    "CuFileDriver",
+    "HipFileError",
+    # Low-level
+    "hipFileDriverOpen",
+    "hipFileDriverClose",
+    "hipFileHandleRegister",
+    "hipFileHandleDeregister",
+    "hipFileBufRegister",
+    "hipFileBufDeregister",
+    "hipFileRead",
+    "hipFileWrite",
+    "hipFileHandle_t",
+    "hipFileStatus",
+    "hipFileDescr",
+    "DescrUnion",
+]