feat: initial commit

art049 · art049 · commit d4317b7b3b6b · 2024-10-30T15:38:01.000+01:00
feat: initial commit

feat: add workflow to measure walltime in CI

fix: gil detection on python 3.12

fix: install deps in a separate step

fix: bump codspeed runner
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
@@ -0,0 +1,31 @@
+on:
+  push:
+
+jobs:
+  codspeed:
+    runs-on: codspeed-macro
+    strategy:
+      matrix:
+        python-version: ["3.12", "3.13"]
+        include:
+          - { python-version: "3.13t", gil: "1" }
+          - { python-version: "3.13t", gil: "0" }
+    env:
+      UV_PYTHON: ${{ matrix.python-version }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install
+
+      - name: Install dependencies
+        run: uv sync --all-extras
+
+      - uses: CodSpeedHQ/action@v3
+        env:
+          PYTHON_GIL: ${{ matrix.gil }}
+        with:
+          runner-version: 3.1.0-beta.3
+          run: uv run pytest --codspeed --codspeed-max-time 10 -vs tests.py
+          token: ${{ secrets.CODSPEED_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,13 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+.codspeed
+# Virtual environments
+.venv
+
+uv.lock
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 CodSpeed - Unmatched Performance Testing
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,15 @@
+# Pagerank performance measurement with python 3.13
+
+Running:
+
+- Without GIL:
+
+  ```
+  uv run python -X gil=0 -m pytest --codspeed -vs -x --codspeed-max-time 10 tests.py
+  ```
+
+- With GIL
+
+  ```
+  uv run python -X gil=1 -m pytest --codspeed -vs -x --codspeed-max-time 10 tests.py
+  ```
diff --git a/pagerank.py b/pagerank.py
@@ -0,0 +1,121 @@
+import numpy as np
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+import threading
+
+
+DAMPING = 0.85
+
+
+def pagerank_single(matrix: np.ndarray, num_iterations: int) -> np.ndarray:
+    """Single-threaded PageRank implementation"""
+    size = matrix.shape[0]
+    # Initialize scores
+    scores = np.ones(size) / size
+
+    for _ in range(num_iterations):
+        new_scores = np.zeros(size)
+        for i in range(size):
+            # Get nodes that point to current node
+            incoming = np.where(matrix[:, i])[0]
+            for j in incoming:
+                # Add score contribution from incoming node
+                new_scores[i] += scores[j] / np.sum(matrix[j])
+
+        # Apply damping factor
+        new_scores = (1 - DAMPING) / size + DAMPING * new_scores
+        scores = new_scores
+
+    return scores
+
+
+def _process_chunk(
+    matrix: np.ndarray, scores: np.ndarray, start_idx: int, end_idx: int
+) -> np.ndarray:
+    """Helper function for multiprocessing implementation"""
+    size = matrix.shape[0]
+    chunk_scores = np.zeros(size)
+
+    for i in range(start_idx, end_idx):
+        incoming = np.where(matrix[:, i])[0]
+        for j in incoming:
+            chunk_scores[i] += scores[j] / np.sum(matrix[j])
+
+    return chunk_scores
+
+
+def pagerank_multiprocess(
+    matrix: np.ndarray, num_iterations: int, num_processes: int
+) -> np.ndarray:
+    """Multi-process PageRank implementation"""
+    size = matrix.shape[0]
+    scores = np.ones(size) / size
+
+    # Split work into chunks
+    chunk_size = size // num_processes
+    chunks = [
+        (matrix, scores, i, min(i + chunk_size, size))
+        for i in range(0, size, chunk_size)
+    ]
+
+    for _ in range(num_iterations):
+        with multiprocessing.Pool(processes=num_processes) as pool:
+            # Process chunks in parallel
+            chunk_results = pool.starmap(_process_chunk, chunks)
+            # Combine results
+            new_scores = sum(chunk_results)
+            new_scores = (1 - DAMPING) / size + DAMPING * new_scores
+            scores = new_scores
+
+    return scores
+
+
+def _thread_worker(
+    matrix: np.ndarray,
+    scores: np.ndarray,
+    new_scores: np.ndarray,
+    start_idx: int,
+    end_idx: int,
+    lock: threading.Lock,
+):
+    """Helper function for multi-threaded implementation"""
+    size = matrix.shape[0]
+    local_scores = np.zeros(size)
+
+    for i in range(start_idx, end_idx):
+        incoming = np.where(matrix[:, i])[0]
+        for j in incoming:
+            local_scores[i] += scores[j] / np.sum(matrix[j])
+
+    with lock:
+        new_scores += local_scores
+
+
+def pagerank_multithread(
+    matrix: np.ndarray, num_iterations: int, num_threads: int
+) -> np.ndarray:
+    """Multi-threaded PageRank implementation"""
+    size = matrix.shape[0]
+    scores = np.ones(size) / size
+
+    # Split work into chunks
+    chunk_size = size // num_threads
+    chunks = [(i, min(i + chunk_size, size)) for i in range(0, size, chunk_size)]
+
+    for _ in range(num_iterations):
+        new_scores = np.zeros(size)
+        lock = threading.Lock()
+        with ThreadPoolExecutor(max_workers=num_threads) as executor:
+            # Process chunks in parallel
+            executor.map(
+                lambda args: _thread_worker(*args),  # starmap isn't available
+                [
+                    (matrix, scores, new_scores, start_idx, end_idx, lock)
+                    for start_idx, end_idx in chunks
+                ],
+            )
+        # Apply damping factor
+        new_scores = (1 - DAMPING) / size + DAMPING * new_scores
+        scores = new_scores
+
+    return scores
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "python-parallel-pagerank"
+version = "0.1.0"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = ["numpy>=2.1.2"]
+
+[tool.uv]
+dev-dependencies = ["pytest-codspeed>=3.0.0"]
diff --git a/tests.py b/tests.py
@@ -0,0 +1,66 @@
+from typing import Callable
+import numpy as np
+import sys
+import pytest
+from pytest_codspeed import BenchmarkFixture
+from functools import partial
+
+from pagerank import pagerank_multiprocess, pagerank_multithread, pagerank_single
+
+PagerankFunc = Callable[[np.ndarray, int], np.ndarray]
+
+
+def create_test_graph(size: int) -> np.ndarray:
+    """Create a random graph for testing"""
+    # Fixed seed
+    np.random.seed(0)
+    # Create random adjacency matrix with ~5 outgoing edges per node
+    matrix = np.random.choice([0, 1], size=(size, size), p=[1 - 5 / size, 5 / size])
+
+    # Find nodes with no outgoing edges
+    zero_outdegree = ~matrix.any(axis=1)
+    zero_indices = np.where(zero_outdegree)[0]
+
+    # For each node with no outgoing edges, add a random edge
+    if len(zero_indices) > 0:
+        random_targets = np.random.randint(0, size, size=len(zero_indices))
+        matrix[zero_indices, random_targets] = 1
+
+    return matrix
+
+
+@pytest.fixture(scope="session", autouse=True)
+def print_gil_status():
+    print()
+    print(f"Running {sys.version}")
+    if "_is_gil_enabled" not in dir(sys):
+        print("sys._is_gil_enabled() is not available in this Python version.")
+    else:
+        print(f"GIL is {"enabled" if sys._is_gil_enabled() else "disabled"}")
+    print()
+
+
+@pytest.mark.parametrize(
+    "pagerank",
+    [
+        pagerank_single,
+        partial(pagerank_multiprocess, num_processes=8),
+        partial(pagerank_multithread, num_threads=8),
+    ],
+    ids=["single", "8-processes", "8-threads"],
+)
+@pytest.mark.parametrize(
+    "graph",
+    [
+        create_test_graph(100),
+        create_test_graph(1000),
+        create_test_graph(2000),
+    ],
+    ids=["XS", "L", "XL"],
+)
+def test_pagerank(
+    benchmark: BenchmarkFixture,
+    pagerank: PagerankFunc,
+    graph: np.ndarray,
+):
+    benchmark(pagerank, graph, num_iterations=10)