From 99383e683387f309d0c042bc5cf4a2424bfce012 Mon Sep 17 00:00:00 2001 From: Caroline Chen Date: Sun, 7 Dec 2025 16:18:52 -0500 Subject: [PATCH 1/2] Profiling notebook --- examples/profile_hdbscan.ipynb | 625 +++++++++++++++++++++++++++++++++ 1 file changed, 625 insertions(+) create mode 100644 examples/profile_hdbscan.ipynb diff --git a/examples/profile_hdbscan.ipynb b/examples/profile_hdbscan.ipynb new file mode 100644 index 00000000..fcfe8c9b --- /dev/null +++ b/examples/profile_hdbscan.ipynb @@ -0,0 +1,625 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HDBSCAN Profiling Analysis\n", + "\n", + "This notebook identifies and analyzes the bottleneck in the HDBSCAN implementation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import pandas as pd\n", + "import numpy as np\n", + "import time\n", + "import cProfile\n", + "import pstats\n", + "from io import StringIO\n", + "import matplotlib.pyplot as plt\n", + "from pathlib import Path\n", + "\n", + "# Add parent directory to path\n", + "sys.path.insert(0, str(Path().absolute().parent))\n", + "\n", + "from nomad.stop_detection import hdbscan" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test Data Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 100 points\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampxyuser_id
01704067411-25.091976-93.7141631
1170406758690.14286127.2820821
2170406772046.398788-37.1288041
3170406789219.7316971.7141381
41704068151-68.79627281.5132951
\n", + "
" + ], + "text/plain": [ + " timestamp x y user_id\n", + "0 1704067411 -25.091976 -93.714163 1\n", + "1 1704067586 90.142861 27.282082 1\n", + "2 1704067720 46.398788 -37.128804 1\n", + "3 1704067892 19.731697 1.714138 1\n", + "4 1704068151 -68.796272 81.513295 1" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def generate_test_data(n_points=1000, seed=42):\n", + " \"\"\"\n", + " Generate test data with GUARANTEED unique timestamps.\n", + " \"\"\"\n", + " np.random.seed(seed)\n", + " \n", + " # Random spatial coordinates\n", + " x = np.random.uniform(-100, 100, n_points)\n", + " y = np.random.uniform(-100, 100, n_points)\n", + " \n", + " # Generate UNIQUE timestamps using cumulative sum\n", + " base_time = int(pd.Timestamp('2024-01-01').timestamp())\n", + " intervals = np.random.randint(60, 300, n_points) # 1-5 min apart\n", + " timestamps = base_time + np.cumsum(intervals)\n", + " \n", + " data = pd.DataFrame({\n", + " 'timestamp': timestamps,\n", + " 'x': x,\n", + " 'y': y,\n", + " 'user_id': 1\n", + " })\n", + " \n", + " print(f\"Generated {len(data)} points\")\n", + " \n", + " return data\n", + "\n", + "# Test it\n", + "test_data = generate_test_data(100)\n", + "test_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. cProfile Analysis\n", + "\n", + "Identify which functions consume the most time." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 500 points\n", + "================================================================================\n", + "cProfile Results - Top 40 Functions by Cumulative Time\n", + "================================================================================\n", + " 2959533 function calls (2906807 primitive calls) in 0.772 seconds\n", + "\n", + " Ordered by: cumulative time\n", + " List reduced from 979 to 40 due to restriction <40>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2 0.000 0.000 0.772 0.386 interactiveshell.py:3514(run_code)\n", + " 2 0.000 0.000 0.772 0.386 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.772 0.772 1367433339.py:8()\n", + " 1 0.002 0.002 0.771 0.771 hdbscan.py:624(hdbscan_labels)\n", + " 1 0.009 0.009 0.543 0.543 hdbscan.py:219(cluster_hierarchy)\n", + " 294 0.010 0.000 0.231 0.001 hdbscan.py:160(_build_border_map)\n", + " 456 0.007 0.000 0.144 0.000 hdbscan.py:339(_build_graph_pd)\n", + " 1004 0.003 0.000 0.116 0.000 indexing.py:883(__setitem__)\n", + " 293 0.000 0.000 0.099 0.000 multi.py:216(new_meth)\n", + " 293 0.003 0.000 0.098 0.000 multi.py:546(from_tuples)\n", + " 295 0.001 0.000 0.097 0.000 multi.py:475(from_arrays)\n", + " 295 0.000 0.000 0.092 0.000 categorical.py:3089(factorize_from_iterables)\n", + " 885 0.000 0.000 0.092 0.000 categorical.py:3110()\n", + " 590 0.001 0.000 0.092 0.000 categorical.py:3046(factorize_from_iterable)\n", + "3762/3176 0.007 0.000 0.091 0.000 base.py:3827(get_indexer)\n", + " 592 0.003 0.000 0.090 0.000 categorical.py:371(__init__)\n", + " 1205 0.001 0.000 0.086 0.000 indexing.py:1533(_get_listlike_indexer)\n", + " 1208 0.003 0.000 0.085 0.000 base.py:6198(_get_indexer_strict)\n", + " 1004 0.001 0.000 0.084 0.000 indexing.py:745(_get_setitem_indexer)\n", + " 1004 0.001 0.000 0.082 0.000 indexing.py:1453(_convert_to_indexer)\n", + " 1781 0.001 0.000 0.066 0.000 common.py:62(new_method)\n", + "536933/531424 0.036 0.000 0.065 0.000 {built-in method builtins.isinstance}\n", + " 1501 0.002 0.000 0.064 0.000 base.py:4330(reindex)\n", + " 1 0.022 0.022 0.063 0.063 hdbscan.py:53(_compute_core_distance)\n", + " 2258 0.001 0.000 0.062 0.000 base.py:6174(get_indexer_for)\n", + " 968/875 0.001 0.000 0.054 0.000 indexing.py:1177(__getitem__)\n", + " 1228 0.002 0.000 0.054 0.000 series.py:1107(__getitem__)\n", + " 1231 0.001 0.000 0.053 0.000 series.py:6129(_cmp_method)\n", + " 3710 0.010 0.000 0.053 0.000 base.py:475(__new__)\n", + " 843 0.001 0.000 0.050 0.000 indexing.py:1398(_getitem_axis)\n", + "3515/2929 0.011 0.000 0.041 0.000 base.py:3962(_get_indexer)\n", + " 1011 0.013 0.000 0.040 0.000 base.py:675(_with_infer)\n", + " 2368 0.001 0.000 0.040 0.000 dtypes.py:220(__init__)\n", + " 2368 0.001 0.000 0.040 0.000 dtypes.py:373(_finalize)\n", + " 1184 0.002 0.000 0.038 0.000 dtypes.py:550(validate_categories)\n", + " 1325 0.002 0.000 0.037 0.000 series.py:6222(_construct_result)\n", + " 6734 0.011 0.000 0.037 0.000 construction.py:517(sanitize_array)\n", + " 293 0.000 0.000 0.036 0.000 series.py:5155(reindex)\n", + " 2454 0.004 0.000 0.035 0.000 multi.py:326(__new__)\n", + " 293 0.001 0.000 0.035 0.000 generic.py:5365(reindex)\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "# Generate test data\n", + "data_500 = generate_test_data(500, seed=99)\n", + "\n", + "# Profile with cProfile\n", + "profiler = cProfile.Profile()\n", + "profiler.enable()\n", + "\n", + "labels = hdbscan.hdbscan_labels(\n", + " data=data_500,\n", + " time_thresh=30,\n", + " min_pts=2,\n", + " min_cluster_size=2,\n", + " dur_min=5,\n", + " traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\n", + ")\n", + "\n", + "profiler.disable()\n", + "\n", + "# Print stats\n", + "s = StringIO()\n", + "ps = pstats.Stats(profiler, stream=s)\n", + "ps.strip_dirs()\n", + "ps.sort_stats('cumulative')\n", + "ps.print_stats(40)\n", + "\n", + "print(\"=\"*80)\n", + "print(\"cProfile Results - Top 40 Functions by Cumulative Time\")\n", + "print(\"=\"*80)\n", + "print(s.getvalue())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. line_profiler Analysis\n", + "\n", + "Line-by-line profiling of suspected O(n²) functions." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Install line_profiler if needed\n", + "!pip install line_profiler -q" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext line_profiler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated 300 points\n", + "================================================================================\n", + "Line-by-line profile of _find_temp_neighbors()\n", + "================================================================================\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timer unit: 1e-09 s\n", + "\n", + "Total time: 0.00294 s\n", + "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n", + "Function: _find_temp_neighbors at line 11\n", + "\n", + "Line # Hits Time Per Hit % Time Line Contents\n", + "==============================================================\n", + " 11 def _find_temp_neighbors(times, time_thresh, use_datetime):\n", + " 12 \"\"\"\n", + " 13 Find timestamp pairs that are within time threshold.\n", + " 14 \n", + " 15 Parameters\n", + " 16 ----------\n", + " 17 times : array of timestamps.\n", + " 18 time_thresh : time threshold for finding what timestamps are close in time.\n", + " 19 use_datetime : Whether to process timestamps as datetime objects.\n", + " 20 \n", + " 21 Returns\n", + " 22 -------\n", + " 23 time_pairs : list of tuples of timestamps [(t1, t2), ...] that are close in time given time_thresh.\n", + " 24 \n", + " 25 TC: O(n^2)\n", + " 26 \"\"\"\n", + " 27 # getting times based on whether they are datetime values or timestamps, changed to seconds for calculations\n", + " 28 1 10000.0 10000.0 0.3 times = to_timestamp(times).values if use_datetime else times.values\n", + " 29 \n", + " 30 # Pairwise time differences\n", + " 31 # times[:, np.newaxis]: from shape (n,) -> to shape (n, 1) – a column vector\n", + " 32 1 667000.0 667000.0 22.7 time_diffs = np.abs(times[:, np.newaxis] - times)\n", + " 33 1 132000.0 132000.0 4.5 time_diffs = time_diffs.astype(int)\n", + " 34 \n", + " 35 # Filter by time threshold\n", + " 36 1 263000.0 263000.0 8.9 within_time_thresh = np.triu(time_diffs <= (time_thresh * 60), k=1) # keep upper triangle\n", + " 37 1 398000.0 398000.0 13.5 i_idx, j_idx = np.where(within_time_thresh)\n", + " 38 \n", + " 39 # Return a list of (timestamp1, timestamp2) tuples\n", + " 40 1 1470000.0 1.47e+06 50.0 time_pairs = [(times[i], times[j]) for i, j in zip(i_idx, j_idx)]\n", + " 41 \n", + " 42 1 0.0 0.0 0.0 return time_pairs, times" + ] + } + ], + "source": [ + "data_300 = generate_test_data(300, seed=88)\n", + "\n", + "print(\"=\"*80)\n", + "print(\"Line-by-line profile of _find_temp_neighbors()\")\n", + "print(\"=\"*80)\n", + "\n", + "%lprun -f hdbscan._find_temp_neighbors hdbscan.hdbscan_labels(\\\n", + " data=data_300,\\\n", + " time_thresh=30,\\\n", + " min_pts=2,\\\n", + " min_cluster_size=2,\\\n", + " dur_min=5,\\\n", + " traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================================================================\n", + "Line-by-line profile of _compute_core_distance()\n", + "================================================================================\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timer unit: 1e-09 s\n", + "\n", + "Total time: 0.055209 s\n", + "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n", + "Function: _compute_core_distance at line 53\n", + "\n", + "Line # Hits Time Per Hit % Time Line Contents\n", + "==============================================================\n", + " 53 def _compute_core_distance(data, time_pairs, times, use_lon_lat, traj_cols, min_pts = 2):\n", + " 54 \"\"\"\n", + " 55 Calculate the core distance for each ping in data.\n", + " 56 It gives local density estimate: small core distance → high local density.\n", + " 57 \n", + " 58 Parameters\n", + " 59 ----------\n", + " 60 data : dataframe\n", + " 61 \n", + " 62 time_pairs : tuples of timestamps that are close in time given time_thresh\n", + " 63 \n", + " 64 min_pts : int\n", + " 65 used to calculate the core distance of a point p, where core distance of a point p \n", + " 66 is defined as the distance from p to its min_pts-th smallest nearest neighbor\n", + " 67 (including itself).\n", + " 68 \n", + " 69 Returns\n", + " 70 -------\n", + " 71 core_distances : dictionary of timestamps\n", + " 72 {timestamp_1: core_distance_1, ..., timestamp_n: core_distance_n} distances are quantized\n", + " 73 \"\"\"\n", + " 74 # getting coordinates based on whether they are geographic coordinates (lon, lat) or catesian (x,y)\n", + " 75 1 2000.0 2000.0 0.0 if use_lon_lat:\n", + " 76 coords = np.radians(data[[traj_cols['latitude'], traj_cols['longitude']]].values) # TC: O(n)\n", + " 77 else:\n", + " 78 1 1637000.0 1.64e+06 3.0 coords = data[[traj_cols['x'], traj_cols['y']]].values # TC: O(n)\n", + " 79 \n", + " 80 1 1000.0 1000.0 0.0 n = len(coords)\n", + " 81 # get the index of timestamp in the arrays (for accessing their value later)\n", + " 82 1 131000.0 131000.0 0.2 ts_indices = {ts: idx for idx, ts in enumerate(times)} # TC: O(n)\n", + " 83 \n", + " 84 # Build neighbor map from time_pairs\n", + " 85 1 3153000.0 3.15e+06 5.7 neighbors = _build_neighbor_graph(time_pairs, times)\n", + " 86 \n", + " 87 1 2000.0 2000.0 0.0 D_INF = np.pi * 6_371_000 # max distance on earth\n", + " 88 1 0.0 0.0 0.0 core_distances = {}\n", + " 89 \n", + " 90 301 68000.0 225.9 0.1 for i in range(n): # TC: O(n+m (mlogm)) \n", + " 91 300 78000.0 260.0 0.1 u = times[i]\n", + " 92 300 80000.0 266.7 0.1 allowed_neighbors = neighbors[u]\n", + " 93 300 95000.0 316.7 0.2 dists = [0.0] # distance to itself\n", + " 94 \n", + " 95 6008 1340000.0 223.0 2.4 for v in allowed_neighbors:\n", + " 96 5708 1442000.0 252.6 2.6 j = ts_indices.get(v)\n", + " 97 5708 1001000.0 175.4 1.8 if j is not None:\n", + " 98 5708 879000.0 154.0 1.6 if use_lon_lat:\n", + " 99 dist = utils._haversine_distance(coords[i], coords[j])\n", + " 100 else:\n", + " 101 5708 28618000.0 5013.7 51.8 dist = np.sqrt(np.sum((coords[i] - coords[j]) ** 2))\n", + " 102 \n", + " 103 5708 14390000.0 2521.0 26.1 dists.append(np.round(dist * 4) / 4)\n", + " 104 \n", + " 105 # pad with large numbers if not enough neighbors\n", + " 106 300 84000.0 280.0 0.2 while len(dists) < min_pts:\n", + " 107 dists.append(D_INF) # use a very large number e.g. infinity for edges between points not temporally close\n", + " 108 \n", + " 109 300 1323000.0 4410.0 2.4 sorted_dists = np.sort(dists) # TC: O(nlogn)\n", + " 110 300 885000.0 2950.0 1.6 core_distances[u] = np.round(sorted_dists[min_pts - 1] * 4)/4\n", + " 111 1 0.0 0.0 0.0 return core_distances, coords" + ] + } + ], + "source": [ + "# Profile _compute_core_distance\n", + "print(\"=\"*80)\n", + "print(\"Line-by-line profile of _compute_core_distance()\")\n", + "print(\"=\"*80)\n", + "\n", + "%lprun -f hdbscan._compute_core_distance hdbscan.hdbscan_labels(\\\n", + " data=data_300,\\\n", + " time_thresh=30,\\\n", + " min_pts=2,\\\n", + " min_cluster_size=2,\\\n", + " dur_min=5,\\\n", + " traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================================================================\n", + "Line-by-line profile of _build_hdbscan_graphs()\n", + "================================================================================\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timer unit: 1e-09 s\n", + "\n", + "Total time: 0.031336 s\n", + "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n", + "Function: _build_hdbscan_graphs at line 555\n", + "\n", + "Line # Hits Time Per Hit % Time Line Contents\n", + "==============================================================\n", + " 555 def _build_hdbscan_graphs(coords, ts_idx, neighbors, core_dist, use_lon_lat):\n", + " 556 \"\"\"\n", + " 557 Computes all graphs required for the HDBSCAN algorithm in one pass.\n", + " 558 \n", + " 559 Returns\n", + " 560 -------\n", + " 561 edges_sorted : np.recarray\n", + " 562 [from, to, weight] sorted descending by weight.\n", + " 563 d_graph : pd.Series\n", + " 564 Symmetric graph of raw distances, MultiIndex (from, to).\n", + " 565 \"\"\"\n", + " 566 1 1000.0 1000.0 0.0 mrd_graph = {}\n", + " 567 1 1000.0 1000.0 0.0 u_list, v_list, d_list = [], [], []\n", + " 568 \n", + " 569 301 74000.0 245.8 0.2 for u, u_neighbors in neighbors.items():\n", + " 570 300 60000.0 200.0 0.2 i = ts_idx[u]\n", + " 571 6008 1103000.0 183.6 3.5 for v in u_neighbors:\n", + " 572 5708 865000.0 151.5 2.8 if u >= v:\n", + " 573 2854 331000.0 116.0 1.1 continue\n", + " 574 \n", + " 575 2854 505000.0 176.9 1.6 j = ts_idx[v]\n", + " 576 5708 783000.0 137.2 2.5 dist = (utils._haversine_distance(coords[i], coords[j])\n", + " 577 5708 11266000.0 1973.7 36.0 if use_lon_lat else np.linalg.norm(coords[i] - coords[j]))\n", + " 578 2854 6039000.0 2116.0 19.3 dist = np.round(dist * 4) / 4\n", + " 579 \n", + " 580 2854 1236000.0 433.1 3.9 mrd_graph[(u, v)] = max(core_dist[u], core_dist[v], dist)\n", + " 581 2854 544000.0 190.6 1.7 u_list.append(u)\n", + " 582 2854 500000.0 175.2 1.6 v_list.append(v)\n", + " 583 2854 467000.0 163.6 1.5 d_list.append(dist)\n", + " 584 \n", + " 585 1 2035000.0 2.04e+06 6.5 idx = pd.MultiIndex.from_arrays([u_list, v_list], names=[\"from\", \"to\"])\n", + " 586 1 591000.0 591000.0 1.9 d_graph_part = pd.Series(d_list, index=idx)\n", + " 587 \n", + " 588 1 66000.0 66000.0 0.2 rev = d_graph_part.copy()\n", + " 589 1 44000.0 44000.0 0.1 rev.index = rev.index.swaplevel(0, 1)\n", + " 590 1 375000.0 375000.0 1.2 d_graph = pd.concat([d_graph_part, rev])\n", + " 591 \n", + " 592 # Build MST from MRD graph\n", + " 593 1 4020000.0 4.02e+06 12.8 mst_arr = _mst(mrd_graph)\n", + " 594 \n", + " 595 # Extend and sort MST with self-loops\n", + " 596 1 3000.0 3000.0 0.0 self_loops_items = list(core_dist.items())\n", + " 597 1 1000.0 1000.0 0.0 if not self_loops_items:\n", + " 598 self_loops_full = np.empty(0, dtype=mst_arr.dtype)\n", + " 599 else:\n", + " 600 2 21000.0 10500.0 0.1 self_loops = np.array(\n", + " 601 1 0.0 0.0 0.0 self_loops_items,\n", + " 602 1 1000.0 1000.0 0.0 dtype=[('from', 'int64'), ('weight', 'float64')]\n", + " 603 )\n", + " 604 1 1000.0 1000.0 0.0 self_loops_full = np.empty(len(self_loops), dtype=mst_arr.dtype)\n", + " 605 1 2000.0 2000.0 0.0 self_loops_full['from'] = self_loops['from']\n", + " 606 1 1000.0 1000.0 0.0 self_loops_full['to'] = self_loops['from']\n", + " 607 1 0.0 0.0 0.0 self_loops_full['weight'] = self_loops['weight']\n", + " 608 \n", + " 609 1 27000.0 27000.0 0.1 all_edges = np.concatenate([mst_arr, self_loops_full])\n", + " 610 \n", + " 611 1 23000.0 23000.0 0.1 order = np.argsort(all_edges[\"weight\"])[::-1]\n", + " 612 1 8000.0 8000.0 0.0 sorted_edges = all_edges[order]\n", + " 613 \n", + " 614 2 40000.0 20000.0 0.1 edges_sorted_df = pd.Series(\n", + " 615 1 1000.0 1000.0 0.0 sorted_edges['weight'],\n", + " 616 2 300000.0 150000.0 1.0 index=pd.MultiIndex.from_arrays(\n", + " 617 1 1000.0 1000.0 0.0 [sorted_edges['from'], sorted_edges['to']],\n", + " 618 1 0.0 0.0 0.0 names=['from', 'to']\n", + " 619 ),\n", + " 620 1 0.0 0.0 0.0 name='weight'\n", + " 621 )\n", + " 622 1 0.0 0.0 0.0 return edges_sorted_df, d_graph" + ] + } + ], + "source": [ + "# Profile _build_hdbscan_graphs\n", + "print(\"=\"*80)\n", + "print(\"Line-by-line profile of _build_hdbscan_graphs()\")\n", + "print(\"=\"*80)\n", + "\n", + "%lprun -f hdbscan._build_hdbscan_graphs hdbscan.hdbscan_labels(\\\n", + " data=data_300,\\\n", + " time_thresh=30,\\\n", + " min_pts=2,\\\n", + " min_cluster_size=2,\\\n", + " dur_min=5,\\\n", + " traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 1df01a1327182b37c2a5d1862122217c597c7e36 Mon Sep 17 00:00:00 2001 From: Caroline Chen Date: Mon, 11 May 2026 14:39:41 -0400 Subject: [PATCH 2/2] fix errors --- examples/profile_hdbscan.ipynb | 350 ++++++++++----------------------- 1 file changed, 104 insertions(+), 246 deletions(-) diff --git a/examples/profile_hdbscan.ipynb b/examples/profile_hdbscan.ipynb index fcfe8c9b..6ae6ea8c 100644 --- a/examples/profile_hdbscan.ipynb +++ b/examples/profile_hdbscan.ipynb @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -133,7 +133,7 @@ "4 1704068151 -68.796272 81.513295 1" ] }, - "execution_count": 12, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -181,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -192,52 +192,52 @@ "================================================================================\n", "cProfile Results - Top 40 Functions by Cumulative Time\n", "================================================================================\n", - " 2959533 function calls (2906807 primitive calls) in 0.772 seconds\n", + " 6499762 function calls (6451156 primitive calls) in 4.923 seconds\n", "\n", " Ordered by: cumulative time\n", - " List reduced from 979 to 40 due to restriction <40>\n", + " List reduced from 1247 to 40 due to restriction <40>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2 0.000 0.000 0.772 0.386 interactiveshell.py:3514(run_code)\n", - " 2 0.000 0.000 0.772 0.386 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.772 0.772 1367433339.py:8()\n", - " 1 0.002 0.002 0.771 0.771 hdbscan.py:624(hdbscan_labels)\n", - " 1 0.009 0.009 0.543 0.543 hdbscan.py:219(cluster_hierarchy)\n", - " 294 0.010 0.000 0.231 0.001 hdbscan.py:160(_build_border_map)\n", - " 456 0.007 0.000 0.144 0.000 hdbscan.py:339(_build_graph_pd)\n", - " 1004 0.003 0.000 0.116 0.000 indexing.py:883(__setitem__)\n", - " 293 0.000 0.000 0.099 0.000 multi.py:216(new_meth)\n", - " 293 0.003 0.000 0.098 0.000 multi.py:546(from_tuples)\n", - " 295 0.001 0.000 0.097 0.000 multi.py:475(from_arrays)\n", - " 295 0.000 0.000 0.092 0.000 categorical.py:3089(factorize_from_iterables)\n", - " 885 0.000 0.000 0.092 0.000 categorical.py:3110()\n", - " 590 0.001 0.000 0.092 0.000 categorical.py:3046(factorize_from_iterable)\n", - "3762/3176 0.007 0.000 0.091 0.000 base.py:3827(get_indexer)\n", - " 592 0.003 0.000 0.090 0.000 categorical.py:371(__init__)\n", - " 1205 0.001 0.000 0.086 0.000 indexing.py:1533(_get_listlike_indexer)\n", - " 1208 0.003 0.000 0.085 0.000 base.py:6198(_get_indexer_strict)\n", - " 1004 0.001 0.000 0.084 0.000 indexing.py:745(_get_setitem_indexer)\n", - " 1004 0.001 0.000 0.082 0.000 indexing.py:1453(_convert_to_indexer)\n", - " 1781 0.001 0.000 0.066 0.000 common.py:62(new_method)\n", - "536933/531424 0.036 0.000 0.065 0.000 {built-in method builtins.isinstance}\n", - " 1501 0.002 0.000 0.064 0.000 base.py:4330(reindex)\n", - " 1 0.022 0.022 0.063 0.063 hdbscan.py:53(_compute_core_distance)\n", - " 2258 0.001 0.000 0.062 0.000 base.py:6174(get_indexer_for)\n", - " 968/875 0.001 0.000 0.054 0.000 indexing.py:1177(__getitem__)\n", - " 1228 0.002 0.000 0.054 0.000 series.py:1107(__getitem__)\n", - " 1231 0.001 0.000 0.053 0.000 series.py:6129(_cmp_method)\n", - " 3710 0.010 0.000 0.053 0.000 base.py:475(__new__)\n", - " 843 0.001 0.000 0.050 0.000 indexing.py:1398(_getitem_axis)\n", - "3515/2929 0.011 0.000 0.041 0.000 base.py:3962(_get_indexer)\n", - " 1011 0.013 0.000 0.040 0.000 base.py:675(_with_infer)\n", - " 2368 0.001 0.000 0.040 0.000 dtypes.py:220(__init__)\n", - " 2368 0.001 0.000 0.040 0.000 dtypes.py:373(_finalize)\n", - " 1184 0.002 0.000 0.038 0.000 dtypes.py:550(validate_categories)\n", - " 1325 0.002 0.000 0.037 0.000 series.py:6222(_construct_result)\n", - " 6734 0.011 0.000 0.037 0.000 construction.py:517(sanitize_array)\n", - " 293 0.000 0.000 0.036 0.000 series.py:5155(reindex)\n", - " 2454 0.004 0.000 0.035 0.000 multi.py:326(__new__)\n", - " 293 0.001 0.000 0.035 0.000 generic.py:5365(reindex)\n", + " 2 0.000 0.000 4.923 2.461 interactiveshell.py:3514(run_code)\n", + " 4/2 0.000 0.000 4.923 2.461 {built-in method builtins.exec}\n", + " 1 0.001 0.001 4.923 4.923 1367433339.py:8()\n", + " 1 0.005 0.005 4.922 4.922 hdbscan.py:488(hdbscan_labels)\n", + " 1 0.082 0.082 3.544 3.544 hdbscan.py:88(cluster_hierarchy)\n", + " 500 0.079 0.000 2.154 0.004 hdbscan.py:22(_build_border_map)\n", + " 1 0.000 0.000 1.039 1.039 preprocessing.py:100(_find_neighbors)\n", + " 7 0.489 0.070 0.853 0.122 graph.py:961(add_edges_from)\n", + " 499 0.311 0.001 0.831 0.002 hdbscan.py:65()\n", + " 1 0.061 0.061 0.747 0.747 preprocessing.py:29(_find_spatial_neighbors)\n", + " 2 0.000 0.000 0.719 0.360 graph.py:1041(add_weighted_edges_from)\n", + " 498 0.003 0.000 0.514 0.001 groupby.py:1898(_agg_general)\n", + " 495 0.001 0.000 0.510 0.001 generic.py:1176(idxmin)\n", + " 495 0.002 0.000 0.509 0.001 groupby.py:5818(_idxmax_idxmin)\n", + " 498 0.004 0.000 0.507 0.001 groupby.py:1964(_cython_agg_general)\n", + " 4967 0.045 0.000 0.423 0.000 base.py:475(__new__)\n", + "3313/2814 0.041 0.000 0.420 0.000 series.py:392(__init__)\n", + " 495 0.002 0.000 0.413 0.001 multi.py:216(new_meth)\n", + " 495 0.012 0.000 0.412 0.001 multi.py:546(from_tuples)\n", + " 496 0.004 0.000 0.401 0.001 multi.py:475(from_arrays)\n", + " 496 0.003 0.000 0.373 0.001 categorical.py:3089(factorize_from_iterables)\n", + " 1488 0.013 0.000 0.370 0.000 categorical.py:3110()\n", + " 644/498 0.005 0.000 0.367 0.001 base.py:3245(union)\n", + " 992 0.008 0.000 0.357 0.000 categorical.py:3046(factorize_from_iterable)\n", + " 186309 0.066 0.000 0.348 0.000 fromnumeric.py:3360(round)\n", + " 994 0.017 0.000 0.344 0.000 categorical.py:371(__init__)\n", + " 10774 0.053 0.000 0.333 0.000 construction.py:517(sanitize_array)\n", + " 550/404 0.005 0.000 0.332 0.001 base.py:3367(_union)\n", + " 187339 0.066 0.000 0.294 0.000 fromnumeric.py:51(_wrapfunc)\n", + " 152 0.006 0.000 0.285 0.002 algorithms.py:1612(union_with_duplicates)\n", + " 495 0.007 0.000 0.283 0.001 groupby.py:5925(_wrap_idxmax_idxmin)\n", + " 129474 0.201 0.000 0.223 0.000 graph.py:1089()\n", + "1034478/1028257 0.133 0.000 0.215 0.000 {built-in method builtins.isinstance}\n", + " 498 0.002 0.000 0.203 0.000 base.py:365(grouped_reduce)\n", + " 186309 0.200 0.000 0.200 0.000 {method 'round' of 'numpy.generic' objects}\n", + " 498 0.002 0.000 0.190 0.000 groupby.py:1978(array_func)\n", + " 1 0.020 0.020 0.190 0.190 hdbscan.py:460(_build_hdbscan_graphs)\n", + " 498 0.004 0.000 0.188 0.000 ops.py:821(_cython_operation)\n", + " 495 0.002 0.000 0.185 0.000 multi.py:1955(to_flat_index)\n", + "1990/1495 0.009 0.000 0.184 0.000 algorithms.py:610(factorize)\n", "\n", "\n", "\n" @@ -287,9 +287,19 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], "source": [ "# Install line_profiler if needed\n", "!pip install line_profiler -q" @@ -297,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -306,86 +316,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Generated 300 points\n", - "================================================================================\n", - "Line-by-line profile of _find_temp_neighbors()\n", - "================================================================================\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Timer unit: 1e-09 s\n", - "\n", - "Total time: 0.00294 s\n", - "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n", - "Function: _find_temp_neighbors at line 11\n", - "\n", - "Line # Hits Time Per Hit % Time Line Contents\n", - "==============================================================\n", - " 11 def _find_temp_neighbors(times, time_thresh, use_datetime):\n", - " 12 \"\"\"\n", - " 13 Find timestamp pairs that are within time threshold.\n", - " 14 \n", - " 15 Parameters\n", - " 16 ----------\n", - " 17 times : array of timestamps.\n", - " 18 time_thresh : time threshold for finding what timestamps are close in time.\n", - " 19 use_datetime : Whether to process timestamps as datetime objects.\n", - " 20 \n", - " 21 Returns\n", - " 22 -------\n", - " 23 time_pairs : list of tuples of timestamps [(t1, t2), ...] that are close in time given time_thresh.\n", - " 24 \n", - " 25 TC: O(n^2)\n", - " 26 \"\"\"\n", - " 27 # getting times based on whether they are datetime values or timestamps, changed to seconds for calculations\n", - " 28 1 10000.0 10000.0 0.3 times = to_timestamp(times).values if use_datetime else times.values\n", - " 29 \n", - " 30 # Pairwise time differences\n", - " 31 # times[:, np.newaxis]: from shape (n,) -> to shape (n, 1) – a column vector\n", - " 32 1 667000.0 667000.0 22.7 time_diffs = np.abs(times[:, np.newaxis] - times)\n", - " 33 1 132000.0 132000.0 4.5 time_diffs = time_diffs.astype(int)\n", - " 34 \n", - " 35 # Filter by time threshold\n", - " 36 1 263000.0 263000.0 8.9 within_time_thresh = np.triu(time_diffs <= (time_thresh * 60), k=1) # keep upper triangle\n", - " 37 1 398000.0 398000.0 13.5 i_idx, j_idx = np.where(within_time_thresh)\n", - " 38 \n", - " 39 # Return a list of (timestamp1, timestamp2) tuples\n", - " 40 1 1470000.0 1.47e+06 50.0 time_pairs = [(times[i], times[j]) for i, j in zip(i_idx, j_idx)]\n", - " 41 \n", - " 42 1 0.0 0.0 0.0 return time_pairs, times" + "Generated 300 points\n" ] } ], "source": [ - "data_300 = generate_test_data(300, seed=88)\n", - "\n", - "print(\"=\"*80)\n", - "print(\"Line-by-line profile of _find_temp_neighbors()\")\n", - "print(\"=\"*80)\n", - "\n", - "%lprun -f hdbscan._find_temp_neighbors hdbscan.hdbscan_labels(\\\n", - " data=data_300,\\\n", - " time_thresh=30,\\\n", - " min_pts=2,\\\n", - " min_cluster_size=2,\\\n", - " dur_min=5,\\\n", - " traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n", - ")" + "data_300 = generate_test_data(300, seed=88)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -403,71 +351,22 @@ "text": [ "Timer unit: 1e-09 s\n", "\n", - "Total time: 0.055209 s\n", + "Total time: 0.022125 s\n", "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n", - "Function: _compute_core_distance at line 53\n", + "Function: _compute_core_distance at line 11\n", "\n", "Line # Hits Time Per Hit % Time Line Contents\n", "==============================================================\n", - " 53 def _compute_core_distance(data, time_pairs, times, use_lon_lat, traj_cols, min_pts = 2):\n", - " 54 \"\"\"\n", - " 55 Calculate the core distance for each ping in data.\n", - " 56 It gives local density estimate: small core distance → high local density.\n", - " 57 \n", - " 58 Parameters\n", - " 59 ----------\n", - " 60 data : dataframe\n", - " 61 \n", - " 62 time_pairs : tuples of timestamps that are close in time given time_thresh\n", - " 63 \n", - " 64 min_pts : int\n", - " 65 used to calculate the core distance of a point p, where core distance of a point p \n", - " 66 is defined as the distance from p to its min_pts-th smallest nearest neighbor\n", - " 67 (including itself).\n", - " 68 \n", - " 69 Returns\n", - " 70 -------\n", - " 71 core_distances : dictionary of timestamps\n", - " 72 {timestamp_1: core_distance_1, ..., timestamp_n: core_distance_n} distances are quantized\n", - " 73 \"\"\"\n", - " 74 # getting coordinates based on whether they are geographic coordinates (lon, lat) or catesian (x,y)\n", - " 75 1 2000.0 2000.0 0.0 if use_lon_lat:\n", - " 76 coords = np.radians(data[[traj_cols['latitude'], traj_cols['longitude']]].values) # TC: O(n)\n", - " 77 else:\n", - " 78 1 1637000.0 1.64e+06 3.0 coords = data[[traj_cols['x'], traj_cols['y']]].values # TC: O(n)\n", - " 79 \n", - " 80 1 1000.0 1000.0 0.0 n = len(coords)\n", - " 81 # get the index of timestamp in the arrays (for accessing their value later)\n", - " 82 1 131000.0 131000.0 0.2 ts_indices = {ts: idx for idx, ts in enumerate(times)} # TC: O(n)\n", - " 83 \n", - " 84 # Build neighbor map from time_pairs\n", - " 85 1 3153000.0 3.15e+06 5.7 neighbors = _build_neighbor_graph(time_pairs, times)\n", - " 86 \n", - " 87 1 2000.0 2000.0 0.0 D_INF = np.pi * 6_371_000 # max distance on earth\n", - " 88 1 0.0 0.0 0.0 core_distances = {}\n", - " 89 \n", - " 90 301 68000.0 225.9 0.1 for i in range(n): # TC: O(n+m (mlogm)) \n", - " 91 300 78000.0 260.0 0.1 u = times[i]\n", - " 92 300 80000.0 266.7 0.1 allowed_neighbors = neighbors[u]\n", - " 93 300 95000.0 316.7 0.2 dists = [0.0] # distance to itself\n", - " 94 \n", - " 95 6008 1340000.0 223.0 2.4 for v in allowed_neighbors:\n", - " 96 5708 1442000.0 252.6 2.6 j = ts_indices.get(v)\n", - " 97 5708 1001000.0 175.4 1.8 if j is not None:\n", - " 98 5708 879000.0 154.0 1.6 if use_lon_lat:\n", - " 99 dist = utils._haversine_distance(coords[i], coords[j])\n", - " 100 else:\n", - " 101 5708 28618000.0 5013.7 51.8 dist = np.sqrt(np.sum((coords[i] - coords[j]) ** 2))\n", - " 102 \n", - " 103 5708 14390000.0 2521.0 26.1 dists.append(np.round(dist * 4) / 4)\n", - " 104 \n", - " 105 # pad with large numbers if not enough neighbors\n", - " 106 300 84000.0 280.0 0.2 while len(dists) < min_pts:\n", - " 107 dists.append(D_INF) # use a very large number e.g. infinity for edges between points not temporally close\n", - " 108 \n", - " 109 300 1323000.0 4410.0 2.4 sorted_dists = np.sort(dists) # TC: O(nlogn)\n", - " 110 300 885000.0 2950.0 1.6 core_distances[u] = np.round(sorted_dists[min_pts - 1] * 4)/4\n", - " 111 1 0.0 0.0 0.0 return core_distances, coords" + " 11 def _compute_core_distance(G, min_pts):\n", + " 12 1 1000.0 1000.0 0.0 result = {}\n", + " 13 \n", + " 14 301 90000.0 299.0 0.4 for node in G.nodes():\n", + " 15 300 20228000.0 67426.7 91.4 edges = sorted(G.edges(node, data='weight'), key=lambda e: e[2])\n", + " 16 300 184000.0 613.3 0.8 result[node] = edges[min_pts - 1][2] if len(edges) >= min_pts else np.inf\n", + " 17 \n", + " 18 1 1607000.0 1.61e+06 7.3 core_distances = pd.Series(result)\n", + " 19 1 15000.0 15000.0 0.1 core_distances.index.name = 'time'\n", + " 20 1 0.0 0.0 0.0 return core_distances" ] } ], @@ -489,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -507,80 +406,39 @@ "text": [ "Timer unit: 1e-09 s\n", "\n", - "Total time: 0.031336 s\n", + "Total time: 0.113286 s\n", "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n", - "Function: _build_hdbscan_graphs at line 555\n", + "Function: _build_hdbscan_graphs at line 460\n", "\n", "Line # Hits Time Per Hit % Time Line Contents\n", "==============================================================\n", - " 555 def _build_hdbscan_graphs(coords, ts_idx, neighbors, core_dist, use_lon_lat):\n", - " 556 \"\"\"\n", - " 557 Computes all graphs required for the HDBSCAN algorithm in one pass.\n", - " 558 \n", - " 559 Returns\n", - " 560 -------\n", - " 561 edges_sorted : np.recarray\n", - " 562 [from, to, weight] sorted descending by weight.\n", - " 563 d_graph : pd.Series\n", - " 564 Symmetric graph of raw distances, MultiIndex (from, to).\n", - " 565 \"\"\"\n", - " 566 1 1000.0 1000.0 0.0 mrd_graph = {}\n", - " 567 1 1000.0 1000.0 0.0 u_list, v_list, d_list = [], [], []\n", - " 568 \n", - " 569 301 74000.0 245.8 0.2 for u, u_neighbors in neighbors.items():\n", - " 570 300 60000.0 200.0 0.2 i = ts_idx[u]\n", - " 571 6008 1103000.0 183.6 3.5 for v in u_neighbors:\n", - " 572 5708 865000.0 151.5 2.8 if u >= v:\n", - " 573 2854 331000.0 116.0 1.1 continue\n", - " 574 \n", - " 575 2854 505000.0 176.9 1.6 j = ts_idx[v]\n", - " 576 5708 783000.0 137.2 2.5 dist = (utils._haversine_distance(coords[i], coords[j])\n", - " 577 5708 11266000.0 1973.7 36.0 if use_lon_lat else np.linalg.norm(coords[i] - coords[j]))\n", - " 578 2854 6039000.0 2116.0 19.3 dist = np.round(dist * 4) / 4\n", - " 579 \n", - " 580 2854 1236000.0 433.1 3.9 mrd_graph[(u, v)] = max(core_dist[u], core_dist[v], dist)\n", - " 581 2854 544000.0 190.6 1.7 u_list.append(u)\n", - " 582 2854 500000.0 175.2 1.6 v_list.append(v)\n", - " 583 2854 467000.0 163.6 1.5 d_list.append(dist)\n", - " 584 \n", - " 585 1 2035000.0 2.04e+06 6.5 idx = pd.MultiIndex.from_arrays([u_list, v_list], names=[\"from\", \"to\"])\n", - " 586 1 591000.0 591000.0 1.9 d_graph_part = pd.Series(d_list, index=idx)\n", - " 587 \n", - " 588 1 66000.0 66000.0 0.2 rev = d_graph_part.copy()\n", - " 589 1 44000.0 44000.0 0.1 rev.index = rev.index.swaplevel(0, 1)\n", - " 590 1 375000.0 375000.0 1.2 d_graph = pd.concat([d_graph_part, rev])\n", - " 591 \n", - " 592 # Build MST from MRD graph\n", - " 593 1 4020000.0 4.02e+06 12.8 mst_arr = _mst(mrd_graph)\n", - " 594 \n", - " 595 # Extend and sort MST with self-loops\n", - " 596 1 3000.0 3000.0 0.0 self_loops_items = list(core_dist.items())\n", - " 597 1 1000.0 1000.0 0.0 if not self_loops_items:\n", - " 598 self_loops_full = np.empty(0, dtype=mst_arr.dtype)\n", - " 599 else:\n", - " 600 2 21000.0 10500.0 0.1 self_loops = np.array(\n", - " 601 1 0.0 0.0 0.0 self_loops_items,\n", - " 602 1 1000.0 1000.0 0.0 dtype=[('from', 'int64'), ('weight', 'float64')]\n", - " 603 )\n", - " 604 1 1000.0 1000.0 0.0 self_loops_full = np.empty(len(self_loops), dtype=mst_arr.dtype)\n", - " 605 1 2000.0 2000.0 0.0 self_loops_full['from'] = self_loops['from']\n", - " 606 1 1000.0 1000.0 0.0 self_loops_full['to'] = self_loops['from']\n", - " 607 1 0.0 0.0 0.0 self_loops_full['weight'] = self_loops['weight']\n", - " 608 \n", - " 609 1 27000.0 27000.0 0.1 all_edges = np.concatenate([mst_arr, self_loops_full])\n", - " 610 \n", - " 611 1 23000.0 23000.0 0.1 order = np.argsort(all_edges[\"weight\"])[::-1]\n", - " 612 1 8000.0 8000.0 0.0 sorted_edges = all_edges[order]\n", - " 613 \n", - " 614 2 40000.0 20000.0 0.1 edges_sorted_df = pd.Series(\n", - " 615 1 1000.0 1000.0 0.0 sorted_edges['weight'],\n", - " 616 2 300000.0 150000.0 1.0 index=pd.MultiIndex.from_arrays(\n", - " 617 1 1000.0 1000.0 0.0 [sorted_edges['from'], sorted_edges['to']],\n", - " 618 1 0.0 0.0 0.0 names=['from', 'to']\n", - " 619 ),\n", - " 620 1 0.0 0.0 0.0 name='weight'\n", - " 621 )\n", - " 622 1 0.0 0.0 0.0 return edges_sorted_df, d_graph" + " 460 def _build_hdbscan_graphs(G, core_dist):\n", + " 461 \"\"\"\n", + " 462 Computes all graphs required for the HDBSCAN algorithm in one pass.\n", + " 463 Uses precomputed edge weights from G instead of recomputing distances.\n", + " 464 \n", + " 465 Returns\n", + " 466 -------\n", + " 467 H : nx.Graph\n", + " 468 Hierarchy graph with mutual-reachability MST edges and core-distance\n", + " 469 self-loops.\n", + " 470 edges_sorted_df : pd.Series\n", + " 471 H sorted descending by weight, MultiIndex (from, to).\n", + " 472 \"\"\"\n", + " 473 1 19645000.0 1.96e+07 17.3 G_copy = G.copy()\n", + " 474 2855 5431000.0 1902.3 4.8 for u, v, data in G_copy.edges(data=True):\n", + " 475 2854 11464000.0 4016.8 10.1 d = np.round(data[\"weight\"] * 4) / 4\n", + " 476 2854 39450000.0 13822.7 34.8 data[\"weight\"] = max(core_dist.at[u], core_dist.at[v], d)\n", + " 477 \n", + " 478 1 28629000.0 2.86e+07 25.3 H = nx.minimum_spanning_tree(G_copy)\n", + " 479 \n", + " 480 1 1098000.0 1.1e+06 1.0 H.add_edges_from((node, node, {'weight': weight}) for node, weight in core_dist.items())\n", + " 481 \n", + " 482 1 5519000.0 5.52e+06 4.9 all_edges = nx.to_pandas_edgelist(H, source='from', target='to')\n", + " 483 1 601000.0 601000.0 0.5 all_edges.sort_values('weight', ascending=False, inplace=True)\n", + " 484 \n", + " 485 1 1365000.0 1.36e+06 1.2 all_edges.set_index(['from', 'to'], inplace=True)\n", + " 486 1 84000.0 84000.0 0.1 return H, all_edges['weight']" ] } ],