From 99383e683387f309d0c042bc5cf4a2424bfce012 Mon Sep 17 00:00:00 2001
From: Caroline Chen <cyc.caroline@gmail.com>
Date: Sun, 7 Dec 2025 16:18:52 -0500
Subject: [PATCH 1/2] Profiling notebook

---
 examples/profile_hdbscan.ipynb | 625 +++++++++++++++++++++++++++++++++
 1 file changed, 625 insertions(+)
 create mode 100644 examples/profile_hdbscan.ipynb
diff --git a/examples/profile_hdbscan.ipynb b/examples/profile_hdbscan.ipynb
new file mode 100644
index 00000000..fcfe8c9b
--- /dev/null
+++ b/examples/profile_hdbscan.ipynb
@@ -0,0 +1,625 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# HDBSCAN Profiling Analysis\n",
+    "\n",
+    "This notebook identifies and analyzes the bottleneck in the HDBSCAN implementation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import cProfile\n",
+    "import pstats\n",
+    "from io import StringIO\n",
+    "import matplotlib.pyplot as plt\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add parent directory to path\n",
+    "sys.path.insert(0, str(Path().absolute().parent))\n",
+    "\n",
+    "from nomad.stop_detection import hdbscan"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test Data Generator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated 100 points\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>timestamp</th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "      <th>user_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1704067411</td>\n",
+       "      <td>-25.091976</td>\n",
+       "      <td>-93.714163</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1704067586</td>\n",
+       "      <td>90.142861</td>\n",
+       "      <td>27.282082</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1704067720</td>\n",
+       "      <td>46.398788</td>\n",
+       "      <td>-37.128804</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1704067892</td>\n",
+       "      <td>19.731697</td>\n",
+       "      <td>1.714138</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1704068151</td>\n",
+       "      <td>-68.796272</td>\n",
+       "      <td>81.513295</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    timestamp          x          y  user_id\n",
+       "0  1704067411 -25.091976 -93.714163        1\n",
+       "1  1704067586  90.142861  27.282082        1\n",
+       "2  1704067720  46.398788 -37.128804        1\n",
+       "3  1704067892  19.731697   1.714138        1\n",
+       "4  1704068151 -68.796272  81.513295        1"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def generate_test_data(n_points=1000, seed=42):\n",
+    "    \"\"\"\n",
+    "    Generate test data with GUARANTEED unique timestamps.\n",
+    "    \"\"\"\n",
+    "    np.random.seed(seed)\n",
+    "    \n",
+    "    # Random spatial coordinates\n",
+    "    x = np.random.uniform(-100, 100, n_points)\n",
+    "    y = np.random.uniform(-100, 100, n_points)\n",
+    "    \n",
+    "    # Generate UNIQUE timestamps using cumulative sum\n",
+    "    base_time = int(pd.Timestamp('2024-01-01').timestamp())\n",
+    "    intervals = np.random.randint(60, 300, n_points)  # 1-5 min apart\n",
+    "    timestamps = base_time + np.cumsum(intervals)\n",
+    "    \n",
+    "    data = pd.DataFrame({\n",
+    "        'timestamp': timestamps,\n",
+    "        'x': x,\n",
+    "        'y': y,\n",
+    "        'user_id': 1\n",
+    "    })\n",
+    "    \n",
+    "    print(f\"Generated {len(data)} points\")\n",
+    "    \n",
+    "    return data\n",
+    "\n",
+    "# Test it\n",
+    "test_data = generate_test_data(100)\n",
+    "test_data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. cProfile Analysis\n",
+    "\n",
+    "Identify which functions consume the most time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated 500 points\n",
+      "================================================================================\n",
+      "cProfile Results - Top 40 Functions by Cumulative Time\n",
+      "================================================================================\n",
+      "         2959533 function calls (2906807 primitive calls) in 0.772 seconds\n",
+      "\n",
+      "   Ordered by: cumulative time\n",
+      "   List reduced from 979 to 40 due to restriction <40>\n",
+      "\n",
+      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
+      "        2    0.000    0.000    0.772    0.386 interactiveshell.py:3514(run_code)\n",
+      "        2    0.000    0.000    0.772    0.386 {built-in method builtins.exec}\n",
+      "        1    0.000    0.000    0.772    0.772 1367433339.py:8(<module>)\n",
+      "        1    0.002    0.002    0.771    0.771 hdbscan.py:624(hdbscan_labels)\n",
+      "        1    0.009    0.009    0.543    0.543 hdbscan.py:219(cluster_hierarchy)\n",
+      "      294    0.010    0.000    0.231    0.001 hdbscan.py:160(_build_border_map)\n",
+      "      456    0.007    0.000    0.144    0.000 hdbscan.py:339(_build_graph_pd)\n",
+      "     1004    0.003    0.000    0.116    0.000 indexing.py:883(__setitem__)\n",
+      "      293    0.000    0.000    0.099    0.000 multi.py:216(new_meth)\n",
+      "      293    0.003    0.000    0.098    0.000 multi.py:546(from_tuples)\n",
+      "      295    0.001    0.000    0.097    0.000 multi.py:475(from_arrays)\n",
+      "      295    0.000    0.000    0.092    0.000 categorical.py:3089(factorize_from_iterables)\n",
+      "      885    0.000    0.000    0.092    0.000 categorical.py:3110(<genexpr>)\n",
+      "      590    0.001    0.000    0.092    0.000 categorical.py:3046(factorize_from_iterable)\n",
+      "3762/3176    0.007    0.000    0.091    0.000 base.py:3827(get_indexer)\n",
+      "      592    0.003    0.000    0.090    0.000 categorical.py:371(__init__)\n",
+      "     1205    0.001    0.000    0.086    0.000 indexing.py:1533(_get_listlike_indexer)\n",
+      "     1208    0.003    0.000    0.085    0.000 base.py:6198(_get_indexer_strict)\n",
+      "     1004    0.001    0.000    0.084    0.000 indexing.py:745(_get_setitem_indexer)\n",
+      "     1004    0.001    0.000    0.082    0.000 indexing.py:1453(_convert_to_indexer)\n",
+      "     1781    0.001    0.000    0.066    0.000 common.py:62(new_method)\n",
+      "536933/531424    0.036    0.000    0.065    0.000 {built-in method builtins.isinstance}\n",
+      "     1501    0.002    0.000    0.064    0.000 base.py:4330(reindex)\n",
+      "        1    0.022    0.022    0.063    0.063 hdbscan.py:53(_compute_core_distance)\n",
+      "     2258    0.001    0.000    0.062    0.000 base.py:6174(get_indexer_for)\n",
+      "  968/875    0.001    0.000    0.054    0.000 indexing.py:1177(__getitem__)\n",
+      "     1228    0.002    0.000    0.054    0.000 series.py:1107(__getitem__)\n",
+      "     1231    0.001    0.000    0.053    0.000 series.py:6129(_cmp_method)\n",
+      "     3710    0.010    0.000    0.053    0.000 base.py:475(__new__)\n",
+      "      843    0.001    0.000    0.050    0.000 indexing.py:1398(_getitem_axis)\n",
+      "3515/2929    0.011    0.000    0.041    0.000 base.py:3962(_get_indexer)\n",
+      "     1011    0.013    0.000    0.040    0.000 base.py:675(_with_infer)\n",
+      "     2368    0.001    0.000    0.040    0.000 dtypes.py:220(__init__)\n",
+      "     2368    0.001    0.000    0.040    0.000 dtypes.py:373(_finalize)\n",
+      "     1184    0.002    0.000    0.038    0.000 dtypes.py:550(validate_categories)\n",
+      "     1325    0.002    0.000    0.037    0.000 series.py:6222(_construct_result)\n",
+      "     6734    0.011    0.000    0.037    0.000 construction.py:517(sanitize_array)\n",
+      "      293    0.000    0.000    0.036    0.000 series.py:5155(reindex)\n",
+      "     2454    0.004    0.000    0.035    0.000 multi.py:326(__new__)\n",
+      "      293    0.001    0.000    0.035    0.000 generic.py:5365(reindex)\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Generate test data\n",
+    "data_500 = generate_test_data(500, seed=99)\n",
+    "\n",
+    "# Profile with cProfile\n",
+    "profiler = cProfile.Profile()\n",
+    "profiler.enable()\n",
+    "\n",
+    "labels = hdbscan.hdbscan_labels(\n",
+    "    data=data_500,\n",
+    "    time_thresh=30,\n",
+    "    min_pts=2,\n",
+    "    min_cluster_size=2,\n",
+    "    dur_min=5,\n",
+    "    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\n",
+    ")\n",
+    "\n",
+    "profiler.disable()\n",
+    "\n",
+    "# Print stats\n",
+    "s = StringIO()\n",
+    "ps = pstats.Stats(profiler, stream=s)\n",
+    "ps.strip_dirs()\n",
+    "ps.sort_stats('cumulative')\n",
+    "ps.print_stats(40)\n",
+    "\n",
+    "print(\"=\"*80)\n",
+    "print(\"cProfile Results - Top 40 Functions by Cumulative Time\")\n",
+    "print(\"=\"*80)\n",
+    "print(s.getvalue())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. line_profiler Analysis\n",
+    "\n",
+    "Line-by-line profiling of suspected O(n²) functions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install line_profiler if needed\n",
+    "!pip install line_profiler -q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext line_profiler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated 300 points\n",
+      "================================================================================\n",
+      "Line-by-line profile of _find_temp_neighbors()\n",
+      "================================================================================\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Timer unit: 1e-09 s\n",
+      "\n",
+      "Total time: 0.00294 s\n",
+      "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n",
+      "Function: _find_temp_neighbors at line 11\n",
+      "\n",
+      "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
+      "==============================================================\n",
+      "    11                                           def _find_temp_neighbors(times, time_thresh, use_datetime):\n",
+      "    12                                               \"\"\"\n",
+      "    13                                               Find timestamp pairs that are within time threshold.\n",
+      "    14                                           \n",
+      "    15                                               Parameters\n",
+      "    16                                               ----------\n",
+      "    17                                               times : array of timestamps.\n",
+      "    18                                               time_thresh : time threshold for finding what timestamps are close in time.\n",
+      "    19                                               use_datetime : Whether to process timestamps as datetime objects.\n",
+      "    20                                           \n",
+      "    21                                               Returns\n",
+      "    22                                               -------\n",
+      "    23                                               time_pairs : list of tuples of timestamps [(t1, t2), ...] that are close in time given time_thresh.\n",
+      "    24                                           \n",
+      "    25                                               TC: O(n^2)\n",
+      "    26                                               \"\"\"\n",
+      "    27                                               # getting times based on whether they are datetime values or timestamps, changed to seconds for calculations\n",
+      "    28         1      10000.0  10000.0      0.3      times = to_timestamp(times).values if use_datetime else times.values\n",
+      "    29                                                   \n",
+      "    30                                               # Pairwise time differences\n",
+      "    31                                               # times[:, np.newaxis]: from shape (n,) -> to shape (n, 1) – a column vector\n",
+      "    32         1     667000.0 667000.0     22.7      time_diffs = np.abs(times[:, np.newaxis] - times)\n",
+      "    33         1     132000.0 132000.0      4.5      time_diffs = time_diffs.astype(int)\n",
+      "    34                                               \n",
+      "    35                                               # Filter by time threshold\n",
+      "    36         1     263000.0 263000.0      8.9      within_time_thresh = np.triu(time_diffs <= (time_thresh * 60), k=1) # keep upper triangle\n",
+      "    37         1     398000.0 398000.0     13.5      i_idx, j_idx = np.where(within_time_thresh)\n",
+      "    38                                               \n",
+      "    39                                               # Return a list of (timestamp1, timestamp2) tuples\n",
+      "    40         1    1470000.0 1.47e+06     50.0      time_pairs = [(times[i], times[j]) for i, j in zip(i_idx, j_idx)]\n",
+      "    41                                               \n",
+      "    42         1          0.0      0.0      0.0      return time_pairs, times"
+     ]
+    }
+   ],
+   "source": [
+    "data_300 = generate_test_data(300, seed=88)\n",
+    "\n",
+    "print(\"=\"*80)\n",
+    "print(\"Line-by-line profile of _find_temp_neighbors()\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "%lprun -f hdbscan._find_temp_neighbors hdbscan.hdbscan_labels(\\\n",
+    "    data=data_300,\\\n",
+    "    time_thresh=30,\\\n",
+    "    min_pts=2,\\\n",
+    "    min_cluster_size=2,\\\n",
+    "    dur_min=5,\\\n",
+    "    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "================================================================================\n",
+      "Line-by-line profile of _compute_core_distance()\n",
+      "================================================================================\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Timer unit: 1e-09 s\n",
+      "\n",
+      "Total time: 0.055209 s\n",
+      "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n",
+      "Function: _compute_core_distance at line 53\n",
+      "\n",
+      "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
+      "==============================================================\n",
+      "    53                                           def _compute_core_distance(data, time_pairs, times, use_lon_lat, traj_cols, min_pts = 2):\n",
+      "    54                                               \"\"\"\n",
+      "    55                                               Calculate the core distance for each ping in data.\n",
+      "    56                                               It gives local density estimate: small core distance → high local density.\n",
+      "    57                                           \n",
+      "    58                                               Parameters\n",
+      "    59                                               ----------\n",
+      "    60                                               data : dataframe\n",
+      "    61                                           \n",
+      "    62                                               time_pairs : tuples of timestamps that are close in time given time_thresh\n",
+      "    63                                               \n",
+      "    64                                               min_pts : int\n",
+      "    65                                                   used to calculate the core distance of a point p, where core distance of a point p \n",
+      "    66                                                   is defined as the distance from p to its min_pts-th smallest nearest neighbor\n",
+      "    67                                                   (including itself).\n",
+      "    68                                           \n",
+      "    69                                               Returns\n",
+      "    70                                               -------\n",
+      "    71                                               core_distances : dictionary of timestamps\n",
+      "    72                                                   {timestamp_1: core_distance_1, ..., timestamp_n: core_distance_n} distances are quantized\n",
+      "    73                                               \"\"\"\n",
+      "    74                                               # getting coordinates based on whether they are geographic coordinates (lon, lat) or catesian (x,y)\n",
+      "    75         1       2000.0   2000.0      0.0      if use_lon_lat:\n",
+      "    76                                                   coords = np.radians(data[[traj_cols['latitude'], traj_cols['longitude']]].values) # TC: O(n)\n",
+      "    77                                               else:\n",
+      "    78         1    1637000.0 1.64e+06      3.0          coords = data[[traj_cols['x'], traj_cols['y']]].values # TC: O(n)\n",
+      "    79                                               \n",
+      "    80         1       1000.0   1000.0      0.0      n = len(coords)\n",
+      "    81                                               # get the index of timestamp in the arrays (for accessing their value later)\n",
+      "    82         1     131000.0 131000.0      0.2      ts_indices = {ts: idx for idx, ts in enumerate(times)} # TC: O(n)\n",
+      "    83                                           \n",
+      "    84                                               # Build neighbor map from time_pairs\n",
+      "    85         1    3153000.0 3.15e+06      5.7      neighbors = _build_neighbor_graph(time_pairs, times)\n",
+      "    86                                           \n",
+      "    87         1       2000.0   2000.0      0.0      D_INF = np.pi * 6_371_000  # max distance on earth\n",
+      "    88         1          0.0      0.0      0.0      core_distances = {}\n",
+      "    89                                           \n",
+      "    90       301      68000.0    225.9      0.1      for i in range(n): # TC: O(n+m (mlogm)) \n",
+      "    91       300      78000.0    260.0      0.1          u = times[i]\n",
+      "    92       300      80000.0    266.7      0.1          allowed_neighbors = neighbors[u]\n",
+      "    93       300      95000.0    316.7      0.2          dists = [0.0]  # distance to itself\n",
+      "    94                                           \n",
+      "    95      6008    1340000.0    223.0      2.4          for v in allowed_neighbors:\n",
+      "    96      5708    1442000.0    252.6      2.6              j = ts_indices.get(v)\n",
+      "    97      5708    1001000.0    175.4      1.8              if j is not None:\n",
+      "    98      5708     879000.0    154.0      1.6                  if use_lon_lat:\n",
+      "    99                                                               dist = utils._haversine_distance(coords[i], coords[j])\n",
+      "   100                                                           else:\n",
+      "   101      5708   28618000.0   5013.7     51.8                      dist = np.sqrt(np.sum((coords[i] - coords[j]) ** 2))\n",
+      "   102                                                           \n",
+      "   103      5708   14390000.0   2521.0     26.1                  dists.append(np.round(dist * 4) / 4)\n",
+      "   104                                           \n",
+      "   105                                                   # pad with large numbers if not enough neighbors\n",
+      "   106       300      84000.0    280.0      0.2          while len(dists) < min_pts:\n",
+      "   107                                                       dists.append(D_INF) # use a very large number e.g. infinity for edges between points not temporally close\n",
+      "   108                                           \n",
+      "   109       300    1323000.0   4410.0      2.4          sorted_dists = np.sort(dists) # TC: O(nlogn)\n",
+      "   110       300     885000.0   2950.0      1.6          core_distances[u] = np.round(sorted_dists[min_pts - 1] * 4)/4\n",
+      "   111         1          0.0      0.0      0.0      return core_distances, coords"
+     ]
+    }
+   ],
+   "source": [
+    "# Profile _compute_core_distance\n",
+    "print(\"=\"*80)\n",
+    "print(\"Line-by-line profile of _compute_core_distance()\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "%lprun -f hdbscan._compute_core_distance hdbscan.hdbscan_labels(\\\n",
+    "    data=data_300,\\\n",
+    "    time_thresh=30,\\\n",
+    "    min_pts=2,\\\n",
+    "    min_cluster_size=2,\\\n",
+    "    dur_min=5,\\\n",
+    "    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "================================================================================\n",
+      "Line-by-line profile of _build_hdbscan_graphs()\n",
+      "================================================================================\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Timer unit: 1e-09 s\n",
+      "\n",
+      "Total time: 0.031336 s\n",
+      "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n",
+      "Function: _build_hdbscan_graphs at line 555\n",
+      "\n",
+      "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
+      "==============================================================\n",
+      "   555                                           def _build_hdbscan_graphs(coords, ts_idx, neighbors, core_dist, use_lon_lat):\n",
+      "   556                                               \"\"\"\n",
+      "   557                                               Computes all graphs required for the HDBSCAN algorithm in one pass.\n",
+      "   558                                           \n",
+      "   559                                               Returns\n",
+      "   560                                               -------\n",
+      "   561                                               edges_sorted : np.recarray\n",
+      "   562                                                   [from, to, weight] sorted descending by weight.\n",
+      "   563                                               d_graph : pd.Series\n",
+      "   564                                                   Symmetric graph of raw distances, MultiIndex (from, to).\n",
+      "   565                                               \"\"\"\n",
+      "   566         1       1000.0   1000.0      0.0      mrd_graph = {}\n",
+      "   567         1       1000.0   1000.0      0.0      u_list, v_list, d_list = [], [], []\n",
+      "   568                                           \n",
+      "   569       301      74000.0    245.8      0.2      for u, u_neighbors in neighbors.items():\n",
+      "   570       300      60000.0    200.0      0.2          i = ts_idx[u]\n",
+      "   571      6008    1103000.0    183.6      3.5          for v in u_neighbors:\n",
+      "   572      5708     865000.0    151.5      2.8              if u >= v:\n",
+      "   573      2854     331000.0    116.0      1.1                  continue\n",
+      "   574                                                       \n",
+      "   575      2854     505000.0    176.9      1.6              j = ts_idx[v]\n",
+      "   576      5708     783000.0    137.2      2.5              dist = (utils._haversine_distance(coords[i], coords[j])\n",
+      "   577      5708   11266000.0   1973.7     36.0                      if use_lon_lat else np.linalg.norm(coords[i] - coords[j]))\n",
+      "   578      2854    6039000.0   2116.0     19.3              dist = np.round(dist * 4) / 4\n",
+      "   579                                           \n",
+      "   580      2854    1236000.0    433.1      3.9              mrd_graph[(u, v)] = max(core_dist[u], core_dist[v], dist)\n",
+      "   581      2854     544000.0    190.6      1.7              u_list.append(u)\n",
+      "   582      2854     500000.0    175.2      1.6              v_list.append(v)\n",
+      "   583      2854     467000.0    163.6      1.5              d_list.append(dist)\n",
+      "   584                                           \n",
+      "   585         1    2035000.0 2.04e+06      6.5      idx = pd.MultiIndex.from_arrays([u_list, v_list], names=[\"from\", \"to\"])\n",
+      "   586         1     591000.0 591000.0      1.9      d_graph_part = pd.Series(d_list, index=idx)\n",
+      "   587                                               \n",
+      "   588         1      66000.0  66000.0      0.2      rev = d_graph_part.copy()\n",
+      "   589         1      44000.0  44000.0      0.1      rev.index = rev.index.swaplevel(0, 1)\n",
+      "   590         1     375000.0 375000.0      1.2      d_graph = pd.concat([d_graph_part, rev])\n",
+      "   591                                           \n",
+      "   592                                               # Build MST from MRD graph\n",
+      "   593         1    4020000.0 4.02e+06     12.8      mst_arr = _mst(mrd_graph)\n",
+      "   594                                           \n",
+      "   595                                               # Extend and sort MST with self-loops\n",
+      "   596         1       3000.0   3000.0      0.0      self_loops_items = list(core_dist.items())\n",
+      "   597         1       1000.0   1000.0      0.0      if not self_loops_items:\n",
+      "   598                                                   self_loops_full = np.empty(0, dtype=mst_arr.dtype)\n",
+      "   599                                               else:\n",
+      "   600         2      21000.0  10500.0      0.1          self_loops = np.array(\n",
+      "   601         1          0.0      0.0      0.0              self_loops_items,\n",
+      "   602         1       1000.0   1000.0      0.0              dtype=[('from', 'int64'), ('weight', 'float64')]\n",
+      "   603                                                   )\n",
+      "   604         1       1000.0   1000.0      0.0          self_loops_full = np.empty(len(self_loops), dtype=mst_arr.dtype)\n",
+      "   605         1       2000.0   2000.0      0.0          self_loops_full['from'] = self_loops['from']\n",
+      "   606         1       1000.0   1000.0      0.0          self_loops_full['to'] = self_loops['from']\n",
+      "   607         1          0.0      0.0      0.0          self_loops_full['weight'] = self_loops['weight']\n",
+      "   608                                               \n",
+      "   609         1      27000.0  27000.0      0.1      all_edges = np.concatenate([mst_arr, self_loops_full])\n",
+      "   610                                               \n",
+      "   611         1      23000.0  23000.0      0.1      order = np.argsort(all_edges[\"weight\"])[::-1]\n",
+      "   612         1       8000.0   8000.0      0.0      sorted_edges = all_edges[order]\n",
+      "   613                                               \n",
+      "   614         2      40000.0  20000.0      0.1      edges_sorted_df = pd.Series(\n",
+      "   615         1       1000.0   1000.0      0.0          sorted_edges['weight'],\n",
+      "   616         2     300000.0 150000.0      1.0          index=pd.MultiIndex.from_arrays(\n",
+      "   617         1       1000.0   1000.0      0.0              [sorted_edges['from'], sorted_edges['to']],\n",
+      "   618         1          0.0      0.0      0.0              names=['from', 'to']\n",
+      "   619                                                   ),\n",
+      "   620         1          0.0      0.0      0.0          name='weight'\n",
+      "   621                                               )\n",
+      "   622         1          0.0      0.0      0.0      return edges_sorted_df, d_graph"
+     ]
+    }
+   ],
+   "source": [
+    "# Profile _build_hdbscan_graphs\n",
+    "print(\"=\"*80)\n",
+    "print(\"Line-by-line profile of _build_hdbscan_graphs()\")\n",
+    "print(\"=\"*80)\n",
+    "\n",
+    "%lprun -f hdbscan._build_hdbscan_graphs hdbscan.hdbscan_labels(\\\n",
+    "    data=data_300,\\\n",
+    "    time_thresh=30,\\\n",
+    "    min_pts=2,\\\n",
+    "    min_cluster_size=2,\\\n",
+    "    dur_min=5,\\\n",
+    "    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 1df01a1327182b37c2a5d1862122217c597c7e36 Mon Sep 17 00:00:00 2001
From: Caroline Chen <cyc.caroline@gmail.com>
Date: Mon, 11 May 2026 14:39:41 -0400
Subject: [PATCH 2/2] fix errors

---
 examples/profile_hdbscan.ipynb | 350 ++++++++++-----------------------
 1 file changed, 104 insertions(+), 246 deletions(-)

diff --git a/examples/profile_hdbscan.ipynb b/examples/profile_hdbscan.ipynb
index fcfe8c9b..6ae6ea8c 100644
--- a/examples/profile_hdbscan.ipynb
+++ b/examples/profile_hdbscan.ipynb
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -133,7 +133,7 @@
        "4  1704068151 -68.796272  81.513295        1"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -181,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -192,52 +192,52 @@
       "================================================================================\n",
       "cProfile Results - Top 40 Functions by Cumulative Time\n",
       "================================================================================\n",
-      "         2959533 function calls (2906807 primitive calls) in 0.772 seconds\n",
+      "         6499762 function calls (6451156 primitive calls) in 4.923 seconds\n",
       "\n",
       "   Ordered by: cumulative time\n",
-      "   List reduced from 979 to 40 due to restriction <40>\n",
+      "   List reduced from 1247 to 40 due to restriction <40>\n",
       "\n",
       "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
-      "        2    0.000    0.000    0.772    0.386 interactiveshell.py:3514(run_code)\n",
-      "        2    0.000    0.000    0.772    0.386 {built-in method builtins.exec}\n",
-      "        1    0.000    0.000    0.772    0.772 1367433339.py:8(<module>)\n",
-      "        1    0.002    0.002    0.771    0.771 hdbscan.py:624(hdbscan_labels)\n",
-      "        1    0.009    0.009    0.543    0.543 hdbscan.py:219(cluster_hierarchy)\n",
-      "      294    0.010    0.000    0.231    0.001 hdbscan.py:160(_build_border_map)\n",
-      "      456    0.007    0.000    0.144    0.000 hdbscan.py:339(_build_graph_pd)\n",
-      "     1004    0.003    0.000    0.116    0.000 indexing.py:883(__setitem__)\n",
-      "      293    0.000    0.000    0.099    0.000 multi.py:216(new_meth)\n",
-      "      293    0.003    0.000    0.098    0.000 multi.py:546(from_tuples)\n",
-      "      295    0.001    0.000    0.097    0.000 multi.py:475(from_arrays)\n",
-      "      295    0.000    0.000    0.092    0.000 categorical.py:3089(factorize_from_iterables)\n",
-      "      885    0.000    0.000    0.092    0.000 categorical.py:3110(<genexpr>)\n",
-      "      590    0.001    0.000    0.092    0.000 categorical.py:3046(factorize_from_iterable)\n",
-      "3762/3176    0.007    0.000    0.091    0.000 base.py:3827(get_indexer)\n",
-      "      592    0.003    0.000    0.090    0.000 categorical.py:371(__init__)\n",
-      "     1205    0.001    0.000    0.086    0.000 indexing.py:1533(_get_listlike_indexer)\n",
-      "     1208    0.003    0.000    0.085    0.000 base.py:6198(_get_indexer_strict)\n",
-      "     1004    0.001    0.000    0.084    0.000 indexing.py:745(_get_setitem_indexer)\n",
-      "     1004    0.001    0.000    0.082    0.000 indexing.py:1453(_convert_to_indexer)\n",
-      "     1781    0.001    0.000    0.066    0.000 common.py:62(new_method)\n",
-      "536933/531424    0.036    0.000    0.065    0.000 {built-in method builtins.isinstance}\n",
-      "     1501    0.002    0.000    0.064    0.000 base.py:4330(reindex)\n",
-      "        1    0.022    0.022    0.063    0.063 hdbscan.py:53(_compute_core_distance)\n",
-      "     2258    0.001    0.000    0.062    0.000 base.py:6174(get_indexer_for)\n",
-      "  968/875    0.001    0.000    0.054    0.000 indexing.py:1177(__getitem__)\n",
-      "     1228    0.002    0.000    0.054    0.000 series.py:1107(__getitem__)\n",
-      "     1231    0.001    0.000    0.053    0.000 series.py:6129(_cmp_method)\n",
-      "     3710    0.010    0.000    0.053    0.000 base.py:475(__new__)\n",
-      "      843    0.001    0.000    0.050    0.000 indexing.py:1398(_getitem_axis)\n",
-      "3515/2929    0.011    0.000    0.041    0.000 base.py:3962(_get_indexer)\n",
-      "     1011    0.013    0.000    0.040    0.000 base.py:675(_with_infer)\n",
-      "     2368    0.001    0.000    0.040    0.000 dtypes.py:220(__init__)\n",
-      "     2368    0.001    0.000    0.040    0.000 dtypes.py:373(_finalize)\n",
-      "     1184    0.002    0.000    0.038    0.000 dtypes.py:550(validate_categories)\n",
-      "     1325    0.002    0.000    0.037    0.000 series.py:6222(_construct_result)\n",
-      "     6734    0.011    0.000    0.037    0.000 construction.py:517(sanitize_array)\n",
-      "      293    0.000    0.000    0.036    0.000 series.py:5155(reindex)\n",
-      "     2454    0.004    0.000    0.035    0.000 multi.py:326(__new__)\n",
-      "      293    0.001    0.000    0.035    0.000 generic.py:5365(reindex)\n",
+      "        2    0.000    0.000    4.923    2.461 interactiveshell.py:3514(run_code)\n",
+      "      4/2    0.000    0.000    4.923    2.461 {built-in method builtins.exec}\n",
+      "        1    0.001    0.001    4.923    4.923 1367433339.py:8(<module>)\n",
+      "        1    0.005    0.005    4.922    4.922 hdbscan.py:488(hdbscan_labels)\n",
+      "        1    0.082    0.082    3.544    3.544 hdbscan.py:88(cluster_hierarchy)\n",
+      "      500    0.079    0.000    2.154    0.004 hdbscan.py:22(_build_border_map)\n",
+      "        1    0.000    0.000    1.039    1.039 preprocessing.py:100(_find_neighbors)\n",
+      "        7    0.489    0.070    0.853    0.122 graph.py:961(add_edges_from)\n",
+      "      499    0.311    0.001    0.831    0.002 hdbscan.py:65(<listcomp>)\n",
+      "        1    0.061    0.061    0.747    0.747 preprocessing.py:29(_find_spatial_neighbors)\n",
+      "        2    0.000    0.000    0.719    0.360 graph.py:1041(add_weighted_edges_from)\n",
+      "      498    0.003    0.000    0.514    0.001 groupby.py:1898(_agg_general)\n",
+      "      495    0.001    0.000    0.510    0.001 generic.py:1176(idxmin)\n",
+      "      495    0.002    0.000    0.509    0.001 groupby.py:5818(_idxmax_idxmin)\n",
+      "      498    0.004    0.000    0.507    0.001 groupby.py:1964(_cython_agg_general)\n",
+      "     4967    0.045    0.000    0.423    0.000 base.py:475(__new__)\n",
+      "3313/2814    0.041    0.000    0.420    0.000 series.py:392(__init__)\n",
+      "      495    0.002    0.000    0.413    0.001 multi.py:216(new_meth)\n",
+      "      495    0.012    0.000    0.412    0.001 multi.py:546(from_tuples)\n",
+      "      496    0.004    0.000    0.401    0.001 multi.py:475(from_arrays)\n",
+      "      496    0.003    0.000    0.373    0.001 categorical.py:3089(factorize_from_iterables)\n",
+      "     1488    0.013    0.000    0.370    0.000 categorical.py:3110(<genexpr>)\n",
+      "  644/498    0.005    0.000    0.367    0.001 base.py:3245(union)\n",
+      "      992    0.008    0.000    0.357    0.000 categorical.py:3046(factorize_from_iterable)\n",
+      "   186309    0.066    0.000    0.348    0.000 fromnumeric.py:3360(round)\n",
+      "      994    0.017    0.000    0.344    0.000 categorical.py:371(__init__)\n",
+      "    10774    0.053    0.000    0.333    0.000 construction.py:517(sanitize_array)\n",
+      "  550/404    0.005    0.000    0.332    0.001 base.py:3367(_union)\n",
+      "   187339    0.066    0.000    0.294    0.000 fromnumeric.py:51(_wrapfunc)\n",
+      "      152    0.006    0.000    0.285    0.002 algorithms.py:1612(union_with_duplicates)\n",
+      "      495    0.007    0.000    0.283    0.001 groupby.py:5925(_wrap_idxmax_idxmin)\n",
+      "   129474    0.201    0.000    0.223    0.000 graph.py:1089(<genexpr>)\n",
+      "1034478/1028257    0.133    0.000    0.215    0.000 {built-in method builtins.isinstance}\n",
+      "      498    0.002    0.000    0.203    0.000 base.py:365(grouped_reduce)\n",
+      "   186309    0.200    0.000    0.200    0.000 {method 'round' of 'numpy.generic' objects}\n",
+      "      498    0.002    0.000    0.190    0.000 groupby.py:1978(array_func)\n",
+      "        1    0.020    0.020    0.190    0.190 hdbscan.py:460(_build_hdbscan_graphs)\n",
+      "      498    0.004    0.000    0.188    0.000 ops.py:821(_cython_operation)\n",
+      "      495    0.002    0.000    0.185    0.000 multi.py:1955(to_flat_index)\n",
+      "1990/1495    0.009    0.000    0.184    0.000 algorithms.py:610(factorize)\n",
       "\n",
       "\n",
       "\n"
@@ -287,9 +287,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
    "source": [
     "# Install line_profiler if needed\n",
     "!pip install line_profiler -q"
@@ -297,7 +307,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -306,86 +316,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Generated 300 points\n",
-      "================================================================================\n",
-      "Line-by-line profile of _find_temp_neighbors()\n",
-      "================================================================================\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Timer unit: 1e-09 s\n",
-      "\n",
-      "Total time: 0.00294 s\n",
-      "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n",
-      "Function: _find_temp_neighbors at line 11\n",
-      "\n",
-      "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
-      "==============================================================\n",
-      "    11                                           def _find_temp_neighbors(times, time_thresh, use_datetime):\n",
-      "    12                                               \"\"\"\n",
-      "    13                                               Find timestamp pairs that are within time threshold.\n",
-      "    14                                           \n",
-      "    15                                               Parameters\n",
-      "    16                                               ----------\n",
-      "    17                                               times : array of timestamps.\n",
-      "    18                                               time_thresh : time threshold for finding what timestamps are close in time.\n",
-      "    19                                               use_datetime : Whether to process timestamps as datetime objects.\n",
-      "    20                                           \n",
-      "    21                                               Returns\n",
-      "    22                                               -------\n",
-      "    23                                               time_pairs : list of tuples of timestamps [(t1, t2), ...] that are close in time given time_thresh.\n",
-      "    24                                           \n",
-      "    25                                               TC: O(n^2)\n",
-      "    26                                               \"\"\"\n",
-      "    27                                               # getting times based on whether they are datetime values or timestamps, changed to seconds for calculations\n",
-      "    28         1      10000.0  10000.0      0.3      times = to_timestamp(times).values if use_datetime else times.values\n",
-      "    29                                                   \n",
-      "    30                                               # Pairwise time differences\n",
-      "    31                                               # times[:, np.newaxis]: from shape (n,) -> to shape (n, 1) – a column vector\n",
-      "    32         1     667000.0 667000.0     22.7      time_diffs = np.abs(times[:, np.newaxis] - times)\n",
-      "    33         1     132000.0 132000.0      4.5      time_diffs = time_diffs.astype(int)\n",
-      "    34                                               \n",
-      "    35                                               # Filter by time threshold\n",
-      "    36         1     263000.0 263000.0      8.9      within_time_thresh = np.triu(time_diffs <= (time_thresh * 60), k=1) # keep upper triangle\n",
-      "    37         1     398000.0 398000.0     13.5      i_idx, j_idx = np.where(within_time_thresh)\n",
-      "    38                                               \n",
-      "    39                                               # Return a list of (timestamp1, timestamp2) tuples\n",
-      "    40         1    1470000.0 1.47e+06     50.0      time_pairs = [(times[i], times[j]) for i, j in zip(i_idx, j_idx)]\n",
-      "    41                                               \n",
-      "    42         1          0.0      0.0      0.0      return time_pairs, times"
+      "Generated 300 points\n"
      ]
     }
    ],
    "source": [
-    "data_300 = generate_test_data(300, seed=88)\n",
-    "\n",
-    "print(\"=\"*80)\n",
-    "print(\"Line-by-line profile of _find_temp_neighbors()\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "%lprun -f hdbscan._find_temp_neighbors hdbscan.hdbscan_labels(\\\n",
-    "    data=data_300,\\\n",
-    "    time_thresh=30,\\\n",
-    "    min_pts=2,\\\n",
-    "    min_cluster_size=2,\\\n",
-    "    dur_min=5,\\\n",
-    "    traj_cols={'timestamp': 'timestamp', 'x': 'x', 'y': 'y', 'user_id': 'user_id'}\\\n",
-    ")"
+    "data_300 = generate_test_data(300, seed=88)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -403,71 +351,22 @@
      "text": [
       "Timer unit: 1e-09 s\n",
       "\n",
-      "Total time: 0.055209 s\n",
+      "Total time: 0.022125 s\n",
       "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n",
-      "Function: _compute_core_distance at line 53\n",
+      "Function: _compute_core_distance at line 11\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
-      "    53                                           def _compute_core_distance(data, time_pairs, times, use_lon_lat, traj_cols, min_pts = 2):\n",
-      "    54                                               \"\"\"\n",
-      "    55                                               Calculate the core distance for each ping in data.\n",
-      "    56                                               It gives local density estimate: small core distance → high local density.\n",
-      "    57                                           \n",
-      "    58                                               Parameters\n",
-      "    59                                               ----------\n",
-      "    60                                               data : dataframe\n",
-      "    61                                           \n",
-      "    62                                               time_pairs : tuples of timestamps that are close in time given time_thresh\n",
-      "    63                                               \n",
-      "    64                                               min_pts : int\n",
-      "    65                                                   used to calculate the core distance of a point p, where core distance of a point p \n",
-      "    66                                                   is defined as the distance from p to its min_pts-th smallest nearest neighbor\n",
-      "    67                                                   (including itself).\n",
-      "    68                                           \n",
-      "    69                                               Returns\n",
-      "    70                                               -------\n",
-      "    71                                               core_distances : dictionary of timestamps\n",
-      "    72                                                   {timestamp_1: core_distance_1, ..., timestamp_n: core_distance_n} distances are quantized\n",
-      "    73                                               \"\"\"\n",
-      "    74                                               # getting coordinates based on whether they are geographic coordinates (lon, lat) or catesian (x,y)\n",
-      "    75         1       2000.0   2000.0      0.0      if use_lon_lat:\n",
-      "    76                                                   coords = np.radians(data[[traj_cols['latitude'], traj_cols['longitude']]].values) # TC: O(n)\n",
-      "    77                                               else:\n",
-      "    78         1    1637000.0 1.64e+06      3.0          coords = data[[traj_cols['x'], traj_cols['y']]].values # TC: O(n)\n",
-      "    79                                               \n",
-      "    80         1       1000.0   1000.0      0.0      n = len(coords)\n",
-      "    81                                               # get the index of timestamp in the arrays (for accessing their value later)\n",
-      "    82         1     131000.0 131000.0      0.2      ts_indices = {ts: idx for idx, ts in enumerate(times)} # TC: O(n)\n",
-      "    83                                           \n",
-      "    84                                               # Build neighbor map from time_pairs\n",
-      "    85         1    3153000.0 3.15e+06      5.7      neighbors = _build_neighbor_graph(time_pairs, times)\n",
-      "    86                                           \n",
-      "    87         1       2000.0   2000.0      0.0      D_INF = np.pi * 6_371_000  # max distance on earth\n",
-      "    88         1          0.0      0.0      0.0      core_distances = {}\n",
-      "    89                                           \n",
-      "    90       301      68000.0    225.9      0.1      for i in range(n): # TC: O(n+m (mlogm)) \n",
-      "    91       300      78000.0    260.0      0.1          u = times[i]\n",
-      "    92       300      80000.0    266.7      0.1          allowed_neighbors = neighbors[u]\n",
-      "    93       300      95000.0    316.7      0.2          dists = [0.0]  # distance to itself\n",
-      "    94                                           \n",
-      "    95      6008    1340000.0    223.0      2.4          for v in allowed_neighbors:\n",
-      "    96      5708    1442000.0    252.6      2.6              j = ts_indices.get(v)\n",
-      "    97      5708    1001000.0    175.4      1.8              if j is not None:\n",
-      "    98      5708     879000.0    154.0      1.6                  if use_lon_lat:\n",
-      "    99                                                               dist = utils._haversine_distance(coords[i], coords[j])\n",
-      "   100                                                           else:\n",
-      "   101      5708   28618000.0   5013.7     51.8                      dist = np.sqrt(np.sum((coords[i] - coords[j]) ** 2))\n",
-      "   102                                                           \n",
-      "   103      5708   14390000.0   2521.0     26.1                  dists.append(np.round(dist * 4) / 4)\n",
-      "   104                                           \n",
-      "   105                                                   # pad with large numbers if not enough neighbors\n",
-      "   106       300      84000.0    280.0      0.2          while len(dists) < min_pts:\n",
-      "   107                                                       dists.append(D_INF) # use a very large number e.g. infinity for edges between points not temporally close\n",
-      "   108                                           \n",
-      "   109       300    1323000.0   4410.0      2.4          sorted_dists = np.sort(dists) # TC: O(nlogn)\n",
-      "   110       300     885000.0   2950.0      1.6          core_distances[u] = np.round(sorted_dists[min_pts - 1] * 4)/4\n",
-      "   111         1          0.0      0.0      0.0      return core_distances, coords"
+      "    11                                           def _compute_core_distance(G, min_pts):\n",
+      "    12         1       1000.0   1000.0      0.0      result = {}\n",
+      "    13                                               \n",
+      "    14       301      90000.0    299.0      0.4      for node in G.nodes():\n",
+      "    15       300   20228000.0  67426.7     91.4          edges = sorted(G.edges(node, data='weight'), key=lambda e: e[2])\n",
+      "    16       300     184000.0    613.3      0.8          result[node] = edges[min_pts - 1][2] if len(edges) >= min_pts else np.inf\n",
+      "    17                                               \n",
+      "    18         1    1607000.0 1.61e+06      7.3      core_distances = pd.Series(result)\n",
+      "    19         1      15000.0  15000.0      0.1      core_distances.index.name = 'time'\n",
+      "    20         1          0.0      0.0      0.0      return core_distances"
      ]
     }
    ],
@@ -489,7 +388,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -507,80 +406,39 @@
      "text": [
       "Timer unit: 1e-09 s\n",
       "\n",
-      "Total time: 0.031336 s\n",
+      "Total time: 0.113286 s\n",
       "File: /Users/carolinechen/Desktop/cs/nomad/nomad/stop_detection/hdbscan.py\n",
-      "Function: _build_hdbscan_graphs at line 555\n",
+      "Function: _build_hdbscan_graphs at line 460\n",
       "\n",
       "Line #      Hits         Time  Per Hit   % Time  Line Contents\n",
       "==============================================================\n",
-      "   555                                           def _build_hdbscan_graphs(coords, ts_idx, neighbors, core_dist, use_lon_lat):\n",
-      "   556                                               \"\"\"\n",
-      "   557                                               Computes all graphs required for the HDBSCAN algorithm in one pass.\n",
-      "   558                                           \n",
-      "   559                                               Returns\n",
-      "   560                                               -------\n",
-      "   561                                               edges_sorted : np.recarray\n",
-      "   562                                                   [from, to, weight] sorted descending by weight.\n",
-      "   563                                               d_graph : pd.Series\n",
-      "   564                                                   Symmetric graph of raw distances, MultiIndex (from, to).\n",
-      "   565                                               \"\"\"\n",
-      "   566         1       1000.0   1000.0      0.0      mrd_graph = {}\n",
-      "   567         1       1000.0   1000.0      0.0      u_list, v_list, d_list = [], [], []\n",
-      "   568                                           \n",
-      "   569       301      74000.0    245.8      0.2      for u, u_neighbors in neighbors.items():\n",
-      "   570       300      60000.0    200.0      0.2          i = ts_idx[u]\n",
-      "   571      6008    1103000.0    183.6      3.5          for v in u_neighbors:\n",
-      "   572      5708     865000.0    151.5      2.8              if u >= v:\n",
-      "   573      2854     331000.0    116.0      1.1                  continue\n",
-      "   574                                                       \n",
-      "   575      2854     505000.0    176.9      1.6              j = ts_idx[v]\n",
-      "   576      5708     783000.0    137.2      2.5              dist = (utils._haversine_distance(coords[i], coords[j])\n",
-      "   577      5708   11266000.0   1973.7     36.0                      if use_lon_lat else np.linalg.norm(coords[i] - coords[j]))\n",
-      "   578      2854    6039000.0   2116.0     19.3              dist = np.round(dist * 4) / 4\n",
-      "   579                                           \n",
-      "   580      2854    1236000.0    433.1      3.9              mrd_graph[(u, v)] = max(core_dist[u], core_dist[v], dist)\n",
-      "   581      2854     544000.0    190.6      1.7              u_list.append(u)\n",
-      "   582      2854     500000.0    175.2      1.6              v_list.append(v)\n",
-      "   583      2854     467000.0    163.6      1.5              d_list.append(dist)\n",
-      "   584                                           \n",
-      "   585         1    2035000.0 2.04e+06      6.5      idx = pd.MultiIndex.from_arrays([u_list, v_list], names=[\"from\", \"to\"])\n",
-      "   586         1     591000.0 591000.0      1.9      d_graph_part = pd.Series(d_list, index=idx)\n",
-      "   587                                               \n",
-      "   588         1      66000.0  66000.0      0.2      rev = d_graph_part.copy()\n",
-      "   589         1      44000.0  44000.0      0.1      rev.index = rev.index.swaplevel(0, 1)\n",
-      "   590         1     375000.0 375000.0      1.2      d_graph = pd.concat([d_graph_part, rev])\n",
-      "   591                                           \n",
-      "   592                                               # Build MST from MRD graph\n",
-      "   593         1    4020000.0 4.02e+06     12.8      mst_arr = _mst(mrd_graph)\n",
-      "   594                                           \n",
-      "   595                                               # Extend and sort MST with self-loops\n",
-      "   596         1       3000.0   3000.0      0.0      self_loops_items = list(core_dist.items())\n",
-      "   597         1       1000.0   1000.0      0.0      if not self_loops_items:\n",
-      "   598                                                   self_loops_full = np.empty(0, dtype=mst_arr.dtype)\n",
-      "   599                                               else:\n",
-      "   600         2      21000.0  10500.0      0.1          self_loops = np.array(\n",
-      "   601         1          0.0      0.0      0.0              self_loops_items,\n",
-      "   602         1       1000.0   1000.0      0.0              dtype=[('from', 'int64'), ('weight', 'float64')]\n",
-      "   603                                                   )\n",
-      "   604         1       1000.0   1000.0      0.0          self_loops_full = np.empty(len(self_loops), dtype=mst_arr.dtype)\n",
-      "   605         1       2000.0   2000.0      0.0          self_loops_full['from'] = self_loops['from']\n",
-      "   606         1       1000.0   1000.0      0.0          self_loops_full['to'] = self_loops['from']\n",
-      "   607         1          0.0      0.0      0.0          self_loops_full['weight'] = self_loops['weight']\n",
-      "   608                                               \n",
-      "   609         1      27000.0  27000.0      0.1      all_edges = np.concatenate([mst_arr, self_loops_full])\n",
-      "   610                                               \n",
-      "   611         1      23000.0  23000.0      0.1      order = np.argsort(all_edges[\"weight\"])[::-1]\n",
-      "   612         1       8000.0   8000.0      0.0      sorted_edges = all_edges[order]\n",
-      "   613                                               \n",
-      "   614         2      40000.0  20000.0      0.1      edges_sorted_df = pd.Series(\n",
-      "   615         1       1000.0   1000.0      0.0          sorted_edges['weight'],\n",
-      "   616         2     300000.0 150000.0      1.0          index=pd.MultiIndex.from_arrays(\n",
-      "   617         1       1000.0   1000.0      0.0              [sorted_edges['from'], sorted_edges['to']],\n",
-      "   618         1          0.0      0.0      0.0              names=['from', 'to']\n",
-      "   619                                                   ),\n",
-      "   620         1          0.0      0.0      0.0          name='weight'\n",
-      "   621                                               )\n",
-      "   622         1          0.0      0.0      0.0      return edges_sorted_df, d_graph"
+      "   460                                           def _build_hdbscan_graphs(G, core_dist):\n",
+      "   461                                               \"\"\"\n",
+      "   462                                               Computes all graphs required for the HDBSCAN algorithm in one pass.\n",
+      "   463                                               Uses precomputed edge weights from G instead of recomputing distances.\n",
+      "   464                                           \n",
+      "   465                                               Returns\n",
+      "   466                                               -------\n",
+      "   467                                               H : nx.Graph\n",
+      "   468                                                   Hierarchy graph with mutual-reachability MST edges and core-distance\n",
+      "   469                                                   self-loops.\n",
+      "   470                                               edges_sorted_df : pd.Series\n",
+      "   471                                                   H sorted descending by weight, MultiIndex (from, to).\n",
+      "   472                                               \"\"\"\n",
+      "   473         1   19645000.0 1.96e+07     17.3      G_copy = G.copy()\n",
+      "   474      2855    5431000.0   1902.3      4.8      for u, v, data in G_copy.edges(data=True):\n",
+      "   475      2854   11464000.0   4016.8     10.1          d = np.round(data[\"weight\"] * 4) / 4\n",
+      "   476      2854   39450000.0  13822.7     34.8          data[\"weight\"] = max(core_dist.at[u], core_dist.at[v], d)\n",
+      "   477                                           \n",
+      "   478         1   28629000.0 2.86e+07     25.3      H = nx.minimum_spanning_tree(G_copy)\n",
+      "   479                                           \n",
+      "   480         1    1098000.0  1.1e+06      1.0      H.add_edges_from((node, node, {'weight': weight}) for node, weight in core_dist.items())\n",
+      "   481                                           \n",
+      "   482         1    5519000.0 5.52e+06      4.9      all_edges = nx.to_pandas_edgelist(H, source='from', target='to')\n",
+      "   483         1     601000.0 601000.0      0.5      all_edges.sort_values('weight', ascending=False, inplace=True)\n",
+      "   484                                           \n",
+      "   485         1    1365000.0 1.36e+06      1.2      all_edges.set_index(['from', 'to'], inplace=True)\n",
+      "   486         1      84000.0  84000.0      0.1      return H, all_edges['weight']"
      ]
     }
    ],

	timestamp	x	y	user_id
0	1704067411	-25.091976	-93.714163	1
1	1704067586	90.142861	27.282082	1
2	1704067720	46.398788	-37.128804	1
3	1704067892	19.731697	1.714138	1
4	1704068151	-68.796272	81.513295	1