From 51d7510a80d2cf025f477cb738117ca85b0d2880 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 23 Jul 2025 19:04:01 +0000 Subject: [PATCH 1/2] Initial plan From 83a34e11dedf41a55a4bdb30127e13dd64c4dae2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 23 Jul 2025 19:14:23 +0000 Subject: [PATCH 2/2] Implement distance calculation optimizations using SciPy and vectorization Co-authored-by: sidchaini <40721514+sidchaini@users.noreply.github.com> --- distclassipy/classifier.py | 25 +++++++++++++++++ distclassipy/distances.py | 56 ++++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/distclassipy/classifier.py b/distclassipy/classifier.py index 5944215..52a4ab0 100644 --- a/distclassipy/classifier.py +++ b/distclassipy/classifier.py @@ -72,6 +72,13 @@ def initialize_metric_function(metric): elif isinstance(metric, str): metric_str_lowercase = metric.lower() metric_found = False + + # Map DistClassiPy metric names to SciPy equivalents where possible + scipy_metric_mapping = { + 'squared_euclidean': 'sqeuclidean', + 'jensenshannon_divergence': 'jensenshannon', + } + for package_str, source in METRIC_SOURCES_.items(): # Don't use scipy for jaccard as their implementation only works with @@ -82,6 +89,24 @@ def initialize_metric_function(metric): ): continue + # Check for direct mapping to SciPy equivalents + if package_str == "scipy.spatial.distance": + scipy_metric_name = scipy_metric_mapping.get(metric_str_lowercase, metric_str_lowercase) + if hasattr(source, scipy_metric_name): + if metric_str_lowercase == 'jensenshannon_divergence': + # Special handling for Jensen-Shannon divergence + # We need to wrap it to square the result + import functools + base_fn = getattr(source, scipy_metric_name) + metric_fn_ = lambda u, v: base_fn(u, v) ** 2 + # Still use the optimized scipy function for cdist + metric_arg_ = scipy_metric_name + else: + metric_fn_ = getattr(source, scipy_metric_name) + metric_arg_ = scipy_metric_name + metric_found = True + break + if hasattr(source, metric_str_lowercase): metric_fn_ = getattr(source, metric_str_lowercase) metric_found = True diff --git a/distclassipy/distances.py b/distclassipy/distances.py index d4c1b21..b01b217 100755 --- a/distclassipy/distances.py +++ b/distclassipy/distances.py @@ -354,8 +354,13 @@ def clark(u, v): 1(4), 300-307. """ u, v = np.asarray(u), np.asarray(v) - with np.errstate(divide="ignore", invalid="ignore"): - return np.sqrt(np.nansum(np.power(np.abs(u - v) / (u + v), 2))) + diff = np.abs(u - v) + sum_uv = u + v + # Use boolean indexing to avoid division by zero more efficiently + nonzero_mask = sum_uv != 0 + result = np.zeros_like(diff) + result[nonzero_mask] = (diff[nonzero_mask] / sum_uv[nonzero_mask]) ** 2 + return np.sqrt(np.sum(result)) def hellinger(u, v): @@ -386,11 +391,13 @@ def hellinger(u, v): 1(4), 300-307. """ u, v = np.asarray(u), np.asarray(v) - # Clip negative values to zero for valid sqrt - with np.errstate(divide="ignore", invalid="ignore"): - u = np.clip(u, a_min=0, a_max=None) - v = np.clip(v, a_min=0, a_max=None) - return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)) + # Clip negative values to zero for valid sqrt and vectorize operations + u = np.clip(u, 0, None) + v = np.clip(v, 0, None) + sqrt_u = np.sqrt(u) + sqrt_v = np.sqrt(v) + diff = sqrt_u - sqrt_v + return np.sqrt(2 * np.dot(diff, diff)) def jaccard(u, v): @@ -442,8 +449,8 @@ def lorentzian(u, v): eschew the log of zero. """ u, v = np.asarray(u), np.asarray(v) - with np.errstate(divide="ignore", invalid="ignore"): - return np.sum(np.log(np.abs(u - v) + 1)) + abs_diff = np.abs(u - v) + return np.sum(np.log1p(abs_diff)) # log1p(x) = log(1 + x) is more accurate def marylandbridge(u, v): @@ -548,7 +555,9 @@ def soergel(u, v): 1(4), 300-307. """ u, v = np.asarray(u), np.asarray(v) - return np.sum(np.abs(u - v)) / np.sum(np.maximum(u, v)) + abs_diff = np.abs(u - v) + max_uv = np.maximum(u, v) + return np.sum(abs_diff) / np.sum(max_uv) def wave_hedges(u, v): @@ -570,10 +579,13 @@ def wave_hedges(u, v): 1(4), 300-307 """ u, v = np.asarray(u), np.asarray(v) - with np.errstate(divide="ignore", invalid="ignore"): - u_v = abs(u - v) - uvmax = np.maximum(u, v) - return np.sum(np.where(((u_v != 0) & (uvmax != 0)), u_v / uvmax, 0)) + abs_diff = np.abs(u - v) + max_uv = np.maximum(u, v) + # Use boolean indexing for more efficient zero handling + nonzero_mask = (abs_diff != 0) & (max_uv != 0) + result = np.zeros_like(abs_diff) + result[nonzero_mask] = abs_diff[nonzero_mask] / max_uv[nonzero_mask] + return np.sum(result) def kulczynski(u, v): @@ -907,17 +919,9 @@ def jensenshannon_divergence(u, v): return np.sum(el1 - el2 * el3) """ u, v = np.asarray(u), np.asarray(v) - with np.errstate(divide="ignore", invalid="ignore"): - # Clip negative values to zero for valid log - u[u == 0] = EPSILON - v[v == 0] = EPSILON - - term1 = np.clip(2 * u / (u + v), a_min=EPSILON, a_max=None) - term2 = np.clip(2 * v / (u + v), a_min=EPSILON, a_max=None) - - dl = u * np.log(term1) - dr = v * np.log(term2) - return (np.sum(dl) + np.sum(dr)) / 2 + # Use SciPy's optimized implementation and square the result + # to match the expected Jensen-Shannon divergence formula + return scipy.spatial.distance.jensenshannon(u, v) ** 2 def jensen_difference(u, v): @@ -1207,7 +1211,7 @@ def squared_euclidean(u, v): Equals to squared Euclidean distance. """ u, v = np.asarray(u), np.asarray(v) - return np.dot((u - v), (u - v)) + return scipy.spatial.distance.sqeuclidean(u, v) def taneja(u, v):