From dc0615fc9c8305fe6069e1c06c4403892e6909d7 Mon Sep 17 00:00:00 2001
From: b010001y <528387485@qq.com>
Date: Sat, 25 Apr 2026 23:27:59 +0800
Subject: [PATCH] Migrate 15 gpu_baseline.cu files to unified compute_only
 interface

The repo migrated all task_io.cu adapters and cpu_reference.c files to a
single solution_compute(N, all_inputs..., output) function in commit
a453c5b ("Migrate all 44 tasks to unified compute_only interface").
However, the 15 gpu_baseline.cu files were not migrated and still expose
the old solution_init + solution_compute(N, output_only) split. As a
result they link successfully (the LLM-facing names match) but produce
garbage at runtime: solution_init is never called, so device pointers
stay null when solution_compute launches its kernel.

This commit migrates each gpu_baseline.cu to the new interface following
the same pattern:
- Delete solution_init entirely.
- Define solution_free before solution_compute (so the lazy-init block
  can call it).
- solution_compute now takes the full set of inputs (signature copied
  verbatim from the task's task_io.cu).
- Move all cudaMalloc into solution_compute, guarded by a shape check
  so allocation only happens on the first call (or when shape changes).
- H2D copies move into solution_compute and run every call, per the
  unified contract: "the harness passes full host-side inputs every call".
- cudaDeviceSynchronize() added before return.

Verified on small data (sm_89 / L20X). 13 of 15 pass exact/tol-bounded
correctness checks vs expected_output.txt:

| Task | GPU(ms) | CPU(ms) |
| --- | ---: | ---: |
| black_scholes              |    0.375 |    7.259 |
| dbscan                     |  203.010 |  239.577 |
| dtw_distance               |    0.316 | 1806.643 |
| monte_carlo                |    0.363 | 1332.731 |
| hausdorff_distance         |    0.102 |    1.359 |
| euclidean_distance_matrix  |    0.054 |    0.377 |
| held_karp_tsp              |    0.601 |   22.403 |
| spmv_csr                   |    0.107 |    0.033 |
| max_flow_push_relabel      | 1948.765 |   88.825 |
| sph_cell_index             |    0.409 |    6.978 |
| sph_position               |    0.351 |    0.430 |
| sph_forces                 |    0.304 |    4.562 |
| pdlp                       |    7.128 |    2.352 |

Two files (bonds_pricing, repo_pricing) have a separate, pre-existing
forward-declaration bug in their __device__ helper functions
(bondAccruedAmountGpu, cashFlowsNpvGpu, etc. are called before they are
declared). Their old-interface versions on main also fail to compile
with the same errors, so this is not a regression. The migration in
this commit is correct on its own; the forward-decl bug should be
addressed separately.
---
 tasks/black_scholes/gpu_baseline.cu           |  64 ++++----
 tasks/bonds_pricing/gpu_baseline.cu           | 101 +++++-------
 tasks/dbscan/gpu_baseline.cu                  |  57 +++----
 tasks/dtw_distance/gpu_baseline.cu            |  39 ++---
 .../euclidean_distance_matrix/gpu_baseline.cu |  50 +++---
 tasks/hausdorff_distance/gpu_baseline.cu      |  57 +++----
 tasks/held_karp_tsp/gpu_baseline.cu           |  25 +--
 tasks/max_flow_push_relabel/gpu_baseline.cu   |  77 +++++----
 tasks/monte_carlo/gpu_baseline.cu             |  31 ++--
 tasks/pdlp/gpu_baseline.cu                    | 124 ++++++++-------
 tasks/repo_pricing/gpu_baseline.cu            | 146 ++++++++----------
 tasks/sph_cell_index/gpu_baseline.cu          |  75 ++++-----
 tasks/sph_forces/gpu_baseline.cu              | 129 ++++++++--------
 tasks/sph_position/gpu_baseline.cu            |  83 +++++-----
 tasks/spmv_csr/gpu_baseline.cu                |  62 ++++----
 15 files changed, 556 insertions(+), 564 deletions(-)

diff --git a/tasks/black_scholes/gpu_baseline.cu b/tasks/black_scholes/gpu_baseline.cu
index a677287..44c5755 100644
--- a/tasks/black_scholes/gpu_baseline.cu
+++ b/tasks/black_scholes/gpu_baseline.cu
@@ -257,7 +257,7 @@ __global__ void getOutValOptionKernel(
     outputVals[optionNum] = resultVal;
 }
 
-// ===== Host interface =====
+// ===== Host interface (compute_only) =====
 
 static int    g_N = 0;
 static int*   d_types   = nullptr;
@@ -269,23 +269,40 @@ static float* d_ts      = nullptr;
 static float* d_vols    = nullptr;
 static float* d_prices  = nullptr;
 
-extern "C" void solution_init(int N,
-                               const int* types, const float* strikes, const float* spots,
-                               const float* qs, const float* rs, const float* ts,
-                               const float* vols)
+extern "C" void solution_free(void)
+{
+    if (d_types)   { cudaFree(d_types);   d_types   = nullptr; }
+    if (d_strikes) { cudaFree(d_strikes); d_strikes = nullptr; }
+    if (d_spots)   { cudaFree(d_spots);   d_spots   = nullptr; }
+    if (d_qs)      { cudaFree(d_qs);      d_qs      = nullptr; }
+    if (d_rs)      { cudaFree(d_rs);      d_rs      = nullptr; }
+    if (d_ts)      { cudaFree(d_ts);      d_ts      = nullptr; }
+    if (d_vols)    { cudaFree(d_vols);    d_vols    = nullptr; }
+    if (d_prices)  { cudaFree(d_prices);  d_prices  = nullptr; }
+    g_N = 0;
+}
+
+extern "C" void solution_compute(int N,
+                                  const int* types, const float* strikes, const float* spots,
+                                  const float* qs, const float* rs, const float* ts,
+                                  const float* vols,
+                                  float* prices)
 {
-    g_N = N;
     size_t szi = (size_t)N * sizeof(int);
     size_t szf = (size_t)N * sizeof(float);
 
-    cudaMalloc(&d_types,   szi);
-    cudaMalloc(&d_strikes, szf);
-    cudaMalloc(&d_spots,   szf);
-    cudaMalloc(&d_qs,      szf);
-    cudaMalloc(&d_rs,      szf);
-    cudaMalloc(&d_ts,      szf);
-    cudaMalloc(&d_vols,    szf);
-    cudaMalloc(&d_prices,  szf);
+    if (g_N != N) {
+        solution_free();
+        cudaMalloc(&d_types,   szi);
+        cudaMalloc(&d_strikes, szf);
+        cudaMalloc(&d_spots,   szf);
+        cudaMalloc(&d_qs,      szf);
+        cudaMalloc(&d_rs,      szf);
+        cudaMalloc(&d_ts,      szf);
+        cudaMalloc(&d_vols,    szf);
+        cudaMalloc(&d_prices,  szf);
+        g_N = N;
+    }
 
     cudaMemcpy(d_types,   types,   szi, cudaMemcpyHostToDevice);
     cudaMemcpy(d_strikes, strikes, szf, cudaMemcpyHostToDevice);
@@ -294,27 +311,12 @@ extern "C" void solution_init(int N,
     cudaMemcpy(d_rs,      rs,      szf, cudaMemcpyHostToDevice);
     cudaMemcpy(d_ts,      ts,      szf, cudaMemcpyHostToDevice);
     cudaMemcpy(d_vols,    vols,    szf, cudaMemcpyHostToDevice);
-}
 
-extern "C" void solution_compute(int N, float* prices)
-{
     int threadsPerBlock = 256;
     int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
-
     getOutValOptionKernel<<<blocks, threadsPerBlock>>>(
         N, d_types, d_strikes, d_spots, d_qs, d_rs, d_ts, d_vols, d_prices);
 
-    cudaMemcpy(prices, d_prices, (size_t)N * sizeof(float), cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    if (d_types)   { cudaFree(d_types);   d_types   = nullptr; }
-    if (d_strikes) { cudaFree(d_strikes); d_strikes = nullptr; }
-    if (d_spots)   { cudaFree(d_spots);   d_spots   = nullptr; }
-    if (d_qs)      { cudaFree(d_qs);      d_qs      = nullptr; }
-    if (d_rs)      { cudaFree(d_rs);      d_rs      = nullptr; }
-    if (d_ts)      { cudaFree(d_ts);      d_ts      = nullptr; }
-    if (d_vols)    { cudaFree(d_vols);    d_vols    = nullptr; }
-    if (d_prices)  { cudaFree(d_prices);  d_prices  = nullptr; }
+    cudaMemcpy(prices, d_prices, szf, cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/bonds_pricing/gpu_baseline.cu b/tasks/bonds_pricing/gpu_baseline.cu
index fdca893..0218a19 100644
--- a/tasks/bonds_pricing/gpu_baseline.cu
+++ b/tasks/bonds_pricing/gpu_baseline.cu
@@ -1072,39 +1072,47 @@ static bondsDateStruct intializeDateKernelCpu(int d, int m, int y)
 
 extern "C" {
 
-void solution_init(int N,
-                   const int* issue_year, const int* issue_month, const int* issue_day,
-                   const int* maturity_year, const int* maturity_month, const int* maturity_day,
-                   const float* rates, float coupon_freq)
+void solution_free(void)
 {
-	g_N = N;
-	g_issue_year = issue_year;
-	g_issue_month = issue_month;
-	g_issue_day = issue_day;
-	g_maturity_year = maturity_year;
-	g_maturity_month = maturity_month;
-	g_maturity_day = maturity_day;
-	g_rates = rates;
-	g_coupon_freq = coupon_freq;
-
-	// Allocate persistent GPU memory for inArgs
-	cudaMalloc(&d_discountCurve, N * sizeof(bondsYieldTermStruct));
-	cudaMalloc(&d_repoCurve, N * sizeof(bondsYieldTermStruct));
-	cudaMalloc(&d_currDate, N * sizeof(bondsDateStruct));
-	cudaMalloc(&d_maturityDate, N * sizeof(bondsDateStruct));
-	cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType));
-	cudaMalloc(&d_bond, N * sizeof(bondStruct));
-	cudaMalloc(&d_dummyStrike, N * sizeof(dataType));
-
-	// Allocate persistent GPU memory for results
-	cudaMalloc(&d_dirtyPrice, N * sizeof(dataType));
-	cudaMalloc(&d_accruedAmountCurrDate, N * sizeof(dataType));
-	cudaMalloc(&d_cleanPrice, N * sizeof(dataType));
-	cudaMalloc(&d_bondForwardVal, N * sizeof(dataType));
+	if (d_discountCurve)         { cudaFree(d_discountCurve);         d_discountCurve         = NULL; }
+	if (d_repoCurve)             { cudaFree(d_repoCurve);             d_repoCurve             = NULL; }
+	if (d_currDate)              { cudaFree(d_currDate);              d_currDate              = NULL; }
+	if (d_maturityDate)          { cudaFree(d_maturityDate);          d_maturityDate          = NULL; }
+	if (d_bondCleanPrice)        { cudaFree(d_bondCleanPrice);        d_bondCleanPrice        = NULL; }
+	if (d_bond)                  { cudaFree(d_bond);                  d_bond                  = NULL; }
+	if (d_dummyStrike)           { cudaFree(d_dummyStrike);           d_dummyStrike           = NULL; }
+	if (d_dirtyPrice)            { cudaFree(d_dirtyPrice);            d_dirtyPrice            = NULL; }
+	if (d_accruedAmountCurrDate) { cudaFree(d_accruedAmountCurrDate); d_accruedAmountCurrDate = NULL; }
+	if (d_cleanPrice)            { cudaFree(d_cleanPrice);            d_cleanPrice            = NULL; }
+	if (d_bondForwardVal)        { cudaFree(d_bondForwardVal);        d_bondForwardVal        = NULL; }
+	g_N = 0;
 }
 
-void solution_compute(int N, float* prices)
+void solution_compute(int N,
+                      const int* issue_year, const int* issue_month, const int* issue_day,
+                      const int* maturity_year, const int* maturity_month, const int* maturity_day,
+                      const float* rates, float coupon_freq,
+                      float* prices)
 {
+	if (g_N != N) {
+		solution_free();
+		// Allocate persistent GPU memory for inArgs
+		cudaMalloc(&d_discountCurve, N * sizeof(bondsYieldTermStruct));
+		cudaMalloc(&d_repoCurve, N * sizeof(bondsYieldTermStruct));
+		cudaMalloc(&d_currDate, N * sizeof(bondsDateStruct));
+		cudaMalloc(&d_maturityDate, N * sizeof(bondsDateStruct));
+		cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType));
+		cudaMalloc(&d_bond, N * sizeof(bondStruct));
+		cudaMalloc(&d_dummyStrike, N * sizeof(dataType));
+
+		// Allocate persistent GPU memory for results
+		cudaMalloc(&d_dirtyPrice, N * sizeof(dataType));
+		cudaMalloc(&d_accruedAmountCurrDate, N * sizeof(dataType));
+		cudaMalloc(&d_cleanPrice, N * sizeof(dataType));
+		cudaMalloc(&d_bondForwardVal, N * sizeof(dataType));
+		g_N = N;
+	}
+
 	// Build inArgs on host (matching bondsEngine.c / cpu_reference.c setup)
 	bondsYieldTermStruct* h_discountCurve = (bondsYieldTermStruct*)malloc(N * sizeof(bondsYieldTermStruct));
 	bondsYieldTermStruct* h_repoCurve = (bondsYieldTermStruct*)malloc(N * sizeof(bondsYieldTermStruct));
@@ -1120,16 +1128,16 @@ void solution_compute(int N, float* prices)
 		int repoCompounding = SIMPLE_INTEREST;
 		dataType repoCompoundFreq = 1;
 
-		bondsDateStruct bondIssueDate = intializeDateKernelCpu(g_issue_day[numBond], g_issue_month[numBond], g_issue_year[numBond]);
-		bondsDateStruct bondMaturityDate = intializeDateKernelCpu(g_maturity_day[numBond], g_maturity_month[numBond], g_maturity_year[numBond]);
+		bondsDateStruct bondIssueDate = intializeDateKernelCpu(issue_day[numBond], issue_month[numBond], issue_year[numBond]);
+		bondsDateStruct bondMaturityDate = intializeDateKernelCpu(maturity_day[numBond], maturity_month[numBond], maturity_year[numBond]);
 		bondsDateStruct todaysDate = intializeDateKernelCpu(bondMaturityDate.day-1, bondMaturityDate.month, bondMaturityDate.year);
 
 		bondStruct bond;
 		bond.startDate = bondIssueDate;
 		bond.maturityDate = bondMaturityDate;
-		bond.rate = g_rates[numBond];
+		bond.rate = rates[numBond];
 
-		dataType bondCouponFrequency = (dataType)g_coupon_freq;
+		dataType bondCouponFrequency = (dataType)coupon_freq;
 		dataType bondCleanPrice = 89.97693786;
 
 		bondsYieldTermStruct bondCurve;
@@ -1221,33 +1229,8 @@ void solution_compute(int N, float* prices)
 	free(h_accruedAmountCurrDate);
 	free(h_cleanPrice);
 	free(h_bondForwardVal);
-}
 
-void solution_free(void)
-{
-	cudaFree(d_discountCurve);
-	cudaFree(d_repoCurve);
-	cudaFree(d_currDate);
-	cudaFree(d_maturityDate);
-	cudaFree(d_bondCleanPrice);
-	cudaFree(d_bond);
-	cudaFree(d_dummyStrike);
-	cudaFree(d_dirtyPrice);
-	cudaFree(d_accruedAmountCurrDate);
-	cudaFree(d_cleanPrice);
-	cudaFree(d_bondForwardVal);
-
-	d_discountCurve = NULL;
-	d_repoCurve = NULL;
-	d_currDate = NULL;
-	d_maturityDate = NULL;
-	d_bondCleanPrice = NULL;
-	d_bond = NULL;
-	d_dummyStrike = NULL;
-	d_dirtyPrice = NULL;
-	d_accruedAmountCurrDate = NULL;
-	d_cleanPrice = NULL;
-	d_bondForwardVal = NULL;
+	cudaDeviceSynchronize();
 }
 
 } // extern "C"
diff --git a/tasks/dbscan/gpu_baseline.cu b/tasks/dbscan/gpu_baseline.cu
index bfc5ecf..67b05ee 100644
--- a/tasks/dbscan/gpu_baseline.cu
+++ b/tasks/dbscan/gpu_baseline.cu
@@ -27,8 +27,6 @@
 
 // Module-level state
 static int    g_N = 0;
-static float  g_eps = 0;
-static int    g_minPts = 0;
 static float* d_xs = NULL;
 static float* d_ys = NULL;
 static int*   d_neighbor_counts = NULL;
@@ -59,24 +57,29 @@ __global__ void countNeighborsKernel(
 
 // ===== Interface =====
 
-extern "C" void solution_init(int N, const float* xs, const float* ys,
-                               float eps, int minPts)
+extern "C" void solution_free(void)
 {
-    g_N = N;
-    g_eps = eps;
-    g_minPts = minPts;
+    if (d_xs)              { cudaFree(d_xs);              d_xs = NULL; }
+    if (d_ys)              { cudaFree(d_ys);              d_ys = NULL; }
+    if (d_neighbor_counts) { cudaFree(d_neighbor_counts); d_neighbor_counts = NULL; }
+    g_N = 0;
+}
 
-    cudaMalloc(&d_xs, N * sizeof(float));
-    cudaMalloc(&d_ys, N * sizeof(float));
-    cudaMalloc(&d_neighbor_counts, N * sizeof(int));
+extern "C" void solution_compute(int N, const float* xs, const float* ys,
+                                  float eps, int minPts, int* labels)
+{
+    if (g_N != N) {
+        solution_free();
+        cudaMalloc(&d_xs, N * sizeof(float));
+        cudaMalloc(&d_ys, N * sizeof(float));
+        cudaMalloc(&d_neighbor_counts, N * sizeof(int));
+        g_N = N;
+    }
 
     cudaMemcpy(d_xs, xs, N * sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(d_ys, ys, N * sizeof(float), cudaMemcpyHostToDevice);
-}
 
-extern "C" void solution_compute(int N, int* labels)
-{
-    float eps2 = g_eps * g_eps;
+    float eps2 = eps * eps;
     int threads = 256;
     int blocks = (N + threads - 1) / threads;
 
@@ -88,11 +91,6 @@ extern "C" void solution_compute(int N, int* labels)
     int* h_counts = (int*)malloc(N * sizeof(int));
     cudaMemcpy(h_counts, d_neighbor_counts, N * sizeof(int), cudaMemcpyDeviceToHost);
 
-    float* h_xs = (float*)malloc(N * sizeof(float));
-    float* h_ys = (float*)malloc(N * sizeof(float));
-    cudaMemcpy(h_xs, d_xs, N * sizeof(float), cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_ys, d_ys, N * sizeof(float), cudaMemcpyDeviceToHost);
-
     // DBSCAN BFS (matches clusterThread + expandCluster from original)
     for (int i = 0; i < N; i++) labels[i] = UNPROCESSED;
 
@@ -102,7 +100,7 @@ extern "C" void solution_compute(int N, int* labels)
     for (int i = 0; i < N; i++) {
         if (labels[i] != UNPROCESSED) continue;
 
-        if (h_counts[i] < g_minPts) {
+        if (h_counts[i] < minPts) {
             labels[i] = NOISE;
             continue;
         }
@@ -116,8 +114,8 @@ extern "C" void solution_compute(int N, int* labels)
         // Add neighbors of i as seeds
         for (int j = 0; j < N; j++) {
             if (j == i) continue;
-            float dx = h_xs[i] - h_xs[j];
-            float dy = h_ys[i] - h_ys[j];
+            float dx = xs[i] - xs[j];
+            float dy = ys[i] - ys[j];
             if (dx*dx + dy*dy <= eps2) {
                 if (labels[j] == UNPROCESSED) seeds[tail++] = j;
                 if (labels[j] == UNPROCESSED || labels[j] == NOISE)
@@ -128,12 +126,12 @@ extern "C" void solution_compute(int N, int* labels)
         // BFS expand
         while (head < tail) {
             int q = seeds[head++];
-            if (h_counts[q] < g_minPts) continue;
+            if (h_counts[q] < minPts) continue;
 
             for (int j = 0; j < N; j++) {
                 if (labels[j] != UNPROCESSED && labels[j] != NOISE) continue;
-                float dx = h_xs[q] - h_xs[j];
-                float dy = h_ys[q] - h_ys[j];
+                float dx = xs[q] - xs[j];
+                float dy = ys[q] - ys[j];
                 if (dx*dx + dy*dy <= eps2) {
                     if (labels[j] == UNPROCESSED) seeds[tail++] = j;
                     labels[j] = clusterId;
@@ -144,13 +142,6 @@ extern "C" void solution_compute(int N, int* labels)
 
     free(seeds);
     free(h_counts);
-    free(h_xs);
-    free(h_ys);
-}
 
-extern "C" void solution_free(void)
-{
-    if (d_xs)              { cudaFree(d_xs);              d_xs = NULL; }
-    if (d_ys)              { cudaFree(d_ys);              d_ys = NULL; }
-    if (d_neighbor_counts) { cudaFree(d_neighbor_counts); d_neighbor_counts = NULL; }
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/dtw_distance/gpu_baseline.cu b/tasks/dtw_distance/gpu_baseline.cu
index 21fa920..6d46ad5 100644
--- a/tasks/dtw_distance/gpu_baseline.cu
+++ b/tasks/dtw_distance/gpu_baseline.cu
@@ -380,14 +380,20 @@ static int    g_num_features = 0;
 static float* d_subjects  = nullptr;
 static float* d_distances = nullptr;
 
-extern "C" void solution_init(int          num_entries,
-                              int          num_features,
-                              const float* subjects,
-                              const float* query)
+extern "C" void solution_free(void)
 {
-    g_num_entries  = num_entries;
-    g_num_features = num_features;
+    if (d_subjects)  { cudaFree(d_subjects);  d_subjects  = nullptr; }
+    if (d_distances) { cudaFree(d_distances); d_distances = nullptr; }
+    g_num_entries  = 0;
+    g_num_features = 0;
+}
 
+extern "C" void solution_compute(int          num_entries,
+                                 int          num_features,
+                                 const float* subjects,
+                                 const float* query,
+                                 float*       distances)
+{
     if (num_features != 1023) {
         fprintf(stderr,
                 "[gpu_baseline] cuDTW SHFL_FULLDTW_1023 is length-specialized "
@@ -398,19 +404,19 @@ extern "C" void solution_init(int          num_entries,
     size_t subj_bytes  = (size_t)num_entries * num_features * sizeof(float);
     size_t dist_bytes  = (size_t)num_entries * sizeof(float);
 
-    cudaMalloc(&d_subjects,  subj_bytes);
-    cudaMalloc(&d_distances, dist_bytes);
+    if (g_num_entries != num_entries || g_num_features != num_features) {
+        solution_free();
+        cudaMalloc(&d_subjects,  subj_bytes);
+        cudaMalloc(&d_distances, dist_bytes);
+        g_num_entries  = num_entries;
+        g_num_features = num_features;
+    }
 
     cudaMemcpy(d_subjects, subjects, subj_bytes, cudaMemcpyHostToDevice);
     // Match cuDTW main.cu line 164: query lives in __constant__ memory.
     cudaMemcpyToSymbol(cQuery, query,
                        (size_t)num_features * sizeof(float));
-}
 
-extern "C" void solution_compute(int    num_entries,
-                                 int    num_features,
-                                 float* distances)
-{
     // Launch matches cuDTW DTW.hpp shfl_FullDTW_1023 dispatch:
     //   grid  = (num_entries, 1, 1)
     //   block = (32, 1, 1)
@@ -423,10 +429,5 @@ extern "C" void solution_compute(int    num_entries,
     cudaMemcpy(distances, d_distances,
                (size_t)num_entries * sizeof(float),
                cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    if (d_subjects)  { cudaFree(d_subjects);  d_subjects  = nullptr; }
-    if (d_distances) { cudaFree(d_distances); d_distances = nullptr; }
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/euclidean_distance_matrix/gpu_baseline.cu b/tasks/euclidean_distance_matrix/gpu_baseline.cu
index 130fa80..71a81ae 100644
--- a/tasks/euclidean_distance_matrix/gpu_baseline.cu
+++ b/tasks/euclidean_distance_matrix/gpu_baseline.cu
@@ -91,33 +91,47 @@ __global__ void compute_distances(float * ref,
 }
 
 // ===== Persistent device state =====
+static int    g_ref_nb   = 0;
+static int    g_query_nb = 0;
+static int    g_dim      = 0;
 static float* d_ref      = nullptr;
 static float* d_query    = nullptr;
 static float* d_dist     = nullptr;
 
-extern "C" void solution_init(int          ref_nb,
-                              int          query_nb,
-                              int          dim,
-                              const float* ref,
-                              const float* query)
+extern "C" void solution_free(void)
+{
+    if (d_ref)   { cudaFree(d_ref);   d_ref   = nullptr; }
+    if (d_query) { cudaFree(d_query); d_query = nullptr; }
+    if (d_dist)  { cudaFree(d_dist);  d_dist  = nullptr; }
+    g_ref_nb = 0;
+    g_query_nb = 0;
+    g_dim = 0;
+}
+
+extern "C" void solution_compute(int          ref_nb,
+                                 int          query_nb,
+                                 int          dim,
+                                 const float* ref,
+                                 const float* query,
+                                 float*       dist)
 {
     size_t ref_bytes   = (size_t)dim * ref_nb   * sizeof(float);
     size_t query_bytes = (size_t)dim * query_nb * sizeof(float);
     size_t dist_bytes  = (size_t)ref_nb * query_nb * sizeof(float);
 
-    cudaMalloc(&d_ref,   ref_bytes);
-    cudaMalloc(&d_query, query_bytes);
-    cudaMalloc(&d_dist,  dist_bytes);
+    if (g_ref_nb != ref_nb || g_query_nb != query_nb || g_dim != dim) {
+        solution_free();
+        cudaMalloc(&d_ref,   ref_bytes);
+        cudaMalloc(&d_query, query_bytes);
+        cudaMalloc(&d_dist,  dist_bytes);
+        g_ref_nb = ref_nb;
+        g_query_nb = query_nb;
+        g_dim = dim;
+    }
 
     cudaMemcpy(d_ref,   ref,   ref_bytes,   cudaMemcpyHostToDevice);
     cudaMemcpy(d_query, query, query_bytes, cudaMemcpyHostToDevice);
-}
 
-extern "C" void solution_compute(int    ref_nb,
-                                 int    query_nb,
-                                 int    dim,
-                                 float* dist)
-{
     // Launch matches kNN-CUDA's knn_cuda_global() (lines around 540) —
     //   block = (BLOCK_DIM, BLOCK_DIM)
     //   grid  = (ceil(query_nb/BLOCK_DIM), ceil(ref_nb/BLOCK_DIM))
@@ -134,11 +148,5 @@ extern "C" void solution_compute(int    ref_nb,
     cudaMemcpy(dist, d_dist,
                (size_t)ref_nb * query_nb * sizeof(float),
                cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    if (d_ref)   { cudaFree(d_ref);   d_ref   = nullptr; }
-    if (d_query) { cudaFree(d_query); d_query = nullptr; }
-    if (d_dist)  { cudaFree(d_dist);  d_dist  = nullptr; }
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/hausdorff_distance/gpu_baseline.cu b/tasks/hausdorff_distance/gpu_baseline.cu
index 068d7d0..f446bea 100644
--- a/tasks/hausdorff_distance/gpu_baseline.cu
+++ b/tasks/hausdorff_distance/gpu_baseline.cu
@@ -91,37 +91,44 @@ static vec_2d* d_points = nullptr;
 static int*    d_space_offsets = nullptr;
 static float*  d_results = nullptr;
 
-extern "C" void solution_init(int          num_points,
-                              int          num_spaces,
-                              const float* points_xy,
-                              const int*   space_offsets)
+// Tiny init kernel (cuSpatial uses thrust::fill_n with -1 sentinel for atomicMax).
+__global__ void hausdorff_init_results(int n, float* r)
 {
-    g_num_points = num_points;
-    g_num_spaces = num_spaces;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) r[i] = -1.0f;
+}
 
+extern "C" void solution_free(void)
+{
+    if (d_points)        { cudaFree(d_points);        d_points        = nullptr; }
+    if (d_space_offsets) { cudaFree(d_space_offsets); d_space_offsets = nullptr; }
+    if (d_results)       { cudaFree(d_results);       d_results       = nullptr; }
+    g_num_points = 0;
+    g_num_spaces = 0;
+}
+
+extern "C" void solution_compute(int          num_points,
+                                 int          num_spaces,
+                                 const float* points_xy,
+                                 const int*   space_offsets,
+                                 float*       results)
+{
     size_t pts_bytes = (size_t)num_points * sizeof(vec_2d);
     size_t off_bytes = (size_t)num_spaces * sizeof(int);
     size_t res_bytes = (size_t)num_spaces * num_spaces * sizeof(float);
 
-    cudaMalloc(&d_points,        pts_bytes);
-    cudaMalloc(&d_space_offsets, off_bytes);
-    cudaMalloc(&d_results,       res_bytes);
+    if (g_num_points != num_points || g_num_spaces != num_spaces) {
+        solution_free();
+        cudaMalloc(&d_points,        pts_bytes);
+        cudaMalloc(&d_space_offsets, off_bytes);
+        cudaMalloc(&d_results,       res_bytes);
+        g_num_points = num_points;
+        g_num_spaces = num_spaces;
+    }
 
     cudaMemcpy(d_points,        points_xy,     pts_bytes, cudaMemcpyHostToDevice);
     cudaMemcpy(d_space_offsets, space_offsets, off_bytes, cudaMemcpyHostToDevice);
-}
 
-// Tiny init kernel (cuSpatial uses thrust::fill_n with -1 sentinel for atomicMax).
-__global__ void hausdorff_init_results(int n, float* r)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) r[i] = -1.0f;
-}
-
-extern "C" void solution_compute(int    num_points,
-                                 int    num_spaces,
-                                 float* results)
-{
     int n_results = num_spaces * num_spaces;
 
     int init_block = 256;
@@ -136,11 +143,5 @@ extern "C" void solution_compute(int    num_points,
     cudaMemcpy(results, d_results,
                (size_t)n_results * sizeof(float),
                cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    if (d_points)        { cudaFree(d_points);        d_points        = nullptr; }
-    if (d_space_offsets) { cudaFree(d_space_offsets); d_space_offsets = nullptr; }
-    if (d_results)       { cudaFree(d_results);       d_results       = nullptr; }
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/held_karp_tsp/gpu_baseline.cu b/tasks/held_karp_tsp/gpu_baseline.cu
index ba9677c..69d3887 100644
--- a/tasks/held_karp_tsp/gpu_baseline.cu
+++ b/tasks/held_karp_tsp/gpu_baseline.cu
@@ -82,15 +82,16 @@ __global__ void held_karp_final(int n, int full_mask,
 
 // ===== Host interface =====
 
-static int    g_B = 0;
 static int    g_n = 0;
 static int*   d_cost = nullptr;
 static int*   d_dp = nullptr;
 static int*   d_result = nullptr;
 
-extern "C" void solution_init(int B, int n, const int* costs) {
-    // Not used — we upload per-instance in solution_compute
-    (void)B; (void)n; (void)costs;
+extern "C" void solution_free(void) {
+    if (d_cost)   { cudaFree(d_cost);   d_cost   = nullptr; }
+    if (d_dp)     { cudaFree(d_dp);     d_dp     = nullptr; }
+    if (d_result) { cudaFree(d_result); d_result = nullptr; }
+    g_n = 0;
 }
 
 extern "C" void solution_compute(int B, int n, const int* costs,
@@ -101,9 +102,13 @@ extern "C" void solution_compute(int B, int n, const int* costs,
     size_t dp_bytes = (size_t)subset_count * n * sizeof(int);
     size_t cost_bytes = (size_t)n * n * sizeof(int);
 
-    cudaMalloc(&d_cost, cost_bytes);
-    cudaMalloc(&d_dp, dp_bytes);
-    cudaMalloc(&d_result, sizeof(int));
+    if (g_n != n) {
+        solution_free();
+        cudaMalloc(&d_cost, cost_bytes);
+        cudaMalloc(&d_dp, dp_bytes);
+        cudaMalloc(&d_result, sizeof(int));
+        g_n = n;
+    }
 
     int block = 256;
 
@@ -136,9 +141,5 @@ extern "C" void solution_compute(int B, int n, const int* costs,
                    cudaMemcpyDeviceToHost);
     }
 
-    cudaFree(d_cost);   d_cost = nullptr;
-    cudaFree(d_dp);     d_dp = nullptr;
-    cudaFree(d_result); d_result = nullptr;
+    cudaDeviceSynchronize();
 }
-
-extern "C" void solution_free(void) {}
diff --git a/tasks/max_flow_push_relabel/gpu_baseline.cu b/tasks/max_flow_push_relabel/gpu_baseline.cu
index 080dafa..c9fa3f9 100644
--- a/tasks/max_flow_push_relabel/gpu_baseline.cu
+++ b/tasks/max_flow_push_relabel/gpu_baseline.cu
@@ -141,15 +141,29 @@ static int* d_bfs_queue = nullptr;
 static int* d_in_queue = nullptr;
 static int* d_result = nullptr;
 
-extern "C" void solution_init(int num_nodes, int num_arcs,
-                              const int* tails, const int* heads, const int* caps,
-                              int source, int sink)
+extern "C" void solution_free(void)
 {
-    g_num_nodes = num_nodes;
-    g_num_total_arcs = 2 * num_arcs;
-    g_source = source;
-    g_sink = sink;
+    if (d_adj_start)    { cudaFree(d_adj_start);    d_adj_start    = nullptr; }
+    if (d_adj_list)     { cudaFree(d_adj_list);     d_adj_list     = nullptr; }
+    if (d_arc_head)     { cudaFree(d_arc_head);     d_arc_head     = nullptr; }
+    if (d_arc_opposite) { cudaFree(d_arc_opposite); d_arc_opposite = nullptr; }
+    if (d_residual)     { cudaFree(d_residual);     d_residual     = nullptr; }
+    if (d_initial_cap)  { cudaFree(d_initial_cap);  d_initial_cap  = nullptr; }
+    if (d_excess)       { cudaFree(d_excess);       d_excess       = nullptr; }
+    if (d_height)       { cudaFree(d_height);       d_height       = nullptr; }
+    if (d_first_arc)    { cudaFree(d_first_arc);    d_first_arc    = nullptr; }
+    if (d_bfs_queue)    { cudaFree(d_bfs_queue);    d_bfs_queue    = nullptr; }
+    if (d_in_queue)     { cudaFree(d_in_queue);     d_in_queue     = nullptr; }
+    if (d_result)       { cudaFree(d_result);       d_result       = nullptr; }
+    g_num_nodes = 0;
+    g_num_total_arcs = 0;
+}
 
+extern "C" void solution_compute(int num_nodes, int num_arcs,
+                                 const int* tails, const int* heads, const int* caps,
+                                 int source, int sink,
+                                 int* max_flow_out)
+{
     int total = 2 * num_arcs;
 
     // Build graph on host (same as CPU reference)
@@ -183,19 +197,23 @@ extern "C" void solution_init(int num_nodes, int num_arcs,
     }
     free(degree);
 
-    // Upload to GPU
-    cudaMalloc(&d_adj_start, (num_nodes + 1) * sizeof(int));
-    cudaMalloc(&d_adj_list, total * sizeof(int));
-    cudaMalloc(&d_arc_head, total * sizeof(int));
-    cudaMalloc(&d_arc_opposite, total * sizeof(int));
-    cudaMalloc(&d_residual, total * sizeof(int));
-    cudaMalloc(&d_initial_cap, total * sizeof(int));
-    cudaMalloc(&d_excess, num_nodes * sizeof(long long));
-    cudaMalloc(&d_height, num_nodes * sizeof(int));
-    cudaMalloc(&d_first_arc, num_nodes * sizeof(int));
-    cudaMalloc(&d_bfs_queue, num_nodes * sizeof(int));
-    cudaMalloc(&d_in_queue, num_nodes * sizeof(int));
-    cudaMalloc(&d_result, sizeof(int));
+    if (g_num_nodes != num_nodes || g_num_total_arcs != total) {
+        solution_free();
+        cudaMalloc(&d_adj_start, (num_nodes + 1) * sizeof(int));
+        cudaMalloc(&d_adj_list, total * sizeof(int));
+        cudaMalloc(&d_arc_head, total * sizeof(int));
+        cudaMalloc(&d_arc_opposite, total * sizeof(int));
+        cudaMalloc(&d_residual, total * sizeof(int));
+        cudaMalloc(&d_initial_cap, total * sizeof(int));
+        cudaMalloc(&d_excess, num_nodes * sizeof(long long));
+        cudaMalloc(&d_height, num_nodes * sizeof(int));
+        cudaMalloc(&d_first_arc, num_nodes * sizeof(int));
+        cudaMalloc(&d_bfs_queue, num_nodes * sizeof(int));
+        cudaMalloc(&d_in_queue, num_nodes * sizeof(int));
+        cudaMalloc(&d_result, sizeof(int));
+        g_num_nodes = num_nodes;
+        g_num_total_arcs = total;
+    }
 
     cudaMemcpy(d_adj_start, h_adj_start, (num_nodes + 1) * sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(d_adj_list, h_adj_list, total * sizeof(int), cudaMemcpyHostToDevice);
@@ -205,25 +223,16 @@ extern "C" void solution_init(int num_nodes, int num_arcs,
 
     free(h_arc_head); free(h_arc_opp); free(h_init_cap);
     free(h_adj_start); free(h_adj_list);
-}
 
-extern "C" void solution_compute(int num_nodes, int* max_flow_out)
-{
+    g_source = source;
+    g_sink = sink;
+
     push_relabel_kernel<<<1, 1>>>(
-        g_num_nodes, g_num_total_arcs, g_source, g_sink,
+        num_nodes, total, source, sink,
         d_adj_start, d_adj_list, d_arc_head, d_arc_opposite,
         d_residual, d_initial_cap, d_excess, d_height, d_first_arc,
         d_bfs_queue, d_in_queue, d_result);
 
     cudaMemcpy(max_flow_out, d_result, sizeof(int), cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    cudaFree(d_adj_start);    cudaFree(d_adj_list);
-    cudaFree(d_arc_head);     cudaFree(d_arc_opposite);
-    cudaFree(d_residual);     cudaFree(d_initial_cap);
-    cudaFree(d_excess);       cudaFree(d_height);
-    cudaFree(d_first_arc);    cudaFree(d_bfs_queue);
-    cudaFree(d_in_queue);     cudaFree(d_result);
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/monte_carlo/gpu_baseline.cu b/tasks/monte_carlo/gpu_baseline.cu
index 232d0a5..959a8fd 100644
--- a/tasks/monte_carlo/gpu_baseline.cu
+++ b/tasks/monte_carlo/gpu_baseline.cu
@@ -203,11 +203,17 @@ static float   g_dt = 0.0f;
 static unsigned int g_baseSeed = 0;
 static monteCarloOptionStruct g_optionStruct;
 
-extern "C" void solution_init(int N, int num_steps, float risk_free, float volatility,
-                               float strike, float spot, float time_to_maturity,
-                               unsigned int base_seed)
+extern "C" void solution_free(void)
+{
+    if (d_samplePrices) { cudaFree(d_samplePrices); d_samplePrices = nullptr; }
+    g_N = 0;
+}
+
+extern "C" void solution_compute(int N, int num_steps, float risk_free, float volatility,
+                                  float strike, float spot, float time_to_maturity,
+                                  unsigned int base_seed,
+                                  float* samplePrices)
 {
-    g_N = N;
     g_seqLen = num_steps;
     g_dt = 1.0f / (float)num_steps;
     g_baseSeed = base_seed;
@@ -219,13 +225,12 @@ extern "C" void solution_init(int N, int num_steps, float risk_free, float volat
     g_optionStruct.strikeVal = strike;
     g_optionStruct.discountVal = expf(-risk_free * time_to_maturity);
 
-    // Allocate GPU output buffer (persistent across calls)
-    if (d_samplePrices) cudaFree(d_samplePrices);
-    cudaMalloc(&d_samplePrices, N * sizeof(float));
-}
+    if (g_N != N) {
+        solution_free();
+        cudaMalloc(&d_samplePrices, N * sizeof(float));
+        g_N = N;
+    }
 
-extern "C" void solution_compute(int N, float* samplePrices)
-{
     // Launch: one thread per sample
     int threadsPerBlock = 256;
     int numBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
@@ -236,9 +241,5 @@ extern "C" void solution_compute(int N, float* samplePrices)
 
     // Download results
     cudaMemcpy(samplePrices, d_samplePrices, N * sizeof(float), cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    if (d_samplePrices) { cudaFree(d_samplePrices); d_samplePrices = nullptr; }
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/pdlp/gpu_baseline.cu b/tasks/pdlp/gpu_baseline.cu
index cfe9130..ee0656f 100644
--- a/tasks/pdlp/gpu_baseline.cu
+++ b/tasks/pdlp/gpu_baseline.cu
@@ -135,36 +135,69 @@ static float* d_primal_avg = nullptr;
 static float* d_x_bar = nullptr;
 static float* d_Ax_bar = nullptr;
 
-extern "C" void solution_init(int          num_vars,
-                              int          num_constraints,
-                              int          nnz,
-                              int          num_iters,
-                              const float* obj,
-                              const float* var_lb,
-                              const float* var_ub,
-                              const float* con_lb,
-                              const float* con_ub,
-                              const int*   col_ptrs,
-                              const int*   row_indices,
-                              const float* values,
-                              float        step_size,
-                              float        primal_weight)
+static int g_nnz = 0;
+
+extern "C" void solution_free(void)
+{
+    if (d_obj)          { cudaFree(d_obj);          d_obj = nullptr; }
+    if (d_var_lb)       { cudaFree(d_var_lb);       d_var_lb = nullptr; }
+    if (d_var_ub)       { cudaFree(d_var_ub);       d_var_ub = nullptr; }
+    if (d_con_lb)       { cudaFree(d_con_lb);       d_con_lb = nullptr; }
+    if (d_con_ub)       { cudaFree(d_con_ub);       d_con_ub = nullptr; }
+    if (d_col_ptrs)     { cudaFree(d_col_ptrs);     d_col_ptrs = nullptr; }
+    if (d_row_indices)  { cudaFree(d_row_indices);  d_row_indices = nullptr; }
+    if (d_values)       { cudaFree(d_values);       d_values = nullptr; }
+    if (d_primal)       { cudaFree(d_primal);       d_primal = nullptr; }
+    if (d_dual)         { cudaFree(d_dual);         d_dual = nullptr; }
+    if (d_dual_product) { cudaFree(d_dual_product); d_dual_product = nullptr; }
+    if (d_primal_avg)   { cudaFree(d_primal_avg);   d_primal_avg = nullptr; }
+    if (d_x_bar)        { cudaFree(d_x_bar);        d_x_bar = nullptr; }
+    if (d_Ax_bar)       { cudaFree(d_Ax_bar);       d_Ax_bar = nullptr; }
+    g_num_vars = 0;
+    g_num_constraints = 0;
+    g_nnz = 0;
+}
+
+extern "C" void solution_compute(int          num_vars,
+                                 int          num_constraints,
+                                 int          nnz,
+                                 int          num_iters,
+                                 const float* obj,
+                                 const float* var_lb,
+                                 const float* var_ub,
+                                 const float* con_lb,
+                                 const float* con_ub,
+                                 const int*   col_ptrs,
+                                 const int*   row_indices,
+                                 const float* values,
+                                 float        step_size,
+                                 float        primal_weight,
+                                 float*       primal_out)
 {
-    g_num_vars        = num_vars;
-    g_num_constraints = num_constraints;
     g_num_iters       = num_iters;
     g_step_size       = step_size;
     g_primal_weight   = primal_weight;
 
-    // Upload LP data
-    cudaMalloc(&d_obj,         num_vars * sizeof(float));
-    cudaMalloc(&d_var_lb,      num_vars * sizeof(float));
-    cudaMalloc(&d_var_ub,      num_vars * sizeof(float));
-    cudaMalloc(&d_con_lb,      num_constraints * sizeof(float));
-    cudaMalloc(&d_con_ub,      num_constraints * sizeof(float));
-    cudaMalloc(&d_col_ptrs,    (num_vars + 1) * sizeof(int));
-    cudaMalloc(&d_row_indices, nnz * sizeof(int));
-    cudaMalloc(&d_values,      nnz * sizeof(float));
+    if (g_num_vars != num_vars || g_num_constraints != num_constraints || g_nnz != nnz) {
+        solution_free();
+        cudaMalloc(&d_obj,         num_vars * sizeof(float));
+        cudaMalloc(&d_var_lb,      num_vars * sizeof(float));
+        cudaMalloc(&d_var_ub,      num_vars * sizeof(float));
+        cudaMalloc(&d_con_lb,      num_constraints * sizeof(float));
+        cudaMalloc(&d_con_ub,      num_constraints * sizeof(float));
+        cudaMalloc(&d_col_ptrs,    (num_vars + 1) * sizeof(int));
+        cudaMalloc(&d_row_indices, nnz * sizeof(int));
+        cudaMalloc(&d_values,      nnz * sizeof(float));
+        cudaMalloc(&d_primal,       num_vars * sizeof(float));
+        cudaMalloc(&d_dual,         num_constraints * sizeof(float));
+        cudaMalloc(&d_dual_product, num_vars * sizeof(float));
+        cudaMalloc(&d_primal_avg,   num_vars * sizeof(float));
+        cudaMalloc(&d_x_bar,        num_vars * sizeof(float));
+        cudaMalloc(&d_Ax_bar,       num_constraints * sizeof(float));
+        g_num_vars = num_vars;
+        g_num_constraints = num_constraints;
+        g_nnz = nnz;
+    }
 
     cudaMemcpy(d_obj,         obj,         num_vars * sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(d_var_lb,      var_lb,      num_vars * sizeof(float), cudaMemcpyHostToDevice);
@@ -175,18 +208,6 @@ extern "C" void solution_init(int          num_vars,
     cudaMemcpy(d_row_indices, row_indices, nnz * sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(d_values,      values,      nnz * sizeof(float), cudaMemcpyHostToDevice);
 
-    // Allocate working buffers
-    cudaMalloc(&d_primal,       num_vars * sizeof(float));
-    cudaMalloc(&d_dual,         num_constraints * sizeof(float));
-    cudaMalloc(&d_dual_product, num_vars * sizeof(float));
-    cudaMalloc(&d_primal_avg,   num_vars * sizeof(float));
-    cudaMalloc(&d_x_bar,        num_vars * sizeof(float));
-    cudaMalloc(&d_Ax_bar,       num_constraints * sizeof(float));
-}
-
-extern "C" void solution_compute(int num_vars, int num_constraints,
-                                 float* primal_out)
-{
     int bv = 256;
     int gv = (num_vars + bv - 1) / bv;
     int gc = (num_constraints + bv - 1) / bv;
@@ -198,10 +219,10 @@ extern "C" void solution_compute(int num_vars, int num_constraints,
     cudaMemset(d_primal_avg,   0, num_vars * sizeof(float));
 
     float avg_weight_sum = 0.0f;
-    float primal_step_size = g_step_size / g_primal_weight;
-    float dual_step_size   = g_step_size * g_primal_weight;
+    float primal_step_size = step_size / primal_weight;
+    float dual_step_size   = step_size * primal_weight;
 
-    for (int iter = 0; iter < g_num_iters; iter++) {
+    for (int iter = 0; iter < num_iters; iter++) {
         // 1. Primal update + extrapolation
         kernel_primal_update<<<gv, bv>>>(num_vars, primal_step_size,
             d_obj, d_dual_product, d_var_lb, d_var_ub, d_primal, d_x_bar);
@@ -220,29 +241,12 @@ extern "C" void solution_compute(int num_vars, int num_constraints,
                                     d_values, d_dual, d_dual_product);
 
         // 5. Weighted average update
-        float ratio = g_step_size / (avg_weight_sum + g_step_size);
+        float ratio = step_size / (avg_weight_sum + step_size);
         kernel_avg_update<<<gv, bv>>>(num_vars, ratio, d_primal, d_primal_avg);
-        avg_weight_sum += g_step_size;
+        avg_weight_sum += step_size;
     }
 
     cudaMemcpy(primal_out, d_primal_avg, num_vars * sizeof(float),
                cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    cudaFree(d_obj);         d_obj = nullptr;
-    cudaFree(d_var_lb);      d_var_lb = nullptr;
-    cudaFree(d_var_ub);      d_var_ub = nullptr;
-    cudaFree(d_con_lb);      d_con_lb = nullptr;
-    cudaFree(d_con_ub);      d_con_ub = nullptr;
-    cudaFree(d_col_ptrs);    d_col_ptrs = nullptr;
-    cudaFree(d_row_indices); d_row_indices = nullptr;
-    cudaFree(d_values);      d_values = nullptr;
-    cudaFree(d_primal);      d_primal = nullptr;
-    cudaFree(d_dual);        d_dual = nullptr;
-    cudaFree(d_dual_product); d_dual_product = nullptr;
-    cudaFree(d_primal_avg);  d_primal_avg = nullptr;
-    cudaFree(d_x_bar);       d_x_bar = nullptr;
-    cudaFree(d_Ax_bar);      d_Ax_bar = nullptr;
+    cudaDeviceSynchronize();
 }
diff --git a/tasks/repo_pricing/gpu_baseline.cu b/tasks/repo_pricing/gpu_baseline.cu
index ed84bb9..540de3d 100644
--- a/tasks/repo_pricing/gpu_baseline.cu
+++ b/tasks/repo_pricing/gpu_baseline.cu
@@ -1153,40 +1153,69 @@ static dataType* d_bondForwardVal = NULL;
 
 extern "C" {
 
-void solution_init(int N,
-                   const int* settle_year, const int* settle_month, const int* settle_day,
-                   const int* delivery_year, const int* delivery_month, const int* delivery_day,
-                   const int* issue_year, const int* issue_month, const int* issue_day,
-                   const int* maturity_year, const int* maturity_month, const int* maturity_day,
-                   const float* bond_rates, const float* repo_rates,
-                   const float* bond_clean_prices, const float* dummy_strikes)
+void solution_free(void)
+{
+	if (d_discountCurve)             { cudaFree(d_discountCurve);             d_discountCurve             = NULL; }
+	if (d_repoCurve)                 { cudaFree(d_repoCurve);                 d_repoCurve                 = NULL; }
+	if (d_settlementDate)            { cudaFree(d_settlementDate);            d_settlementDate            = NULL; }
+	if (d_deliveryDate)              { cudaFree(d_deliveryDate);              d_deliveryDate              = NULL; }
+	if (d_maturityDate)              { cudaFree(d_maturityDate);              d_maturityDate              = NULL; }
+	if (d_repoDeliveryDate)          { cudaFree(d_repoDeliveryDate);          d_repoDeliveryDate          = NULL; }
+	if (d_bondCleanPrice)            { cudaFree(d_bondCleanPrice);            d_bondCleanPrice            = NULL; }
+	if (d_bond)                      { cudaFree(d_bond);                      d_bond                      = NULL; }
+	if (d_dummyStrike)               { cudaFree(d_dummyStrike);               d_dummyStrike               = NULL; }
+	if (d_dirtyPrice)                { cudaFree(d_dirtyPrice);                d_dirtyPrice                = NULL; }
+	if (d_accruedAmountSettlement)   { cudaFree(d_accruedAmountSettlement);   d_accruedAmountSettlement   = NULL; }
+	if (d_accruedAmountDeliveryDate) { cudaFree(d_accruedAmountDeliveryDate); d_accruedAmountDeliveryDate = NULL; }
+	if (d_cleanPrice)                { cudaFree(d_cleanPrice);                d_cleanPrice                = NULL; }
+	if (d_forwardSpotIncome)         { cudaFree(d_forwardSpotIncome);         d_forwardSpotIncome         = NULL; }
+	if (d_underlyingBondFwd)         { cudaFree(d_underlyingBondFwd);         d_underlyingBondFwd         = NULL; }
+	if (d_repoNpv)                   { cudaFree(d_repoNpv);                   d_repoNpv                   = NULL; }
+	if (d_repoCleanForwardPrice)     { cudaFree(d_repoCleanForwardPrice);     d_repoCleanForwardPrice     = NULL; }
+	if (d_repoDirtyForwardPrice)     { cudaFree(d_repoDirtyForwardPrice);     d_repoDirtyForwardPrice     = NULL; }
+	if (d_repoImpliedYield)          { cudaFree(d_repoImpliedYield);          d_repoImpliedYield          = NULL; }
+	if (d_marketRepoRate)            { cudaFree(d_marketRepoRate);            d_marketRepoRate            = NULL; }
+	if (d_bondForwardVal)            { cudaFree(d_bondForwardVal);            d_bondForwardVal            = NULL; }
+	g_N = 0;
+}
+
+void solution_compute(int N,
+                      const int* settle_year, const int* settle_month, const int* settle_day,
+                      const int* delivery_year, const int* delivery_month, const int* delivery_day,
+                      const int* issue_year, const int* issue_month, const int* issue_day,
+                      const int* maturity_year, const int* maturity_month, const int* maturity_day,
+                      const float* bond_rates, const float* repo_rates,
+                      const float* bond_clean_prices, const float* dummy_strikes,
+                      float* prices)
 {
-	g_N = N;
-
-	// Allocate device memory for inArgs
-	cudaMalloc(&d_discountCurve, N * sizeof(repoYieldTermStruct));
-	cudaMalloc(&d_repoCurve, N * sizeof(repoYieldTermStruct));
-	cudaMalloc(&d_settlementDate, N * sizeof(repoDateStruct));
-	cudaMalloc(&d_deliveryDate, N * sizeof(repoDateStruct));
-	cudaMalloc(&d_maturityDate, N * sizeof(repoDateStruct));
-	cudaMalloc(&d_repoDeliveryDate, N * sizeof(repoDateStruct));
-	cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType));
-	cudaMalloc(&d_bond, N * sizeof(bondStruct));
-	cudaMalloc(&d_dummyStrike, N * sizeof(dataType));
-
-	// Allocate device memory for results
-	cudaMalloc(&d_dirtyPrice, N * sizeof(dataType));
-	cudaMalloc(&d_accruedAmountSettlement, N * sizeof(dataType));
-	cudaMalloc(&d_accruedAmountDeliveryDate, N * sizeof(dataType));
-	cudaMalloc(&d_cleanPrice, N * sizeof(dataType));
-	cudaMalloc(&d_forwardSpotIncome, N * sizeof(dataType));
-	cudaMalloc(&d_underlyingBondFwd, N * sizeof(dataType));
-	cudaMalloc(&d_repoNpv, N * sizeof(dataType));
-	cudaMalloc(&d_repoCleanForwardPrice, N * sizeof(dataType));
-	cudaMalloc(&d_repoDirtyForwardPrice, N * sizeof(dataType));
-	cudaMalloc(&d_repoImpliedYield, N * sizeof(dataType));
-	cudaMalloc(&d_marketRepoRate, N * sizeof(dataType));
-	cudaMalloc(&d_bondForwardVal, N * sizeof(dataType));
+	if (g_N != N) {
+		solution_free();
+		// Allocate device memory for inArgs
+		cudaMalloc(&d_discountCurve, N * sizeof(repoYieldTermStruct));
+		cudaMalloc(&d_repoCurve, N * sizeof(repoYieldTermStruct));
+		cudaMalloc(&d_settlementDate, N * sizeof(repoDateStruct));
+		cudaMalloc(&d_deliveryDate, N * sizeof(repoDateStruct));
+		cudaMalloc(&d_maturityDate, N * sizeof(repoDateStruct));
+		cudaMalloc(&d_repoDeliveryDate, N * sizeof(repoDateStruct));
+		cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType));
+		cudaMalloc(&d_bond, N * sizeof(bondStruct));
+		cudaMalloc(&d_dummyStrike, N * sizeof(dataType));
+
+		// Allocate device memory for results
+		cudaMalloc(&d_dirtyPrice, N * sizeof(dataType));
+		cudaMalloc(&d_accruedAmountSettlement, N * sizeof(dataType));
+		cudaMalloc(&d_accruedAmountDeliveryDate, N * sizeof(dataType));
+		cudaMalloc(&d_cleanPrice, N * sizeof(dataType));
+		cudaMalloc(&d_forwardSpotIncome, N * sizeof(dataType));
+		cudaMalloc(&d_underlyingBondFwd, N * sizeof(dataType));
+		cudaMalloc(&d_repoNpv, N * sizeof(dataType));
+		cudaMalloc(&d_repoCleanForwardPrice, N * sizeof(dataType));
+		cudaMalloc(&d_repoDirtyForwardPrice, N * sizeof(dataType));
+		cudaMalloc(&d_repoImpliedYield, N * sizeof(dataType));
+		cudaMalloc(&d_marketRepoRate, N * sizeof(dataType));
+		cudaMalloc(&d_bondForwardVal, N * sizeof(dataType));
+		g_N = N;
+	}
 
 	// Build host inArgs (matching repoEngine.c / cpu_reference.c setup)
 	repoYieldTermStruct* h_discountCurve = (repoYieldTermStruct*)malloc(N * sizeof(repoYieldTermStruct));
@@ -1260,10 +1289,7 @@ void solution_init(int N,
 	free(h_bondCleanPrice);
 	free(h_bond);
 	free(h_dummyStrike);
-}
 
-void solution_compute(int N, float* prices)
-{
 	// Build device structs
 	inArgsStruct inArgs;
 	inArgs.discountCurve = d_discountCurve;
@@ -1322,54 +1348,8 @@ void solution_compute(int N, float* prices)
 
 	for (int r = 0; r < 12; r++)
 		free(h_res[r]);
-}
 
-void solution_free(void)
-{
-	cudaFree(d_discountCurve);
-	cudaFree(d_repoCurve);
-	cudaFree(d_settlementDate);
-	cudaFree(d_deliveryDate);
-	cudaFree(d_maturityDate);
-	cudaFree(d_repoDeliveryDate);
-	cudaFree(d_bondCleanPrice);
-	cudaFree(d_bond);
-	cudaFree(d_dummyStrike);
-
-	cudaFree(d_dirtyPrice);
-	cudaFree(d_accruedAmountSettlement);
-	cudaFree(d_accruedAmountDeliveryDate);
-	cudaFree(d_cleanPrice);
-	cudaFree(d_forwardSpotIncome);
-	cudaFree(d_underlyingBondFwd);
-	cudaFree(d_repoNpv);
-	cudaFree(d_repoCleanForwardPrice);
-	cudaFree(d_repoDirtyForwardPrice);
-	cudaFree(d_repoImpliedYield);
-	cudaFree(d_marketRepoRate);
-	cudaFree(d_bondForwardVal);
-
-	d_discountCurve = NULL;
-	d_repoCurve = NULL;
-	d_settlementDate = NULL;
-	d_deliveryDate = NULL;
-	d_maturityDate = NULL;
-	d_repoDeliveryDate = NULL;
-	d_bondCleanPrice = NULL;
-	d_bond = NULL;
-	d_dummyStrike = NULL;
-	d_dirtyPrice = NULL;
-	d_accruedAmountSettlement = NULL;
-	d_accruedAmountDeliveryDate = NULL;
-	d_cleanPrice = NULL;
-	d_forwardSpotIncome = NULL;
-	d_underlyingBondFwd = NULL;
-	d_repoNpv = NULL;
-	d_repoCleanForwardPrice = NULL;
-	d_repoDirtyForwardPrice = NULL;
-	d_repoImpliedYield = NULL;
-	d_marketRepoRate = NULL;
-	d_bondForwardVal = NULL;
+	cudaDeviceSynchronize();
 }
 
 } // extern "C"
diff --git a/tasks/sph_cell_index/gpu_baseline.cu b/tasks/sph_cell_index/gpu_baseline.cu
index 4d1b71d..3853cb7 100644
--- a/tasks/sph_cell_index/gpu_baseline.cu
+++ b/tasks/sph_cell_index/gpu_baseline.cu
@@ -97,46 +97,61 @@ __global__ void KerCalcBeginEndCell(unsigned int n,
 
 extern "C" {
 
-void solution_init(int N,
-                   const float* xs, const float* ys, const float* zs,
-                   float cell_size, int grid_nx, int grid_ny, int grid_nz)
+static int g_num_cells = 0;
+
+void solution_free(void)
 {
-    g_N = N;
-    g_cell_size = cell_size;
-    g_grid_nx = grid_nx;
-    g_grid_ny = grid_ny;
-    g_grid_nz = grid_nz;
+    if (d_xs)         { cudaFree(d_xs);         d_xs         = nullptr; }
+    if (d_ys)         { cudaFree(d_ys);         d_ys         = nullptr; }
+    if (d_zs)         { cudaFree(d_zs);         d_zs         = nullptr; }
+    if (d_cell_ids)   { cudaFree(d_cell_ids);   d_cell_ids   = nullptr; }
+    if (d_indices)    { cudaFree(d_indices);    d_indices    = nullptr; }
+    if (d_cell_begin) { cudaFree(d_cell_begin); d_cell_begin = nullptr; }
+    if (d_cell_end)   { cudaFree(d_cell_end);   d_cell_end   = nullptr; }
+    g_N = 0;
+    g_num_cells = 0;
+}
 
+void solution_compute(int N,
+                      const float* xs, const float* ys, const float* zs,
+                      float cell_size,
+                      int grid_nx, int grid_ny, int grid_nz,
+                      int num_cells,
+                      int* sorted_indices, int* cell_begin, int* cell_end)
+{
     size_t sz_f = (size_t)N * sizeof(float);
     size_t sz_i = (size_t)N * sizeof(int);
-    int num_cells = grid_nx * grid_ny * grid_nz;
     size_t sz_cells = (size_t)num_cells * sizeof(int);
 
-    // Allocate and copy input arrays to device
-    cudaMalloc(&d_xs, sz_f);
-    cudaMalloc(&d_ys, sz_f);
-    cudaMalloc(&d_zs, sz_f);
+    if (g_N != N || g_num_cells != num_cells) {
+        solution_free();
+        cudaMalloc(&d_xs, sz_f);
+        cudaMalloc(&d_ys, sz_f);
+        cudaMalloc(&d_zs, sz_f);
+        cudaMalloc(&d_cell_ids, sz_i);
+        cudaMalloc(&d_indices, sz_i);
+        cudaMalloc(&d_cell_begin, sz_cells);
+        cudaMalloc(&d_cell_end, sz_cells);
+        g_N = N;
+        g_num_cells = num_cells;
+    }
+
+    g_cell_size = cell_size;
+    g_grid_nx = grid_nx;
+    g_grid_ny = grid_ny;
+    g_grid_nz = grid_nz;
+
     cudaMemcpy(d_xs, xs, sz_f, cudaMemcpyHostToDevice);
     cudaMemcpy(d_ys, ys, sz_f, cudaMemcpyHostToDevice);
     cudaMemcpy(d_zs, zs, sz_f, cudaMemcpyHostToDevice);
 
-    // Allocate intermediate and output arrays on device
-    cudaMalloc(&d_cell_ids, sz_i);
-    cudaMalloc(&d_indices, sz_i);
-    cudaMalloc(&d_cell_begin, sz_cells);
-    cudaMalloc(&d_cell_end, sz_cells);
-}
-
-void solution_compute(int N, int num_cells,
-                      int* sorted_indices, int* cell_begin, int* cell_end)
-{
     dim3 block(BLOCK_SIZE);
     dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
     // Step 1: Compute cell_id for each particle
     KerComputeCellIndex<<<grid, block>>>(
         N, d_xs, d_ys, d_zs,
-        g_cell_size, g_grid_nx, g_grid_ny,
+        cell_size, grid_nx, grid_ny,
         d_cell_ids, d_indices);
 
     // Step 2: Sort by cell_id using thrust::sort_by_key
@@ -158,17 +173,7 @@ void solution_compute(int N, int num_cells,
     cudaMemcpy(sorted_indices, d_indices,    (size_t)N * sizeof(int), cudaMemcpyDeviceToHost);
     cudaMemcpy(cell_begin,     d_cell_begin, (size_t)num_cells * sizeof(int), cudaMemcpyDeviceToHost);
     cudaMemcpy(cell_end,       d_cell_end,   (size_t)num_cells * sizeof(int), cudaMemcpyDeviceToHost);
-}
-
-void solution_free(void)
-{
-    cudaFree(d_xs);
-    cudaFree(d_ys);
-    cudaFree(d_zs);
-    cudaFree(d_cell_ids);
-    cudaFree(d_indices);
-    cudaFree(d_cell_begin);
-    cudaFree(d_cell_end);
+    cudaDeviceSynchronize();
 }
 
 } // extern "C"
diff --git a/tasks/sph_forces/gpu_baseline.cu b/tasks/sph_forces/gpu_baseline.cu
index aa86aef..68a4afd 100644
--- a/tasks/sph_forces/gpu_baseline.cu
+++ b/tasks/sph_forces/gpu_baseline.cu
@@ -231,25 +231,40 @@ __global__ void InteractionForcesFluidKernel(
 
 extern "C" {
 
-void solution_init(int N,
-                   const float* xs, const float* ys, const float* zs,
-                   const float* vxs, const float* vys, const float* vzs,
-                   const float* rhos, const float* masses,
-                   float h, float cs0, float rhop0, float alpha_visc,
-                   const int* cell_begin, const int* cell_end,
-                   const int* sorted_idx,
-                   int grid_nx, int grid_ny, int grid_nz,
-                   float cell_size)
+static int g_num_cells = 0;
+
+void solution_free(void)
 {
-    g_N = N;
-    g_grid_nx = grid_nx;
-    g_grid_ny = grid_ny;
-    g_grid_nz = grid_nz;
-    g_cell_size = cell_size;
-    g_h = h;
-    g_cs0 = cs0;
-    g_alpha_visc = alpha_visc;
+    if (d_xs)         { cudaFree(d_xs);         d_xs         = nullptr; }
+    if (d_ys)         { cudaFree(d_ys);         d_ys         = nullptr; }
+    if (d_zs)         { cudaFree(d_zs);         d_zs         = nullptr; }
+    if (d_vxs)        { cudaFree(d_vxs);        d_vxs        = nullptr; }
+    if (d_vys)        { cudaFree(d_vys);        d_vys        = nullptr; }
+    if (d_vzs)        { cudaFree(d_vzs);        d_vzs        = nullptr; }
+    if (d_rhos)       { cudaFree(d_rhos);       d_rhos       = nullptr; }
+    if (d_masses)     { cudaFree(d_masses);     d_masses     = nullptr; }
+    if (d_cell_begin) { cudaFree(d_cell_begin); d_cell_begin = nullptr; }
+    if (d_cell_end)   { cudaFree(d_cell_end);   d_cell_end   = nullptr; }
+    if (d_sorted_idx) { cudaFree(d_sorted_idx); d_sorted_idx = nullptr; }
+    if (d_ax)         { cudaFree(d_ax);         d_ax         = nullptr; }
+    if (d_ay)         { cudaFree(d_ay);         d_ay         = nullptr; }
+    if (d_az)         { cudaFree(d_az);         d_az         = nullptr; }
+    if (d_drhodt)     { cudaFree(d_drhodt);     d_drhodt     = nullptr; }
+    g_N = 0;
+    g_num_cells = 0;
+}
 
+void solution_compute(int N,
+                      const float* xs, const float* ys, const float* zs,
+                      const float* vxs, const float* vys, const float* vzs,
+                      const float* rhos, const float* masses,
+                      float h, float cs0, float rhop0, float alpha_visc,
+                      const int* cell_begin, const int* cell_end,
+                      const int* sorted_idx,
+                      int grid_nx, int grid_ny, int grid_nz,
+                      float cell_size,
+                      float* ax, float* ay, float* az, float* drhodt)
+{
     int num_cells = grid_nx * grid_ny * grid_nz;
 
     // Precompute Wendland kernel constants (3D)
@@ -259,22 +274,38 @@ void solution_init(int N,
     g_cteb = cs0 * cs0 * rhop0 / SPH_GAMMA;
     g_ovrhopzero = 1.0f / rhop0;
 
+    g_grid_nx = grid_nx;
+    g_grid_ny = grid_ny;
+    g_grid_nz = grid_nz;
+    g_cell_size = cell_size;
+    g_h = h;
+    g_cs0 = cs0;
+    g_alpha_visc = alpha_visc;
+
     size_t sz_f = (size_t)N * sizeof(float);
     size_t sz_i_n = (size_t)N * sizeof(int);
     size_t sz_i_c = (size_t)num_cells * sizeof(int);
 
-    // Allocate and copy input arrays to device
-    cudaMalloc(&d_xs, sz_f);
-    cudaMalloc(&d_ys, sz_f);
-    cudaMalloc(&d_zs, sz_f);
-    cudaMalloc(&d_vxs, sz_f);
-    cudaMalloc(&d_vys, sz_f);
-    cudaMalloc(&d_vzs, sz_f);
-    cudaMalloc(&d_rhos, sz_f);
-    cudaMalloc(&d_masses, sz_f);
-    cudaMalloc(&d_cell_begin, sz_i_c);
-    cudaMalloc(&d_cell_end, sz_i_c);
-    cudaMalloc(&d_sorted_idx, sz_i_n);
+    if (g_N != N || g_num_cells != num_cells) {
+        solution_free();
+        cudaMalloc(&d_xs, sz_f);
+        cudaMalloc(&d_ys, sz_f);
+        cudaMalloc(&d_zs, sz_f);
+        cudaMalloc(&d_vxs, sz_f);
+        cudaMalloc(&d_vys, sz_f);
+        cudaMalloc(&d_vzs, sz_f);
+        cudaMalloc(&d_rhos, sz_f);
+        cudaMalloc(&d_masses, sz_f);
+        cudaMalloc(&d_cell_begin, sz_i_c);
+        cudaMalloc(&d_cell_end, sz_i_c);
+        cudaMalloc(&d_sorted_idx, sz_i_n);
+        cudaMalloc(&d_ax, sz_f);
+        cudaMalloc(&d_ay, sz_f);
+        cudaMalloc(&d_az, sz_f);
+        cudaMalloc(&d_drhodt, sz_f);
+        g_N = N;
+        g_num_cells = num_cells;
+    }
 
     cudaMemcpy(d_xs, xs, sz_f, cudaMemcpyHostToDevice);
     cudaMemcpy(d_ys, ys, sz_f, cudaMemcpyHostToDevice);
@@ -288,16 +319,6 @@ void solution_init(int N,
     cudaMemcpy(d_cell_end, cell_end, sz_i_c, cudaMemcpyHostToDevice);
     cudaMemcpy(d_sorted_idx, sorted_idx, sz_i_n, cudaMemcpyHostToDevice);
 
-    // Allocate output arrays on device
-    cudaMalloc(&d_ax, sz_f);
-    cudaMalloc(&d_ay, sz_f);
-    cudaMalloc(&d_az, sz_f);
-    cudaMalloc(&d_drhodt, sz_f);
-}
-
-void solution_compute(int N,
-                      float* ax, float* ay, float* az, float* drhodt)
-{
     dim3 block(BLOCK_SIZE);
     dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
@@ -307,37 +328,17 @@ void solution_compute(int N,
         d_vxs, d_vys, d_vzs,
         d_rhos, d_masses,
         d_cell_begin, d_cell_end, d_sorted_idx,
-        g_grid_nx, g_grid_ny, g_grid_nz,
-        g_cell_size,
-        g_h, g_bwenh, g_kernelsize2,
-        g_cs0, g_alpha_visc, g_ovrhopzero, g_cteb,
+        grid_nx, grid_ny, grid_nz,
+        cell_size,
+        h, g_bwenh, g_kernelsize2,
+        cs0, alpha_visc, g_ovrhopzero, g_cteb,
         d_ax, d_ay, d_az, d_drhodt);
 
-    // Copy results back to host
-    size_t sz_f = (size_t)N * sizeof(float);
     cudaMemcpy(ax,     d_ax,     sz_f, cudaMemcpyDeviceToHost);
     cudaMemcpy(ay,     d_ay,     sz_f, cudaMemcpyDeviceToHost);
     cudaMemcpy(az,     d_az,     sz_f, cudaMemcpyDeviceToHost);
     cudaMemcpy(drhodt, d_drhodt, sz_f, cudaMemcpyDeviceToHost);
-}
-
-void solution_free(void)
-{
-    cudaFree(d_xs);
-    cudaFree(d_ys);
-    cudaFree(d_zs);
-    cudaFree(d_vxs);
-    cudaFree(d_vys);
-    cudaFree(d_vzs);
-    cudaFree(d_rhos);
-    cudaFree(d_masses);
-    cudaFree(d_cell_begin);
-    cudaFree(d_cell_end);
-    cudaFree(d_sorted_idx);
-    cudaFree(d_ax);
-    cudaFree(d_ay);
-    cudaFree(d_az);
-    cudaFree(d_drhodt);
+    cudaDeviceSynchronize();
 }
 
 } // extern "C"
diff --git a/tasks/sph_position/gpu_baseline.cu b/tasks/sph_position/gpu_baseline.cu
index 8c0fddb..e26d805 100644
--- a/tasks/sph_position/gpu_baseline.cu
+++ b/tasks/sph_position/gpu_baseline.cu
@@ -70,27 +70,50 @@ __global__ void KerComputeStepPos(int N,
 
 extern "C" {
 
-void solution_init(int N,
-                   const float* posxy_x, const float* posxy_y,
-                   const float* posz,
-                   const float* movxy_x, const float* movxy_y,
-                   const float* movz,
-                   float cell_size)
+void solution_free(void)
 {
-    g_N = N;
-    g_cell_size = cell_size;
+    if (d_pos_x)    { cudaFree(d_pos_x);    d_pos_x    = nullptr; }
+    if (d_pos_y)    { cudaFree(d_pos_y);    d_pos_y    = nullptr; }
+    if (d_pos_z)    { cudaFree(d_pos_z);    d_pos_z    = nullptr; }
+    if (d_mov_x)    { cudaFree(d_mov_x);    d_mov_x    = nullptr; }
+    if (d_mov_y)    { cudaFree(d_mov_y);    d_mov_y    = nullptr; }
+    if (d_mov_z)    { cudaFree(d_mov_z);    d_mov_z    = nullptr; }
+    if (d_out_x)    { cudaFree(d_out_x);    d_out_x    = nullptr; }
+    if (d_out_y)    { cudaFree(d_out_y);    d_out_y    = nullptr; }
+    if (d_out_z)    { cudaFree(d_out_z);    d_out_z    = nullptr; }
+    if (d_out_cell) { cudaFree(d_out_cell); d_out_cell = nullptr; }
+    g_N = 0;
+}
 
+void solution_compute(int N,
+                      const float* posxy_x, const float* posxy_y,
+                      const float* posz,
+                      const float* movxy_x, const float* movxy_y,
+                      const float* movz,
+                      float cell_size,
+                      double* out_x, double* out_y, double* out_z,
+                      int* out_cell)
+{
     size_t sz_f = (size_t)N * sizeof(float);
     size_t sz_d = (size_t)N * sizeof(double);
     size_t sz_i = (size_t)N * sizeof(int);
 
-    // Allocate and copy input arrays to device
-    cudaMalloc(&d_pos_x, sz_f);
-    cudaMalloc(&d_pos_y, sz_f);
-    cudaMalloc(&d_pos_z, sz_f);
-    cudaMalloc(&d_mov_x, sz_f);
-    cudaMalloc(&d_mov_y, sz_f);
-    cudaMalloc(&d_mov_z, sz_f);
+    if (g_N != N) {
+        solution_free();
+        cudaMalloc(&d_pos_x, sz_f);
+        cudaMalloc(&d_pos_y, sz_f);
+        cudaMalloc(&d_pos_z, sz_f);
+        cudaMalloc(&d_mov_x, sz_f);
+        cudaMalloc(&d_mov_y, sz_f);
+        cudaMalloc(&d_mov_z, sz_f);
+        cudaMalloc(&d_out_x, sz_d);
+        cudaMalloc(&d_out_y, sz_d);
+        cudaMalloc(&d_out_z, sz_d);
+        cudaMalloc(&d_out_cell, sz_i);
+        g_N = N;
+    }
+
+    g_cell_size = cell_size;
 
     cudaMemcpy(d_pos_x, posxy_x, sz_f, cudaMemcpyHostToDevice);
     cudaMemcpy(d_pos_y, posxy_y, sz_f, cudaMemcpyHostToDevice);
@@ -99,17 +122,6 @@ void solution_init(int N,
     cudaMemcpy(d_mov_y, movxy_y, sz_f, cudaMemcpyHostToDevice);
     cudaMemcpy(d_mov_z, movz,    sz_f, cudaMemcpyHostToDevice);
 
-    // Allocate output arrays on device
-    cudaMalloc(&d_out_x, sz_d);
-    cudaMalloc(&d_out_y, sz_d);
-    cudaMalloc(&d_out_z, sz_d);
-    cudaMalloc(&d_out_cell, sz_i);
-}
-
-void solution_compute(int N,
-                      double* out_x, double* out_y, double* out_z,
-                      int* out_cell)
-{
     dim3 block(BLOCK_SIZE);
     dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
@@ -117,30 +129,15 @@ void solution_compute(int N,
         N,
         d_pos_x, d_pos_y, d_pos_z,
         d_mov_x, d_mov_y, d_mov_z,
-        g_cell_size,
+        cell_size,
         d_out_x, d_out_y, d_out_z, d_out_cell);
 
     // Copy results back to host
-    size_t sz_d = (size_t)N * sizeof(double);
-    size_t sz_i = (size_t)N * sizeof(int);
     cudaMemcpy(out_x,    d_out_x,    sz_d, cudaMemcpyDeviceToHost);
     cudaMemcpy(out_y,    d_out_y,    sz_d, cudaMemcpyDeviceToHost);
     cudaMemcpy(out_z,    d_out_z,    sz_d, cudaMemcpyDeviceToHost);
     cudaMemcpy(out_cell, d_out_cell, sz_i, cudaMemcpyDeviceToHost);
-}
-
-void solution_free(void)
-{
-    cudaFree(d_pos_x);
-    cudaFree(d_pos_y);
-    cudaFree(d_pos_z);
-    cudaFree(d_mov_x);
-    cudaFree(d_mov_y);
-    cudaFree(d_mov_z);
-    cudaFree(d_out_x);
-    cudaFree(d_out_y);
-    cudaFree(d_out_z);
-    cudaFree(d_out_cell);
+    cudaDeviceSynchronize();
 }
 
 } // extern "C"
diff --git a/tasks/spmv_csr/gpu_baseline.cu b/tasks/spmv_csr/gpu_baseline.cu
index 9498216..19867fa 100644
--- a/tasks/spmv_csr/gpu_baseline.cu
+++ b/tasks/spmv_csr/gpu_baseline.cu
@@ -46,6 +46,7 @@ __global__ void TransposedMatrixVectorProductKernel(
 }
 
 // ===== Persistent device state =====
+static int    g_num_rows = 0;
 static int    g_num_cols = 0;
 static int    g_nnz = 0;
 static int*   d_col_ptrs    = nullptr;
@@ -54,30 +55,45 @@ static float* d_values      = nullptr;
 static float* d_vector      = nullptr;
 static float* d_answer      = nullptr;
 
-extern "C" void solution_init(int          num_rows,
-                              int          num_cols,
-                              const int*   col_ptrs,
-                              const int*   row_indices,
-                              const float* values,
-                              const float* vector)
+extern "C" void solution_free(void)
 {
-    g_num_cols = num_cols;
-    g_nnz      = col_ptrs[num_cols];
+    if (d_col_ptrs)    { cudaFree(d_col_ptrs);    d_col_ptrs    = nullptr; }
+    if (d_row_indices) { cudaFree(d_row_indices); d_row_indices = nullptr; }
+    if (d_values)      { cudaFree(d_values);      d_values      = nullptr; }
+    if (d_vector)      { cudaFree(d_vector);      d_vector      = nullptr; }
+    if (d_answer)      { cudaFree(d_answer);      d_answer      = nullptr; }
+    g_num_rows = 0;
+    g_num_cols = 0;
+    g_nnz = 0;
+}
 
-    cudaMalloc(&d_col_ptrs,    (num_cols + 1) * sizeof(int));
-    cudaMalloc(&d_row_indices, g_nnz * sizeof(int));
-    cudaMalloc(&d_values,      g_nnz * sizeof(float));
-    cudaMalloc(&d_vector,      num_rows * sizeof(float));
-    cudaMalloc(&d_answer,      num_cols * sizeof(float));
+extern "C" void solution_compute(int          num_rows,
+                                 int          num_cols,
+                                 const int*   col_ptrs,
+                                 const int*   row_indices,
+                                 const float* values,
+                                 const float* vector,
+                                 float*       answer)
+{
+    int nnz = col_ptrs[num_cols];
+
+    if (g_num_rows != num_rows || g_num_cols != num_cols || g_nnz != nnz) {
+        solution_free();
+        cudaMalloc(&d_col_ptrs,    (num_cols + 1) * sizeof(int));
+        cudaMalloc(&d_row_indices, nnz * sizeof(int));
+        cudaMalloc(&d_values,      nnz * sizeof(float));
+        cudaMalloc(&d_vector,      num_rows * sizeof(float));
+        cudaMalloc(&d_answer,      num_cols * sizeof(float));
+        g_num_rows = num_rows;
+        g_num_cols = num_cols;
+        g_nnz = nnz;
+    }
 
     cudaMemcpy(d_col_ptrs,    col_ptrs,    (num_cols + 1) * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_row_indices, row_indices, g_nnz * sizeof(int),          cudaMemcpyHostToDevice);
-    cudaMemcpy(d_values,      values,      g_nnz * sizeof(float),        cudaMemcpyHostToDevice);
+    cudaMemcpy(d_row_indices, row_indices, nnz * sizeof(int),            cudaMemcpyHostToDevice);
+    cudaMemcpy(d_values,      values,      nnz * sizeof(float),          cudaMemcpyHostToDevice);
     cudaMemcpy(d_vector,      vector,      num_rows * sizeof(float),     cudaMemcpyHostToDevice);
-}
 
-extern "C" void solution_compute(int num_cols, float* answer)
-{
     int threadsPerBlock = 256;
     int blocks = (num_cols + threadsPerBlock - 1) / threadsPerBlock;
 
@@ -85,13 +101,5 @@ extern "C" void solution_compute(int num_cols, float* answer)
         num_cols, d_col_ptrs, d_row_indices, d_values, d_vector, d_answer);
 
     cudaMemcpy(answer, d_answer, num_cols * sizeof(float), cudaMemcpyDeviceToHost);
-}
-
-extern "C" void solution_free(void)
-{
-    if (d_col_ptrs)    { cudaFree(d_col_ptrs);    d_col_ptrs    = nullptr; }
-    if (d_row_indices) { cudaFree(d_row_indices); d_row_indices = nullptr; }
-    if (d_values)      { cudaFree(d_values);      d_values      = nullptr; }
-    if (d_vector)      { cudaFree(d_vector);      d_vector      = nullptr; }
-    if (d_answer)      { cudaFree(d_answer);      d_answer      = nullptr; }
+    cudaDeviceSynchronize();
 }