From dc0615fc9c8305fe6069e1c06c4403892e6909d7 Mon Sep 17 00:00:00 2001 From: b010001y <528387485@qq.com> Date: Sat, 25 Apr 2026 23:27:59 +0800 Subject: [PATCH] Migrate 15 gpu_baseline.cu files to unified compute_only interface The repo migrated all task_io.cu adapters and cpu_reference.c files to a single solution_compute(N, all_inputs..., output) function in commit a453c5b ("Migrate all 44 tasks to unified compute_only interface"). However, the 15 gpu_baseline.cu files were not migrated and still expose the old solution_init + solution_compute(N, output_only) split. As a result they link successfully (the LLM-facing names match) but produce garbage at runtime: solution_init is never called, so device pointers stay null when solution_compute launches its kernel. This commit migrates each gpu_baseline.cu to the new interface following the same pattern: - Delete solution_init entirely. - Define solution_free before solution_compute (so the lazy-init block can call it). - solution_compute now takes the full set of inputs (signature copied verbatim from the task's task_io.cu). - Move all cudaMalloc into solution_compute, guarded by a shape check so allocation only happens on the first call (or when shape changes). - H2D copies move into solution_compute and run every call, per the unified contract: "the harness passes full host-side inputs every call". - cudaDeviceSynchronize() added before return. Verified on small data (sm_89 / L20X). 13 of 15 pass exact/tol-bounded correctness checks vs expected_output.txt: | Task | GPU(ms) | CPU(ms) | | --- | ---: | ---: | | black_scholes | 0.375 | 7.259 | | dbscan | 203.010 | 239.577 | | dtw_distance | 0.316 | 1806.643 | | monte_carlo | 0.363 | 1332.731 | | hausdorff_distance | 0.102 | 1.359 | | euclidean_distance_matrix | 0.054 | 0.377 | | held_karp_tsp | 0.601 | 22.403 | | spmv_csr | 0.107 | 0.033 | | max_flow_push_relabel | 1948.765 | 88.825 | | sph_cell_index | 0.409 | 6.978 | | sph_position | 0.351 | 0.430 | | sph_forces | 0.304 | 4.562 | | pdlp | 7.128 | 2.352 | Two files (bonds_pricing, repo_pricing) have a separate, pre-existing forward-declaration bug in their __device__ helper functions (bondAccruedAmountGpu, cashFlowsNpvGpu, etc. are called before they are declared). Their old-interface versions on main also fail to compile with the same errors, so this is not a regression. The migration in this commit is correct on its own; the forward-decl bug should be addressed separately. --- tasks/black_scholes/gpu_baseline.cu | 64 ++++---- tasks/bonds_pricing/gpu_baseline.cu | 101 +++++------- tasks/dbscan/gpu_baseline.cu | 57 +++---- tasks/dtw_distance/gpu_baseline.cu | 39 ++--- .../euclidean_distance_matrix/gpu_baseline.cu | 50 +++--- tasks/hausdorff_distance/gpu_baseline.cu | 57 +++---- tasks/held_karp_tsp/gpu_baseline.cu | 25 +-- tasks/max_flow_push_relabel/gpu_baseline.cu | 77 +++++---- tasks/monte_carlo/gpu_baseline.cu | 31 ++-- tasks/pdlp/gpu_baseline.cu | 124 ++++++++------- tasks/repo_pricing/gpu_baseline.cu | 146 ++++++++---------- tasks/sph_cell_index/gpu_baseline.cu | 75 ++++----- tasks/sph_forces/gpu_baseline.cu | 129 ++++++++-------- tasks/sph_position/gpu_baseline.cu | 83 +++++----- tasks/spmv_csr/gpu_baseline.cu | 62 ++++---- 15 files changed, 556 insertions(+), 564 deletions(-) diff --git a/tasks/black_scholes/gpu_baseline.cu b/tasks/black_scholes/gpu_baseline.cu index a677287..44c5755 100644 --- a/tasks/black_scholes/gpu_baseline.cu +++ b/tasks/black_scholes/gpu_baseline.cu @@ -257,7 +257,7 @@ __global__ void getOutValOptionKernel( outputVals[optionNum] = resultVal; } -// ===== Host interface ===== +// ===== Host interface (compute_only) ===== static int g_N = 0; static int* d_types = nullptr; @@ -269,23 +269,40 @@ static float* d_ts = nullptr; static float* d_vols = nullptr; static float* d_prices = nullptr; -extern "C" void solution_init(int N, - const int* types, const float* strikes, const float* spots, - const float* qs, const float* rs, const float* ts, - const float* vols) +extern "C" void solution_free(void) +{ + if (d_types) { cudaFree(d_types); d_types = nullptr; } + if (d_strikes) { cudaFree(d_strikes); d_strikes = nullptr; } + if (d_spots) { cudaFree(d_spots); d_spots = nullptr; } + if (d_qs) { cudaFree(d_qs); d_qs = nullptr; } + if (d_rs) { cudaFree(d_rs); d_rs = nullptr; } + if (d_ts) { cudaFree(d_ts); d_ts = nullptr; } + if (d_vols) { cudaFree(d_vols); d_vols = nullptr; } + if (d_prices) { cudaFree(d_prices); d_prices = nullptr; } + g_N = 0; +} + +extern "C" void solution_compute(int N, + const int* types, const float* strikes, const float* spots, + const float* qs, const float* rs, const float* ts, + const float* vols, + float* prices) { - g_N = N; size_t szi = (size_t)N * sizeof(int); size_t szf = (size_t)N * sizeof(float); - cudaMalloc(&d_types, szi); - cudaMalloc(&d_strikes, szf); - cudaMalloc(&d_spots, szf); - cudaMalloc(&d_qs, szf); - cudaMalloc(&d_rs, szf); - cudaMalloc(&d_ts, szf); - cudaMalloc(&d_vols, szf); - cudaMalloc(&d_prices, szf); + if (g_N != N) { + solution_free(); + cudaMalloc(&d_types, szi); + cudaMalloc(&d_strikes, szf); + cudaMalloc(&d_spots, szf); + cudaMalloc(&d_qs, szf); + cudaMalloc(&d_rs, szf); + cudaMalloc(&d_ts, szf); + cudaMalloc(&d_vols, szf); + cudaMalloc(&d_prices, szf); + g_N = N; + } cudaMemcpy(d_types, types, szi, cudaMemcpyHostToDevice); cudaMemcpy(d_strikes, strikes, szf, cudaMemcpyHostToDevice); @@ -294,27 +311,12 @@ extern "C" void solution_init(int N, cudaMemcpy(d_rs, rs, szf, cudaMemcpyHostToDevice); cudaMemcpy(d_ts, ts, szf, cudaMemcpyHostToDevice); cudaMemcpy(d_vols, vols, szf, cudaMemcpyHostToDevice); -} -extern "C" void solution_compute(int N, float* prices) -{ int threadsPerBlock = 256; int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; - getOutValOptionKernel<<>>( N, d_types, d_strikes, d_spots, d_qs, d_rs, d_ts, d_vols, d_prices); - cudaMemcpy(prices, d_prices, (size_t)N * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - if (d_types) { cudaFree(d_types); d_types = nullptr; } - if (d_strikes) { cudaFree(d_strikes); d_strikes = nullptr; } - if (d_spots) { cudaFree(d_spots); d_spots = nullptr; } - if (d_qs) { cudaFree(d_qs); d_qs = nullptr; } - if (d_rs) { cudaFree(d_rs); d_rs = nullptr; } - if (d_ts) { cudaFree(d_ts); d_ts = nullptr; } - if (d_vols) { cudaFree(d_vols); d_vols = nullptr; } - if (d_prices) { cudaFree(d_prices); d_prices = nullptr; } + cudaMemcpy(prices, d_prices, szf, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); } diff --git a/tasks/bonds_pricing/gpu_baseline.cu b/tasks/bonds_pricing/gpu_baseline.cu index fdca893..0218a19 100644 --- a/tasks/bonds_pricing/gpu_baseline.cu +++ b/tasks/bonds_pricing/gpu_baseline.cu @@ -1072,39 +1072,47 @@ static bondsDateStruct intializeDateKernelCpu(int d, int m, int y) extern "C" { -void solution_init(int N, - const int* issue_year, const int* issue_month, const int* issue_day, - const int* maturity_year, const int* maturity_month, const int* maturity_day, - const float* rates, float coupon_freq) +void solution_free(void) { - g_N = N; - g_issue_year = issue_year; - g_issue_month = issue_month; - g_issue_day = issue_day; - g_maturity_year = maturity_year; - g_maturity_month = maturity_month; - g_maturity_day = maturity_day; - g_rates = rates; - g_coupon_freq = coupon_freq; - - // Allocate persistent GPU memory for inArgs - cudaMalloc(&d_discountCurve, N * sizeof(bondsYieldTermStruct)); - cudaMalloc(&d_repoCurve, N * sizeof(bondsYieldTermStruct)); - cudaMalloc(&d_currDate, N * sizeof(bondsDateStruct)); - cudaMalloc(&d_maturityDate, N * sizeof(bondsDateStruct)); - cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType)); - cudaMalloc(&d_bond, N * sizeof(bondStruct)); - cudaMalloc(&d_dummyStrike, N * sizeof(dataType)); - - // Allocate persistent GPU memory for results - cudaMalloc(&d_dirtyPrice, N * sizeof(dataType)); - cudaMalloc(&d_accruedAmountCurrDate, N * sizeof(dataType)); - cudaMalloc(&d_cleanPrice, N * sizeof(dataType)); - cudaMalloc(&d_bondForwardVal, N * sizeof(dataType)); + if (d_discountCurve) { cudaFree(d_discountCurve); d_discountCurve = NULL; } + if (d_repoCurve) { cudaFree(d_repoCurve); d_repoCurve = NULL; } + if (d_currDate) { cudaFree(d_currDate); d_currDate = NULL; } + if (d_maturityDate) { cudaFree(d_maturityDate); d_maturityDate = NULL; } + if (d_bondCleanPrice) { cudaFree(d_bondCleanPrice); d_bondCleanPrice = NULL; } + if (d_bond) { cudaFree(d_bond); d_bond = NULL; } + if (d_dummyStrike) { cudaFree(d_dummyStrike); d_dummyStrike = NULL; } + if (d_dirtyPrice) { cudaFree(d_dirtyPrice); d_dirtyPrice = NULL; } + if (d_accruedAmountCurrDate) { cudaFree(d_accruedAmountCurrDate); d_accruedAmountCurrDate = NULL; } + if (d_cleanPrice) { cudaFree(d_cleanPrice); d_cleanPrice = NULL; } + if (d_bondForwardVal) { cudaFree(d_bondForwardVal); d_bondForwardVal = NULL; } + g_N = 0; } -void solution_compute(int N, float* prices) +void solution_compute(int N, + const int* issue_year, const int* issue_month, const int* issue_day, + const int* maturity_year, const int* maturity_month, const int* maturity_day, + const float* rates, float coupon_freq, + float* prices) { + if (g_N != N) { + solution_free(); + // Allocate persistent GPU memory for inArgs + cudaMalloc(&d_discountCurve, N * sizeof(bondsYieldTermStruct)); + cudaMalloc(&d_repoCurve, N * sizeof(bondsYieldTermStruct)); + cudaMalloc(&d_currDate, N * sizeof(bondsDateStruct)); + cudaMalloc(&d_maturityDate, N * sizeof(bondsDateStruct)); + cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType)); + cudaMalloc(&d_bond, N * sizeof(bondStruct)); + cudaMalloc(&d_dummyStrike, N * sizeof(dataType)); + + // Allocate persistent GPU memory for results + cudaMalloc(&d_dirtyPrice, N * sizeof(dataType)); + cudaMalloc(&d_accruedAmountCurrDate, N * sizeof(dataType)); + cudaMalloc(&d_cleanPrice, N * sizeof(dataType)); + cudaMalloc(&d_bondForwardVal, N * sizeof(dataType)); + g_N = N; + } + // Build inArgs on host (matching bondsEngine.c / cpu_reference.c setup) bondsYieldTermStruct* h_discountCurve = (bondsYieldTermStruct*)malloc(N * sizeof(bondsYieldTermStruct)); bondsYieldTermStruct* h_repoCurve = (bondsYieldTermStruct*)malloc(N * sizeof(bondsYieldTermStruct)); @@ -1120,16 +1128,16 @@ void solution_compute(int N, float* prices) int repoCompounding = SIMPLE_INTEREST; dataType repoCompoundFreq = 1; - bondsDateStruct bondIssueDate = intializeDateKernelCpu(g_issue_day[numBond], g_issue_month[numBond], g_issue_year[numBond]); - bondsDateStruct bondMaturityDate = intializeDateKernelCpu(g_maturity_day[numBond], g_maturity_month[numBond], g_maturity_year[numBond]); + bondsDateStruct bondIssueDate = intializeDateKernelCpu(issue_day[numBond], issue_month[numBond], issue_year[numBond]); + bondsDateStruct bondMaturityDate = intializeDateKernelCpu(maturity_day[numBond], maturity_month[numBond], maturity_year[numBond]); bondsDateStruct todaysDate = intializeDateKernelCpu(bondMaturityDate.day-1, bondMaturityDate.month, bondMaturityDate.year); bondStruct bond; bond.startDate = bondIssueDate; bond.maturityDate = bondMaturityDate; - bond.rate = g_rates[numBond]; + bond.rate = rates[numBond]; - dataType bondCouponFrequency = (dataType)g_coupon_freq; + dataType bondCouponFrequency = (dataType)coupon_freq; dataType bondCleanPrice = 89.97693786; bondsYieldTermStruct bondCurve; @@ -1221,33 +1229,8 @@ void solution_compute(int N, float* prices) free(h_accruedAmountCurrDate); free(h_cleanPrice); free(h_bondForwardVal); -} -void solution_free(void) -{ - cudaFree(d_discountCurve); - cudaFree(d_repoCurve); - cudaFree(d_currDate); - cudaFree(d_maturityDate); - cudaFree(d_bondCleanPrice); - cudaFree(d_bond); - cudaFree(d_dummyStrike); - cudaFree(d_dirtyPrice); - cudaFree(d_accruedAmountCurrDate); - cudaFree(d_cleanPrice); - cudaFree(d_bondForwardVal); - - d_discountCurve = NULL; - d_repoCurve = NULL; - d_currDate = NULL; - d_maturityDate = NULL; - d_bondCleanPrice = NULL; - d_bond = NULL; - d_dummyStrike = NULL; - d_dirtyPrice = NULL; - d_accruedAmountCurrDate = NULL; - d_cleanPrice = NULL; - d_bondForwardVal = NULL; + cudaDeviceSynchronize(); } } // extern "C" diff --git a/tasks/dbscan/gpu_baseline.cu b/tasks/dbscan/gpu_baseline.cu index bfc5ecf..67b05ee 100644 --- a/tasks/dbscan/gpu_baseline.cu +++ b/tasks/dbscan/gpu_baseline.cu @@ -27,8 +27,6 @@ // Module-level state static int g_N = 0; -static float g_eps = 0; -static int g_minPts = 0; static float* d_xs = NULL; static float* d_ys = NULL; static int* d_neighbor_counts = NULL; @@ -59,24 +57,29 @@ __global__ void countNeighborsKernel( // ===== Interface ===== -extern "C" void solution_init(int N, const float* xs, const float* ys, - float eps, int minPts) +extern "C" void solution_free(void) { - g_N = N; - g_eps = eps; - g_minPts = minPts; + if (d_xs) { cudaFree(d_xs); d_xs = NULL; } + if (d_ys) { cudaFree(d_ys); d_ys = NULL; } + if (d_neighbor_counts) { cudaFree(d_neighbor_counts); d_neighbor_counts = NULL; } + g_N = 0; +} - cudaMalloc(&d_xs, N * sizeof(float)); - cudaMalloc(&d_ys, N * sizeof(float)); - cudaMalloc(&d_neighbor_counts, N * sizeof(int)); +extern "C" void solution_compute(int N, const float* xs, const float* ys, + float eps, int minPts, int* labels) +{ + if (g_N != N) { + solution_free(); + cudaMalloc(&d_xs, N * sizeof(float)); + cudaMalloc(&d_ys, N * sizeof(float)); + cudaMalloc(&d_neighbor_counts, N * sizeof(int)); + g_N = N; + } cudaMemcpy(d_xs, xs, N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_ys, ys, N * sizeof(float), cudaMemcpyHostToDevice); -} -extern "C" void solution_compute(int N, int* labels) -{ - float eps2 = g_eps * g_eps; + float eps2 = eps * eps; int threads = 256; int blocks = (N + threads - 1) / threads; @@ -88,11 +91,6 @@ extern "C" void solution_compute(int N, int* labels) int* h_counts = (int*)malloc(N * sizeof(int)); cudaMemcpy(h_counts, d_neighbor_counts, N * sizeof(int), cudaMemcpyDeviceToHost); - float* h_xs = (float*)malloc(N * sizeof(float)); - float* h_ys = (float*)malloc(N * sizeof(float)); - cudaMemcpy(h_xs, d_xs, N * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(h_ys, d_ys, N * sizeof(float), cudaMemcpyDeviceToHost); - // DBSCAN BFS (matches clusterThread + expandCluster from original) for (int i = 0; i < N; i++) labels[i] = UNPROCESSED; @@ -102,7 +100,7 @@ extern "C" void solution_compute(int N, int* labels) for (int i = 0; i < N; i++) { if (labels[i] != UNPROCESSED) continue; - if (h_counts[i] < g_minPts) { + if (h_counts[i] < minPts) { labels[i] = NOISE; continue; } @@ -116,8 +114,8 @@ extern "C" void solution_compute(int N, int* labels) // Add neighbors of i as seeds for (int j = 0; j < N; j++) { if (j == i) continue; - float dx = h_xs[i] - h_xs[j]; - float dy = h_ys[i] - h_ys[j]; + float dx = xs[i] - xs[j]; + float dy = ys[i] - ys[j]; if (dx*dx + dy*dy <= eps2) { if (labels[j] == UNPROCESSED) seeds[tail++] = j; if (labels[j] == UNPROCESSED || labels[j] == NOISE) @@ -128,12 +126,12 @@ extern "C" void solution_compute(int N, int* labels) // BFS expand while (head < tail) { int q = seeds[head++]; - if (h_counts[q] < g_minPts) continue; + if (h_counts[q] < minPts) continue; for (int j = 0; j < N; j++) { if (labels[j] != UNPROCESSED && labels[j] != NOISE) continue; - float dx = h_xs[q] - h_xs[j]; - float dy = h_ys[q] - h_ys[j]; + float dx = xs[q] - xs[j]; + float dy = ys[q] - ys[j]; if (dx*dx + dy*dy <= eps2) { if (labels[j] == UNPROCESSED) seeds[tail++] = j; labels[j] = clusterId; @@ -144,13 +142,6 @@ extern "C" void solution_compute(int N, int* labels) free(seeds); free(h_counts); - free(h_xs); - free(h_ys); -} -extern "C" void solution_free(void) -{ - if (d_xs) { cudaFree(d_xs); d_xs = NULL; } - if (d_ys) { cudaFree(d_ys); d_ys = NULL; } - if (d_neighbor_counts) { cudaFree(d_neighbor_counts); d_neighbor_counts = NULL; } + cudaDeviceSynchronize(); } diff --git a/tasks/dtw_distance/gpu_baseline.cu b/tasks/dtw_distance/gpu_baseline.cu index 21fa920..6d46ad5 100644 --- a/tasks/dtw_distance/gpu_baseline.cu +++ b/tasks/dtw_distance/gpu_baseline.cu @@ -380,14 +380,20 @@ static int g_num_features = 0; static float* d_subjects = nullptr; static float* d_distances = nullptr; -extern "C" void solution_init(int num_entries, - int num_features, - const float* subjects, - const float* query) +extern "C" void solution_free(void) { - g_num_entries = num_entries; - g_num_features = num_features; + if (d_subjects) { cudaFree(d_subjects); d_subjects = nullptr; } + if (d_distances) { cudaFree(d_distances); d_distances = nullptr; } + g_num_entries = 0; + g_num_features = 0; +} +extern "C" void solution_compute(int num_entries, + int num_features, + const float* subjects, + const float* query, + float* distances) +{ if (num_features != 1023) { fprintf(stderr, "[gpu_baseline] cuDTW SHFL_FULLDTW_1023 is length-specialized " @@ -398,19 +404,19 @@ extern "C" void solution_init(int num_entries, size_t subj_bytes = (size_t)num_entries * num_features * sizeof(float); size_t dist_bytes = (size_t)num_entries * sizeof(float); - cudaMalloc(&d_subjects, subj_bytes); - cudaMalloc(&d_distances, dist_bytes); + if (g_num_entries != num_entries || g_num_features != num_features) { + solution_free(); + cudaMalloc(&d_subjects, subj_bytes); + cudaMalloc(&d_distances, dist_bytes); + g_num_entries = num_entries; + g_num_features = num_features; + } cudaMemcpy(d_subjects, subjects, subj_bytes, cudaMemcpyHostToDevice); // Match cuDTW main.cu line 164: query lives in __constant__ memory. cudaMemcpyToSymbol(cQuery, query, (size_t)num_features * sizeof(float)); -} -extern "C" void solution_compute(int num_entries, - int num_features, - float* distances) -{ // Launch matches cuDTW DTW.hpp shfl_FullDTW_1023 dispatch: // grid = (num_entries, 1, 1) // block = (32, 1, 1) @@ -423,10 +429,5 @@ extern "C" void solution_compute(int num_entries, cudaMemcpy(distances, d_distances, (size_t)num_entries * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - if (d_subjects) { cudaFree(d_subjects); d_subjects = nullptr; } - if (d_distances) { cudaFree(d_distances); d_distances = nullptr; } + cudaDeviceSynchronize(); } diff --git a/tasks/euclidean_distance_matrix/gpu_baseline.cu b/tasks/euclidean_distance_matrix/gpu_baseline.cu index 130fa80..71a81ae 100644 --- a/tasks/euclidean_distance_matrix/gpu_baseline.cu +++ b/tasks/euclidean_distance_matrix/gpu_baseline.cu @@ -91,33 +91,47 @@ __global__ void compute_distances(float * ref, } // ===== Persistent device state ===== +static int g_ref_nb = 0; +static int g_query_nb = 0; +static int g_dim = 0; static float* d_ref = nullptr; static float* d_query = nullptr; static float* d_dist = nullptr; -extern "C" void solution_init(int ref_nb, - int query_nb, - int dim, - const float* ref, - const float* query) +extern "C" void solution_free(void) +{ + if (d_ref) { cudaFree(d_ref); d_ref = nullptr; } + if (d_query) { cudaFree(d_query); d_query = nullptr; } + if (d_dist) { cudaFree(d_dist); d_dist = nullptr; } + g_ref_nb = 0; + g_query_nb = 0; + g_dim = 0; +} + +extern "C" void solution_compute(int ref_nb, + int query_nb, + int dim, + const float* ref, + const float* query, + float* dist) { size_t ref_bytes = (size_t)dim * ref_nb * sizeof(float); size_t query_bytes = (size_t)dim * query_nb * sizeof(float); size_t dist_bytes = (size_t)ref_nb * query_nb * sizeof(float); - cudaMalloc(&d_ref, ref_bytes); - cudaMalloc(&d_query, query_bytes); - cudaMalloc(&d_dist, dist_bytes); + if (g_ref_nb != ref_nb || g_query_nb != query_nb || g_dim != dim) { + solution_free(); + cudaMalloc(&d_ref, ref_bytes); + cudaMalloc(&d_query, query_bytes); + cudaMalloc(&d_dist, dist_bytes); + g_ref_nb = ref_nb; + g_query_nb = query_nb; + g_dim = dim; + } cudaMemcpy(d_ref, ref, ref_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_query, query, query_bytes, cudaMemcpyHostToDevice); -} -extern "C" void solution_compute(int ref_nb, - int query_nb, - int dim, - float* dist) -{ // Launch matches kNN-CUDA's knn_cuda_global() (lines around 540) — // block = (BLOCK_DIM, BLOCK_DIM) // grid = (ceil(query_nb/BLOCK_DIM), ceil(ref_nb/BLOCK_DIM)) @@ -134,11 +148,5 @@ extern "C" void solution_compute(int ref_nb, cudaMemcpy(dist, d_dist, (size_t)ref_nb * query_nb * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - if (d_ref) { cudaFree(d_ref); d_ref = nullptr; } - if (d_query) { cudaFree(d_query); d_query = nullptr; } - if (d_dist) { cudaFree(d_dist); d_dist = nullptr; } + cudaDeviceSynchronize(); } diff --git a/tasks/hausdorff_distance/gpu_baseline.cu b/tasks/hausdorff_distance/gpu_baseline.cu index 068d7d0..f446bea 100644 --- a/tasks/hausdorff_distance/gpu_baseline.cu +++ b/tasks/hausdorff_distance/gpu_baseline.cu @@ -91,37 +91,44 @@ static vec_2d* d_points = nullptr; static int* d_space_offsets = nullptr; static float* d_results = nullptr; -extern "C" void solution_init(int num_points, - int num_spaces, - const float* points_xy, - const int* space_offsets) +// Tiny init kernel (cuSpatial uses thrust::fill_n with -1 sentinel for atomicMax). +__global__ void hausdorff_init_results(int n, float* r) { - g_num_points = num_points; - g_num_spaces = num_spaces; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) r[i] = -1.0f; +} +extern "C" void solution_free(void) +{ + if (d_points) { cudaFree(d_points); d_points = nullptr; } + if (d_space_offsets) { cudaFree(d_space_offsets); d_space_offsets = nullptr; } + if (d_results) { cudaFree(d_results); d_results = nullptr; } + g_num_points = 0; + g_num_spaces = 0; +} + +extern "C" void solution_compute(int num_points, + int num_spaces, + const float* points_xy, + const int* space_offsets, + float* results) +{ size_t pts_bytes = (size_t)num_points * sizeof(vec_2d); size_t off_bytes = (size_t)num_spaces * sizeof(int); size_t res_bytes = (size_t)num_spaces * num_spaces * sizeof(float); - cudaMalloc(&d_points, pts_bytes); - cudaMalloc(&d_space_offsets, off_bytes); - cudaMalloc(&d_results, res_bytes); + if (g_num_points != num_points || g_num_spaces != num_spaces) { + solution_free(); + cudaMalloc(&d_points, pts_bytes); + cudaMalloc(&d_space_offsets, off_bytes); + cudaMalloc(&d_results, res_bytes); + g_num_points = num_points; + g_num_spaces = num_spaces; + } cudaMemcpy(d_points, points_xy, pts_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_space_offsets, space_offsets, off_bytes, cudaMemcpyHostToDevice); -} -// Tiny init kernel (cuSpatial uses thrust::fill_n with -1 sentinel for atomicMax). -__global__ void hausdorff_init_results(int n, float* r) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) r[i] = -1.0f; -} - -extern "C" void solution_compute(int num_points, - int num_spaces, - float* results) -{ int n_results = num_spaces * num_spaces; int init_block = 256; @@ -136,11 +143,5 @@ extern "C" void solution_compute(int num_points, cudaMemcpy(results, d_results, (size_t)n_results * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - if (d_points) { cudaFree(d_points); d_points = nullptr; } - if (d_space_offsets) { cudaFree(d_space_offsets); d_space_offsets = nullptr; } - if (d_results) { cudaFree(d_results); d_results = nullptr; } + cudaDeviceSynchronize(); } diff --git a/tasks/held_karp_tsp/gpu_baseline.cu b/tasks/held_karp_tsp/gpu_baseline.cu index ba9677c..69d3887 100644 --- a/tasks/held_karp_tsp/gpu_baseline.cu +++ b/tasks/held_karp_tsp/gpu_baseline.cu @@ -82,15 +82,16 @@ __global__ void held_karp_final(int n, int full_mask, // ===== Host interface ===== -static int g_B = 0; static int g_n = 0; static int* d_cost = nullptr; static int* d_dp = nullptr; static int* d_result = nullptr; -extern "C" void solution_init(int B, int n, const int* costs) { - // Not used — we upload per-instance in solution_compute - (void)B; (void)n; (void)costs; +extern "C" void solution_free(void) { + if (d_cost) { cudaFree(d_cost); d_cost = nullptr; } + if (d_dp) { cudaFree(d_dp); d_dp = nullptr; } + if (d_result) { cudaFree(d_result); d_result = nullptr; } + g_n = 0; } extern "C" void solution_compute(int B, int n, const int* costs, @@ -101,9 +102,13 @@ extern "C" void solution_compute(int B, int n, const int* costs, size_t dp_bytes = (size_t)subset_count * n * sizeof(int); size_t cost_bytes = (size_t)n * n * sizeof(int); - cudaMalloc(&d_cost, cost_bytes); - cudaMalloc(&d_dp, dp_bytes); - cudaMalloc(&d_result, sizeof(int)); + if (g_n != n) { + solution_free(); + cudaMalloc(&d_cost, cost_bytes); + cudaMalloc(&d_dp, dp_bytes); + cudaMalloc(&d_result, sizeof(int)); + g_n = n; + } int block = 256; @@ -136,9 +141,5 @@ extern "C" void solution_compute(int B, int n, const int* costs, cudaMemcpyDeviceToHost); } - cudaFree(d_cost); d_cost = nullptr; - cudaFree(d_dp); d_dp = nullptr; - cudaFree(d_result); d_result = nullptr; + cudaDeviceSynchronize(); } - -extern "C" void solution_free(void) {} diff --git a/tasks/max_flow_push_relabel/gpu_baseline.cu b/tasks/max_flow_push_relabel/gpu_baseline.cu index 080dafa..c9fa3f9 100644 --- a/tasks/max_flow_push_relabel/gpu_baseline.cu +++ b/tasks/max_flow_push_relabel/gpu_baseline.cu @@ -141,15 +141,29 @@ static int* d_bfs_queue = nullptr; static int* d_in_queue = nullptr; static int* d_result = nullptr; -extern "C" void solution_init(int num_nodes, int num_arcs, - const int* tails, const int* heads, const int* caps, - int source, int sink) +extern "C" void solution_free(void) { - g_num_nodes = num_nodes; - g_num_total_arcs = 2 * num_arcs; - g_source = source; - g_sink = sink; + if (d_adj_start) { cudaFree(d_adj_start); d_adj_start = nullptr; } + if (d_adj_list) { cudaFree(d_adj_list); d_adj_list = nullptr; } + if (d_arc_head) { cudaFree(d_arc_head); d_arc_head = nullptr; } + if (d_arc_opposite) { cudaFree(d_arc_opposite); d_arc_opposite = nullptr; } + if (d_residual) { cudaFree(d_residual); d_residual = nullptr; } + if (d_initial_cap) { cudaFree(d_initial_cap); d_initial_cap = nullptr; } + if (d_excess) { cudaFree(d_excess); d_excess = nullptr; } + if (d_height) { cudaFree(d_height); d_height = nullptr; } + if (d_first_arc) { cudaFree(d_first_arc); d_first_arc = nullptr; } + if (d_bfs_queue) { cudaFree(d_bfs_queue); d_bfs_queue = nullptr; } + if (d_in_queue) { cudaFree(d_in_queue); d_in_queue = nullptr; } + if (d_result) { cudaFree(d_result); d_result = nullptr; } + g_num_nodes = 0; + g_num_total_arcs = 0; +} +extern "C" void solution_compute(int num_nodes, int num_arcs, + const int* tails, const int* heads, const int* caps, + int source, int sink, + int* max_flow_out) +{ int total = 2 * num_arcs; // Build graph on host (same as CPU reference) @@ -183,19 +197,23 @@ extern "C" void solution_init(int num_nodes, int num_arcs, } free(degree); - // Upload to GPU - cudaMalloc(&d_adj_start, (num_nodes + 1) * sizeof(int)); - cudaMalloc(&d_adj_list, total * sizeof(int)); - cudaMalloc(&d_arc_head, total * sizeof(int)); - cudaMalloc(&d_arc_opposite, total * sizeof(int)); - cudaMalloc(&d_residual, total * sizeof(int)); - cudaMalloc(&d_initial_cap, total * sizeof(int)); - cudaMalloc(&d_excess, num_nodes * sizeof(long long)); - cudaMalloc(&d_height, num_nodes * sizeof(int)); - cudaMalloc(&d_first_arc, num_nodes * sizeof(int)); - cudaMalloc(&d_bfs_queue, num_nodes * sizeof(int)); - cudaMalloc(&d_in_queue, num_nodes * sizeof(int)); - cudaMalloc(&d_result, sizeof(int)); + if (g_num_nodes != num_nodes || g_num_total_arcs != total) { + solution_free(); + cudaMalloc(&d_adj_start, (num_nodes + 1) * sizeof(int)); + cudaMalloc(&d_adj_list, total * sizeof(int)); + cudaMalloc(&d_arc_head, total * sizeof(int)); + cudaMalloc(&d_arc_opposite, total * sizeof(int)); + cudaMalloc(&d_residual, total * sizeof(int)); + cudaMalloc(&d_initial_cap, total * sizeof(int)); + cudaMalloc(&d_excess, num_nodes * sizeof(long long)); + cudaMalloc(&d_height, num_nodes * sizeof(int)); + cudaMalloc(&d_first_arc, num_nodes * sizeof(int)); + cudaMalloc(&d_bfs_queue, num_nodes * sizeof(int)); + cudaMalloc(&d_in_queue, num_nodes * sizeof(int)); + cudaMalloc(&d_result, sizeof(int)); + g_num_nodes = num_nodes; + g_num_total_arcs = total; + } cudaMemcpy(d_adj_start, h_adj_start, (num_nodes + 1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_adj_list, h_adj_list, total * sizeof(int), cudaMemcpyHostToDevice); @@ -205,25 +223,16 @@ extern "C" void solution_init(int num_nodes, int num_arcs, free(h_arc_head); free(h_arc_opp); free(h_init_cap); free(h_adj_start); free(h_adj_list); -} -extern "C" void solution_compute(int num_nodes, int* max_flow_out) -{ + g_source = source; + g_sink = sink; + push_relabel_kernel<<<1, 1>>>( - g_num_nodes, g_num_total_arcs, g_source, g_sink, + num_nodes, total, source, sink, d_adj_start, d_adj_list, d_arc_head, d_arc_opposite, d_residual, d_initial_cap, d_excess, d_height, d_first_arc, d_bfs_queue, d_in_queue, d_result); cudaMemcpy(max_flow_out, d_result, sizeof(int), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - cudaFree(d_adj_start); cudaFree(d_adj_list); - cudaFree(d_arc_head); cudaFree(d_arc_opposite); - cudaFree(d_residual); cudaFree(d_initial_cap); - cudaFree(d_excess); cudaFree(d_height); - cudaFree(d_first_arc); cudaFree(d_bfs_queue); - cudaFree(d_in_queue); cudaFree(d_result); + cudaDeviceSynchronize(); } diff --git a/tasks/monte_carlo/gpu_baseline.cu b/tasks/monte_carlo/gpu_baseline.cu index 232d0a5..959a8fd 100644 --- a/tasks/monte_carlo/gpu_baseline.cu +++ b/tasks/monte_carlo/gpu_baseline.cu @@ -203,11 +203,17 @@ static float g_dt = 0.0f; static unsigned int g_baseSeed = 0; static monteCarloOptionStruct g_optionStruct; -extern "C" void solution_init(int N, int num_steps, float risk_free, float volatility, - float strike, float spot, float time_to_maturity, - unsigned int base_seed) +extern "C" void solution_free(void) +{ + if (d_samplePrices) { cudaFree(d_samplePrices); d_samplePrices = nullptr; } + g_N = 0; +} + +extern "C" void solution_compute(int N, int num_steps, float risk_free, float volatility, + float strike, float spot, float time_to_maturity, + unsigned int base_seed, + float* samplePrices) { - g_N = N; g_seqLen = num_steps; g_dt = 1.0f / (float)num_steps; g_baseSeed = base_seed; @@ -219,13 +225,12 @@ extern "C" void solution_init(int N, int num_steps, float risk_free, float volat g_optionStruct.strikeVal = strike; g_optionStruct.discountVal = expf(-risk_free * time_to_maturity); - // Allocate GPU output buffer (persistent across calls) - if (d_samplePrices) cudaFree(d_samplePrices); - cudaMalloc(&d_samplePrices, N * sizeof(float)); -} + if (g_N != N) { + solution_free(); + cudaMalloc(&d_samplePrices, N * sizeof(float)); + g_N = N; + } -extern "C" void solution_compute(int N, float* samplePrices) -{ // Launch: one thread per sample int threadsPerBlock = 256; int numBlocks = (N + threadsPerBlock - 1) / threadsPerBlock; @@ -236,9 +241,5 @@ extern "C" void solution_compute(int N, float* samplePrices) // Download results cudaMemcpy(samplePrices, d_samplePrices, N * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - if (d_samplePrices) { cudaFree(d_samplePrices); d_samplePrices = nullptr; } + cudaDeviceSynchronize(); } diff --git a/tasks/pdlp/gpu_baseline.cu b/tasks/pdlp/gpu_baseline.cu index cfe9130..ee0656f 100644 --- a/tasks/pdlp/gpu_baseline.cu +++ b/tasks/pdlp/gpu_baseline.cu @@ -135,36 +135,69 @@ static float* d_primal_avg = nullptr; static float* d_x_bar = nullptr; static float* d_Ax_bar = nullptr; -extern "C" void solution_init(int num_vars, - int num_constraints, - int nnz, - int num_iters, - const float* obj, - const float* var_lb, - const float* var_ub, - const float* con_lb, - const float* con_ub, - const int* col_ptrs, - const int* row_indices, - const float* values, - float step_size, - float primal_weight) +static int g_nnz = 0; + +extern "C" void solution_free(void) +{ + if (d_obj) { cudaFree(d_obj); d_obj = nullptr; } + if (d_var_lb) { cudaFree(d_var_lb); d_var_lb = nullptr; } + if (d_var_ub) { cudaFree(d_var_ub); d_var_ub = nullptr; } + if (d_con_lb) { cudaFree(d_con_lb); d_con_lb = nullptr; } + if (d_con_ub) { cudaFree(d_con_ub); d_con_ub = nullptr; } + if (d_col_ptrs) { cudaFree(d_col_ptrs); d_col_ptrs = nullptr; } + if (d_row_indices) { cudaFree(d_row_indices); d_row_indices = nullptr; } + if (d_values) { cudaFree(d_values); d_values = nullptr; } + if (d_primal) { cudaFree(d_primal); d_primal = nullptr; } + if (d_dual) { cudaFree(d_dual); d_dual = nullptr; } + if (d_dual_product) { cudaFree(d_dual_product); d_dual_product = nullptr; } + if (d_primal_avg) { cudaFree(d_primal_avg); d_primal_avg = nullptr; } + if (d_x_bar) { cudaFree(d_x_bar); d_x_bar = nullptr; } + if (d_Ax_bar) { cudaFree(d_Ax_bar); d_Ax_bar = nullptr; } + g_num_vars = 0; + g_num_constraints = 0; + g_nnz = 0; +} + +extern "C" void solution_compute(int num_vars, + int num_constraints, + int nnz, + int num_iters, + const float* obj, + const float* var_lb, + const float* var_ub, + const float* con_lb, + const float* con_ub, + const int* col_ptrs, + const int* row_indices, + const float* values, + float step_size, + float primal_weight, + float* primal_out) { - g_num_vars = num_vars; - g_num_constraints = num_constraints; g_num_iters = num_iters; g_step_size = step_size; g_primal_weight = primal_weight; - // Upload LP data - cudaMalloc(&d_obj, num_vars * sizeof(float)); - cudaMalloc(&d_var_lb, num_vars * sizeof(float)); - cudaMalloc(&d_var_ub, num_vars * sizeof(float)); - cudaMalloc(&d_con_lb, num_constraints * sizeof(float)); - cudaMalloc(&d_con_ub, num_constraints * sizeof(float)); - cudaMalloc(&d_col_ptrs, (num_vars + 1) * sizeof(int)); - cudaMalloc(&d_row_indices, nnz * sizeof(int)); - cudaMalloc(&d_values, nnz * sizeof(float)); + if (g_num_vars != num_vars || g_num_constraints != num_constraints || g_nnz != nnz) { + solution_free(); + cudaMalloc(&d_obj, num_vars * sizeof(float)); + cudaMalloc(&d_var_lb, num_vars * sizeof(float)); + cudaMalloc(&d_var_ub, num_vars * sizeof(float)); + cudaMalloc(&d_con_lb, num_constraints * sizeof(float)); + cudaMalloc(&d_con_ub, num_constraints * sizeof(float)); + cudaMalloc(&d_col_ptrs, (num_vars + 1) * sizeof(int)); + cudaMalloc(&d_row_indices, nnz * sizeof(int)); + cudaMalloc(&d_values, nnz * sizeof(float)); + cudaMalloc(&d_primal, num_vars * sizeof(float)); + cudaMalloc(&d_dual, num_constraints * sizeof(float)); + cudaMalloc(&d_dual_product, num_vars * sizeof(float)); + cudaMalloc(&d_primal_avg, num_vars * sizeof(float)); + cudaMalloc(&d_x_bar, num_vars * sizeof(float)); + cudaMalloc(&d_Ax_bar, num_constraints * sizeof(float)); + g_num_vars = num_vars; + g_num_constraints = num_constraints; + g_nnz = nnz; + } cudaMemcpy(d_obj, obj, num_vars * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_var_lb, var_lb, num_vars * sizeof(float), cudaMemcpyHostToDevice); @@ -175,18 +208,6 @@ extern "C" void solution_init(int num_vars, cudaMemcpy(d_row_indices, row_indices, nnz * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice); - // Allocate working buffers - cudaMalloc(&d_primal, num_vars * sizeof(float)); - cudaMalloc(&d_dual, num_constraints * sizeof(float)); - cudaMalloc(&d_dual_product, num_vars * sizeof(float)); - cudaMalloc(&d_primal_avg, num_vars * sizeof(float)); - cudaMalloc(&d_x_bar, num_vars * sizeof(float)); - cudaMalloc(&d_Ax_bar, num_constraints * sizeof(float)); -} - -extern "C" void solution_compute(int num_vars, int num_constraints, - float* primal_out) -{ int bv = 256; int gv = (num_vars + bv - 1) / bv; int gc = (num_constraints + bv - 1) / bv; @@ -198,10 +219,10 @@ extern "C" void solution_compute(int num_vars, int num_constraints, cudaMemset(d_primal_avg, 0, num_vars * sizeof(float)); float avg_weight_sum = 0.0f; - float primal_step_size = g_step_size / g_primal_weight; - float dual_step_size = g_step_size * g_primal_weight; + float primal_step_size = step_size / primal_weight; + float dual_step_size = step_size * primal_weight; - for (int iter = 0; iter < g_num_iters; iter++) { + for (int iter = 0; iter < num_iters; iter++) { // 1. Primal update + extrapolation kernel_primal_update<<>>(num_vars, primal_step_size, d_obj, d_dual_product, d_var_lb, d_var_ub, d_primal, d_x_bar); @@ -220,29 +241,12 @@ extern "C" void solution_compute(int num_vars, int num_constraints, d_values, d_dual, d_dual_product); // 5. Weighted average update - float ratio = g_step_size / (avg_weight_sum + g_step_size); + float ratio = step_size / (avg_weight_sum + step_size); kernel_avg_update<<>>(num_vars, ratio, d_primal, d_primal_avg); - avg_weight_sum += g_step_size; + avg_weight_sum += step_size; } cudaMemcpy(primal_out, d_primal_avg, num_vars * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - cudaFree(d_obj); d_obj = nullptr; - cudaFree(d_var_lb); d_var_lb = nullptr; - cudaFree(d_var_ub); d_var_ub = nullptr; - cudaFree(d_con_lb); d_con_lb = nullptr; - cudaFree(d_con_ub); d_con_ub = nullptr; - cudaFree(d_col_ptrs); d_col_ptrs = nullptr; - cudaFree(d_row_indices); d_row_indices = nullptr; - cudaFree(d_values); d_values = nullptr; - cudaFree(d_primal); d_primal = nullptr; - cudaFree(d_dual); d_dual = nullptr; - cudaFree(d_dual_product); d_dual_product = nullptr; - cudaFree(d_primal_avg); d_primal_avg = nullptr; - cudaFree(d_x_bar); d_x_bar = nullptr; - cudaFree(d_Ax_bar); d_Ax_bar = nullptr; + cudaDeviceSynchronize(); } diff --git a/tasks/repo_pricing/gpu_baseline.cu b/tasks/repo_pricing/gpu_baseline.cu index ed84bb9..540de3d 100644 --- a/tasks/repo_pricing/gpu_baseline.cu +++ b/tasks/repo_pricing/gpu_baseline.cu @@ -1153,40 +1153,69 @@ static dataType* d_bondForwardVal = NULL; extern "C" { -void solution_init(int N, - const int* settle_year, const int* settle_month, const int* settle_day, - const int* delivery_year, const int* delivery_month, const int* delivery_day, - const int* issue_year, const int* issue_month, const int* issue_day, - const int* maturity_year, const int* maturity_month, const int* maturity_day, - const float* bond_rates, const float* repo_rates, - const float* bond_clean_prices, const float* dummy_strikes) +void solution_free(void) +{ + if (d_discountCurve) { cudaFree(d_discountCurve); d_discountCurve = NULL; } + if (d_repoCurve) { cudaFree(d_repoCurve); d_repoCurve = NULL; } + if (d_settlementDate) { cudaFree(d_settlementDate); d_settlementDate = NULL; } + if (d_deliveryDate) { cudaFree(d_deliveryDate); d_deliveryDate = NULL; } + if (d_maturityDate) { cudaFree(d_maturityDate); d_maturityDate = NULL; } + if (d_repoDeliveryDate) { cudaFree(d_repoDeliveryDate); d_repoDeliveryDate = NULL; } + if (d_bondCleanPrice) { cudaFree(d_bondCleanPrice); d_bondCleanPrice = NULL; } + if (d_bond) { cudaFree(d_bond); d_bond = NULL; } + if (d_dummyStrike) { cudaFree(d_dummyStrike); d_dummyStrike = NULL; } + if (d_dirtyPrice) { cudaFree(d_dirtyPrice); d_dirtyPrice = NULL; } + if (d_accruedAmountSettlement) { cudaFree(d_accruedAmountSettlement); d_accruedAmountSettlement = NULL; } + if (d_accruedAmountDeliveryDate) { cudaFree(d_accruedAmountDeliveryDate); d_accruedAmountDeliveryDate = NULL; } + if (d_cleanPrice) { cudaFree(d_cleanPrice); d_cleanPrice = NULL; } + if (d_forwardSpotIncome) { cudaFree(d_forwardSpotIncome); d_forwardSpotIncome = NULL; } + if (d_underlyingBondFwd) { cudaFree(d_underlyingBondFwd); d_underlyingBondFwd = NULL; } + if (d_repoNpv) { cudaFree(d_repoNpv); d_repoNpv = NULL; } + if (d_repoCleanForwardPrice) { cudaFree(d_repoCleanForwardPrice); d_repoCleanForwardPrice = NULL; } + if (d_repoDirtyForwardPrice) { cudaFree(d_repoDirtyForwardPrice); d_repoDirtyForwardPrice = NULL; } + if (d_repoImpliedYield) { cudaFree(d_repoImpliedYield); d_repoImpliedYield = NULL; } + if (d_marketRepoRate) { cudaFree(d_marketRepoRate); d_marketRepoRate = NULL; } + if (d_bondForwardVal) { cudaFree(d_bondForwardVal); d_bondForwardVal = NULL; } + g_N = 0; +} + +void solution_compute(int N, + const int* settle_year, const int* settle_month, const int* settle_day, + const int* delivery_year, const int* delivery_month, const int* delivery_day, + const int* issue_year, const int* issue_month, const int* issue_day, + const int* maturity_year, const int* maturity_month, const int* maturity_day, + const float* bond_rates, const float* repo_rates, + const float* bond_clean_prices, const float* dummy_strikes, + float* prices) { - g_N = N; - - // Allocate device memory for inArgs - cudaMalloc(&d_discountCurve, N * sizeof(repoYieldTermStruct)); - cudaMalloc(&d_repoCurve, N * sizeof(repoYieldTermStruct)); - cudaMalloc(&d_settlementDate, N * sizeof(repoDateStruct)); - cudaMalloc(&d_deliveryDate, N * sizeof(repoDateStruct)); - cudaMalloc(&d_maturityDate, N * sizeof(repoDateStruct)); - cudaMalloc(&d_repoDeliveryDate, N * sizeof(repoDateStruct)); - cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType)); - cudaMalloc(&d_bond, N * sizeof(bondStruct)); - cudaMalloc(&d_dummyStrike, N * sizeof(dataType)); - - // Allocate device memory for results - cudaMalloc(&d_dirtyPrice, N * sizeof(dataType)); - cudaMalloc(&d_accruedAmountSettlement, N * sizeof(dataType)); - cudaMalloc(&d_accruedAmountDeliveryDate, N * sizeof(dataType)); - cudaMalloc(&d_cleanPrice, N * sizeof(dataType)); - cudaMalloc(&d_forwardSpotIncome, N * sizeof(dataType)); - cudaMalloc(&d_underlyingBondFwd, N * sizeof(dataType)); - cudaMalloc(&d_repoNpv, N * sizeof(dataType)); - cudaMalloc(&d_repoCleanForwardPrice, N * sizeof(dataType)); - cudaMalloc(&d_repoDirtyForwardPrice, N * sizeof(dataType)); - cudaMalloc(&d_repoImpliedYield, N * sizeof(dataType)); - cudaMalloc(&d_marketRepoRate, N * sizeof(dataType)); - cudaMalloc(&d_bondForwardVal, N * sizeof(dataType)); + if (g_N != N) { + solution_free(); + // Allocate device memory for inArgs + cudaMalloc(&d_discountCurve, N * sizeof(repoYieldTermStruct)); + cudaMalloc(&d_repoCurve, N * sizeof(repoYieldTermStruct)); + cudaMalloc(&d_settlementDate, N * sizeof(repoDateStruct)); + cudaMalloc(&d_deliveryDate, N * sizeof(repoDateStruct)); + cudaMalloc(&d_maturityDate, N * sizeof(repoDateStruct)); + cudaMalloc(&d_repoDeliveryDate, N * sizeof(repoDateStruct)); + cudaMalloc(&d_bondCleanPrice, N * sizeof(dataType)); + cudaMalloc(&d_bond, N * sizeof(bondStruct)); + cudaMalloc(&d_dummyStrike, N * sizeof(dataType)); + + // Allocate device memory for results + cudaMalloc(&d_dirtyPrice, N * sizeof(dataType)); + cudaMalloc(&d_accruedAmountSettlement, N * sizeof(dataType)); + cudaMalloc(&d_accruedAmountDeliveryDate, N * sizeof(dataType)); + cudaMalloc(&d_cleanPrice, N * sizeof(dataType)); + cudaMalloc(&d_forwardSpotIncome, N * sizeof(dataType)); + cudaMalloc(&d_underlyingBondFwd, N * sizeof(dataType)); + cudaMalloc(&d_repoNpv, N * sizeof(dataType)); + cudaMalloc(&d_repoCleanForwardPrice, N * sizeof(dataType)); + cudaMalloc(&d_repoDirtyForwardPrice, N * sizeof(dataType)); + cudaMalloc(&d_repoImpliedYield, N * sizeof(dataType)); + cudaMalloc(&d_marketRepoRate, N * sizeof(dataType)); + cudaMalloc(&d_bondForwardVal, N * sizeof(dataType)); + g_N = N; + } // Build host inArgs (matching repoEngine.c / cpu_reference.c setup) repoYieldTermStruct* h_discountCurve = (repoYieldTermStruct*)malloc(N * sizeof(repoYieldTermStruct)); @@ -1260,10 +1289,7 @@ void solution_init(int N, free(h_bondCleanPrice); free(h_bond); free(h_dummyStrike); -} -void solution_compute(int N, float* prices) -{ // Build device structs inArgsStruct inArgs; inArgs.discountCurve = d_discountCurve; @@ -1322,54 +1348,8 @@ void solution_compute(int N, float* prices) for (int r = 0; r < 12; r++) free(h_res[r]); -} -void solution_free(void) -{ - cudaFree(d_discountCurve); - cudaFree(d_repoCurve); - cudaFree(d_settlementDate); - cudaFree(d_deliveryDate); - cudaFree(d_maturityDate); - cudaFree(d_repoDeliveryDate); - cudaFree(d_bondCleanPrice); - cudaFree(d_bond); - cudaFree(d_dummyStrike); - - cudaFree(d_dirtyPrice); - cudaFree(d_accruedAmountSettlement); - cudaFree(d_accruedAmountDeliveryDate); - cudaFree(d_cleanPrice); - cudaFree(d_forwardSpotIncome); - cudaFree(d_underlyingBondFwd); - cudaFree(d_repoNpv); - cudaFree(d_repoCleanForwardPrice); - cudaFree(d_repoDirtyForwardPrice); - cudaFree(d_repoImpliedYield); - cudaFree(d_marketRepoRate); - cudaFree(d_bondForwardVal); - - d_discountCurve = NULL; - d_repoCurve = NULL; - d_settlementDate = NULL; - d_deliveryDate = NULL; - d_maturityDate = NULL; - d_repoDeliveryDate = NULL; - d_bondCleanPrice = NULL; - d_bond = NULL; - d_dummyStrike = NULL; - d_dirtyPrice = NULL; - d_accruedAmountSettlement = NULL; - d_accruedAmountDeliveryDate = NULL; - d_cleanPrice = NULL; - d_forwardSpotIncome = NULL; - d_underlyingBondFwd = NULL; - d_repoNpv = NULL; - d_repoCleanForwardPrice = NULL; - d_repoDirtyForwardPrice = NULL; - d_repoImpliedYield = NULL; - d_marketRepoRate = NULL; - d_bondForwardVal = NULL; + cudaDeviceSynchronize(); } } // extern "C" diff --git a/tasks/sph_cell_index/gpu_baseline.cu b/tasks/sph_cell_index/gpu_baseline.cu index 4d1b71d..3853cb7 100644 --- a/tasks/sph_cell_index/gpu_baseline.cu +++ b/tasks/sph_cell_index/gpu_baseline.cu @@ -97,46 +97,61 @@ __global__ void KerCalcBeginEndCell(unsigned int n, extern "C" { -void solution_init(int N, - const float* xs, const float* ys, const float* zs, - float cell_size, int grid_nx, int grid_ny, int grid_nz) +static int g_num_cells = 0; + +void solution_free(void) { - g_N = N; - g_cell_size = cell_size; - g_grid_nx = grid_nx; - g_grid_ny = grid_ny; - g_grid_nz = grid_nz; + if (d_xs) { cudaFree(d_xs); d_xs = nullptr; } + if (d_ys) { cudaFree(d_ys); d_ys = nullptr; } + if (d_zs) { cudaFree(d_zs); d_zs = nullptr; } + if (d_cell_ids) { cudaFree(d_cell_ids); d_cell_ids = nullptr; } + if (d_indices) { cudaFree(d_indices); d_indices = nullptr; } + if (d_cell_begin) { cudaFree(d_cell_begin); d_cell_begin = nullptr; } + if (d_cell_end) { cudaFree(d_cell_end); d_cell_end = nullptr; } + g_N = 0; + g_num_cells = 0; +} +void solution_compute(int N, + const float* xs, const float* ys, const float* zs, + float cell_size, + int grid_nx, int grid_ny, int grid_nz, + int num_cells, + int* sorted_indices, int* cell_begin, int* cell_end) +{ size_t sz_f = (size_t)N * sizeof(float); size_t sz_i = (size_t)N * sizeof(int); - int num_cells = grid_nx * grid_ny * grid_nz; size_t sz_cells = (size_t)num_cells * sizeof(int); - // Allocate and copy input arrays to device - cudaMalloc(&d_xs, sz_f); - cudaMalloc(&d_ys, sz_f); - cudaMalloc(&d_zs, sz_f); + if (g_N != N || g_num_cells != num_cells) { + solution_free(); + cudaMalloc(&d_xs, sz_f); + cudaMalloc(&d_ys, sz_f); + cudaMalloc(&d_zs, sz_f); + cudaMalloc(&d_cell_ids, sz_i); + cudaMalloc(&d_indices, sz_i); + cudaMalloc(&d_cell_begin, sz_cells); + cudaMalloc(&d_cell_end, sz_cells); + g_N = N; + g_num_cells = num_cells; + } + + g_cell_size = cell_size; + g_grid_nx = grid_nx; + g_grid_ny = grid_ny; + g_grid_nz = grid_nz; + cudaMemcpy(d_xs, xs, sz_f, cudaMemcpyHostToDevice); cudaMemcpy(d_ys, ys, sz_f, cudaMemcpyHostToDevice); cudaMemcpy(d_zs, zs, sz_f, cudaMemcpyHostToDevice); - // Allocate intermediate and output arrays on device - cudaMalloc(&d_cell_ids, sz_i); - cudaMalloc(&d_indices, sz_i); - cudaMalloc(&d_cell_begin, sz_cells); - cudaMalloc(&d_cell_end, sz_cells); -} - -void solution_compute(int N, int num_cells, - int* sorted_indices, int* cell_begin, int* cell_end) -{ dim3 block(BLOCK_SIZE); dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE); // Step 1: Compute cell_id for each particle KerComputeCellIndex<<>>( N, d_xs, d_ys, d_zs, - g_cell_size, g_grid_nx, g_grid_ny, + cell_size, grid_nx, grid_ny, d_cell_ids, d_indices); // Step 2: Sort by cell_id using thrust::sort_by_key @@ -158,17 +173,7 @@ void solution_compute(int N, int num_cells, cudaMemcpy(sorted_indices, d_indices, (size_t)N * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(cell_begin, d_cell_begin, (size_t)num_cells * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(cell_end, d_cell_end, (size_t)num_cells * sizeof(int), cudaMemcpyDeviceToHost); -} - -void solution_free(void) -{ - cudaFree(d_xs); - cudaFree(d_ys); - cudaFree(d_zs); - cudaFree(d_cell_ids); - cudaFree(d_indices); - cudaFree(d_cell_begin); - cudaFree(d_cell_end); + cudaDeviceSynchronize(); } } // extern "C" diff --git a/tasks/sph_forces/gpu_baseline.cu b/tasks/sph_forces/gpu_baseline.cu index aa86aef..68a4afd 100644 --- a/tasks/sph_forces/gpu_baseline.cu +++ b/tasks/sph_forces/gpu_baseline.cu @@ -231,25 +231,40 @@ __global__ void InteractionForcesFluidKernel( extern "C" { -void solution_init(int N, - const float* xs, const float* ys, const float* zs, - const float* vxs, const float* vys, const float* vzs, - const float* rhos, const float* masses, - float h, float cs0, float rhop0, float alpha_visc, - const int* cell_begin, const int* cell_end, - const int* sorted_idx, - int grid_nx, int grid_ny, int grid_nz, - float cell_size) +static int g_num_cells = 0; + +void solution_free(void) { - g_N = N; - g_grid_nx = grid_nx; - g_grid_ny = grid_ny; - g_grid_nz = grid_nz; - g_cell_size = cell_size; - g_h = h; - g_cs0 = cs0; - g_alpha_visc = alpha_visc; + if (d_xs) { cudaFree(d_xs); d_xs = nullptr; } + if (d_ys) { cudaFree(d_ys); d_ys = nullptr; } + if (d_zs) { cudaFree(d_zs); d_zs = nullptr; } + if (d_vxs) { cudaFree(d_vxs); d_vxs = nullptr; } + if (d_vys) { cudaFree(d_vys); d_vys = nullptr; } + if (d_vzs) { cudaFree(d_vzs); d_vzs = nullptr; } + if (d_rhos) { cudaFree(d_rhos); d_rhos = nullptr; } + if (d_masses) { cudaFree(d_masses); d_masses = nullptr; } + if (d_cell_begin) { cudaFree(d_cell_begin); d_cell_begin = nullptr; } + if (d_cell_end) { cudaFree(d_cell_end); d_cell_end = nullptr; } + if (d_sorted_idx) { cudaFree(d_sorted_idx); d_sorted_idx = nullptr; } + if (d_ax) { cudaFree(d_ax); d_ax = nullptr; } + if (d_ay) { cudaFree(d_ay); d_ay = nullptr; } + if (d_az) { cudaFree(d_az); d_az = nullptr; } + if (d_drhodt) { cudaFree(d_drhodt); d_drhodt = nullptr; } + g_N = 0; + g_num_cells = 0; +} +void solution_compute(int N, + const float* xs, const float* ys, const float* zs, + const float* vxs, const float* vys, const float* vzs, + const float* rhos, const float* masses, + float h, float cs0, float rhop0, float alpha_visc, + const int* cell_begin, const int* cell_end, + const int* sorted_idx, + int grid_nx, int grid_ny, int grid_nz, + float cell_size, + float* ax, float* ay, float* az, float* drhodt) +{ int num_cells = grid_nx * grid_ny * grid_nz; // Precompute Wendland kernel constants (3D) @@ -259,22 +274,38 @@ void solution_init(int N, g_cteb = cs0 * cs0 * rhop0 / SPH_GAMMA; g_ovrhopzero = 1.0f / rhop0; + g_grid_nx = grid_nx; + g_grid_ny = grid_ny; + g_grid_nz = grid_nz; + g_cell_size = cell_size; + g_h = h; + g_cs0 = cs0; + g_alpha_visc = alpha_visc; + size_t sz_f = (size_t)N * sizeof(float); size_t sz_i_n = (size_t)N * sizeof(int); size_t sz_i_c = (size_t)num_cells * sizeof(int); - // Allocate and copy input arrays to device - cudaMalloc(&d_xs, sz_f); - cudaMalloc(&d_ys, sz_f); - cudaMalloc(&d_zs, sz_f); - cudaMalloc(&d_vxs, sz_f); - cudaMalloc(&d_vys, sz_f); - cudaMalloc(&d_vzs, sz_f); - cudaMalloc(&d_rhos, sz_f); - cudaMalloc(&d_masses, sz_f); - cudaMalloc(&d_cell_begin, sz_i_c); - cudaMalloc(&d_cell_end, sz_i_c); - cudaMalloc(&d_sorted_idx, sz_i_n); + if (g_N != N || g_num_cells != num_cells) { + solution_free(); + cudaMalloc(&d_xs, sz_f); + cudaMalloc(&d_ys, sz_f); + cudaMalloc(&d_zs, sz_f); + cudaMalloc(&d_vxs, sz_f); + cudaMalloc(&d_vys, sz_f); + cudaMalloc(&d_vzs, sz_f); + cudaMalloc(&d_rhos, sz_f); + cudaMalloc(&d_masses, sz_f); + cudaMalloc(&d_cell_begin, sz_i_c); + cudaMalloc(&d_cell_end, sz_i_c); + cudaMalloc(&d_sorted_idx, sz_i_n); + cudaMalloc(&d_ax, sz_f); + cudaMalloc(&d_ay, sz_f); + cudaMalloc(&d_az, sz_f); + cudaMalloc(&d_drhodt, sz_f); + g_N = N; + g_num_cells = num_cells; + } cudaMemcpy(d_xs, xs, sz_f, cudaMemcpyHostToDevice); cudaMemcpy(d_ys, ys, sz_f, cudaMemcpyHostToDevice); @@ -288,16 +319,6 @@ void solution_init(int N, cudaMemcpy(d_cell_end, cell_end, sz_i_c, cudaMemcpyHostToDevice); cudaMemcpy(d_sorted_idx, sorted_idx, sz_i_n, cudaMemcpyHostToDevice); - // Allocate output arrays on device - cudaMalloc(&d_ax, sz_f); - cudaMalloc(&d_ay, sz_f); - cudaMalloc(&d_az, sz_f); - cudaMalloc(&d_drhodt, sz_f); -} - -void solution_compute(int N, - float* ax, float* ay, float* az, float* drhodt) -{ dim3 block(BLOCK_SIZE); dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE); @@ -307,37 +328,17 @@ void solution_compute(int N, d_vxs, d_vys, d_vzs, d_rhos, d_masses, d_cell_begin, d_cell_end, d_sorted_idx, - g_grid_nx, g_grid_ny, g_grid_nz, - g_cell_size, - g_h, g_bwenh, g_kernelsize2, - g_cs0, g_alpha_visc, g_ovrhopzero, g_cteb, + grid_nx, grid_ny, grid_nz, + cell_size, + h, g_bwenh, g_kernelsize2, + cs0, alpha_visc, g_ovrhopzero, g_cteb, d_ax, d_ay, d_az, d_drhodt); - // Copy results back to host - size_t sz_f = (size_t)N * sizeof(float); cudaMemcpy(ax, d_ax, sz_f, cudaMemcpyDeviceToHost); cudaMemcpy(ay, d_ay, sz_f, cudaMemcpyDeviceToHost); cudaMemcpy(az, d_az, sz_f, cudaMemcpyDeviceToHost); cudaMemcpy(drhodt, d_drhodt, sz_f, cudaMemcpyDeviceToHost); -} - -void solution_free(void) -{ - cudaFree(d_xs); - cudaFree(d_ys); - cudaFree(d_zs); - cudaFree(d_vxs); - cudaFree(d_vys); - cudaFree(d_vzs); - cudaFree(d_rhos); - cudaFree(d_masses); - cudaFree(d_cell_begin); - cudaFree(d_cell_end); - cudaFree(d_sorted_idx); - cudaFree(d_ax); - cudaFree(d_ay); - cudaFree(d_az); - cudaFree(d_drhodt); + cudaDeviceSynchronize(); } } // extern "C" diff --git a/tasks/sph_position/gpu_baseline.cu b/tasks/sph_position/gpu_baseline.cu index 8c0fddb..e26d805 100644 --- a/tasks/sph_position/gpu_baseline.cu +++ b/tasks/sph_position/gpu_baseline.cu @@ -70,27 +70,50 @@ __global__ void KerComputeStepPos(int N, extern "C" { -void solution_init(int N, - const float* posxy_x, const float* posxy_y, - const float* posz, - const float* movxy_x, const float* movxy_y, - const float* movz, - float cell_size) +void solution_free(void) { - g_N = N; - g_cell_size = cell_size; + if (d_pos_x) { cudaFree(d_pos_x); d_pos_x = nullptr; } + if (d_pos_y) { cudaFree(d_pos_y); d_pos_y = nullptr; } + if (d_pos_z) { cudaFree(d_pos_z); d_pos_z = nullptr; } + if (d_mov_x) { cudaFree(d_mov_x); d_mov_x = nullptr; } + if (d_mov_y) { cudaFree(d_mov_y); d_mov_y = nullptr; } + if (d_mov_z) { cudaFree(d_mov_z); d_mov_z = nullptr; } + if (d_out_x) { cudaFree(d_out_x); d_out_x = nullptr; } + if (d_out_y) { cudaFree(d_out_y); d_out_y = nullptr; } + if (d_out_z) { cudaFree(d_out_z); d_out_z = nullptr; } + if (d_out_cell) { cudaFree(d_out_cell); d_out_cell = nullptr; } + g_N = 0; +} +void solution_compute(int N, + const float* posxy_x, const float* posxy_y, + const float* posz, + const float* movxy_x, const float* movxy_y, + const float* movz, + float cell_size, + double* out_x, double* out_y, double* out_z, + int* out_cell) +{ size_t sz_f = (size_t)N * sizeof(float); size_t sz_d = (size_t)N * sizeof(double); size_t sz_i = (size_t)N * sizeof(int); - // Allocate and copy input arrays to device - cudaMalloc(&d_pos_x, sz_f); - cudaMalloc(&d_pos_y, sz_f); - cudaMalloc(&d_pos_z, sz_f); - cudaMalloc(&d_mov_x, sz_f); - cudaMalloc(&d_mov_y, sz_f); - cudaMalloc(&d_mov_z, sz_f); + if (g_N != N) { + solution_free(); + cudaMalloc(&d_pos_x, sz_f); + cudaMalloc(&d_pos_y, sz_f); + cudaMalloc(&d_pos_z, sz_f); + cudaMalloc(&d_mov_x, sz_f); + cudaMalloc(&d_mov_y, sz_f); + cudaMalloc(&d_mov_z, sz_f); + cudaMalloc(&d_out_x, sz_d); + cudaMalloc(&d_out_y, sz_d); + cudaMalloc(&d_out_z, sz_d); + cudaMalloc(&d_out_cell, sz_i); + g_N = N; + } + + g_cell_size = cell_size; cudaMemcpy(d_pos_x, posxy_x, sz_f, cudaMemcpyHostToDevice); cudaMemcpy(d_pos_y, posxy_y, sz_f, cudaMemcpyHostToDevice); @@ -99,17 +122,6 @@ void solution_init(int N, cudaMemcpy(d_mov_y, movxy_y, sz_f, cudaMemcpyHostToDevice); cudaMemcpy(d_mov_z, movz, sz_f, cudaMemcpyHostToDevice); - // Allocate output arrays on device - cudaMalloc(&d_out_x, sz_d); - cudaMalloc(&d_out_y, sz_d); - cudaMalloc(&d_out_z, sz_d); - cudaMalloc(&d_out_cell, sz_i); -} - -void solution_compute(int N, - double* out_x, double* out_y, double* out_z, - int* out_cell) -{ dim3 block(BLOCK_SIZE); dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE); @@ -117,30 +129,15 @@ void solution_compute(int N, N, d_pos_x, d_pos_y, d_pos_z, d_mov_x, d_mov_y, d_mov_z, - g_cell_size, + cell_size, d_out_x, d_out_y, d_out_z, d_out_cell); // Copy results back to host - size_t sz_d = (size_t)N * sizeof(double); - size_t sz_i = (size_t)N * sizeof(int); cudaMemcpy(out_x, d_out_x, sz_d, cudaMemcpyDeviceToHost); cudaMemcpy(out_y, d_out_y, sz_d, cudaMemcpyDeviceToHost); cudaMemcpy(out_z, d_out_z, sz_d, cudaMemcpyDeviceToHost); cudaMemcpy(out_cell, d_out_cell, sz_i, cudaMemcpyDeviceToHost); -} - -void solution_free(void) -{ - cudaFree(d_pos_x); - cudaFree(d_pos_y); - cudaFree(d_pos_z); - cudaFree(d_mov_x); - cudaFree(d_mov_y); - cudaFree(d_mov_z); - cudaFree(d_out_x); - cudaFree(d_out_y); - cudaFree(d_out_z); - cudaFree(d_out_cell); + cudaDeviceSynchronize(); } } // extern "C" diff --git a/tasks/spmv_csr/gpu_baseline.cu b/tasks/spmv_csr/gpu_baseline.cu index 9498216..19867fa 100644 --- a/tasks/spmv_csr/gpu_baseline.cu +++ b/tasks/spmv_csr/gpu_baseline.cu @@ -46,6 +46,7 @@ __global__ void TransposedMatrixVectorProductKernel( } // ===== Persistent device state ===== +static int g_num_rows = 0; static int g_num_cols = 0; static int g_nnz = 0; static int* d_col_ptrs = nullptr; @@ -54,30 +55,45 @@ static float* d_values = nullptr; static float* d_vector = nullptr; static float* d_answer = nullptr; -extern "C" void solution_init(int num_rows, - int num_cols, - const int* col_ptrs, - const int* row_indices, - const float* values, - const float* vector) +extern "C" void solution_free(void) { - g_num_cols = num_cols; - g_nnz = col_ptrs[num_cols]; + if (d_col_ptrs) { cudaFree(d_col_ptrs); d_col_ptrs = nullptr; } + if (d_row_indices) { cudaFree(d_row_indices); d_row_indices = nullptr; } + if (d_values) { cudaFree(d_values); d_values = nullptr; } + if (d_vector) { cudaFree(d_vector); d_vector = nullptr; } + if (d_answer) { cudaFree(d_answer); d_answer = nullptr; } + g_num_rows = 0; + g_num_cols = 0; + g_nnz = 0; +} - cudaMalloc(&d_col_ptrs, (num_cols + 1) * sizeof(int)); - cudaMalloc(&d_row_indices, g_nnz * sizeof(int)); - cudaMalloc(&d_values, g_nnz * sizeof(float)); - cudaMalloc(&d_vector, num_rows * sizeof(float)); - cudaMalloc(&d_answer, num_cols * sizeof(float)); +extern "C" void solution_compute(int num_rows, + int num_cols, + const int* col_ptrs, + const int* row_indices, + const float* values, + const float* vector, + float* answer) +{ + int nnz = col_ptrs[num_cols]; + + if (g_num_rows != num_rows || g_num_cols != num_cols || g_nnz != nnz) { + solution_free(); + cudaMalloc(&d_col_ptrs, (num_cols + 1) * sizeof(int)); + cudaMalloc(&d_row_indices, nnz * sizeof(int)); + cudaMalloc(&d_values, nnz * sizeof(float)); + cudaMalloc(&d_vector, num_rows * sizeof(float)); + cudaMalloc(&d_answer, num_cols * sizeof(float)); + g_num_rows = num_rows; + g_num_cols = num_cols; + g_nnz = nnz; + } cudaMemcpy(d_col_ptrs, col_ptrs, (num_cols + 1) * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_row_indices, row_indices, g_nnz * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_values, values, g_nnz * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_row_indices, row_indices, nnz * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_vector, vector, num_rows * sizeof(float), cudaMemcpyHostToDevice); -} -extern "C" void solution_compute(int num_cols, float* answer) -{ int threadsPerBlock = 256; int blocks = (num_cols + threadsPerBlock - 1) / threadsPerBlock; @@ -85,13 +101,5 @@ extern "C" void solution_compute(int num_cols, float* answer) num_cols, d_col_ptrs, d_row_indices, d_values, d_vector, d_answer); cudaMemcpy(answer, d_answer, num_cols * sizeof(float), cudaMemcpyDeviceToHost); -} - -extern "C" void solution_free(void) -{ - if (d_col_ptrs) { cudaFree(d_col_ptrs); d_col_ptrs = nullptr; } - if (d_row_indices) { cudaFree(d_row_indices); d_row_indices = nullptr; } - if (d_values) { cudaFree(d_values); d_values = nullptr; } - if (d_vector) { cudaFree(d_vector); d_vector = nullptr; } - if (d_answer) { cudaFree(d_answer); d_answer = nullptr; } + cudaDeviceSynchronize(); }