Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cuda/include/poly/eval_at_point.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@
#define POLY_EVAL_AT_POINT_H

#include "../fields.cuh"
#include "../utils.cuh"

extern "C"
qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y);

extern "C"
void evaluate_polynomials_out_of_domain(
qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials,
qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes
);

#endif // POLY_EVAL_AT_POINT_H
209 changes: 208 additions & 1 deletion cuda/src/poly/eval_at_point.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ __global__ void eval_at_point_first_pass(m31 *g_coeffs, qm31 *temp, qm31 *factor
}
factor_idx -= 1;
level_size >>= 1;

}

if (idx == 0) {
Expand Down Expand Up @@ -154,3 +153,211 @@ qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y) {
return result;
}

/* Many polynomials */


__global__ void eval_many_at_point_first_pass(m31 **g_coeffs, qm31 *temp, qm31 *factors, int coeffs_size, int factors_size,
int output_offset) {
int idx = threadIdx.x;

qm31 *output = &temp[output_offset];

int coeffs_per_block = 2 * blockDim.x;
int blocks_in_poly = max(1, coeffs_size / coeffs_per_block);
// Thread syncing happens within a block.
// Split the problem to feed them to multiple blocks.
if (coeffs_size >= coeffs_per_block) {
coeffs_size = coeffs_per_block;
}

extern __shared__ m31 s_coeffs[];
extern __shared__ qm31 s_level[];

int poly_index = blockIdx.x / blocks_in_poly;

// A % X == A & (X-1) when X is a power of two
s_coeffs[idx] = g_coeffs[poly_index][(blockIdx.x & (blocks_in_poly - 1)) * coeffs_size + idx];
s_coeffs[idx + blockDim.x] = g_coeffs[poly_index][(blockIdx.x & (blocks_in_poly - 1)) * coeffs_size + idx + blockDim.x];
__syncthreads();

int level_size = coeffs_size >> 1;
int factor_idx = factors_size - 1;

if (idx < level_size) {
m31 alpha = s_coeffs[2 * idx];
m31 beta = s_coeffs[2 * idx + 1];
qm31 factor = factors[factor_idx];

qm31 result = {
{add(mul(beta, factor.a.a), alpha), mul(factor.a.b, beta)},
{mul(beta, factor.b.a), mul(beta, factor.b.b)}
};
s_level[idx] = result;
}
factor_idx -= 1;
level_size >>= 1;

while (level_size > 0) {
if (idx < level_size) {
__syncthreads();
qm31 a = s_level[2 * idx];
qm31 b = s_level[2 * idx + 1];
__syncthreads();
s_level[idx] = add(a, mul(b, factors[factor_idx]));
}
factor_idx -= 1;
level_size >>= 1;
}

if (idx == 0) {
output[blockIdx.x] = s_level[0];
}
}

__global__
void eval_many_at_point_second_pass(qm31 *temp, qm31 *factors, int level_size, int factor_offset, int level_offset,
int output_offset, int results_per_block) {
int idx = threadIdx.x;

qm31 *level = &temp[level_offset];
qm31 *output = &temp[output_offset];

// Thread syncing happens within a block.
// Split the problem to feed them to multiple blocks.
if (level_size >= 2 * blockDim.x) {
level_size = 2 * blockDim.x;
}

extern __shared__ qm31 s_level[];

s_level[idx] = level[2 * blockIdx.x * blockDim.x + idx];
s_level[idx + blockDim.x] = level[2 * blockIdx.x * blockDim.x + idx + blockDim.x];

level_size >>= 1;

int factor_idx = factor_offset;

while (level_size >= results_per_block) {
if (idx < level_size) {
__syncthreads();
qm31 a = s_level[2 * idx];
qm31 b = s_level[2 * idx + 1];
__syncthreads();
s_level[idx] = add(a, mul(b, factors[factor_idx]));
}
factor_idx -= 1;
level_size >>= 1;
}

if (idx < results_per_block) {
output[blockIdx.x * results_per_block + idx] = s_level[idx];
}
}

__global__
void copy_result_for_polynomial(qm31 **result, qm31 *temp, int number_of_polynomials) {
int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x;

if (global_thread_index < number_of_polynomials) {
result[global_thread_index][0] = temp[global_thread_index];
}
}

void eval_polys_at_point(
qm31 **result, m31 **polynomials, int log_number_of_polynomials, int log_coeffs_size, qm31 point_x, qm31 point_y
) {
int coeffs_size = 1 << log_coeffs_size;
int block_dim = min(256, coeffs_size);
int coeffs_per_block = block_dim * 2;

qm31 *host_mappings = (qm31 *) malloc(sizeof(qm31) * log_coeffs_size);
host_mappings[log_coeffs_size - 1] = point_y;
host_mappings[log_coeffs_size - 2] = point_x;
qm31 x = point_x;
for (int i = 2; i < log_coeffs_size; i += 1) {
x = sub(mul(qm31{cm31{2, 0}, cm31{0, 0}}, mul(x, x)), qm31{cm31{1, 0}, cm31{0, 0}});
host_mappings[log_coeffs_size - 1 - i] = x;
}

int number_of_polynomials = 1 << log_number_of_polynomials;
int total_number_of_coeffs = coeffs_size * number_of_polynomials;
int temp_memory_size = 0;
int size = total_number_of_coeffs;
while (size > number_of_polynomials) {
size = (size + coeffs_per_block - 1) / coeffs_per_block;
temp_memory_size += size;
}

temp_memory_size = max(temp_memory_size, number_of_polynomials);

qm31 *temp = cuda_malloc<qm31>(temp_memory_size);
qm31 *device_mappings = clone_to_device<qm31>(host_mappings, log_coeffs_size);

free(host_mappings);

// First pass
int num_blocks = max(number_of_polynomials, ((total_number_of_coeffs >> 1) + block_dim - 1) / block_dim);
int shared_memory_bytes = coeffs_per_block * 4 + coeffs_per_block * 8;
int output_offset = temp_memory_size - num_blocks;

eval_many_at_point_first_pass<<<num_blocks, block_dim, shared_memory_bytes>>>(polynomials, temp, device_mappings, coeffs_size,
log_coeffs_size, output_offset);

// Second pass
int mappings_offset = log_coeffs_size - 1;
int level_offset = output_offset;
while (num_blocks > number_of_polynomials) {
mappings_offset -= 9;
int new_num_blocks = ((num_blocks >> 1) + block_dim - 1) / block_dim;
int number_of_results = max(number_of_polynomials, new_num_blocks);
int results_per_block = number_of_results / new_num_blocks;
shared_memory_bytes = coeffs_per_block * 4 * 4;
output_offset = level_offset - new_num_blocks;
eval_many_at_point_second_pass<<<new_num_blocks, block_dim, shared_memory_bytes>>>(temp, device_mappings, num_blocks,
mappings_offset, level_offset,
output_offset, results_per_block);
num_blocks = new_num_blocks;
level_offset = output_offset;
}

cudaDeviceSynchronize();

num_blocks = (number_of_polynomials + block_dim - 1) / block_dim;
copy_result_for_polynomial<<<num_blocks, block_dim>>>(
result, temp, number_of_polynomials
);

cuda_free_memory(temp);
cuda_free_memory(device_mappings);
}

void eval_polys_at_points(
qm31 **result, m31 **polynomials, int log_polynomial_size, int log_number_of_polynomials,
qm31 *points_x, qm31 *points_y, int sample_size
) {
for (int point_index = 0; point_index < sample_size; point_index++) {
qm31 point_x = points_x[point_index];
qm31 point_y = points_y[point_index];
eval_polys_at_point(result, polynomials, log_number_of_polynomials, log_polynomial_size, point_x, point_y);
}
}

void evaluate_polynomials_out_of_domain(
qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials,
qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes
) {
// In this iteration, we assume all polynomials are of equal size and will be evaluated in the same list of points

qm31 **device_result = clone_to_device<qm31*>(result, number_of_polynomials);
m31 **device_polynomials = clone_to_device<m31*>(polynomials, number_of_polynomials);
int log_polynomial_size = log_polynomial_sizes[0];
int log_number_of_polynomials = log_2(number_of_polynomials);

eval_polys_at_points(
device_result, device_polynomials, log_polynomial_size, log_number_of_polynomials,
out_of_domain_points_x[0], out_of_domain_points_y[0], sample_sizes[0]
);

cuda_free_memory(device_result);
cuda_free_memory(device_polynomials);
};
8 changes: 6 additions & 2 deletions stwo_gpu_backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ edition = "2021"

[dependencies]
cc = "1.0"
stwo-prover = { git = "https://github.com/starkware-libs/stwo", rev = "ce5b975" }
stwo-prover = { git = "https://github.com/jarnesino/stwo", rev = "fa16181" }
itertools = "0.10.5"
rand = "0.8.5"
criterion = "0.4"
Expand Down Expand Up @@ -35,5 +35,9 @@ name = "interpolate_columns"
harness = false

[[bench]]
name = "evaluate_columns"
name = "evaluate_polynomials"
harness = false

[[bench]]
name = "evaluate_polynomials_out_of_domain"
harness = false
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps};
use stwo_prover::core::poly::BitReversedOrder;
use stwo_prover::core::ColumnVec;

const LOG_COLUMN_SIZE: u32 = 10;
const LOG_NUMBER_OF_COLUMNS: usize = 16;
const LOG_COLUMN_SIZE: u32 = 16;
const LOG_NUMBER_OF_COLUMNS: usize = 10;
const LOG_BLOWUP_FACTOR: u32 = 2;

pub fn simd_evaluate_columns(c: &mut Criterion) {
let mut group = c.benchmark_group("evaluate_columns");
pub fn simd_evaluate_polynomials(c: &mut Criterion) {
let mut group = c.benchmark_group("evaluate_polynomials");

let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
let values = (0..coset.size()).map(BaseField::from).collect();
Expand All @@ -37,8 +37,8 @@ pub fn simd_evaluate_columns(c: &mut Criterion) {
});
}

pub fn gpu_evaluate_columns(c: &mut Criterion) {
let mut group = c.benchmark_group("evaluate_columns");
pub fn gpu_evaluate_polynomials(c: &mut Criterion) {
let mut group = c.benchmark_group("evaluate_polynomials");

let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
let values = BaseFieldVec::from_vec((0..coset.size()).map(BaseField::from).collect_vec());
Expand All @@ -63,7 +63,7 @@ pub fn gpu_evaluate_columns(c: &mut Criterion) {
}

criterion_group!(
name = interpolate_columns;
name = evaluate_polynomials;
config = Criterion::default().sample_size(10);
targets = simd_evaluate_columns, gpu_evaluate_columns);
criterion_main!(interpolate_columns);
targets = simd_evaluate_polynomials, gpu_evaluate_polynomials);
criterion_main!(evaluate_polynomials);
100 changes: 100 additions & 0 deletions stwo_gpu_backend/benches/evaluate_polynomials_out_of_domain.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
use itertools::Itertools;
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
use stwo_gpu_backend::cuda::BaseFieldVec;
use stwo_gpu_backend::CudaBackend;
use stwo_prover::core::air::mask::fixed_mask_points;
use stwo_prover::core::backend::simd::SimdBackend;
use stwo_prover::core::backend::{Backend, ColumnOps};
use stwo_prover::core::circle::CirclePoint;
use stwo_prover::core::fields::m31::BaseField;
use stwo_prover::core::fields::qm31::SecureField;
use stwo_prover::core::pcs::TreeVec;
use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation};
use stwo_prover::core::poly::BitReversedOrder;
use stwo_prover::core::ColumnVec;

const LOG_COLUMN_SIZE: u32 = 16;
const LOG_NUMBER_OF_COLUMNS: usize = 10;
const LOG_BLOWUP_FACTOR: u32 = 2;

fn generate_random_point() -> CirclePoint<SecureField> {
let mut rng = SmallRng::seed_from_u64(0);
let x = rng.gen();
let y = rng.gen();
CirclePoint { x, y }
}

fn mask_points(
point: CirclePoint<SecureField>,
number_of_columns: usize,
) -> TreeVec<ColumnVec<Vec<CirclePoint<SecureField>>>> {
TreeVec(vec![fixed_mask_points(
&vec![vec![0_usize]; number_of_columns],
point,
)])
}

fn benchmark_evaluate_polynomials_out_of_domain<B: Backend>(
criterion: &mut Criterion,
coset: CanonicCoset,
values: <B as ColumnOps<BaseField>>::Column,
benchmark_id: &str,
) {
let mut group = criterion.benchmark_group("evaluate_polynomials_out_of_domain");
let number_of_columns = 1 << LOG_NUMBER_OF_COLUMNS;

let circle_evaluation: CircleEvaluation<B, BaseField, BitReversedOrder> =
B::new_canonical_ordered(coset, values);

let interpolation_coset = CanonicCoset::new(LOG_COLUMN_SIZE + LOG_BLOWUP_FACTOR);
let twiddle_tree = B::precompute_twiddles(interpolation_coset.half_coset());

let polynomial = B::interpolate(circle_evaluation, &twiddle_tree);
let polynomials = ColumnVec::from(vec![&polynomial; number_of_columns]);

let point = generate_random_point();
let sample_points = mask_points(point, number_of_columns);

group.bench_function(BenchmarkId::new(benchmark_id, LOG_COLUMN_SIZE), |b| {
b.iter_batched(
|| (polynomials.clone(), sample_points.clone()),
|(polys, sample)| {
B::evaluate_polynomials_out_of_domain(TreeVec::new(vec![polys]), sample)
},
BatchSize::LargeInput,
)
});
}

pub fn simd_evaluate_polynomials_out_of_domain(c: &mut Criterion) {
let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
let values = (0..coset.size()).map(BaseField::from).collect();

benchmark_evaluate_polynomials_out_of_domain::<SimdBackend>(
c,
coset,
values,
"simd evaluate polynomials out of domain",
);
}

pub fn gpu_evaluate_polynomials_out_of_domain(c: &mut Criterion) {
let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
let values = BaseFieldVec::from_vec((0..coset.size()).map(BaseField::from).collect_vec());

benchmark_evaluate_polynomials_out_of_domain::<CudaBackend>(
c,
coset,
values,
"gpu evaluate polynomials out of domain",
);
}

criterion_group!(
name = evaluate_polynomials_out_of_domain;
config = Criterion::default().sample_size(10);
targets = simd_evaluate_polynomials_out_of_domain, gpu_evaluate_polynomials_out_of_domain
);
criterion_main!(evaluate_polynomials_out_of_domain);
Loading