From 3f942e1de0780d9ad57cd78e6203e077f85f708f Mon Sep 17 00:00:00 2001 From: Pooja Thakur Date: Sat, 2 Aug 2025 17:57:22 -0400 Subject: [PATCH 1/4] docs --- Challenge_guide/Getting_started.md | 39 ++++++ Challenge_guide/New_Challenge.md | 53 ++++++++ Challenge_guide/STARTER_CODE_PROCESS.md | 166 ++++++++++++++++++++++++ Challenge_guide/TESTING_GUIDE.md | 111 ++++++++++++++++ Challenge_guide/challenge_template.py | 136 +++++++++++++++++++ 5 files changed, 505 insertions(+) create mode 100644 Challenge_guide/Getting_started.md create mode 100644 Challenge_guide/New_Challenge.md create mode 100644 Challenge_guide/STARTER_CODE_PROCESS.md create mode 100644 Challenge_guide/TESTING_GUIDE.md create mode 100644 Challenge_guide/challenge_template.py diff --git a/Challenge_guide/Getting_started.md b/Challenge_guide/Getting_started.md new file mode 100644 index 0000000..0d5aa8c --- /dev/null +++ b/Challenge_guide/Getting_started.md @@ -0,0 +1,39 @@ +# Creating New Challenges for LeetGPU + +LeetGPU challenges are low-level GPU programming tasks focused on writing custom CUDA, Triton, or Tinygrad kernels. They evaluate both functional correctness and performance under real GPU constraints. + +This guide provides detailed instructions for creating new GPU programming challenges for LeetGPU. It covers the complete process from concept to submission. + +## Challenge Structure + +Each challenge follows this directory structure: + +``` +challenges//_/ +├── challenge.html # Problem description and examples +├── challenge.py # Reference implementation and test cases +└── starter/ # Starter templates for each framework + ├── starter.cu # CUDA template + ├── starter.mojo # Mojo template + ├── starter.pytorch.py # PyTorch template + ├── starter.tinygrad.py # TinyGrad template + └── starter.triton.py # Triton template +``` + +## Creating the Challenge Files + +### Step 1: Choose Your Challenge Location + +1. Determine the appropriate difficulty level(easy, medium or hard) +2. Create your challenge directory: `challenges///` + +### Step 2: Create the Basic Structure + +```bash +mkdir challenges/level_folder/your_challenge_name/ +cd challenges/level_folder/your_challenge_name/ +mkdir starter/ +touch challenge.html challenge.py +touch starter/starter.cu starter/mojo starter/pytorch.py starter/tinygrad.py starter/triton.py +``` + diff --git a/Challenge_guide/New_Challenge.md b/Challenge_guide/New_Challenge.md new file mode 100644 index 0000000..b4245ba --- /dev/null +++ b/Challenge_guide/New_Challenge.md @@ -0,0 +1,53 @@ +# [Challenge Name] + +## Description + +[Provide a clear, concise explanation of what the algorithm or function is supposed to do. Include the mathematical concept, the expected behavior, and what the output should represent.] + +**Input Format:** All inputs must be floating-point values. + +**Output:** [Specify what the output variable represents and its expected format/range] + +## Implementation Requirements + +- **No External Libraries:** Solutions must be implemented using only native features. No external libraries or frameworks are permitted. +- **Function Signature:** The solve function signature is fixed and must not be modified. Implement your solution according to the provided signature. +- **Output Variable:** Results must be written to the designated output parameter: `[output_parameter_name]` + +### Mathematical Formulation + +[If applicable, provide the mathematical formula using LaTeX notation] + +$$ +\text{[Your formula here]} +$$ + +## Examples + +### Example 1 +**Input:** +``` +[Provide specific input values] +``` + +**Expected Output:** +``` +[Show the corresponding output values] +``` + +### Example 2 +**Input:** +``` +[Provide different input values] +``` + +**Expected Output:** +``` +[Show the corresponding output values] +``` + +## Constraints + +- **Input Size:** [Specify the range of input dimensions, e.g., "1 ≤ N ≤ 1,000,000"] +- **Value Range:** [Specify the range of input values, e.g., "-1000.0 ≤ input[i] ≤ 1000.0"] +- **Memory Limits:** [If applicable, specify any memory constraints] diff --git a/Challenge_guide/STARTER_CODE_PROCESS.md b/Challenge_guide/STARTER_CODE_PROCESS.md new file mode 100644 index 0000000..3540f49 --- /dev/null +++ b/Challenge_guide/STARTER_CODE_PROCESS.md @@ -0,0 +1,166 @@ +# Starter Code Creation Process for LeetGPU Challenges + +This guide explains the complete process of creating starter codes for LeetGPU challenges, from understanding the requirements to implementing across all frameworks. + + +## Analyzing Challenge Requirements + +###Identify Framework Requirements + +Each framework has specific requirements: + +**CUDA:** +- Kernel functions with `__global__` qualifier +- `extern "C"` solve function for framework integration +- Proper memory management and synchronization +- Grid and block size calculations + +**Triton:** +- `@triton.jit` decorator for kernel compilation +- Pointer type conversions for data types +- Block size and grid calculations +- PyTorch restriction compliance + +**Mojo:** +- `@export` decorator for framework integration +- Proper GPU imports and memory types +- Device context management +- Function parameter types + +**PyTorch/TinyGrad:** +- Tensor-based function signatures +- GPU tensor parameters +- Simple, direct implementations +- Type hints for clarity + +## Designing Function Signatures + +### Step 1: Define Core Parameters + +Based on the algorithm requirements, determine: + +1. **Input parameters**: What data does the algorithm need? +2. **Output parameters**: Where should results be written? +3. **Size parameters**: What dimensions are involved? +4. **Configuration parameters**: Any algorithm-specific settings? + +### Step 2: Choose Parameter Names + +Use clear, descriptive names that follow conventions: + +**Common Patterns:** +- Single input/output: `input`, `output` +- Multiple inputs: `A`, `B`, `C` or `Q`, `K`, `V` +- Dimensions: `N`, `M`, `K`, `rows`, `cols` +- Algorithm-specific: `kernel_size`, `stride`, `padding` + +### Step 3: Determine Data Types + +**Standard Types:** +- **CUDA**: `const float*` for inputs, `float*` for outputs, `int` for sizes +- **Triton**: `int` for pointers, `int` for sizes +- **Mojo**: `UnsafePointer[Float32]` for data, `Int32` for sizes +- **PyTorch/TinyGrad**: `Tensor` for data, `int` for sizes + +### Step 4: Create Function Signatures + +**Example: ReLU Activation** + +```cuda +// CUDA +extern "C" void solve(const float* input, float* output, int N) + +// Triton +def solve(input_ptr: int, output_ptr: int, N: int) + +// Mojo +@export +def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32) + +// PyTorch +def solve(input: torch.Tensor, output: torch.Tensor, N: int) +``` + +## Implementing Across Frameworks + +### Step 1: CUDA Implementation + +**Basic Structure:** +```cuda +#include + +__global__ void kernel_name(const float* input, float* output, int N) { + // TODO: Implement kernel logic +} + +extern "C" void solve(const float* input, float* output, int N) { + int threadsPerBlock = 256; + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + kernel_name<<>>(input, output, N); + cudaDeviceSynchronize(); +} +``` + +### Step 2: Triton Implementation + +**Basic Structure:** +```python +# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. +import triton +import triton.language as tl + +@triton.jit +def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): + input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) + output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) + +def solve(input_ptr: int, output_ptr: int, N: int): + BLOCK_SIZE = 1024 + grid = (triton.cdiv(N, BLOCK_SIZE),) + kernel_name[grid](input_ptr, output_ptr, N, BLOCK_SIZE) +``` + +### Step 3: Mojo Implementation + +**Basic Structure:** +```mojo +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +fn kernel_name(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + pass + +@export +def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + pass +``` + +### Step 4: PyTorch Implementation + +**Basic Structure:** +```python +import torch + +def solve(input: torch.Tensor, output: torch.Tensor, N: int): + pass +``` + +### Step 5: TinyGrad Implementation + +**Basic Structure:** +```python +import tinygrad + +def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): + pass +``` + +----- + +A proper starter code gives participants a runnable foundation while keeping the solution logic as their task. + + + diff --git a/Challenge_guide/TESTING_GUIDE.md b/Challenge_guide/TESTING_GUIDE.md new file mode 100644 index 0000000..31a5acb --- /dev/null +++ b/Challenge_guide/TESTING_GUIDE.md @@ -0,0 +1,111 @@ +# Testing Guide for LeetGPU Challenges + +This guide covers how to create test cases and validate your challenges to ensure they work correctly across all frameworks. + +## Table of Contents + +1. [Test Case Types](#test-case-types) +2. [Test Case Design Principles](#test-case-design-principles) +3. [Creating Robust Test Cases](#creating-robust-test-cases) +4. [Edge Cases and Boundary Conditions](#edge-cases-and-boundary-conditions) +5. [Performance Testing](#performance-testing) +6. [Validation Strategies](#validation-strategies) +7. [Common Testing Patterns](#common-testing-patterns) +8. [Debugging Test Issues](#debugging-test-issues) + +## Test Case Types + +### 1. Example Test (`generate_example_test`) +- **Purpose**: Simple test case that matches the example in `challenge.html` +- **Complexity**: Low - should be easy to understand and verify manually +- **Size**: Small (typically 3-10 elements) +- **Values**: Simple, predictable values + +### 2. Functional Tests (`generate_functional_test`) +- **Purpose**: Comprehensive test suite covering various scenarios +- **Complexity**: Medium - includes edge cases and typical usage +- **Size**: Varied (small to medium) +- **Values**: Diverse, including edge cases + +### 3. Performance Test (`generate_performance_test`) +- **Purpose**: Large test case for performance evaluation +- **Complexity**: High - tests scalability and efficiency +- **Size**: Large (typically 1M+ elements) +- **Values**: Random or structured large datasets + +## Test Case Design Principles + +### 1. Coverage +- **Input ranges**: Test minimum, maximum, and typical values +- **Input sizes**: Test small, medium, and large inputs +- **Data patterns**: Test edge cases, special values, and random data +- **Error conditions**: Test boundary conditions and invalid inputs + +### 2. Determinism +- **Reproducible**: Tests should produce the same results every time +- **Seeded randomness**: Use fixed seeds for random test cases +- **Clear expectations**: Expected outputs should be well-defined + +### 3. Efficiency +- **Fast execution**: Tests should run quickly for development +- **Memory efficient**: Avoid unnecessarily large test cases +- **Scalable**: Performance tests should be appropriately sized + +## Debugging Test Issues + +### Common Issues and Solutions + +#### 1. Memory Issues +```python +# Problem: CUDA out of memory +# Solution: Reduce test case sizes +def generate_performance_test(self) -> Dict[str, Any]: + # Reduce size if memory issues occur + size = 100_000 # Instead of 1_000_000 + return { + "input": torch.empty(size, device="cuda", dtype=torch.float32).uniform_(-100.0, 100.0), + "output": torch.empty(size, device="cuda", dtype=torch.float32), + "N": size + } +``` + +#### 2. Precision Issues +```python +# Problem: Floating point precision errors +# Solution: Adjust tolerances +def __init__(self): + super().__init__( + name="Complex Algorithm", + atol=1e-03, # Increase tolerance for complex algorithms + rtol=1e-03, + num_gpus=1, + access_tier="free" + ) +``` + +#### 3. Shape Mismatch Issues +```python +# Problem: Tensor shape mismatches +# Solution: Add shape validation +def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int): + # Validate shapes + assert input.shape == (N,), f"Expected input shape ({N},), got {input.shape}" + assert output.shape == (N,), f"Expected output shape ({N},), got {output.shape}" + + # Rest of implementation... +``` + +### Debugging Checklist + +- [ ] Reference implementation produces correct results +- [ ] All test cases have required parameters +- [ ] Tensor shapes match expectations +- [ ] Data types are consistent (float32) +- [ ] Tolerances are appropriate for the algorithm +- [ ] Performance test size is reasonable +- [ ] Edge cases are covered +- [ ] Random test cases use appropriate ranges + +--- + +*This testing guide ensures your challenges are robust, well-tested, and ready for production use.* \ No newline at end of file diff --git a/Challenge_guide/challenge_template.py b/Challenge_guide/challenge_template.py new file mode 100644 index 0000000..08f5387 --- /dev/null +++ b/Challenge_guide/challenge_template.py @@ -0,0 +1,136 @@ +import ctypes +from typing import Any, List, Dict +import torch +from core.challenge_base import ChallengeBase + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="[CHALLENGE_NAME]", # e.g., "ReLU", "Softmax", "Multi-Head Attention" + atol=1e-05, # Absolute tolerance for testing. 1e-05 is a good default. + rtol=1e-05, # Relative tolerance for testing. 1e-05 is a good default. + num_gpus=1, # Number of GPUs required. + access_tier="free" # Access tier + ) + + def reference_impl(self, *args, **kwargs): + """ + Reference implementation of the algorithm/function. + + Common patterns: + - Assert input shapes and properties (dtype, device) + - Implement the core algorithm logic + - Use output.copy_(result) to write results + + Example signature patterns: + - Simple: (input: torch.Tensor, output: torch.Tensor, N: int) + - Complex: (Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, N: int, d_model: int, h: int) + """ + # TODO: Add input assertions + # assert input.shape == expected_shape + # assert input.dtype == expected_dtype + # assert input.device == expected_device + + # TODO: Implement core algorithm logic + # result = your_algorithm_implementation() + + # TODO: Copy result to output tensor + # output.copy_(result) + pass + + def get_solve_signature(self) -> Dict[str, Any]: + """ + Define the C function signature for the solver. + + Common ctypes patterns: + - Tensor pointers: ctypes.POINTER(ctypes.c_float) + - Integers: ctypes.c_int + - Floats: ctypes.c_float + """ + return { + # TODO: Define your function signature + # "input": ctypes.POINTER(ctypes.c_float), + # "output": ctypes.POINTER(ctypes.c_float), + # "N": ctypes.c_int, + # Add other parameters as needed + } + + def generate_example_test(self) -> Dict[str, Any]: + """ + Generate a simple example test case. + Usually small, hand-crafted data for basic demonstration. + """ + dtype = torch.float32 + + # TODO: Create example input tensors + # input_tensor = torch.tensor([...], device="cuda", dtype=dtype) + # output_tensor = torch.empty(shape, device="cuda", dtype=dtype) + + return { + # TODO: Return test case dictionary + # "input": input_tensor, + # "output": output_tensor, + # "N": size, + # Add other parameters as needed + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + """ + Generate comprehensive functional test cases. + + Common test patterns: + - Edge cases (zeros, negatives, single elements) + - Boundary conditions + - Various sizes + - Random data + - Special mathematical cases + """ + dtype = torch.float32 + test_cases = [] + + # TODO: Add basic test case + # test_cases.append({ + # "input": torch.tensor([...], device="cuda", dtype=dtype), + # "output": torch.empty(shape, device="cuda", dtype=dtype), + # "N": size + # }) + + # TODO: Add edge cases + # - All zeros + # - All negatives + # - Single element + # - Large values + # - Small values + # - Mixed positive/negative + + # TODO: Add random test cases + # test_cases.append({ + # "input": torch.empty(size, device="cuda", dtype=dtype).uniform_(min_val, max_val), + # "output": torch.empty(size, device="cuda", dtype=dtype), + # "N": size + # }) + + return test_cases + + def generate_performance_test(self) -> Dict[str, Any]: + """ + Generate a large-scale performance test case. + Usually uses large tensors with random data. + """ + dtype = torch.float32 + + # TODO: Set appropriate size for performance testing + # Common sizes: 25000000, 500000, 1024x1024, etc. + N = 1000000 # Adjust based on your challenge + + # TODO: Create large tensors for performance testing + # input_tensor = torch.empty(N, device="cuda", dtype=dtype).uniform_(min_val, max_val) + # output_tensor = torch.empty(N, device="cuda", dtype=dtype) + + return { + # TODO: Return performance test case + # "input": input_tensor, + # "output": output_tensor, + # "N": N, + # Add other parameters as needed + } \ No newline at end of file From e3cd48a056c62638800e55c0f1d653b27323f35a Mon Sep 17 00:00:00 2001 From: Pooja Thakur Date: Thu, 7 Aug 2025 23:22:57 -0400 Subject: [PATCH 2/4] Update docs for new challenges --- Challenge_guide/STARTER_CODE_PROCESS.md | 166 ------------------ {Challenge_guide => docs}/Getting_started.md | 0 {Challenge_guide => docs}/New_Challenge.md | 16 +- docs/Starter_Codes.md | 71 ++++++++ docs/Starter_Files/easy_problems.md | 116 ++++++++++++ docs/Starter_Files/hard_problems.md | 93 ++++++++++ docs/Starter_Files/medium_problems.md | 87 +++++++++ {Challenge_guide => docs}/TESTING_GUIDE.md | 0 .../challenge_template.py | 0 9 files changed, 374 insertions(+), 175 deletions(-) delete mode 100644 Challenge_guide/STARTER_CODE_PROCESS.md rename {Challenge_guide => docs}/Getting_started.md (100%) rename {Challenge_guide => docs}/New_Challenge.md (81%) create mode 100644 docs/Starter_Codes.md create mode 100644 docs/Starter_Files/easy_problems.md create mode 100644 docs/Starter_Files/hard_problems.md create mode 100644 docs/Starter_Files/medium_problems.md rename {Challenge_guide => docs}/TESTING_GUIDE.md (100%) rename {Challenge_guide => docs}/challenge_template.py (100%) diff --git a/Challenge_guide/STARTER_CODE_PROCESS.md b/Challenge_guide/STARTER_CODE_PROCESS.md deleted file mode 100644 index 3540f49..0000000 --- a/Challenge_guide/STARTER_CODE_PROCESS.md +++ /dev/null @@ -1,166 +0,0 @@ -# Starter Code Creation Process for LeetGPU Challenges - -This guide explains the complete process of creating starter codes for LeetGPU challenges, from understanding the requirements to implementing across all frameworks. - - -## Analyzing Challenge Requirements - -###Identify Framework Requirements - -Each framework has specific requirements: - -**CUDA:** -- Kernel functions with `__global__` qualifier -- `extern "C"` solve function for framework integration -- Proper memory management and synchronization -- Grid and block size calculations - -**Triton:** -- `@triton.jit` decorator for kernel compilation -- Pointer type conversions for data types -- Block size and grid calculations -- PyTorch restriction compliance - -**Mojo:** -- `@export` decorator for framework integration -- Proper GPU imports and memory types -- Device context management -- Function parameter types - -**PyTorch/TinyGrad:** -- Tensor-based function signatures -- GPU tensor parameters -- Simple, direct implementations -- Type hints for clarity - -## Designing Function Signatures - -### Step 1: Define Core Parameters - -Based on the algorithm requirements, determine: - -1. **Input parameters**: What data does the algorithm need? -2. **Output parameters**: Where should results be written? -3. **Size parameters**: What dimensions are involved? -4. **Configuration parameters**: Any algorithm-specific settings? - -### Step 2: Choose Parameter Names - -Use clear, descriptive names that follow conventions: - -**Common Patterns:** -- Single input/output: `input`, `output` -- Multiple inputs: `A`, `B`, `C` or `Q`, `K`, `V` -- Dimensions: `N`, `M`, `K`, `rows`, `cols` -- Algorithm-specific: `kernel_size`, `stride`, `padding` - -### Step 3: Determine Data Types - -**Standard Types:** -- **CUDA**: `const float*` for inputs, `float*` for outputs, `int` for sizes -- **Triton**: `int` for pointers, `int` for sizes -- **Mojo**: `UnsafePointer[Float32]` for data, `Int32` for sizes -- **PyTorch/TinyGrad**: `Tensor` for data, `int` for sizes - -### Step 4: Create Function Signatures - -**Example: ReLU Activation** - -```cuda -// CUDA -extern "C" void solve(const float* input, float* output, int N) - -// Triton -def solve(input_ptr: int, output_ptr: int, N: int) - -// Mojo -@export -def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32) - -// PyTorch -def solve(input: torch.Tensor, output: torch.Tensor, N: int) -``` - -## Implementing Across Frameworks - -### Step 1: CUDA Implementation - -**Basic Structure:** -```cuda -#include - -__global__ void kernel_name(const float* input, float* output, int N) { - // TODO: Implement kernel logic -} - -extern "C" void solve(const float* input, float* output, int N) { - int threadsPerBlock = 256; - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - - kernel_name<<>>(input, output, N); - cudaDeviceSynchronize(); -} -``` - -### Step 2: Triton Implementation - -**Basic Structure:** -```python -# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. -import triton -import triton.language as tl - -@triton.jit -def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): - input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) - output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) - -def solve(input_ptr: int, output_ptr: int, N: int): - BLOCK_SIZE = 1024 - grid = (triton.cdiv(N, BLOCK_SIZE),) - kernel_name[grid](input_ptr, output_ptr, N, BLOCK_SIZE) -``` - -### Step 3: Mojo Implementation - -**Basic Structure:** -```mojo -from gpu.host import DeviceContext -from gpu.id import block_dim, block_idx, thread_idx -from memory import UnsafePointer -from math import ceildiv - -fn kernel_name(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): - pass - -@export -def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): - pass -``` - -### Step 4: PyTorch Implementation - -**Basic Structure:** -```python -import torch - -def solve(input: torch.Tensor, output: torch.Tensor, N: int): - pass -``` - -### Step 5: TinyGrad Implementation - -**Basic Structure:** -```python -import tinygrad - -def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): - pass -``` - ------ - -A proper starter code gives participants a runnable foundation while keeping the solution logic as their task. - - - diff --git a/Challenge_guide/Getting_started.md b/docs/Getting_started.md similarity index 100% rename from Challenge_guide/Getting_started.md rename to docs/Getting_started.md diff --git a/Challenge_guide/New_Challenge.md b/docs/New_Challenge.md similarity index 81% rename from Challenge_guide/New_Challenge.md rename to docs/New_Challenge.md index b4245ba..50e3287 100644 --- a/Challenge_guide/New_Challenge.md +++ b/docs/New_Challenge.md @@ -2,11 +2,15 @@ ## Description -[Provide a clear, concise explanation of what the algorithm or function is supposed to do. Include the mathematical concept, the expected behavior, and what the output should represent.] +[Provide a clear, concise explanation of what the algorithm or function is supposed to do. Include input and output specifications, if necessary.] -**Input Format:** All inputs must be floating-point values. +### Mathematical Formulation + +[If applicable, provide the mathematical formula using LaTeX notation] -**Output:** [Specify what the output variable represents and its expected format/range] +$$ +\text{[Your formula here]} +$$ ## Implementation Requirements @@ -14,13 +18,7 @@ - **Function Signature:** The solve function signature is fixed and must not be modified. Implement your solution according to the provided signature. - **Output Variable:** Results must be written to the designated output parameter: `[output_parameter_name]` -### Mathematical Formulation - -[If applicable, provide the mathematical formula using LaTeX notation] -$$ -\text{[Your formula here]} -$$ ## Examples diff --git a/docs/Starter_Codes.md b/docs/Starter_Codes.md new file mode 100644 index 0000000..b07f1f0 --- /dev/null +++ b/docs/Starter_Codes.md @@ -0,0 +1,71 @@ +# Starter Code Creation Process for LeetGPU Challenges + +A starter code is a template file that provides the basic structure and function signatures for implementing GPU-accelerated algorithms in LeetGPU challenges. It gives users a runnable foundation while leaving the core algorithmic logic as their task. + +## Major Components + +- **Function Signatures:** Standardized `solve` function with consistent parameters across all frameworks +- **Framework-Specific Templates:** CUDA, Triton, Mojo, PyTorch, and TinyGrad implementations +- **Memory Management:** Proper device pointer handling and memory allocation patterns +- **Kernel Structure:** Basic kernel function templates with grid/block sizing +- **Error Handling:** Bounds checking and synchronization primitives + + +### Identify Framework Requirements + +Each framework has specific requirements: + +**CUDA:** +- Kernel functions with `__global__` qualifier(for easy problems) +- `extern "C"` solve function for framework integration +- Proper memory management and synchronization +- Grid and block size calculations + +**Triton:** +- `@triton.jit` decorator for kernel compilation +- Pointer type conversions for data types +- Block size and grid calculations +- PyTorch restriction compliance + +**Mojo:** +- `@export` decorator for framework integration +- Proper GPU imports and memory types +- Device context management +- Function parameter types + +**PyTorch/TinyGrad:** +- Tensor-based function signatures +- GPU tensor parameters +- Simple, direct implementations + +## Designing Function Signatures + +### Step 1: Define Core Parameters + +Based on the algorithm requirements, determine: + +1. **Input parameters**: What data does the algorithm need? +2. **Output parameters**: Where should results be written? +3. **Size parameters**: What dimensions are involved? +4. **Configuration parameters**: Any algorithm-specific settings? + +### Step 2: Choose Parameter Names + +Use clear, descriptive names that follow conventions: + +**Common Patterns:** +- Single input/output: `input`, `output` +- Multiple inputs: `A`, `B`, `C` or `Q`, `K`, `V` +- Dimensions: `N`, `M`, `K`, `rows`, `cols` +- Algorithm-specific: `kernel_size`, `stride`, `padding` + +### Step 3: Determine Data Types + +**Standard Types:** +- **CUDA**: `const float*` for inputs, `float*` for outputs, `int` for sizes +- **Triton**: `int` for pointers, `int` for sizes +- **Mojo**: `UnsafePointer[Float32]` for data, `Int32` for sizes +- **PyTorch/TinyGrad**: `Tensor` for data, `int` for sizes + + +Refer Starter_Files for difficulty wise starter code templates. \ No newline at end of file diff --git a/docs/Starter_Files/easy_problems.md b/docs/Starter_Files/easy_problems.md new file mode 100644 index 0000000..60168db --- /dev/null +++ b/docs/Starter_Files/easy_problems.md @@ -0,0 +1,116 @@ +# Easy Problems Starter File Creation Guide + +Easy problems typically involve straightforward GPU operations with minimal complexity. + +Easy problems typically feature: +- **Simple algorithms**: Vector operations, element-wise transformations +- **Basic memory patterns**: Linear access patterns, minimal synchronization +- **Straightforward parameters**: Usually 2-4 parameters (inputs, outputs, sizes) +- **Single kernel**: One main kernel function per solution +- **No complex data structures**: Arrays, simple matrices +- **Element-wise operations**: Each thread processes one element independently + + +### CUDA Starter Template + +```cuda +#include + +__global__ void kernel_name(const float* input, float* output, int N) { + // TODO: Implement kernel logic + // Each thread processes one element + // Use threadIdx.x + blockIdx.x * blockDim.x to get global index +} + +// input, output are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(const float* input, float* output, int N) { + int threadsPerBlock = 256; + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + kernel_name<<>>(input, output, N); + cudaDeviceSynchronize(); +} +``` + +### Triton Starter Template + +```python +# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. +import triton +import triton.language as tl + +@triton.jit +def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): + input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) + output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) + + # TODO: Implement kernel logic + # Use tl.program_id(0) to get block index + # Use tl.program_id(1) to get thread ndex within block + +# input_ptr, output_ptr are raw device pointers +def solve(input_ptr: int, output_ptr: int, N: int): + BLOCK_SIZE = 1024 + # define grid + kernel_name[grid](input_ptr, output_ptr, N, BLOCK_SIZE) +``` + +### Mojo Starter Template + +```mojo +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +fn kernel_name(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + # TODO: Implement kernel logic + # Use thread_idx() to get thread index + # Use block_idx() to get block index + pass + +@export +def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + # define block, thread size + var ctx = DeviceContext() + + # Launch the kernel using enqueue_function + ctx.enqueue_function[kernel_name]( + input_ptr, output_ptr, N, + grid_dim = num_blocks, # Number of blocks in 1D grid + block_dim = BLOCK_SIZE # Number of threads per block + ) + + ctx.synchronize() + # TODO: Implement solve function + pass +``` + +### PyTorch Starter Template + +```python +import torch + +def solve(input: torch.Tensor, output: torch.Tensor, N: int): + # TODO: Implement solution using PyTorch operations + pass +``` + +### TinyGrad Starter Template + +```python +import tinygrad + +def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): + # TODO: Implement solution using TinyGrad operations + pass +``` + +### Common Mistakes + +1. **Missing bounds checking**: Always check `idx < N` +2. **Incorrect grid/block sizing**: Ensure all elements are covered +3. **Wrong memory access patterns**: Ensure coalesced access +4. **Missing synchronization**: Call `cudaDeviceSynchronize()` +5. **Incorrect pointer types**: Use proper type conversions in Triton + diff --git a/docs/Starter_Files/hard_problems.md b/docs/Starter_Files/hard_problems.md new file mode 100644 index 0000000..2e3b7f5 --- /dev/null +++ b/docs/Starter_Files/hard_problems.md @@ -0,0 +1,93 @@ +# Hard Problems Starter File Creation Guide + +Hard problems involve complex algorithms, advanced GPU programming techniques, and sophisticated memory management. + +Hard problems typically feature: +- **Complex algorithms**: Multi-stage computations, advanced mathematical operations +- **Sophisticated memory patterns**: Complex data structures, custom memory layouts +- **Multiple parameters**: 5+ parameters including algorithm-specific configurations +- **Advanced indexing**: Multi-dimensional indexing, complex access patterns +- **Inter-block communication**: Cooperation across multiple thread blocks +- **Specialized optimizations**: Tensor cores, custom kernels, advanced techniques + +## Starter File Structure + +### CUDA Starter Template + +```cuda +#include + +// input, output are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(const float* input, float* output, int N, int param1, int param2) { + // TODO: Implement solve function + // May require multiple kernel launches with complex coordination + // Consider memory allocation for intermediate results + // May need to handle complex data structures +} +``` + +### Triton Starter Template + +```python +# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. +import triton +import triton.language as tl + +# Complex algorithms may require multiple kernel functions +def solve(input_ptr: int, output_ptr: int, N: int, param1: int, param2: int): + # TODO: Implement solve function + # May require multiple kernel launches with complex coordination + # Consider intermediate memory allocation + # May need to handle complex data structures + pass +``` + +### Mojo Starter Template + +```mojo +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +@export +def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], + N: Int32, param1: Int32, param2: Int32): + # TODO: Implement solve function + # May require multiple kernel launches or complex logic + # Consider advanced memory management + pass +``` + +### PyTorch Starter Template + +```python +import torch + +def solve(input: torch.Tensor, output: torch.Tensor, N: int, param1: int, param2: int): + # TODO: Implement solution using PyTorch operations + # May require complex tensor operations and custom functions + pass +``` + +### TinyGrad Starter Template + +```python +import tinygrad + +def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int, param1: int, param2: int): + # TODO: Implement solution using TinyGrad operations + # May require complex tensor operations and custom functions + pass +``` + + + +## Common Mistakes to Avoid + +1. **Complex synchronization**: Ensure proper thread coordination +2. **Memory leaks**: Free all allocated memory +3. **Race conditions**: Use proper synchronization primitives +4. **Numerical instability**: Handle floating-point precision carefully +5. **Incorrect indexing**: Verify complex multi-dimensional indexing + diff --git a/docs/Starter_Files/medium_problems.md b/docs/Starter_Files/medium_problems.md new file mode 100644 index 0000000..4104543 --- /dev/null +++ b/docs/Starter_Files/medium_problems.md @@ -0,0 +1,87 @@ +# Medium Problems Starter File Creation Guide + +Medium problems introduce more complex algorithms and memory access patterns while maintaining reasonable implementation complexity. + +Medium problems typically feature: +- **Multi-step algorithms**: Operations requiring multiple kernel launches or complex logic +- **Advanced memory patterns**: Shared memory usage, reduction operations +- **Multiple parameters**: 3-6 parameters including algorithm-specific settings +- **Complex indexing**: 2D/3D indexing, strided access patterns +- **Inter-thread communication**: Some cooperation between threads within blocks +- **Algorithm-specific optimizations**: Custom memory layouts, specialized kernels + +## Starter File Structure + +### CUDA Starter Template + +```cuda +#include + +// input, output are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(const float* input, float* output, int N) { + // TODO: Implement solve function + // May require multiple kernel launches + // Consider memory allocation for intermediate results +} +``` + +### Triton Starter Template + +```python +# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. +import triton +import triton.language as tl + +def solve(input_ptr: int, output_ptr: int, N: int): + # TODO: Implement solve function + # May require multiple kernel launches + # Consider intermediate memory allocation + pass +``` + +### Mojo Starter Template + +```mojo +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +@export +def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + # TODO: Implement solve function + # May require multiple kernel launches or complex logic + pass +``` + +### PyTorch Starter Template + +```python +import torch + +def solve(input: torch.Tensor, output: torch.Tensor, N: int): + # TODO: Implement solution using PyTorch operations + # May require multiple tensor operations + pass +``` + +### TinyGrad Starter Template + +```python +import tinygrad + +def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): + # TODO: Implement solution using TinyGrad operations + # May require multiple tensor operations + pass +``` + + +## Common Mistakes to Avoid + +1. **Missing synchronization**: Use `__syncthreads()` when needed +2. **Incorrect shared memory usage**: Avoid bank conflicts +3. **Memory leaks**: Free allocated memory +4. **Race conditions**: Ensure proper thread coordination +5. **Numerical instability**: Use proper techniques (e.g., softmax with max subtraction) + diff --git a/Challenge_guide/TESTING_GUIDE.md b/docs/TESTING_GUIDE.md similarity index 100% rename from Challenge_guide/TESTING_GUIDE.md rename to docs/TESTING_GUIDE.md diff --git a/Challenge_guide/challenge_template.py b/docs/challenge_template.py similarity index 100% rename from Challenge_guide/challenge_template.py rename to docs/challenge_template.py From 8fb03c142789ea7900a639d7b4b49025ec7ebbad Mon Sep 17 00:00:00 2001 From: Pooja Thakur Date: Fri, 8 Aug 2025 18:57:29 -0400 Subject: [PATCH 3/4] remove extra docs --- docs/Getting_started.md | 63 +++++++++++--- docs/New_Challenge.md | 51 ----------- docs/Starter_Codes.md | 111 ++++++++++++++++++++---- docs/Starter_Files/easy_problems.md | 116 -------------------------- docs/Starter_Files/hard_problems.md | 93 --------------------- docs/Starter_Files/medium_problems.md | 87 ------------------- 6 files changed, 145 insertions(+), 376 deletions(-) delete mode 100644 docs/New_Challenge.md delete mode 100644 docs/Starter_Files/easy_problems.md delete mode 100644 docs/Starter_Files/hard_problems.md delete mode 100644 docs/Starter_Files/medium_problems.md diff --git a/docs/Getting_started.md b/docs/Getting_started.md index 0d5aa8c..17c21e6 100644 --- a/docs/Getting_started.md +++ b/docs/Getting_started.md @@ -2,7 +2,7 @@ LeetGPU challenges are low-level GPU programming tasks focused on writing custom CUDA, Triton, or Tinygrad kernels. They evaluate both functional correctness and performance under real GPU constraints. -This guide provides detailed instructions for creating new GPU programming challenges for LeetGPU. It covers the complete process from concept to submission. +This guide provides instructions for creating new GPU programming challenges for LeetGPU. It covers the complete process from concept to submission. ## Challenge Structure @@ -20,20 +20,59 @@ challenges//_/ └── starter.triton.py # Triton template ``` -## Creating the Challenge Files +### Challenge.html template -### Step 1: Choose Your Challenge Location -1. Determine the appropriate difficulty level(easy, medium or hard) -2. Create your challenge directory: `challenges///` +# [Challenge Name] -### Step 2: Create the Basic Structure +## Description -```bash -mkdir challenges/level_folder/your_challenge_name/ -cd challenges/level_folder/your_challenge_name/ -mkdir starter/ -touch challenge.html challenge.py -touch starter/starter.cu starter/mojo starter/pytorch.py starter/tinygrad.py starter/triton.py +[Provide a clear, concise explanation of what the algorithm or function is supposed to do. Include input and output specifications, if necessary.] + +### Mathematical Formulation + +[If applicable, provide the mathematical formula using LaTeX notation] + +$$ +\text{[Your formula here]} +$$ + +## Implementation Requirements + +- **No External Libraries:** Solutions must be implemented using only native features. No external libraries or frameworks are permitted. +- **Function Signature:** The solve function signature is fixed and must not be modified. Implement your solution according to the provided signature. +- **Output Variable:** Results must be written to the designated output parameter: `[output_parameter_name]` + + + +## Examples + +### Example 1 +**Input:** +``` +[Provide specific input values] +``` + +**Expected Output:** +``` +[Show the corresponding output values] +``` + +### Example 2 +**Input:** +``` +[Provide different input values] ``` +**Expected Output:** +``` +[Show the corresponding output values] +``` + +## Constraints + +- **Input Size:** [Specify the range of input dimensions, e.g., "1 ≤ N ≤ 1,000,000"] +- **Value Range:** [Specify the range of input values, e.g., "-1000.0 ≤ input[i] ≤ 1000.0"] +- **Memory Limits:** [If applicable, specify any memory constraints] + + diff --git a/docs/New_Challenge.md b/docs/New_Challenge.md deleted file mode 100644 index 50e3287..0000000 --- a/docs/New_Challenge.md +++ /dev/null @@ -1,51 +0,0 @@ -# [Challenge Name] - -## Description - -[Provide a clear, concise explanation of what the algorithm or function is supposed to do. Include input and output specifications, if necessary.] - -### Mathematical Formulation - -[If applicable, provide the mathematical formula using LaTeX notation] - -$$ -\text{[Your formula here]} -$$ - -## Implementation Requirements - -- **No External Libraries:** Solutions must be implemented using only native features. No external libraries or frameworks are permitted. -- **Function Signature:** The solve function signature is fixed and must not be modified. Implement your solution according to the provided signature. -- **Output Variable:** Results must be written to the designated output parameter: `[output_parameter_name]` - - - -## Examples - -### Example 1 -**Input:** -``` -[Provide specific input values] -``` - -**Expected Output:** -``` -[Show the corresponding output values] -``` - -### Example 2 -**Input:** -``` -[Provide different input values] -``` - -**Expected Output:** -``` -[Show the corresponding output values] -``` - -## Constraints - -- **Input Size:** [Specify the range of input dimensions, e.g., "1 ≤ N ≤ 1,000,000"] -- **Value Range:** [Specify the range of input values, e.g., "-1000.0 ≤ input[i] ≤ 1000.0"] -- **Memory Limits:** [If applicable, specify any memory constraints] diff --git a/docs/Starter_Codes.md b/docs/Starter_Codes.md index b07f1f0..8befc09 100644 --- a/docs/Starter_Codes.md +++ b/docs/Starter_Codes.md @@ -38,9 +38,6 @@ Each framework has specific requirements: - GPU tensor parameters - Simple, direct implementations -## Designing Function Signatures - -### Step 1: Define Core Parameters Based on the algorithm requirements, determine: @@ -49,23 +46,103 @@ Based on the algorithm requirements, determine: 3. **Size parameters**: What dimensions are involved? 4. **Configuration parameters**: Any algorithm-specific settings? -### Step 2: Choose Parameter Names -Use clear, descriptive names that follow conventions: +## Easy Problems templates + + +### CUDA Starter Template + +```cuda +#include + +__global__ void kernel_name(const float* input, float* output, int N) { + // TODO: Implement kernel logic + // Each thread processes one element + // Use threadIdx.x + blockIdx.x * blockDim.x to get global index +} + +// input, output are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(const float* input, float* output, int N) { + int threadsPerBlock = 256; + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + kernel_name<<>>(input, output, N); + cudaDeviceSynchronize(); +} +``` + +### Triton Starter Template + +```python +# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. +import triton +import triton.language as tl + +@triton.jit +def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): + input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) + output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) + + # TODO: Implement kernel logic + # Use tl.program_id(0) to get block index + # Use tl.program_id(1) to get thread ndex within block + +# input_ptr, output_ptr are raw device pointers +def solve(input_ptr: int, output_ptr: int, N: int): + BLOCK_SIZE = 1024 + # define grid + kernel_name[grid](input_ptr, output_ptr, N, BLOCK_SIZE) +``` + +### Mojo Starter Template + +```mojo +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +fn kernel_name(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + # TODO: Implement kernel logic + # Use thread_idx() to get thread index + # Use block_idx() to get block index + pass + +@export +def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): + # define block, thread size + var ctx = DeviceContext() + + # Launch the kernel using enqueue_function + ctx.enqueue_function[kernel_name]( + input_ptr, output_ptr, N, + grid_dim = num_blocks, # Number of blocks in 1D grid + block_dim = BLOCK_SIZE # Number of threads per block + ) + + ctx.synchronize() + # TODO: Implement solve function + pass +``` + +### PyTorch Starter Template + +```python +import torch -**Common Patterns:** -- Single input/output: `input`, `output` -- Multiple inputs: `A`, `B`, `C` or `Q`, `K`, `V` -- Dimensions: `N`, `M`, `K`, `rows`, `cols` -- Algorithm-specific: `kernel_size`, `stride`, `padding` +def solve(input: torch.Tensor, output: torch.Tensor, N: int): + # TODO: Implement solution using PyTorch operations + pass +``` -### Step 3: Determine Data Types +### TinyGrad Starter Template -**Standard Types:** -- **CUDA**: `const float*` for inputs, `float*` for outputs, `int` for sizes -- **Triton**: `int` for pointers, `int` for sizes -- **Mojo**: `UnsafePointer[Float32]` for data, `Int32` for sizes -- **PyTorch/TinyGrad**: `Tensor` for data, `int` for sizes +```python +import tinygrad +def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): + # TODO: Implement solution using TinyGrad operations + pass +``` -Refer Starter_Files for difficulty wise starter code templates. \ No newline at end of file +For medium and hard problems, only define solve function for CUDA, Mojo and Triton. \ No newline at end of file diff --git a/docs/Starter_Files/easy_problems.md b/docs/Starter_Files/easy_problems.md deleted file mode 100644 index 60168db..0000000 --- a/docs/Starter_Files/easy_problems.md +++ /dev/null @@ -1,116 +0,0 @@ -# Easy Problems Starter File Creation Guide - -Easy problems typically involve straightforward GPU operations with minimal complexity. - -Easy problems typically feature: -- **Simple algorithms**: Vector operations, element-wise transformations -- **Basic memory patterns**: Linear access patterns, minimal synchronization -- **Straightforward parameters**: Usually 2-4 parameters (inputs, outputs, sizes) -- **Single kernel**: One main kernel function per solution -- **No complex data structures**: Arrays, simple matrices -- **Element-wise operations**: Each thread processes one element independently - - -### CUDA Starter Template - -```cuda -#include - -__global__ void kernel_name(const float* input, float* output, int N) { - // TODO: Implement kernel logic - // Each thread processes one element - // Use threadIdx.x + blockIdx.x * blockDim.x to get global index -} - -// input, output are device pointers (i.e. pointers to memory on the GPU) -extern "C" void solve(const float* input, float* output, int N) { - int threadsPerBlock = 256; - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - - kernel_name<<>>(input, output, N); - cudaDeviceSynchronize(); -} -``` - -### Triton Starter Template - -```python -# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. -import triton -import triton.language as tl - -@triton.jit -def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): - input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) - output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) - - # TODO: Implement kernel logic - # Use tl.program_id(0) to get block index - # Use tl.program_id(1) to get thread ndex within block - -# input_ptr, output_ptr are raw device pointers -def solve(input_ptr: int, output_ptr: int, N: int): - BLOCK_SIZE = 1024 - # define grid - kernel_name[grid](input_ptr, output_ptr, N, BLOCK_SIZE) -``` - -### Mojo Starter Template - -```mojo -from gpu.host import DeviceContext -from gpu.id import block_dim, block_idx, thread_idx -from memory import UnsafePointer -from math import ceildiv - -fn kernel_name(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): - # TODO: Implement kernel logic - # Use thread_idx() to get thread index - # Use block_idx() to get block index - pass - -@export -def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): - # define block, thread size - var ctx = DeviceContext() - - # Launch the kernel using enqueue_function - ctx.enqueue_function[kernel_name]( - input_ptr, output_ptr, N, - grid_dim = num_blocks, # Number of blocks in 1D grid - block_dim = BLOCK_SIZE # Number of threads per block - ) - - ctx.synchronize() - # TODO: Implement solve function - pass -``` - -### PyTorch Starter Template - -```python -import torch - -def solve(input: torch.Tensor, output: torch.Tensor, N: int): - # TODO: Implement solution using PyTorch operations - pass -``` - -### TinyGrad Starter Template - -```python -import tinygrad - -def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): - # TODO: Implement solution using TinyGrad operations - pass -``` - -### Common Mistakes - -1. **Missing bounds checking**: Always check `idx < N` -2. **Incorrect grid/block sizing**: Ensure all elements are covered -3. **Wrong memory access patterns**: Ensure coalesced access -4. **Missing synchronization**: Call `cudaDeviceSynchronize()` -5. **Incorrect pointer types**: Use proper type conversions in Triton - diff --git a/docs/Starter_Files/hard_problems.md b/docs/Starter_Files/hard_problems.md deleted file mode 100644 index 2e3b7f5..0000000 --- a/docs/Starter_Files/hard_problems.md +++ /dev/null @@ -1,93 +0,0 @@ -# Hard Problems Starter File Creation Guide - -Hard problems involve complex algorithms, advanced GPU programming techniques, and sophisticated memory management. - -Hard problems typically feature: -- **Complex algorithms**: Multi-stage computations, advanced mathematical operations -- **Sophisticated memory patterns**: Complex data structures, custom memory layouts -- **Multiple parameters**: 5+ parameters including algorithm-specific configurations -- **Advanced indexing**: Multi-dimensional indexing, complex access patterns -- **Inter-block communication**: Cooperation across multiple thread blocks -- **Specialized optimizations**: Tensor cores, custom kernels, advanced techniques - -## Starter File Structure - -### CUDA Starter Template - -```cuda -#include - -// input, output are device pointers (i.e. pointers to memory on the GPU) -extern "C" void solve(const float* input, float* output, int N, int param1, int param2) { - // TODO: Implement solve function - // May require multiple kernel launches with complex coordination - // Consider memory allocation for intermediate results - // May need to handle complex data structures -} -``` - -### Triton Starter Template - -```python -# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. -import triton -import triton.language as tl - -# Complex algorithms may require multiple kernel functions -def solve(input_ptr: int, output_ptr: int, N: int, param1: int, param2: int): - # TODO: Implement solve function - # May require multiple kernel launches with complex coordination - # Consider intermediate memory allocation - # May need to handle complex data structures - pass -``` - -### Mojo Starter Template - -```mojo -from gpu.host import DeviceContext -from gpu.id import block_dim, block_idx, thread_idx -from memory import UnsafePointer -from math import ceildiv - -@export -def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], - N: Int32, param1: Int32, param2: Int32): - # TODO: Implement solve function - # May require multiple kernel launches or complex logic - # Consider advanced memory management - pass -``` - -### PyTorch Starter Template - -```python -import torch - -def solve(input: torch.Tensor, output: torch.Tensor, N: int, param1: int, param2: int): - # TODO: Implement solution using PyTorch operations - # May require complex tensor operations and custom functions - pass -``` - -### TinyGrad Starter Template - -```python -import tinygrad - -def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int, param1: int, param2: int): - # TODO: Implement solution using TinyGrad operations - # May require complex tensor operations and custom functions - pass -``` - - - -## Common Mistakes to Avoid - -1. **Complex synchronization**: Ensure proper thread coordination -2. **Memory leaks**: Free all allocated memory -3. **Race conditions**: Use proper synchronization primitives -4. **Numerical instability**: Handle floating-point precision carefully -5. **Incorrect indexing**: Verify complex multi-dimensional indexing - diff --git a/docs/Starter_Files/medium_problems.md b/docs/Starter_Files/medium_problems.md deleted file mode 100644 index 4104543..0000000 --- a/docs/Starter_Files/medium_problems.md +++ /dev/null @@ -1,87 +0,0 @@ -# Medium Problems Starter File Creation Guide - -Medium problems introduce more complex algorithms and memory access patterns while maintaining reasonable implementation complexity. - -Medium problems typically feature: -- **Multi-step algorithms**: Operations requiring multiple kernel launches or complex logic -- **Advanced memory patterns**: Shared memory usage, reduction operations -- **Multiple parameters**: 3-6 parameters including algorithm-specific settings -- **Complex indexing**: 2D/3D indexing, strided access patterns -- **Inter-thread communication**: Some cooperation between threads within blocks -- **Algorithm-specific optimizations**: Custom memory layouts, specialized kernels - -## Starter File Structure - -### CUDA Starter Template - -```cuda -#include - -// input, output are device pointers (i.e. pointers to memory on the GPU) -extern "C" void solve(const float* input, float* output, int N) { - // TODO: Implement solve function - // May require multiple kernel launches - // Consider memory allocation for intermediate results -} -``` - -### Triton Starter Template - -```python -# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. -import triton -import triton.language as tl - -def solve(input_ptr: int, output_ptr: int, N: int): - # TODO: Implement solve function - # May require multiple kernel launches - # Consider intermediate memory allocation - pass -``` - -### Mojo Starter Template - -```mojo -from gpu.host import DeviceContext -from gpu.id import block_dim, block_idx, thread_idx -from memory import UnsafePointer -from math import ceildiv - -@export -def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): - # TODO: Implement solve function - # May require multiple kernel launches or complex logic - pass -``` - -### PyTorch Starter Template - -```python -import torch - -def solve(input: torch.Tensor, output: torch.Tensor, N: int): - # TODO: Implement solution using PyTorch operations - # May require multiple tensor operations - pass -``` - -### TinyGrad Starter Template - -```python -import tinygrad - -def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): - # TODO: Implement solution using TinyGrad operations - # May require multiple tensor operations - pass -``` - - -## Common Mistakes to Avoid - -1. **Missing synchronization**: Use `__syncthreads()` when needed -2. **Incorrect shared memory usage**: Avoid bank conflicts -3. **Memory leaks**: Free allocated memory -4. **Race conditions**: Ensure proper thread coordination -5. **Numerical instability**: Use proper techniques (e.g., softmax with max subtraction) - From fcd465f7d4c5af3f603b9be62d1abbe1c4a2f2d8 Mon Sep 17 00:00:00 2001 From: Pooja Thakur Date: Sat, 9 Aug 2025 16:38:14 -0400 Subject: [PATCH 4/4] update starter docs --- docs/Starter_Codes.md | 129 +++++++++++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 38 deletions(-) diff --git a/docs/Starter_Codes.md b/docs/Starter_Codes.md index 8befc09..67b40ea 100644 --- a/docs/Starter_Codes.md +++ b/docs/Starter_Codes.md @@ -19,7 +19,9 @@ Each framework has specific requirements: - Kernel functions with `__global__` qualifier(for easy problems) - `extern "C"` solve function for framework integration - Proper memory management and synchronization -- Grid and block size calculations +- Grid and block size +- + **Triton:** - `@triton.jit` decorator for kernel compilation @@ -39,38 +41,29 @@ Each framework has specific requirements: - Simple, direct implementations -Based on the algorithm requirements, determine: - -1. **Input parameters**: What data does the algorithm need? -2. **Output parameters**: Where should results be written? -3. **Size parameters**: What dimensions are involved? -4. **Configuration parameters**: Any algorithm-specific settings? - - -## Easy Problems templates - +## Easy Problems ### CUDA Starter Template ```cuda #include -__global__ void kernel_name(const float* input, float* output, int N) { - // TODO: Implement kernel logic - // Each thread processes one element - // Use threadIdx.x + blockIdx.x * blockDim.x to get global index +__global__ void kernel_name() { } // input, output are device pointers (i.e. pointers to memory on the GPU) -extern "C" void solve(const float* input, float* output, int N) { - int threadsPerBlock = 256; - int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; - - kernel_name<<>>(input, output, N); +extern "C" void solve(input, output,size) { + + // define grid, block size + kernel_name<<>>(input, output, size); cudaDeviceSynchronize(); } ``` + + + + ### Triton Starter Template ```python @@ -79,7 +72,7 @@ import triton import triton.language as tl @triton.jit -def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): +def kernel_name(input_ptr, output_ptr, input size, block size): input_ptr = input_ptr.to(tl.pointer_type(tl.float32)) output_ptr = output_ptr.to(tl.pointer_type(tl.float32)) @@ -88,12 +81,15 @@ def kernel_name(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): # Use tl.program_id(1) to get thread ndex within block # input_ptr, output_ptr are raw device pointers -def solve(input_ptr: int, output_ptr: int, N: int): - BLOCK_SIZE = 1024 - # define grid - kernel_name[grid](input_ptr, output_ptr, N, BLOCK_SIZE) +def solve(input_ptr, output_ptr, input size): + # define grid, block size + kernel_name[grid](input_ptr, output_ptr, input size, block size) ``` + + + + ### Mojo Starter Template ```mojo @@ -102,27 +98,25 @@ from gpu.id import block_dim, block_idx, thread_idx from memory import UnsafePointer from math import ceildiv -fn kernel_name(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): +fn kernel_name(input, output, size): # TODO: Implement kernel logic - # Use thread_idx() to get thread index + # Use thread_idx() to get thread index within block # Use block_idx() to get block index pass +# input, output are device pointers (i.e. pointers to memory on the GPU) @export -def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int32): - # define block, thread size +def solve(input, output, size): + #calculate threads per block var ctx = DeviceContext() - # Launch the kernel using enqueue_function ctx.enqueue_function[kernel_name]( - input_ptr, output_ptr, N, - grid_dim = num_blocks, # Number of blocks in 1D grid - block_dim = BLOCK_SIZE # Number of threads per block + input, output, size, + grid_dim = num_blocks, + block_dim = BLOCK_SIZE ) ctx.synchronize() - # TODO: Implement solve function - pass ``` ### PyTorch Starter Template @@ -130,7 +124,7 @@ def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], N: Int3 ```python import torch -def solve(input: torch.Tensor, output: torch.Tensor, N: int): +def solve(input, output, size): # TODO: Implement solution using PyTorch operations pass ``` @@ -140,9 +134,68 @@ def solve(input: torch.Tensor, output: torch.Tensor, N: int): ```python import tinygrad -def solve(input: tinygrad.Tensor, output: tinygrad.Tensor, N: int): +def solve(input, output, size): # TODO: Implement solution using TinyGrad operations pass ``` -For medium and hard problems, only define solve function for CUDA, Mojo and Triton. \ No newline at end of file + +## Medium and Hard Problems + +### CUDA Starter Template + +```cuda +#include + +// input, output are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(input, output, size) { + +} +``` + +### Triton Starter Template + +```python +# The use of PyTorch in Triton programs is not allowed for the purposes of fair benchmarking. +import triton +import triton.language as tl + +# input_ptr, output_ptr are raw device pointers +def solve(): + pass +``` + + +### Mojo Starter Template + +```mojo +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +@export +def solve(input, output, size): + + pass +``` + +### PyTorch Starter Template + +```python +import torch + +def solve(input, output, size): + # TODO: Implement solution using PyTorch operations + pass +``` + +### TinyGrad Starter Template + +```python +import tinygrad + +def solve(input, output, size): + # TODO: Implement solution using TinyGrad operations + pass +```