From 7e448af9dd6f9bcab9852e319b22ad1c576e42be Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 20 Nov 2025 01:36:38 +0000
Subject: [PATCH 1/4] Add readme.txt with compilation and execution
 instructions

---
 readme.txt | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 readme.txt
diff --git a/readme.txt b/readme.txt
new file mode 100644
index 0000000..3b0085a
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,181 @@
+================================================================================
+GPU KERNEL PERFORMANCE PREDICTION - QUICK USER MANUAL
+================================================================================
+
+This project has two main components:
+1. gpu-perf: GPU performance data collection using CUDA benchmarks
+2. predict-kernel-perf: Performance prediction models (analytical & ML)
+
+================================================================================
+PART 1: DATA COLLECTION (gpu-perf)
+================================================================================
+
+REQUIREMENTS:
+- NVIDIA GPU with CUDA support
+- CUDA Toolkit (nvcc compiler)
+- cuBLAS library
+- Python 3.6+
+- Bash shell
+
+STEP 1: GPU CALIBRATION (one-time setup per GPU)
+-------------------------------------------------
+cd gpu-perf/calibration
+
+# Compile and run calibration programs
+# Replace "2080ti" with your GPU: 2080ti, 4070, titanv, or titanx
+
+nvcc -O3 -o ../bin/props props.cu
+../bin/props > ../data/props_2080ti.out
+
+nvcc -O3 -o ../bin/stream_like stream_like.cu
+../bin/stream_like > ../data/stream_like_2080ti.out
+
+nvcc -O3 -lcublas -o ../bin/gemm_cublas gemm_cublas.cu
+../bin/gemm_cublas > ../data/gemm_cublas_2080ti.out
+
+cd ..
+
+STEP 2: GENERATE DATASET
+-------------------------
+# For RTX 2080 Ti:
+./scripts/gen_trials_2080ti.sh
+python3 scripts/build_final_dataset.py \
+  "data/trials_*__2080ti.csv" \
+  data/props_2080ti.out \
+  data/stream_like_2080ti.out \
+  data/gemm_cublas_2080ti.out \
+  data/runs_2080ti_final.csv
+
+# For RTX 4070:
+./scripts/gen_trials_4070.sh
+python3 scripts/build_final_dataset.py \
+  "data/trials_*__4070.csv" \
+  data/props_4070.out \
+  data/stream_like_4070.out \
+  data/gemm_cublas_4070.out \
+  data/runs_4070_final.csv
+
+# For Titan V:
+./scripts/gen_trials_titanv.sh
+python3 scripts/build_final_dataset.py \
+  "data/trials_*__titanv.csv" \
+  data/props_titanv.out \
+  data/stream_like_titanv.out \
+  data/gemm_cublas_titanv.out \
+  data/runs_titanv_final.csv
+
+# For Titan X:
+./scripts/gen_trials_titanx.sh
+python3 scripts/build_final_dataset.py \
+  "data/trials_*__titanx.csv" \
+  data/props_titanx.out \
+  data/stream_like_titanx.out \
+  data/gemm_cublas_titanx.out \
+  data/runs_titanx_final.csv
+
+OUTPUT: data/runs_<gpu>_final.csv (65+ kernel configurations × 40 metrics)
+
+NOTE: You can only generate data for ONE GPU at a time (the one in your system)
+
+================================================================================
+PART 2: PERFORMANCE PREDICTION (predict-kernel-perf)
+================================================================================
+
+REQUIREMENTS:
+- Python 3.6+
+- pandas, numpy
+- scikit-learn (for ML models)
+
+PREREQUISITES:
+Place dataset files in predict-kernel-perf/data/:
+- runs_2080ti_final.csv
+- runs_4070_final.csv
+- runs_titanv_final.csv
+- runs_titanx_final.csv
+- gpu_metrics.json
+
+(These files are generated by the gpu-perf pipeline above)
+
+RUN ANALYTICAL MODEL:
+---------------------
+cd predict-kernel-perf/scripts
+python3 analytical_model_occupancy.py
+
+OUTPUT: analytic_model_outputs/*.csv
+- cross_gpu_predictions.csv
+- exp1_same_config_new_gpu.csv
+- exp2_new_configs_same_gpus.csv
+- exp3a_new_kernels_*.csv
+
+RUN MACHINE LEARNING MODELS:
+-----------------------------
+cd predict-kernel-perf/scripts
+python3 ml_baseline.py
+
+OUTPUT: ml_outputs/*.csv
+- exp1_*_[model]_predictions.csv
+- exp2_*_[model]_predictions.csv
+- exp3_*_[model]_predictions.csv
+- Per-kernel error analysis files
+
+================================================================================
+VISUALIZATION AND ANALYSIS
+================================================================================
+
+GENERATE PLOTS:
+---------------
+cd predict-kernel-perf/scripts
+python3 plots.py
+
+OUTPUT: Visualization plots comparing model predictions
+
+CREATE SUMMARY TABLES:
+----------------------
+cd predict-kernel-perf/scripts
+python3 create_tables.py
+
+OUTPUT: Summary tables of model performance metrics
+
+================================================================================
+QUICK START EXAMPLE (RTX 2080 Ti)
+================================================================================
+
+# 1. Calibrate GPU
+cd gpu-perf/calibration
+nvcc -O3 -o ../bin/props props.cu && ../bin/props > ../data/props_2080ti.out
+nvcc -O3 -o ../bin/stream_like stream_like.cu && ../bin/stream_like > ../data/stream_like_2080ti.out
+nvcc -O3 -lcublas -o ../bin/gemm_cublas gemm_cublas.cu && ../bin/gemm_cublas > ../data/gemm_cublas_2080ti.out
+cd ..
+
+# 2. Generate dataset
+./scripts/gen_trials_2080ti.sh
+python3 scripts/build_final_dataset.py \
+  "data/trials_*__2080ti.csv" \
+  data/props_2080ti.out \
+  data/stream_like_2080ti.out \
+  data/gemm_cublas_2080ti.out \
+  data/runs_2080ti_final.csv
+
+# 3. Copy data files to prediction directory
+cp data/runs_*.csv ../predict-kernel-perf/data/
+
+# 4. Run prediction models
+cd ../predict-kernel-perf/scripts
+python3 analytical_model_occupancy.py
+python3 ml_baseline.py
+
+================================================================================
+TROUBLESHOOTING
+================================================================================
+
+- If nvcc not found: Ensure CUDA Toolkit is installed and in PATH
+- If permission denied on scripts: Run "chmod +x scripts/*.sh"
+- If Python modules missing: Run "pip3 install pandas numpy scikit-learn"
+- If cuBLAS errors: Ensure cuBLAS library is installed with CUDA
+
+For detailed documentation, see:
+- README.md (main documentation)
+- gpu-perf/README.md (data collection details)
+- predict-kernel-perf/README.md (modeling details)
+
+================================================================================

From 645b834e6a2deeb967ba8e611beaba454693c15a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 20 Nov 2025 01:43:11 +0000
Subject: [PATCH 2/4] Fix paths for predict-kernel-perf scripts - run from
 data/ directory

---
 readme.txt | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/readme.txt b/readme.txt
index 3b0085a..a88432a 100644
--- a/readme.txt
+++ b/readme.txt
@@ -98,10 +98,10 @@ Place dataset files in predict-kernel-perf/data/:
 
 RUN ANALYTICAL MODEL:
 ---------------------
-cd predict-kernel-perf/scripts
-python3 analytical_model_occupancy.py
+cd predict-kernel-perf/data
+python3 ../scripts/analytical_model_occupancy.py
 
-OUTPUT: analytic_model_outputs/*.csv
+OUTPUT: predict-kernel-perf/data/analytic_model_outputs/*.csv
 - cross_gpu_predictions.csv
 - exp1_same_config_new_gpu.csv
 - exp2_new_configs_same_gpus.csv
@@ -109,10 +109,10 @@ OUTPUT: analytic_model_outputs/*.csv
 
 RUN MACHINE LEARNING MODELS:
 -----------------------------
-cd predict-kernel-perf/scripts
-python3 ml_baseline.py
+cd predict-kernel-perf/data
+python3 ../scripts/ml_baseline.py
 
-OUTPUT: ml_outputs/*.csv
+OUTPUT: predict-kernel-perf/data/ml_outputs/*.csv
 - exp1_*_[model]_predictions.csv
 - exp2_*_[model]_predictions.csv
 - exp3_*_[model]_predictions.csv
@@ -124,15 +124,15 @@ VISUALIZATION AND ANALYSIS
 
 GENERATE PLOTS:
 ---------------
-cd predict-kernel-perf/scripts
-python3 plots.py
+cd predict-kernel-perf/data
+python3 ../scripts/plots.py
 
 OUTPUT: Visualization plots comparing model predictions
 
 CREATE SUMMARY TABLES:
 ----------------------
-cd predict-kernel-perf/scripts
-python3 create_tables.py
+cd predict-kernel-perf/data
+python3 ../scripts/create_tables.py
 
 OUTPUT: Summary tables of model performance metrics
 
@@ -160,9 +160,9 @@ python3 scripts/build_final_dataset.py \
 cp data/runs_*.csv ../predict-kernel-perf/data/
 
 # 4. Run prediction models
-cd ../predict-kernel-perf/scripts
-python3 analytical_model_occupancy.py
-python3 ml_baseline.py
+cd ../predict-kernel-perf/data
+python3 ../scripts/analytical_model_occupancy.py
+python3 ../scripts/ml_baseline.py
 
 ================================================================================
 TROUBLESHOOTING

From 8439f99d1bbd537245809f30bae5e51eff97531f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 20 Nov 2025 02:45:18 +0000
Subject: [PATCH 3/4] Add CUDA version compatibility section for different GPU
 architectures

---
 readme.txt | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/readme.txt b/readme.txt
index a88432a..55df807 100644
--- a/readme.txt
+++ b/readme.txt
@@ -17,6 +17,25 @@ REQUIREMENTS:
 - Python 3.6+
 - Bash shell
 
+CUDA VERSION COMPATIBILITY:
+----------------------------
+IMPORTANT: Different GPU architectures require different CUDA versions for
+optimal performance. Using incompatible CUDA versions can result in absurd
+performance numbers.
+
+If using module system (HPC clusters):
+
+# For Titan V (Volta, sm_70):
+module load cuda-12.6
+# AVOID CUDA 13.0+ for sm_70 - causes performance degradation
+
+# For RTX 2080 Ti, RTX 4070, Titan X (Turing/Ada, sm_75/sm_89):
+module load cuda-12.6
+# or module load cuda-13.0 for newer architectures
+
+Verify CUDA version:
+nvcc --version
+
 STEP 1: GPU CALIBRATION (one-time setup per GPU)
 -------------------------------------------------
 cd gpu-perf/calibration

From 1b044397b129da209dea9dd783c8200e7598fa28 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 20 Nov 2025 02:53:05 +0000
Subject: [PATCH 4/4] Update CUDA compatibility: only RTX 4070 uses CUDA 13.0,
 others use 12.6

---
 readme.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/readme.txt b/readme.txt
index 55df807..e29a4af 100644
--- a/readme.txt
+++ b/readme.txt
@@ -25,13 +25,15 @@ performance numbers.
 
 If using module system (HPC clusters):
 
-# For Titan V (Volta, sm_70):
+# For RTX 4070 (Ada Lovelace, sm_89):
+module load cuda-13.0
+
+# For RTX 2080 Ti, Titan X (Turing, sm_75):
 module load cuda-12.6
-# AVOID CUDA 13.0+ for sm_70 - causes performance degradation
 
-# For RTX 2080 Ti, RTX 4070, Titan X (Turing/Ada, sm_75/sm_89):
+# For Titan V (Volta, sm_70):
 module load cuda-12.6
-# or module load cuda-13.0 for newer architectures
+# AVOID CUDA 13.0+ for sm_70 - causes performance degradation
 
 Verify CUDA version:
 nvcc --version