VisionClaw/build.rs at main · DreamLab-AI/VisionClaw · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
use std::env;
use std::path::{Path, PathBuf};
use std::process::Command;

fn main() {
    // Check if GPU feature is enabled
    let gpu_enabled = env::var("CARGO_FEATURE_GPU").is_ok();

    if !gpu_enabled {
        println!("cargo:warning=GPU feature disabled, skipping CUDA compilation");
        return;
    }

    // All CUDA source files that need compilation
    let cuda_files = [
        "src/utils/visionflow_unified.cu",
        "src/utils/gpu_clustering_kernels.cu",
        "src/utils/dynamic_grid.cu",
        "src/utils/gpu_aabb_reduction.cu",
        "src/utils/gpu_landmark_apsp.cu",
        "src/utils/sssp_compact.cu",
        "src/utils/visionflow_unified_stability.cu",
        "src/utils/ontology_constraints.cu",
        "src/utils/semantic_forces.cu",
        "src/utils/pagerank.cu",
        "src/utils/gpu_connected_components.cu",
    ];

    // Only rebuild if CUDA files change
    for cuda_file in &cuda_files {
        println!("cargo:rerun-if-changed={}", cuda_file);
    }
    println!("cargo:rerun-if-changed=build.rs");

    // Content-hash CUDA files to detect bind-mount overlay changes that cargo's
    // mtime-based rerun-if-changed misses (Docker image build vs host mount).
    // Export hashes so ptx.rs can verify PTX was built from current source.
    use std::collections::hash_map::DefaultHasher;
    use std::hash::Hasher;
    use std::io::Read;
    for cuda_file in &cuda_files {
        if let Ok(mut f) = std::fs::File::open(cuda_file) {
            let mut contents = Vec::new();
            if f.read_to_end(&mut contents).is_ok() {
                let mut hasher = DefaultHasher::new();
                hasher.write(&contents);
                let hash = hasher.finish();
                let stem = Path::new(cuda_file).file_stem().unwrap().to_str().unwrap();
                let env_var = format!("{}_CUDA_HASH", stem.to_uppercase());
                println!("cargo:rustc-env={}={:016x}", env_var, hash);
            }
        }
    }

    // Get build configuration
    let out_dir = env::var("OUT_DIR").unwrap();
    let cuda_path = env::var("CUDA_PATH")
        .or_else(|_| env::var("CUDA_HOME"))
        .unwrap_or_else(|_| "/opt/cuda".to_string());

    // Determine CUDA architecture.
    // In Docker builds (DOCKER_ENV set), NEVER auto-detect via nvidia-smi because the
    // build machine's GPU (e.g. sm_89) differs from the runtime GPU (e.g. sm_86).
    // Instead, default to sm_75 — a portable baseline whose PTX JIT-compiles to any
    // sm_75+ GPU at runtime.  The CUDA_ARCH env var overrides in all cases.
    let is_docker = env::var("DOCKER_ENV").is_ok();
    let cuda_arch = env::var("CUDA_ARCH").unwrap_or_else(|_| {
        if is_docker {
            println!("Docker build detected — skipping nvidia-smi GPU detection, using portable sm_75");
            return "75".to_string();
        }
        // Native (non-Docker) build: try to auto-detect GPU compute capability
        if let Ok(output) = Command::new("nvidia-smi")
            .args(["--query-gpu=compute_cap", "--format=csv,noheader", "--id=0"])
            .output()
        {
            if output.status.success() {
                let raw = String::from_utf8_lossy(&output.stdout);
                if let Some(cap) = raw.lines().next() {
                    let cap = cap.trim();
                    // nvidia-smi returns "8.6" → we need "86"
                    let arch = cap.replace('.', "");
                    if !arch.is_empty() {
                        println!("Auto-detected GPU compute capability: {} (sm_{})", cap, arch);
                        return arch;
                    }
                }
            }
        }
        "75".to_string()
    });
    println!("Using CUDA architecture: sm_{}", cuda_arch);

    // Compile all CUDA files to PTX
    println!("Compiling {} CUDA kernels to PTX...", cuda_files.len());

    for cuda_file in &cuda_files {
        let cuda_src = Path::new(cuda_file);
        let file_name = cuda_src.file_stem().unwrap().to_str().unwrap();
        let ptx_output = PathBuf::from(&out_dir).join(format!("{}.ptx", file_name));

        println!("Compiling {} to PTX...", file_name);
        println!(
            "NVCC Command: nvcc -ptx -arch sm_{} -o {} {} --use_fast_math -O3",
            cuda_arch,
            ptx_output.display(),
            cuda_src.display()
        );

        let nvcc_output = Command::new("nvcc")
            .args([
                "-ptx",
                "-arch",
                &format!("sm_{}", cuda_arch),
                "-o",
                ptx_output.to_str().unwrap(),
                cuda_src.to_str().unwrap(),
                "--use_fast_math",
                "-O3",
            ])
            .output()
            .expect("Failed to execute nvcc - is CUDA toolkit installed and in PATH?");

        if !nvcc_output.status.success() {
            eprintln!(
                "NVCC STDOUT: {}",
                String::from_utf8_lossy(&nvcc_output.stdout)
            );
            eprintln!(
                "NVCC STDERR: {}",
                String::from_utf8_lossy(&nvcc_output.stderr)
            );
            panic!("CUDA PTX compilation failed for {} with exit code: {:?}. Check CUDA installation and source file.",
                   file_name, nvcc_output.status.code());
        }

        // Downgrade PTX ISA version to 9.0 for driver compatibility.
        // CUDA toolkit 13.x emits .version 9.x but the host driver may only JIT up to 9.0.
        // This is safe: sm_86 kernels don't use ISA 9.1+ features.
        if let Ok(ptx_text) = std::fs::read_to_string(&ptx_output) {
            // Match any .version 9.N where N > 0
            if let Some(pos) = ptx_text.find(".version 9.") {
                let version_str = &ptx_text[pos..pos+13.min(ptx_text.len() - pos)];
                if version_str != ".version 9.0" {
                    let fixed = ptx_text[..pos].to_string() + ".version 9.0" + &ptx_text[pos + 12..];
                    std::fs::write(&ptx_output, fixed).expect("Failed to write downgraded PTX");
                    println!("PTX Build: Downgraded {} -> 9.0 for {}", version_str.trim(), file_name);
                }
            }
        }

        // Verify the PTX file was created
        match std::fs::metadata(&ptx_output) {
            Ok(metadata) => {
                println!(
                    "PTX Build: {} created, size: {} bytes",
                    file_name,
                    metadata.len()
                );
                if metadata.len() == 0 {
                    panic!("PTX file {} was created but is empty - CUDA compilation may have failed silently", file_name);
                }

                // Export PTX path as environment variable
                let env_var = format!("{}_PTX_PATH", file_name.to_uppercase());
                println!("cargo:rustc-env={}={}", env_var, ptx_output.display());
                println!("PTX Build: Exported {}={}", env_var, ptx_output.display());
            }
            Err(e) => {
                panic!(
                    "PTX file {} was not created despite successful nvcc status: {}",
                    file_name, e
                );
            }
        }
    }

    println!("All PTX compilation successful!");

    // CUDA source files that export host-callable FFI symbols and need linking
    let link_sources = [
        ("src/utils/visionflow_unified.cu", "thrust_wrapper"),
        ("src/utils/semantic_forces.cu", "semantic_forces"),
        ("src/utils/pagerank.cu", "pagerank"),
        ("src/utils/gpu_connected_components.cu", "gpu_connected_components"),
    ];

    let mut obj_files: Vec<PathBuf> = Vec::new();

    for (src_path, obj_name) in &link_sources {
        let cuda_src = Path::new(src_path);
        let obj_output = PathBuf::from(&out_dir).join(format!("{}.o", obj_name));

        // Use -gencode to produce CUBIN + embedded PTX for portability.
        // The PTX allows JIT compilation on GPUs with higher compute capability
        // than the build target (e.g. built for sm_75, runs on sm_86).
        // ISA version is downgraded post-compilation in build.rs (see below).
        let gencode_flag = format!(
            "-gencode=arch=compute_{0},code=[sm_{0},compute_{0}]",
            cuda_arch
        );
        println!("Compiling {} to object file (gencode: {})...", obj_name, gencode_flag);
        let obj_status = Command::new("nvcc")
            .args([
                "-c",
                &gencode_flag,
                "-o",
                obj_output.to_str().unwrap(),
                cuda_src.to_str().unwrap(),
                "--use_fast_math",
                "-O3",
                "-Xcompiler",
                "-fPIC",
                "-dc", // Enable device code linking
            ])
            .status()
            .expect(&format!("Failed to compile {}", obj_name));

        if !obj_status.success() {
            panic!("{} compilation failed", obj_name);
        }

        obj_files.push(obj_output);
    }

    // Device link all object files together (required for cross-module device calls)
    let dlink_output = PathBuf::from(&out_dir).join("cuda_dlink.o");
    let dlink_gencode = format!("-gencode=arch=compute_{0},code=[sm_{0},compute_{0}]", cuda_arch);
    println!("Device linking {} CUDA object files ({})...", obj_files.len(), dlink_gencode);
    let mut dlink_args: Vec<String> = vec![
        "-dlink".to_string(),
        dlink_gencode,
    ];
    for obj in &obj_files {
        dlink_args.push(obj.to_str().unwrap().to_string());
    }
    dlink_args.push("-o".to_string());
    dlink_args.push(dlink_output.to_str().unwrap().to_string());

    let dlink_status = Command::new("nvcc")
        .args(&dlink_args)
        .status()
        .expect("Failed to device link");

    if !dlink_status.success() {
        panic!("Device linking failed");
    }

    // Create static library from all object files + device link output
    let lib_output = PathBuf::from(&out_dir).join("libthrust_wrapper.a");
    println!("Creating static library...");
    let mut ar_args: Vec<String> = vec![
        "rcs".to_string(),
        lib_output.to_str().unwrap().to_string(),
    ];
    for obj in &obj_files {
        ar_args.push(obj.to_str().unwrap().to_string());
    }
    ar_args.push(dlink_output.to_str().unwrap().to_string());

    let ar_status = Command::new("ar")
        .args(&ar_args)
        .status()
        .expect("Failed to create static library");

    if !ar_status.success() {
        panic!("Failed to create static library");
    }

    // Link the static library
    println!("cargo:rustc-link-search=native={}", out_dir);
    println!("cargo:rustc-link-lib=static=thrust_wrapper");

    // Link CUDA libraries
    println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
    println!("cargo:rustc-link-search=native={}/lib64/stubs", cuda_path);
    println!("cargo:rustc-link-lib=cudart");
    println!("cargo:rustc-link-lib=cuda");
    println!("cargo:rustc-link-lib=cudadevrt"); // Device runtime for Thrust

    // Link C++ standard library for Thrust
    println!("cargo:rustc-link-lib=stdc++");

    println!("CUDA build complete!");
}