From 3962addbeb324c061df7106175f65d442d57899d Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Wed, 26 Nov 2025 09:54:34 -0500 Subject: [PATCH] Fix CR3 handling in interrupt/syscall paths using per-CPU data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel was using a hardcoded CR3 value (0x101000) in assembly code, which became stale after build_master_kernel_pml4() created a new master PML4 at a different address. This caused triple faults during CR3 switches. Changes: - Add kernel_cr3 and saved_process_cr3 fields to PerCpuData struct - Store kernel CR3 at gs:[72] for interrupt/syscall entry to read - Save process CR3 at gs:[80] on entry, restore on exit if no context switch - Update kernel_cr3 in memory::init() after master PML4 is created - Fix SyscallFrame struct field order to match assembly push order - Fix syscall_return_to_userspace to clear next_cr3 BEFORE CR3 switch The fix ensures interrupt handlers and syscall entry can always switch to the correct kernel page table, and restore the process page table on return when no context switch occurred. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- kernel/src/interrupts/timer_entry.asm | 47 +++++++++++-- kernel/src/memory/mod.rs | 11 +++ kernel/src/per_cpu.rs | 75 +++++++++++++++++++- kernel/src/syscall/entry.asm | 99 +++++++++++++++++++++++---- kernel/src/syscall/handler.rs | 31 +++++---- 5 files changed, 227 insertions(+), 36 deletions(-) diff --git a/kernel/src/interrupts/timer_entry.asm b/kernel/src/interrupts/timer_entry.asm index 7d7f360..25e2c72 100644 --- a/kernel/src/interrupts/timer_entry.asm +++ b/kernel/src/interrupts/timer_entry.asm @@ -50,15 +50,25 @@ timer_interrupt_entry: cmp rax, 3 ; Ring 3? jne .skip_swapgs_entry ; If not from userspace, skip swapgs + ; We came from userspace, swap to kernel GS FIRST + ; We need kernel GS to read kernel_cr3 from per-CPU data + swapgs + + ; CRITICAL: Save the process CR3 BEFORE switching to kernel CR3 + ; This allows us to restore it on exit if no context switch happens + ; Save process CR3 to per-CPU data at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, cr3 ; Read current (process) CR3 + mov qword [gs:80], rax ; Save to per-CPU saved_process_cr3 + ; CRITICAL: Switch CR3 back to kernel page table ; When interrupt fires from userspace, CR3 is still the process PT ; We MUST switch to kernel PT before running any kernel code - ; TODO: Make this dynamic by storing kernel CR3 in per-CPU data - mov rax, 0x101000 ; Kernel CR3 (hardcoded for now) + ; Read kernel_cr3 from per-CPU data at gs:[72] (KERNEL_CR3_OFFSET) + mov rax, qword [gs:72] ; Read kernel CR3 from per-CPU data + test rax, rax ; Check if kernel_cr3 is set + jz .skip_cr3_switch_entry ; If not set, skip (early boot fallback) mov cr3, rax ; Switch to kernel page table - - ; We came from userspace, swap to kernel GS - swapgs +.skip_cr3_switch_entry: ; Log full frame details for first few userspace interrupts ; Pass frame pointer to logging function @@ -304,7 +314,32 @@ timer_interrupt_entry: jmp .after_cr3_check .no_cr3_switch_back_to_user: - ; No CR3 switch needed, swap back to user GS + ; No context switch, but we still need to restore the ORIGINAL process CR3! + ; We saved it on entry at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, qword [gs:80] ; Read saved process CR3 + test rax, rax ; Check if it was saved (non-zero) + jz .no_saved_cr3 ; If 0, skip (shouldn't happen from userspace) + + ; Debug: Output marker for saved CR3 restore + push rdx + mov dx, 0x3F8 + push rax + mov al, '!' ; '!' for saved CR3 restore + out dx, al + mov al, 'C' + out dx, al + mov al, 'R' + out dx, al + mov al, '3' + out dx, al + pop rax + pop rdx + + ; Switch back to original process CR3 + mov cr3, rax + +.no_saved_cr3: + ; Swap back to user GS for IRETQ swapgs .after_cr3_check: diff --git a/kernel/src/memory/mod.rs b/kernel/src/memory/mod.rs index 2f26136..37e69b2 100644 --- a/kernel/src/memory/mod.rs +++ b/kernel/src/memory/mod.rs @@ -46,6 +46,17 @@ pub fn init(physical_memory_offset: VirtAddr, memory_regions: &'static MemoryReg // PHASE 2: Build master kernel PML4 with upper-half mappings kernel_page_table::build_master_kernel_pml4(); + // CRITICAL: Update kernel_cr3 in per-CPU data to the new master PML4 + // per_cpu::init() already ran and set kernel_cr3 to the bootloader's CR3 + // Now that we've switched to the master PML4, we must update it + { + use x86_64::registers::control::Cr3; + let (current_frame, _) = Cr3::read(); + let master_cr3 = current_frame.start_address().as_u64(); + log::info!("CRITICAL: Updating kernel_cr3 to master PML4: {:#x}", master_cr3); + crate::per_cpu::set_kernel_cr3(master_cr3); + } + // Migrate any existing processes (though there shouldn't be any yet) kernel_page_table::migrate_existing_processes(); diff --git a/kernel/src/per_cpu.rs b/kernel/src/per_cpu.rs index c4bdd54..92675ed 100644 --- a/kernel/src/per_cpu.rs +++ b/kernel/src/per_cpu.rs @@ -56,6 +56,14 @@ pub struct PerCpuData { /// Target CR3 for next IRETQ (offset 64) - set before context switch /// 0 means no CR3 switch needed pub next_cr3: u64, + + /// Kernel CR3 (offset 72) - the master kernel page table + /// Used by interrupt/syscall entry to switch to kernel page tables + pub kernel_cr3: u64, + + /// Saved process CR3 (offset 80) - saved on interrupt entry from userspace + /// Used to restore process page tables on interrupt exit if no context switch + pub saved_process_cr3: u64, } // Linux-style preempt_count bit layout constants @@ -123,6 +131,10 @@ const TSS_OFFSET: usize = 48; // offset 48: *mut TSS (8 bytes) const SOFTIRQ_PENDING_OFFSET: usize = 56; // offset 56: u32 (4 bytes) #[allow(dead_code)] const NEXT_CR3_OFFSET: usize = 64; // offset 64: u64 (8 bytes) - ALIGNED +#[allow(dead_code)] +const KERNEL_CR3_OFFSET: usize = 72; // offset 72: u64 (8 bytes) - ALIGNED +#[allow(dead_code)] +const SAVED_PROCESS_CR3_OFFSET: usize = 80; // offset 80: u64 (8 bytes) - ALIGNED // Compile-time assertions to ensure offsets are correct // These will fail to compile if the offsets don't match expected values @@ -132,7 +144,7 @@ const _: () = assert!(USER_RSP_SCRATCH_OFFSET % 8 == 0, "user_rsp_scratch must b const _: () = assert!(core::mem::size_of::() == 8, "This code assumes 64-bit pointers"); // Verify struct size is 128 bytes due to align(64) attribute -// The actual data is 72 bytes, but align(64) rounds up to 128 +// The actual data is 88 bytes (saved_process_cr3 at offset 80), but align(64) rounds up to 128 const _: () = assert!(core::mem::size_of::() == 128, "PerCpuData must be 128 bytes (aligned to 64)"); // Verify bit layout matches Linux kernel @@ -158,6 +170,8 @@ impl PerCpuData { softirq_pending: 0, _pad2: 0, next_cr3: 0, + kernel_cr3: 0, + saved_process_cr3: 0, } } } @@ -195,6 +209,25 @@ pub fn init() { // Mark per-CPU data as initialized and safe to use PER_CPU_INITIALIZED.store(true, Ordering::Release); log::info!("Per-CPU data marked as initialized - preempt_count functions now use per-CPU storage"); + + // Store the current CR3 as the initial kernel CR3 + // NOTE: At this point, we're still using the bootloader's page tables. + // After memory::init() calls build_master_kernel_pml4(), the kernel switches + // to the master PML4 and calls set_kernel_cr3() to update this value. + // This initial value provides a fallback during early boot. + let (current_frame, _) = x86_64::registers::control::Cr3::read(); + let kernel_cr3_val = current_frame.start_address().as_u64(); + log::info!("Storing initial kernel_cr3 = {:#x} in per-CPU data (bootloader PT)", kernel_cr3_val); + + unsafe { + core::arch::asm!( + "mov gs:[{offset}], {}", + in(reg) kernel_cr3_val, + offset = const KERNEL_CR3_OFFSET, + options(nostack, preserves_flags) + ); + } + log::info!("kernel_cr3 stored successfully - interrupt handlers can now switch to kernel page tables"); } /// Get the current thread from per-CPU data @@ -941,6 +974,46 @@ pub fn set_next_cr3(cr3: u64) { } } +/// Get the kernel CR3 (master kernel page table) +/// Returns 0 if not initialized +#[allow(dead_code)] +pub fn get_kernel_cr3() -> u64 { + if !PER_CPU_INITIALIZED.load(Ordering::Acquire) { + return 0; + } + + unsafe { + let cr3: u64; + core::arch::asm!( + "mov {}, gs:[{offset}]", + out(reg) cr3, + offset = const KERNEL_CR3_OFFSET, + options(nostack, readonly, preserves_flags) + ); + cr3 + } +} + +/// Set the kernel CR3 (master kernel page table) +/// This should be called once after build_master_kernel_pml4() +pub fn set_kernel_cr3(cr3: u64) { + if !PER_CPU_INITIALIZED.load(Ordering::Acquire) { + log::warn!("set_kernel_cr3 called before per-CPU init, storing for later"); + // We can't store it yet, but we'll set it during init + return; + } + + log::info!("Setting kernel_cr3 in per-CPU data to {:#x}", cr3); + unsafe { + core::arch::asm!( + "mov gs:[{offset}], {}", + in(reg) cr3, + offset = const KERNEL_CR3_OFFSET, + options(nostack, preserves_flags) + ); + } +} + /// Check if we can schedule (preempt_count == 0 and returning to userspace) pub fn can_schedule(saved_cs: u64) -> bool { let current_preempt = preempt_count(); diff --git a/kernel/src/syscall/entry.asm b/kernel/src/syscall/entry.asm index 3c95a3b..eea28b9 100644 --- a/kernel/src/syscall/entry.asm +++ b/kernel/src/syscall/entry.asm @@ -42,16 +42,26 @@ syscall_entry: ; Clear direction flag for string operations cld + ; Always switch to kernel GS FIRST for INT 0x80 entry + ; We need kernel GS to read kernel_cr3 from per-CPU data + ; INT 0x80 is only used from userspace, so we always need swapgs + swapgs + + ; CRITICAL: Save the process CR3 BEFORE switching to kernel CR3 + ; This allows us to restore it on exit if no context switch happens + ; Save process CR3 to per-CPU data at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, cr3 ; Read current (process) CR3 + mov qword [gs:80], rax ; Save to per-CPU saved_process_cr3 + ; CRITICAL: Switch CR3 back to kernel page table ; Syscalls only come from userspace, so CR3 is always a process PT ; We MUST switch to kernel PT before running kernel code - ; TODO: Make this dynamic by storing kernel CR3 in per-CPU data - mov rax, 0x101000 ; Kernel CR3 (hardcoded for now) + ; Read kernel_cr3 from per-CPU data at gs:[72] (KERNEL_CR3_OFFSET) + mov rax, qword [gs:72] ; Read kernel CR3 from per-CPU data + test rax, rax ; Check if kernel_cr3 is set + jz .skip_cr3_switch ; If not set, skip (early boot fallback) mov cr3, rax ; Switch to kernel page table - - ; Always switch to kernel GS for INT 0x80 entry - ; INT 0x80 is only used from userspace, so we always need swapgs - swapgs +.skip_cr3_switch: ; Call the Rust syscall handler ; Pass pointer to saved registers as argument @@ -196,7 +206,32 @@ syscall_entry: jmp .after_cr3_check_syscall .no_cr3_switch_syscall_back_to_user: - ; No CR3 switch needed, swap back to user GS + ; No context switch, but we still need to restore the ORIGINAL process CR3! + ; We saved it on entry at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, qword [gs:80] ; Read saved process CR3 + test rax, rax ; Check if it was saved (non-zero) + jz .no_saved_cr3_syscall ; If 0, skip (shouldn't happen from userspace) + + ; Debug: Output marker for saved CR3 restore + push rdx + mov dx, 0x3F8 + push rax + mov al, '!' ; '!' for saved CR3 restore + out dx, al + mov al, 'S' + out dx, al + mov al, 'Y' + out dx, al + mov al, 'S' + out dx, al + pop rax + pop rdx + + ; Switch back to original process CR3 + mov cr3, rax + +.no_saved_cr3_syscall: + ; Swap back to user GS for IRETQ swapgs .after_cr3_check_syscall: @@ -317,9 +352,19 @@ syscall_return_to_userspace: test rax, rax jz .no_cr3_switch_first_entry_back_to_user - ; Interrupts already disabled (CLI at function start line 211) + ; Interrupts already disabled (CLI at function start line 260) ; Safe to switch CR3 now + ; CRITICAL FIX: Clear next_cr3 BEFORE switching CR3! + ; We must do this while kernel page tables are still active, + ; because after CR3 switch the process page tables may not + ; have the kernel per-CPU region mapped. Accessing [gs:64] + ; after CR3 switch would cause a page fault -> triple fault. + push rdx + xor rdx, rdx + mov qword [gs:64], rdx + pop rdx + ; Debug: Output marker for CR3 switch mov dx, 0x3F8 push rax @@ -337,20 +382,46 @@ syscall_return_to_userspace: out dx, al pop rax - ; Switch CR3 to process page table + ; NOW safe to switch CR3 to process page table + ; Kernel per-CPU data already cleared while kernel PT was active mov cr3, rax - ; Clear next_cr3 flag (set to 0) - xor rdx, rdx - mov qword [gs:64], rdx - ; Swap back to user GS for IRETQ swapgs jmp .after_cr3_check_first_entry .no_cr3_switch_first_entry_back_to_user: - ; No CR3 switch needed, swap back to user GS + ; No context switch, but we still need to restore the ORIGINAL process CR3! + ; We saved it on entry at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, qword [gs:80] ; Read saved process CR3 + test rax, rax ; Check if it was saved (non-zero) + jz .no_saved_cr3_first_entry ; If 0, skip (shouldn't happen from userspace) + + ; Debug: Output marker for saved CR3 restore + push rdx + mov dx, 0x3F8 + push rax + mov al, '!' ; '!' for saved CR3 restore + out dx, al + mov al, 'F' + out dx, al + mov al, 'I' + out dx, al + mov al, 'R' + out dx, al + mov al, 'S' + out dx, al + mov al, 'T' + out dx, al + pop rax + pop rdx + + ; Switch back to original process CR3 + mov cr3, rax + +.no_saved_cr3_first_entry: + ; Swap back to user GS for IRETQ swapgs .after_cr3_check_first_entry: diff --git a/kernel/src/syscall/handler.rs b/kernel/src/syscall/handler.rs index 5b04bcb..0a7fbf0 100644 --- a/kernel/src/syscall/handler.rs +++ b/kernel/src/syscall/handler.rs @@ -5,22 +5,23 @@ use super::{SyscallNumber, SyscallResult}; pub struct SyscallFrame { // General purpose registers (in memory order after all pushes) // Stack grows down, so last pushed is at lowest address (where RSP points) - // Assembly pushes in reverse order: r15 first, rax last - pub rax: u64, // Syscall number - pushed last, so at RSP+0 - pub rcx: u64, // at RSP+8 - pub rdx: u64, // at RSP+16 - pub rbx: u64, // at RSP+24 - pub rbp: u64, // at RSP+32 - pub rsi: u64, // at RSP+40 - pub rdi: u64, // at RSP+48 + // Assembly pushes: rax first, then rcx, rdx, rbx, rbp, rsi, rdi, r8-r15 + // So r15 (pushed last) is at RSP+0, and rax (pushed first) is at RSP+112 + pub r15: u64, // pushed last, at RSP+0 + pub r14: u64, // at RSP+8 + pub r13: u64, // at RSP+16 + pub r12: u64, // at RSP+24 + pub r11: u64, // at RSP+32 + pub r10: u64, // at RSP+40 + pub r9: u64, // at RSP+48 pub r8: u64, // at RSP+56 - pub r9: u64, // at RSP+64 - pub r10: u64, // at RSP+72 - pub r11: u64, // at RSP+80 - pub r12: u64, // at RSP+88 - pub r13: u64, // at RSP+96 - pub r14: u64, // at RSP+104 - pub r15: u64, // pushed first, so at RSP+112 + pub rdi: u64, // at RSP+64 + pub rsi: u64, // at RSP+72 + pub rbp: u64, // at RSP+80 + pub rbx: u64, // at RSP+88 + pub rdx: u64, // at RSP+96 + pub rcx: u64, // at RSP+104 + pub rax: u64, // Syscall number - pushed first, at RSP+112 // Interrupt frame (pushed by CPU before our code) pub rip: u64,