diff --git a/kernel/src/interrupts/timer_entry.asm b/kernel/src/interrupts/timer_entry.asm index 7d7f360..25e2c72 100644 --- a/kernel/src/interrupts/timer_entry.asm +++ b/kernel/src/interrupts/timer_entry.asm @@ -50,15 +50,25 @@ timer_interrupt_entry: cmp rax, 3 ; Ring 3? jne .skip_swapgs_entry ; If not from userspace, skip swapgs + ; We came from userspace, swap to kernel GS FIRST + ; We need kernel GS to read kernel_cr3 from per-CPU data + swapgs + + ; CRITICAL: Save the process CR3 BEFORE switching to kernel CR3 + ; This allows us to restore it on exit if no context switch happens + ; Save process CR3 to per-CPU data at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, cr3 ; Read current (process) CR3 + mov qword [gs:80], rax ; Save to per-CPU saved_process_cr3 + ; CRITICAL: Switch CR3 back to kernel page table ; When interrupt fires from userspace, CR3 is still the process PT ; We MUST switch to kernel PT before running any kernel code - ; TODO: Make this dynamic by storing kernel CR3 in per-CPU data - mov rax, 0x101000 ; Kernel CR3 (hardcoded for now) + ; Read kernel_cr3 from per-CPU data at gs:[72] (KERNEL_CR3_OFFSET) + mov rax, qword [gs:72] ; Read kernel CR3 from per-CPU data + test rax, rax ; Check if kernel_cr3 is set + jz .skip_cr3_switch_entry ; If not set, skip (early boot fallback) mov cr3, rax ; Switch to kernel page table - - ; We came from userspace, swap to kernel GS - swapgs +.skip_cr3_switch_entry: ; Log full frame details for first few userspace interrupts ; Pass frame pointer to logging function @@ -304,7 +314,32 @@ timer_interrupt_entry: jmp .after_cr3_check .no_cr3_switch_back_to_user: - ; No CR3 switch needed, swap back to user GS + ; No context switch, but we still need to restore the ORIGINAL process CR3! + ; We saved it on entry at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, qword [gs:80] ; Read saved process CR3 + test rax, rax ; Check if it was saved (non-zero) + jz .no_saved_cr3 ; If 0, skip (shouldn't happen from userspace) + + ; Debug: Output marker for saved CR3 restore + push rdx + mov dx, 0x3F8 + push rax + mov al, '!' ; '!' for saved CR3 restore + out dx, al + mov al, 'C' + out dx, al + mov al, 'R' + out dx, al + mov al, '3' + out dx, al + pop rax + pop rdx + + ; Switch back to original process CR3 + mov cr3, rax + +.no_saved_cr3: + ; Swap back to user GS for IRETQ swapgs .after_cr3_check: diff --git a/kernel/src/memory/mod.rs b/kernel/src/memory/mod.rs index 2f26136..37e69b2 100644 --- a/kernel/src/memory/mod.rs +++ b/kernel/src/memory/mod.rs @@ -46,6 +46,17 @@ pub fn init(physical_memory_offset: VirtAddr, memory_regions: &'static MemoryReg // PHASE 2: Build master kernel PML4 with upper-half mappings kernel_page_table::build_master_kernel_pml4(); + // CRITICAL: Update kernel_cr3 in per-CPU data to the new master PML4 + // per_cpu::init() already ran and set kernel_cr3 to the bootloader's CR3 + // Now that we've switched to the master PML4, we must update it + { + use x86_64::registers::control::Cr3; + let (current_frame, _) = Cr3::read(); + let master_cr3 = current_frame.start_address().as_u64(); + log::info!("CRITICAL: Updating kernel_cr3 to master PML4: {:#x}", master_cr3); + crate::per_cpu::set_kernel_cr3(master_cr3); + } + // Migrate any existing processes (though there shouldn't be any yet) kernel_page_table::migrate_existing_processes(); diff --git a/kernel/src/per_cpu.rs b/kernel/src/per_cpu.rs index c4bdd54..92675ed 100644 --- a/kernel/src/per_cpu.rs +++ b/kernel/src/per_cpu.rs @@ -56,6 +56,14 @@ pub struct PerCpuData { /// Target CR3 for next IRETQ (offset 64) - set before context switch /// 0 means no CR3 switch needed pub next_cr3: u64, + + /// Kernel CR3 (offset 72) - the master kernel page table + /// Used by interrupt/syscall entry to switch to kernel page tables + pub kernel_cr3: u64, + + /// Saved process CR3 (offset 80) - saved on interrupt entry from userspace + /// Used to restore process page tables on interrupt exit if no context switch + pub saved_process_cr3: u64, } // Linux-style preempt_count bit layout constants @@ -123,6 +131,10 @@ const TSS_OFFSET: usize = 48; // offset 48: *mut TSS (8 bytes) const SOFTIRQ_PENDING_OFFSET: usize = 56; // offset 56: u32 (4 bytes) #[allow(dead_code)] const NEXT_CR3_OFFSET: usize = 64; // offset 64: u64 (8 bytes) - ALIGNED +#[allow(dead_code)] +const KERNEL_CR3_OFFSET: usize = 72; // offset 72: u64 (8 bytes) - ALIGNED +#[allow(dead_code)] +const SAVED_PROCESS_CR3_OFFSET: usize = 80; // offset 80: u64 (8 bytes) - ALIGNED // Compile-time assertions to ensure offsets are correct // These will fail to compile if the offsets don't match expected values @@ -132,7 +144,7 @@ const _: () = assert!(USER_RSP_SCRATCH_OFFSET % 8 == 0, "user_rsp_scratch must b const _: () = assert!(core::mem::size_of::() == 8, "This code assumes 64-bit pointers"); // Verify struct size is 128 bytes due to align(64) attribute -// The actual data is 72 bytes, but align(64) rounds up to 128 +// The actual data is 88 bytes (saved_process_cr3 at offset 80), but align(64) rounds up to 128 const _: () = assert!(core::mem::size_of::() == 128, "PerCpuData must be 128 bytes (aligned to 64)"); // Verify bit layout matches Linux kernel @@ -158,6 +170,8 @@ impl PerCpuData { softirq_pending: 0, _pad2: 0, next_cr3: 0, + kernel_cr3: 0, + saved_process_cr3: 0, } } } @@ -195,6 +209,25 @@ pub fn init() { // Mark per-CPU data as initialized and safe to use PER_CPU_INITIALIZED.store(true, Ordering::Release); log::info!("Per-CPU data marked as initialized - preempt_count functions now use per-CPU storage"); + + // Store the current CR3 as the initial kernel CR3 + // NOTE: At this point, we're still using the bootloader's page tables. + // After memory::init() calls build_master_kernel_pml4(), the kernel switches + // to the master PML4 and calls set_kernel_cr3() to update this value. + // This initial value provides a fallback during early boot. + let (current_frame, _) = x86_64::registers::control::Cr3::read(); + let kernel_cr3_val = current_frame.start_address().as_u64(); + log::info!("Storing initial kernel_cr3 = {:#x} in per-CPU data (bootloader PT)", kernel_cr3_val); + + unsafe { + core::arch::asm!( + "mov gs:[{offset}], {}", + in(reg) kernel_cr3_val, + offset = const KERNEL_CR3_OFFSET, + options(nostack, preserves_flags) + ); + } + log::info!("kernel_cr3 stored successfully - interrupt handlers can now switch to kernel page tables"); } /// Get the current thread from per-CPU data @@ -941,6 +974,46 @@ pub fn set_next_cr3(cr3: u64) { } } +/// Get the kernel CR3 (master kernel page table) +/// Returns 0 if not initialized +#[allow(dead_code)] +pub fn get_kernel_cr3() -> u64 { + if !PER_CPU_INITIALIZED.load(Ordering::Acquire) { + return 0; + } + + unsafe { + let cr3: u64; + core::arch::asm!( + "mov {}, gs:[{offset}]", + out(reg) cr3, + offset = const KERNEL_CR3_OFFSET, + options(nostack, readonly, preserves_flags) + ); + cr3 + } +} + +/// Set the kernel CR3 (master kernel page table) +/// This should be called once after build_master_kernel_pml4() +pub fn set_kernel_cr3(cr3: u64) { + if !PER_CPU_INITIALIZED.load(Ordering::Acquire) { + log::warn!("set_kernel_cr3 called before per-CPU init, storing for later"); + // We can't store it yet, but we'll set it during init + return; + } + + log::info!("Setting kernel_cr3 in per-CPU data to {:#x}", cr3); + unsafe { + core::arch::asm!( + "mov gs:[{offset}], {}", + in(reg) cr3, + offset = const KERNEL_CR3_OFFSET, + options(nostack, preserves_flags) + ); + } +} + /// Check if we can schedule (preempt_count == 0 and returning to userspace) pub fn can_schedule(saved_cs: u64) -> bool { let current_preempt = preempt_count(); diff --git a/kernel/src/syscall/entry.asm b/kernel/src/syscall/entry.asm index 3c95a3b..eea28b9 100644 --- a/kernel/src/syscall/entry.asm +++ b/kernel/src/syscall/entry.asm @@ -42,16 +42,26 @@ syscall_entry: ; Clear direction flag for string operations cld + ; Always switch to kernel GS FIRST for INT 0x80 entry + ; We need kernel GS to read kernel_cr3 from per-CPU data + ; INT 0x80 is only used from userspace, so we always need swapgs + swapgs + + ; CRITICAL: Save the process CR3 BEFORE switching to kernel CR3 + ; This allows us to restore it on exit if no context switch happens + ; Save process CR3 to per-CPU data at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, cr3 ; Read current (process) CR3 + mov qword [gs:80], rax ; Save to per-CPU saved_process_cr3 + ; CRITICAL: Switch CR3 back to kernel page table ; Syscalls only come from userspace, so CR3 is always a process PT ; We MUST switch to kernel PT before running kernel code - ; TODO: Make this dynamic by storing kernel CR3 in per-CPU data - mov rax, 0x101000 ; Kernel CR3 (hardcoded for now) + ; Read kernel_cr3 from per-CPU data at gs:[72] (KERNEL_CR3_OFFSET) + mov rax, qword [gs:72] ; Read kernel CR3 from per-CPU data + test rax, rax ; Check if kernel_cr3 is set + jz .skip_cr3_switch ; If not set, skip (early boot fallback) mov cr3, rax ; Switch to kernel page table - - ; Always switch to kernel GS for INT 0x80 entry - ; INT 0x80 is only used from userspace, so we always need swapgs - swapgs +.skip_cr3_switch: ; Call the Rust syscall handler ; Pass pointer to saved registers as argument @@ -196,7 +206,32 @@ syscall_entry: jmp .after_cr3_check_syscall .no_cr3_switch_syscall_back_to_user: - ; No CR3 switch needed, swap back to user GS + ; No context switch, but we still need to restore the ORIGINAL process CR3! + ; We saved it on entry at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, qword [gs:80] ; Read saved process CR3 + test rax, rax ; Check if it was saved (non-zero) + jz .no_saved_cr3_syscall ; If 0, skip (shouldn't happen from userspace) + + ; Debug: Output marker for saved CR3 restore + push rdx + mov dx, 0x3F8 + push rax + mov al, '!' ; '!' for saved CR3 restore + out dx, al + mov al, 'S' + out dx, al + mov al, 'Y' + out dx, al + mov al, 'S' + out dx, al + pop rax + pop rdx + + ; Switch back to original process CR3 + mov cr3, rax + +.no_saved_cr3_syscall: + ; Swap back to user GS for IRETQ swapgs .after_cr3_check_syscall: @@ -317,9 +352,19 @@ syscall_return_to_userspace: test rax, rax jz .no_cr3_switch_first_entry_back_to_user - ; Interrupts already disabled (CLI at function start line 211) + ; Interrupts already disabled (CLI at function start line 260) ; Safe to switch CR3 now + ; CRITICAL FIX: Clear next_cr3 BEFORE switching CR3! + ; We must do this while kernel page tables are still active, + ; because after CR3 switch the process page tables may not + ; have the kernel per-CPU region mapped. Accessing [gs:64] + ; after CR3 switch would cause a page fault -> triple fault. + push rdx + xor rdx, rdx + mov qword [gs:64], rdx + pop rdx + ; Debug: Output marker for CR3 switch mov dx, 0x3F8 push rax @@ -337,20 +382,46 @@ syscall_return_to_userspace: out dx, al pop rax - ; Switch CR3 to process page table + ; NOW safe to switch CR3 to process page table + ; Kernel per-CPU data already cleared while kernel PT was active mov cr3, rax - ; Clear next_cr3 flag (set to 0) - xor rdx, rdx - mov qword [gs:64], rdx - ; Swap back to user GS for IRETQ swapgs jmp .after_cr3_check_first_entry .no_cr3_switch_first_entry_back_to_user: - ; No CR3 switch needed, swap back to user GS + ; No context switch, but we still need to restore the ORIGINAL process CR3! + ; We saved it on entry at gs:[80] (SAVED_PROCESS_CR3_OFFSET) + mov rax, qword [gs:80] ; Read saved process CR3 + test rax, rax ; Check if it was saved (non-zero) + jz .no_saved_cr3_first_entry ; If 0, skip (shouldn't happen from userspace) + + ; Debug: Output marker for saved CR3 restore + push rdx + mov dx, 0x3F8 + push rax + mov al, '!' ; '!' for saved CR3 restore + out dx, al + mov al, 'F' + out dx, al + mov al, 'I' + out dx, al + mov al, 'R' + out dx, al + mov al, 'S' + out dx, al + mov al, 'T' + out dx, al + pop rax + pop rdx + + ; Switch back to original process CR3 + mov cr3, rax + +.no_saved_cr3_first_entry: + ; Swap back to user GS for IRETQ swapgs .after_cr3_check_first_entry: diff --git a/kernel/src/syscall/handler.rs b/kernel/src/syscall/handler.rs index 5b04bcb..0a7fbf0 100644 --- a/kernel/src/syscall/handler.rs +++ b/kernel/src/syscall/handler.rs @@ -5,22 +5,23 @@ use super::{SyscallNumber, SyscallResult}; pub struct SyscallFrame { // General purpose registers (in memory order after all pushes) // Stack grows down, so last pushed is at lowest address (where RSP points) - // Assembly pushes in reverse order: r15 first, rax last - pub rax: u64, // Syscall number - pushed last, so at RSP+0 - pub rcx: u64, // at RSP+8 - pub rdx: u64, // at RSP+16 - pub rbx: u64, // at RSP+24 - pub rbp: u64, // at RSP+32 - pub rsi: u64, // at RSP+40 - pub rdi: u64, // at RSP+48 + // Assembly pushes: rax first, then rcx, rdx, rbx, rbp, rsi, rdi, r8-r15 + // So r15 (pushed last) is at RSP+0, and rax (pushed first) is at RSP+112 + pub r15: u64, // pushed last, at RSP+0 + pub r14: u64, // at RSP+8 + pub r13: u64, // at RSP+16 + pub r12: u64, // at RSP+24 + pub r11: u64, // at RSP+32 + pub r10: u64, // at RSP+40 + pub r9: u64, // at RSP+48 pub r8: u64, // at RSP+56 - pub r9: u64, // at RSP+64 - pub r10: u64, // at RSP+72 - pub r11: u64, // at RSP+80 - pub r12: u64, // at RSP+88 - pub r13: u64, // at RSP+96 - pub r14: u64, // at RSP+104 - pub r15: u64, // pushed first, so at RSP+112 + pub rdi: u64, // at RSP+64 + pub rsi: u64, // at RSP+72 + pub rbp: u64, // at RSP+80 + pub rbx: u64, // at RSP+88 + pub rdx: u64, // at RSP+96 + pub rcx: u64, // at RSP+104 + pub rax: u64, // Syscall number - pushed first, at RSP+112 // Interrupt frame (pushed by CPU before our code) pub rip: u64,