From 7ed5c1c73c33ce65e64dae1547ed759bff99a62d Mon Sep 17 00:00:00 2001 From: Agustin Gutierrez Date: Sun, 6 Jul 2025 03:32:10 -0600 Subject: [PATCH 1/3] paging: Optimize memory code for performance - Reduced redundant calculations and memory accesses in loops. - Applied `restrict`, `const`, and `inline` to improve compiler optimizations. - Optimized `VirtualMapL` and `VirtualMapRegionByLength` by minimizing repeated pointer arithmetic and avoiding unnecessary operations. - Improved clarity and efficiency of critical page mapping routines. These changes target hot paths in the paging system to enhance overall performance. Signed-off-by: Agustin Gutierrez --- src/kernel/include/paging.h | 2 +- src/kernel/memory/paging.c | 43 ++++++++++++++++--------------------- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/src/kernel/include/paging.h b/src/kernel/include/paging.h index 0b841c20..9cb13ae8 100644 --- a/src/kernel/include/paging.h +++ b/src/kernel/include/paging.h @@ -77,7 +77,7 @@ size_t VirtualToPhysicalL(uint64_t *pagedir, size_t virt_addr); size_t VirtualToPhysical(size_t virt_addr); uint64_t *GetPageDirectory(); -uint64_t *GetTaskPageDirectory(void *task); +uint64_t *GetTaskPageDirectory(const void *task); void ChangePageDirectory(uint64_t *pd); void ChangePageDirectoryUnsafe(uint64_t *pd); void ChangePageDirectoryFake(uint64_t *pd); diff --git a/src/kernel/memory/paging.c b/src/kernel/memory/paging.c index 30af4c2d..c5c5a904 100644 --- a/src/kernel/memory/paging.c +++ b/src/kernel/memory/paging.c @@ -100,10 +100,11 @@ void VirtualMapRegionByLength(uint64_t virt_addr, uint64_t phys_addr, debugf("[paging::map::region] virt{%lx} phys{%lx} len{%lx}\n", virt_addr, phys_addr, length); #endif + if (length == 0) return; uint32_t pagesAmnt = DivRoundUp(length, PAGE_SIZE); - for (int i = 0; i < pagesAmnt; i++) { - uint64_t xvirt = virt_addr + i * PAGE_SIZE; - uint64_t xphys = phys_addr + i * PAGE_SIZE; + uint64_t xvirt = virt_addr; + uint64_t xphys = phys_addr; + for (uint32_t i = 0; i < pagesAmnt; i++, xvirt += PAGE_SIZE, xphys += PAGE_SIZE) { VirtualMap(xvirt, xphys, flags); } } @@ -116,7 +117,6 @@ void ChangePageDirectoryUnsafe(uint64_t *pd) { panic(); } asm volatile("movq %0, %%cr3" ::"r"(targ)); - globalPagedir = pd; } @@ -126,7 +126,6 @@ void ChangePageDirectoryFake(uint64_t *pd) { debugf("[paging] Could not (fake) change to pd{%lx}!\n", pd); panic(); } - globalPagedir = pd; } @@ -142,27 +141,23 @@ void ChangePageDirectory(uint64_t *pd) { ChangePageDirectoryUnsafe(pd); } -uint64_t *GetPageDirectory() { return (uint64_t *)globalPagedir; } - -uint64_t *GetTaskPageDirectory(void *taskPtr) { - Task *task = (Task *)taskPtr; +inline uint64_t *GetPageDirectory() { return (uint64_t *)globalPagedir; } +inline uint64_t *GetTaskPageDirectory(const void *taskPtr) { + const Task *task = (const Task *)taskPtr; TaskInfoPagedir *info = task->infoPd; spinlockAcquire(&info->LOCK_PD); uint64_t *ret = task->pagedirOverride ? task->pagedirOverride : info->pagedir; spinlockRelease(&info->LOCK_PD); - return ret; } -void invalidate(uint64_t vaddr) { asm volatile("invlpg %0" ::"m"(vaddr)); } +inline void invalidate(uint64_t vaddr) { asm volatile("invlpg %0" ::"m"(vaddr)); } size_t PagingPhysAllocate() { size_t phys = PhysicalAllocate(1); - void *virt = (void *)(phys + HHDMoffset); memset(virt, 0, PAGE_SIZE); - return phys; } @@ -172,7 +167,7 @@ void VirtualMap(uint64_t virt_addr, uint64_t phys_addr, uint64_t flags) { VirtualMapL(globalPagedir, virt_addr, phys_addr, flags); } -void VirtualMapL(uint64_t *pagedir, uint64_t virt_addr, uint64_t phys_addr, +void VirtualMapL(uint64_t *restrict pagedir, uint64_t virt_addr, uint64_t phys_addr, uint64_t flags) { if (virt_addr % PAGE_SIZE) { debugf("[paging] Tried to map non-aligned address! virt{%lx} phys{%lx}\n", @@ -181,42 +176,40 @@ void VirtualMapL(uint64_t *pagedir, uint64_t virt_addr, uint64_t phys_addr, } virt_addr = AMD64_MM_STRIPSX(virt_addr); - uint32_t pml4_index = PML4E(virt_addr); - uint32_t pdp_index = PDPTE(virt_addr); - uint32_t pd_index = PDE(virt_addr); - uint32_t pt_index = PTE(virt_addr); + const uint32_t pml4_index = PML4E(virt_addr); + const uint32_t pdp_index = PDPTE(virt_addr); + const uint32_t pd_index = PDE(virt_addr); + const uint32_t pt_index = PTE(virt_addr); spinlockCntWriteAcquire(&WLOCK_PAGING); + size_t *pdp, *pd, *pt; if (!(pagedir[pml4_index] & PF_PRESENT)) { size_t target = PagingPhysAllocate(); pagedir[pml4_index] = target | PF_PRESENT | PF_RW | PF_USER; } - size_t *pdp = (size_t *)(PTE_GET_ADDR(pagedir[pml4_index]) + HHDMoffset); + pdp = (size_t *)(PTE_GET_ADDR(pagedir[pml4_index]) + HHDMoffset); if (!(pdp[pdp_index] & PF_PRESENT)) { size_t target = PagingPhysAllocate(); pdp[pdp_index] = target | PF_PRESENT | PF_RW | PF_USER; } - size_t *pd = (size_t *)(PTE_GET_ADDR(pdp[pdp_index]) + HHDMoffset); + pd = (size_t *)(PTE_GET_ADDR(pdp[pdp_index]) + HHDMoffset); if (!(pd[pd_index] & PF_PRESENT)) { size_t target = PagingPhysAllocate(); pd[pd_index] = target | PF_PRESENT | PF_RW | PF_USER; } - size_t *pt = (size_t *)(PTE_GET_ADDR(pd[pd_index]) + HHDMoffset); + pt = (size_t *)(PTE_GET_ADDR(pd[pd_index]) + HHDMoffset); if (pt[pt_index] & PF_PRESENT && !(PTE_GET_ADDR(pt[pt_index]) >= fb.phys && PTE_GET_ADDR(pt[pt_index]) < fb.phys + (fb.width * fb.height * 4))) { PhysicalFree(PTE_GET_ADDR(pt[pt_index]), 1); - // debugf("[paging] Overwrite (without unmapping) WARN! virt{%lx} - // phys{%lx}\n", - // virt_addr, phys_addr); } if (!phys_addr) // todo: proper unmapping pt[pt_index] = 0; else - pt[pt_index] = (P_PHYS_ADDR(phys_addr)) | PF_PRESENT | flags; // | PF_RW + pt[pt_index] = (P_PHYS_ADDR(phys_addr)) | PF_PRESENT | flags; invalidate(virt_addr); spinlockCntWriteRelease(&WLOCK_PAGING); From 6977c80b270b8f9f16c5315bbab28f628fada064 Mon Sep 17 00:00:00 2001 From: Agustin Gutierrez Date: Sun, 6 Jul 2025 03:45:42 -0600 Subject: [PATCH 2/3] task/sched: Optimize code for performance - Reduced redundant pointer dereferencing and atomic reads. - Improved task selection loop efficiency. - Clarified and grouped context switch steps to enhance performance and maintainability. - Minimized unnecessary memory operations and improved code readability. These changes aim to streamline the core tasking logic and improve overall scheduler responsiveness. Signed-off-by: Agustin Gutierrez --- src/kernel/multitasking/schedule.c | 97 +++++++++--------------------- 1 file changed, 30 insertions(+), 67 deletions(-) diff --git a/src/kernel/multitasking/schedule.c b/src/kernel/multitasking/schedule.c index 258a6752..9be16815 100644 --- a/src/kernel/multitasking/schedule.c +++ b/src/kernel/multitasking/schedule.c @@ -22,134 +22,97 @@ void schedule(uint64_t rsp) { if (!tasksInitiated) return; - // try to find a next task AsmPassedInterrupt *cpu = (AsmPassedInterrupt *)rsp; - Task *next = currentTask->next; - if (!next) - next = firstTask; - + Task *next = currentTask->next ? currentTask->next : firstTask; int fullRun = 0; + + // Fast path: if next is ready, skip loop while (next->state != TASK_STATE_READY) { if (signalsRevivableState(next->state) && signalsPendingQuick(next)) { - // back to the syscall handler which returns -EINTR (& handles the signal) assert(next->registers.cs & GDT_KERNEL_CODE); - next->forcefulWakeupTimeUnsafe = 0; // needed + next->forcefulWakeupTimeUnsafe = 0; next->state = TASK_STATE_READY; break; } - if (next->forcefulWakeupTimeUnsafe && - next->forcefulWakeupTimeUnsafe <= timerTicks) { - // no race! the task has to already have been suspended to end up here + if (next->forcefulWakeupTimeUnsafe && next->forcefulWakeupTimeUnsafe <= timerTicks) { next->state = TASK_STATE_READY; next->forcefulWakeupTimeUnsafe = 0; - // ^ is here to avoid interference with future statuses break; } - next = next->next; - if (!next) { - fullRun++; - if (fullRun > 2) - break; - next = firstTask; - } + next = next->next ? next->next : firstTask; + if (++fullRun > 2) break; } - // found no task - if (!next) - next = dummyTask; - + if (!next) next = dummyTask; Task *old = currentTask; - currentTask = next; if (old->state != TASK_STATE_READY && old->spinlockQueueEntry) { - // taskSpinlockExit(). maybe also todo on exit cleanup spinlockRelease(old->spinlockQueueEntry); old->spinlockQueueEntry = 0; } if (!next->kernel_task) { - // per-process timers - uint64_t rtAt = atomicRead64(&next->infoSignals->itimerReal.at); - uint64_t rtReset = atomicRead64(&next->infoSignals->itimerReal.reset); - if (rtAt && rtAt <= timerTicks) { - // issue signal + // Optimize timer checks: only read atomic values once + struct { + uint64_t at, reset; + } itimerReal = { + .at = atomicRead64(&next->infoSignals->itimerReal.at), + .reset = atomicRead64(&next->infoSignals->itimerReal.reset) + }; + if (itimerReal.at && itimerReal.at <= timerTicks) { atomicBitmapSet(&next->sigPendingList, SIGALRM); - if (!rtReset) - atomicWrite64(&next->infoSignals->itimerReal.at, 0); - else - atomicWrite64(&next->infoSignals->itimerReal.at, timerTicks + rtReset); + atomicWrite64(&next->infoSignals->itimerReal.at, + itimerReal.reset ? timerTicks + itimerReal.reset : 0); } } #if SCHEDULE_DEBUG - // if (old->id != 0 || next->id != 0) - debugf("[scheduler] Switching context: id{%d} -> id{%d}\n", old->id, - next->id); + debugf("[scheduler] Switching context: id{%d} -> id{%d}\n", old->id, next->id); debugf("cpu->usermode_rsp{%lx} rip{%lx} fsbase{%lx} gsbase{%lx}\n", - next->registers.usermode_rsp, next->registers.rip, old->fsbase, - old->gsbase); + next->registers.usermode_rsp, next->registers.rip, old->fsbase, old->gsbase); #endif - // Before doing anything, handle any signals + // Handle signals before context switch if (!next->kernel_task && !(next->registers.cs & GDT_KERNEL_CODE)) { signalsPendingHandleSched(next); - if (next->state == TASK_STATE_SIGKILLED) { // killed in the process + if (next->state == TASK_STATE_SIGKILLED) { currentTask = old; return schedule(rsp); } } - // Change TSS rsp0 (software multitasking) + // Change TSS rsp0 and syscall stack tssPtr->rsp0 = next->whileTssRsp; threadInfo.syscall_stack = next->whileSyscallRsp; - // Save MSRIDs (HIGHLY unsure) - // old->fsbase = rdmsr(MSRID_FSBASE); - // old->gsbase = rdmsr(MSRID_GSBASE); - // Apply new MSRIDs wrmsr(MSRID_FSBASE, next->fsbase); wrmsr(MSRID_GSBASE, next->gsbase); wrmsr(MSRID_KERNEL_GSBASE, (size_t)&threadInfo); - // Save generic (and non) registers + // Save registers memcpy(&old->registers, cpu, sizeof(AsmPassedInterrupt)); - // Apply new generic (and non) registers (not needed!) - // memcpy(cpu, &next->registers, sizeof(AsmPassedInterrupt)); - - // Apply pagetable (not needed!) - // ChangePageDirectoryUnsafe(next->pagedir); - - // Save & load appropriate FPU state + // Save/load FPU state only for user tasks if (!old->kernel_task) { asm volatile(" fxsave %0 " ::"m"(old->fpuenv)); asm("stmxcsr (%%rax)" : : "a"(&old->mxcsr)); } - if (!next->kernel_task) { asm volatile(" fxrstor %0 " ::"m"(next->fpuenv)); asm("ldmxcsr (%%rax)" : : "a"(&next->mxcsr)); } - // Cleanup any old tasks left dead (not needed!) - // if (old->state == TASK_STATE_DEAD) - // taskKillCleanup(old); - - // Put next task's registers in tssRsp + // Prepare iretq stack AsmPassedInterrupt *iretqRsp = (AsmPassedInterrupt *)(next->whileTssRsp - sizeof(AsmPassedInterrupt)); memcpy(iretqRsp, &next->registers, sizeof(AsmPassedInterrupt)); - // Pass off control to our assembly finalization code that: - // - uses the tssRsp to iretq (give control back) - // - applies the new pagetable - // - cleanups old killed task (if necessary) - // .. basically replaces all (not needed!) stuff - uint64_t *pagedir = - next->pagedirOverride ? next->pagedirOverride : next->infoPd->pagedir; + // Update pagedir pointer (no full switch, just update global) + uint64_t *pagedir = next->pagedirOverride ? next->pagedirOverride : next->infoPd->pagedir; ChangePageDirectoryFake(pagedir); - // ^ just for globalPagedir to update (note potential race cond) + + // Finalize context switch asm_finalize((size_t)iretqRsp, VirtualToPhysical((size_t)pagedir)); } From acd336748ae2d039f5ccdf08680a73fa3637aa9e Mon Sep 17 00:00:00 2001 From: Agustin Gutierrez Date: Sun, 6 Jul 2025 04:09:04 -0600 Subject: [PATCH 3/3] framebuffer: Optimize drawRect code performance - Reduced repeated calculations inside loops. - Applied pointer arithmetic for faster pixel access and memory writes. These improvements enhance the efficiency of rectangle drawing and related framebuffer operations. Signed-off-by: Agustin Gutierrez --- src/kernel/graphical/fb.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/kernel/graphical/fb.c b/src/kernel/graphical/fb.c index 0867236f..7d24870b 100644 --- a/src/kernel/graphical/fb.c +++ b/src/kernel/graphical/fb.c @@ -10,19 +10,19 @@ Framebuffer fb = {0}; -void drawRect(int x, int y, int w, int h, int r, int g, - int b) { // Draw a filled rectangle - unsigned int offset = - (x + y * fb.width) * - 4; // Finding the location of the pixel in the video array +void drawRect(int x, int y, int w, int h, int r, int g, int b) { + // Optimize: minimize repeated calculations, use pointer arithmetic + unsigned int offset = (x + y * fb.width) * 4; + uint8_t *row = fb.virt + offset; for (int i = 0; i < h; i++) { - for (int j = 0; j < w; j++) { // color each line - fb.virt[offset + j * 4] = b; - fb.virt[offset + j * 4 + 1] = g; - fb.virt[offset + j * 4 + 2] = r; - fb.virt[offset + j * 4 + 3] = 0; + uint8_t *pixel = row; + for (int j = 0; j < w; j++, pixel += 4) { + pixel[0] = b; + pixel[1] = g; + pixel[2] = r; + pixel[3] = 0; } - offset += fb.pitch; // switch to the beginnering of next line + row += fb.pitch; } }