diff --git a/apps/scraper/src/index.ts b/apps/scraper/src/index.ts index 185dd6e7..5a11e0b6 100644 --- a/apps/scraper/src/index.ts +++ b/apps/scraper/src/index.ts @@ -13,6 +13,9 @@ import { discoverCourses, scrapeCourse } from "./modules/courses"; import { discoverPrograms, scrapeProgram } from "./modules/programs"; const app = new Hono<{ Bindings: CloudflareBindings }>(); +const COURSE_SCRAPE_DELAY_MS = 250; + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); const validateApiKey = async ( c: Context<{ Bindings: CloudflareBindings }>, @@ -210,19 +213,31 @@ export default { } case "discover-courses": { const courseUrls = await discoverCourses(job.url); - const newJobs = await db - .insert(jobs) - .values( - courseUrls.map((url) => ({ - url, - jobType: "course" as const, - })), - ) - .returning(); + // NOTE: Cloudflare Queues has a limit of 100 messages per sendBatch() + console.log(`Discovered ${courseUrls.length} course URLs`); + + const BATCH_SIZE = 10; + for (let i = 0; i < courseUrls.length; i += BATCH_SIZE) { + const batch = courseUrls.slice(i, i + BATCH_SIZE); + + const newJobs = await db + .insert(jobs) + .values( + batch.map((url) => ({ + url, + jobType: "course" as const, + })), + ) + .returning(); + + await env.SCRAPING_QUEUE.sendBatch( + newJobs.map((j) => ({ body: { jobId: j.id } })), + ); - await env.SCRAPING_QUEUE.sendBatch( - newJobs.map((j) => ({ body: { jobId: j.id } })), - ); + console.log( + `Queued batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(courseUrls.length / BATCH_SIZE)} (${newJobs.length} jobs)`, + ); + } break; } case "program": { @@ -242,18 +257,28 @@ export default { break; } case "course": { - const res = await scrapeCourse(job.url, db, env); + // A single URL may contain multiple courses + if (COURSE_SCRAPE_DELAY_MS > 0) { + await sleep(COURSE_SCRAPE_DELAY_MS); + } + const courses = await scrapeCourse(job.url, db, env); - const courseId = await convex.upsertCourseWithPrerequisites({ - ...res.course, - prerequisites: res.prerequisites, - }); + console.log( + `Scraped ${courses.length} courses from ${job.url}`, + ); - if (!courseId) { - throw new JobError( - "Failed to upsert course: no ID returned", - "validation", - ); + for (const courseData of courses) { + const courseId = await convex.upsertCourseWithPrerequisites({ + ...courseData.course, + prerequisites: courseData.prerequisites, + }); + + if (!courseId) { + throw new JobError( + `Failed to upsert course ${courseData.course.code}: no ID returned`, + "validation", + ); + } } break; } diff --git a/apps/scraper/src/modules/courses/index.test.ts b/apps/scraper/src/modules/courses/index.test.ts index 6b1894cb..546c6934 100644 --- a/apps/scraper/src/modules/courses/index.test.ts +++ b/apps/scraper/src/modules/courses/index.test.ts @@ -33,16 +33,21 @@ describe("Courses Scraper", () => { const mockDb = createMockDb(); const mockEnv = createMockEnv(); - const result = await scrapeCourse( + const courses = await scrapeCourse( "https://bulletins.nyu.edu/courses/acct_gb/", mockDb, mockEnv, ); - expect(result).toHaveProperty("course"); - expect(result).toHaveProperty("prerequisites"); - expect(typeof result.course).toBe("object"); - expect(Array.isArray(result.prerequisites)).toBe(true); + expect(Array.isArray(courses)).toBe(true); + expect(courses.length).toBeGreaterThan(0); + + for (const result of courses) { + expect(result).toHaveProperty("course"); + expect(result).toHaveProperty("prerequisites"); + expect(typeof result.course).toBe("object"); + expect(Array.isArray(result.prerequisites)).toBe(true); + } }); test("should handle invalid course URLs", async () => { diff --git a/apps/scraper/src/modules/courses/index.ts b/apps/scraper/src/modules/courses/index.ts index e33f8032..5b74af19 100644 --- a/apps/scraper/src/modules/courses/index.ts +++ b/apps/scraper/src/modules/courses/index.ts @@ -3,6 +3,8 @@ import type { ZUpsertCourseWithPrerequisites, ZUpsertPrerequisites, } from "@albert-plus/server/convex/http"; +import type { schoolName } from "@albert-plus/server/convex/schemas/schools"; +import type { Infer } from "convex/values"; import type { DrizzleD1Database } from "drizzle-orm/d1"; import type * as z from "zod/mini"; @@ -13,19 +15,289 @@ export type CoursePrerequisite = | Omit, "courseId"> | Omit, "courseId">; +type CourseLevel = "undergraduate" | "graduate"; + +export type SchoolName = Infer; + +const SCHOOL_CODE_TO_NAME: Record = { + UA: "College of Arts and Science", + NA: "College of Arts and Science", + + CD: "College of Dentistry", + UD: "College of Dentistry", + ND: "College of Dentistry", + DN: "College of Dentistry", + + UG: "Gallatin School of Individualized Study", + GG: "Gallatin School of Individualized Study", + + GA: "Graduate School of Arts and Science", + + UF: "Liberal Studies", + + ML: "NYU Grossman Long Island School of Medicine", + + UZ: "Non-School Based Programs - UG", + + UH: "NYU Abu Dhabi", + GH: "NYU Abu Dhabi", + NH: "NYU Abu Dhabi", + + UI: "NYU Shanghai", + GI: "NYU Shanghai", + SHU: "NYU Shanghai", + + GP: "Robert F. Wagner Graduate School of Public Service", + UW: "Robert F. Wagner Graduate School of Public Service", + NP: "Robert F. Wagner Graduate School of Public Service", + + GN: "Rory Meyers College of Nursing", + UN: "Rory Meyers College of Nursing", + + GU: "School of Global Public Health", + UU: "School of Global Public Health", + NU: "School of Global Public Health", + + LW: "School of Law", + NL: "School of Law", + + GC: "School of Professional Studies", + UC: "School of Professional Studies", + CE: "School of Professional Studies", + + NS: "Silver School of Social Work", + GS: "Silver School of Social Work", + US: "Silver School of Social Work", + + NE: "Steinhardt School of Culture, Education, and Human Development", + UE: "Steinhardt School of Culture, Education, and Human Development", + GE: "Steinhardt School of Culture, Education, and Human Development", + + GB: "Leonard N. Stern School of Business", + UB: "Leonard N. Stern School of Business", + + GY: "Tandon School of Engineering", + UY: "Tandon School of Engineering", + GX: "Tandon School of Engineering", + + NT: "Tisch School of the Arts", + GT: "Tisch School of the Arts", + UT: "Tisch School of the Arts", + + MD: "NYU Grossman School of Medicine", +}; + +export function getSchoolFromProgram(program: string): SchoolName { + const code = program.split("-").pop()?.toUpperCase() ?? ""; + const school = SCHOOL_CODE_TO_NAME[code]; + return school ?? "College of Arts and Science"; +} + +const GRADUATE_PROGRAM_CODES = new Set([ + "GA", + "GG", + "GH", + "GI", + "GP", + "GN", + "GU", + "GC", + "GS", + "GE", + "GB", + "GY", + "GX", + "GT", + "MD", + "LW", + "NL", + "NP", +]); + +function getCourseLevel(program: string, courseNumber: string): CourseLevel { + const programCode = program.split("-").pop()?.toUpperCase() ?? ""; + + if (GRADUATE_PROGRAM_CODES.has(programCode)) { + return "graduate"; + } + + if (programCode.startsWith("U")) { + return "undergraduate"; + } + + const normalized = courseNumber.replace(/\D/g, "").padStart(4, "0"); + const firstDigit = Number.parseInt(normalized[0], 10); + + if (!Number.isNaN(firstDigit) && firstDigit >= 5) { + return "graduate"; + } + + return "undergraduate"; +} + export async function discoverCourses(url: string): Promise { - // TODO: implement this function - return []; + const courses: string[] = []; + + const response = await fetch(url); + + class CourseLinkHandler { + element(element: Element) { + const href = element.getAttribute("href"); + if (href?.startsWith("/courses/") && href !== "/courses/") { + const baseUrl = new URL(url); + const absoluteUrl = new URL(href, baseUrl).toString(); + courses.push(absoluteUrl); + } + } + } + + const rewriter = new HTMLRewriter().on( + 'a[href^="/courses/"]', + new CourseLinkHandler(), + ); + + await rewriter.transform(response).arrayBuffer(); + + return courses; +} + +interface ParsedCourse { + course: Omit, "prerequisites">; + prerequisites: CoursePrerequisite[]; } export async function scrapeCourse( url: string, db: DrizzleD1Database, env: CloudflareBindings, -): Promise<{ - course: Omit, "prerequisites">; - prerequisites: CoursePrerequisite[]; -}> { - // TODO: implement this function - throw new Error("Not implemented"); +): Promise { + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`Failed to fetch course page: ${response.status}`); + } + + const courses: ParsedCourse[] = []; + let currentCode = ""; + let currentTitle = ""; + let currentCredits = 0; + let currentDescription = ""; + let currentPrereqs = ""; + + class CourseBlockHandler { + element(element: Element) { + currentCode = ""; + currentTitle = ""; + currentCredits = 0; + currentDescription = ""; + currentPrereqs = ""; + + element.onEndTag(() => { + if (currentCode && currentTitle) { + const codeMatch = currentCode.match(/([A-Z]+(?:-[A-Z]+)?)\s+(\d+)/); + if (codeMatch) { + const [, program, courseNumber] = codeMatch; + const level = getCourseLevel(program, courseNumber); + + courses.push({ + course: { + program, + code: currentCode, + level, + title: currentTitle, + credits: Math.floor(currentCredits), + description: currentDescription || "No description available.", + courseUrl: url, + school: getSchoolFromProgram(program), + }, + prerequisites: currentPrereqs + ? parsePrerequisites(currentPrereqs) + : [], + }); + } + } + }); + } + } + + class CodeHandler { + text(text: { text: string }) { + currentCode += text.text.trim(); + } + } + + class TitleHandler { + text(text: { text: string }) { + currentTitle += text.text.trim(); + } + } + + class CreditsHandler { + text(text: { text: string; lastInTextNode: boolean }) { + const trimmed = text.text.trim(); + if (trimmed) { + const match = trimmed.match(/\((\d+(?:\.\d+)?)\s*Credits?\)/); + if (match) { + currentCredits = Number.parseFloat(match[1]); + } + } + } + } + + class DescriptionHandler { + text(text: { text: string }) { + const trimmed = text.text.trim(); + if (trimmed) { + currentDescription += (currentDescription ? " " : "") + trimmed; + } + } + } + + class PrereqHandler { + text(text: { text: string }) { + currentPrereqs += `${text.text.trim()} `; + } + } + + const rewriter = new HTMLRewriter() + .on(".courseblock", new CourseBlockHandler()) + .on(".detail-code strong", new CodeHandler()) + .on(".detail-title strong", new TitleHandler()) + .on(".detail-hours_html strong", new CreditsHandler()) + .on(".courseblockextra", new DescriptionHandler()) + .on(".detail-prerequisites", new PrereqHandler()); + + await rewriter.transform(response).arrayBuffer(); + + return courses; +} + +function parsePrerequisites(text: string): CoursePrerequisite[] { + const prerequisites: CoursePrerequisite[] = []; + + const cleanText = text.replace(/^Prerequisites?:\s*/i, "").trim(); + + // Match course codes + const coursePattern = /([A-Z]+(?:-[A-Z]+)?)\s+(\d+)/g; + const matches = [...cleanText.matchAll(coursePattern)]; + + if (matches.length > 0) { + const courses = matches.map((match) => `${match[1]} ${match[2]}`); + + // TODO: Now it cannot handle "NOT open to students who take ..." + // Check if it's an "or" (alternative) or "and" (required) relationship + if (cleanText.toLowerCase().includes(" or ")) { + prerequisites.push({ + type: "alternative", + courses, + }); + } else { + prerequisites.push({ + type: "required", + courses, + }); + } + } + + return prerequisites; } diff --git a/apps/web/src/app/dashboard/admin/page.tsx b/apps/web/src/app/dashboard/admin/page.tsx index 1ed1524b..49ffa0be 100644 --- a/apps/web/src/app/dashboard/admin/page.tsx +++ b/apps/web/src/app/dashboard/admin/page.tsx @@ -33,7 +33,7 @@ export default function AdminPage() { ); const setConfig = useMutation(api.appConfigs.setAppConfig); const removeConfig = useMutation(api.appConfigs.removeAppConfig); - const triggerMajorsScraping = useAction(api.scraper.triggerMajorsScraping); + const triggerProgramsScraping = useAction(api.scraper.triggerMajorsScraping); const triggerCoursesScraping = useAction(api.scraper.triggerCoursesScraping); const [isDialogOpen, setIsDialogOpen] = useState(false); @@ -46,7 +46,7 @@ export default function AdminPage() { Doc<"appConfigs"> | undefined >(undefined); - const [isTriggeringMajors, setIsTriggeringMajors] = useState(false); + const [isTriggeringPrograms, setIsTriggeringPrograms] = useState(false); const [isTriggeringCourses, setIsTriggeringCourses] = useState(false); if (isAuthenticated && !isAdmin) { @@ -142,19 +142,19 @@ export default function AdminPage() { } }; - const handleTriggerMajors = async () => { - setIsTriggeringMajors(true); + const handleTriggerPrograms = async () => { + setIsTriggeringPrograms(true); try { - const result = await triggerMajorsScraping({}); - toast.success("Majors scraping triggered successfully", { + const result = await triggerProgramsScraping({}); + toast.success("Programs scraping triggered successfully", { description: `Job ID: ${result.jobId}`, }); } catch (error) { - toast.error("Failed to trigger majors scraping", { + toast.error("Failed to trigger programs scraping", { description: error instanceof Error ? error.message : "Unknown error", }); } finally { - setIsTriggeringMajors(false); + setIsTriggeringPrograms(false); } }; @@ -180,17 +180,17 @@ export default function AdminPage() {

Scraper Controls