Skip to content
69 changes: 47 additions & 22 deletions apps/scraper/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ import { discoverCourses, scrapeCourse } from "./modules/courses";
import { discoverPrograms, scrapeProgram } from "./modules/programs";

const app = new Hono<{ Bindings: CloudflareBindings }>();
const COURSE_SCRAPE_DELAY_MS = 250;

const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));

const validateApiKey = async (
c: Context<{ Bindings: CloudflareBindings }>,
Expand Down Expand Up @@ -210,19 +213,31 @@ export default {
}
case "discover-courses": {
const courseUrls = await discoverCourses(job.url);
const newJobs = await db
.insert(jobs)
.values(
courseUrls.map((url) => ({
url,
jobType: "course" as const,
})),
)
.returning();
// NOTE: Cloudflare Queues has a limit of 100 messages per sendBatch()
console.log(`Discovered ${courseUrls.length} course URLs`);

const BATCH_SIZE = 10;
for (let i = 0; i < courseUrls.length; i += BATCH_SIZE) {
const batch = courseUrls.slice(i, i + BATCH_SIZE);

const newJobs = await db
.insert(jobs)
.values(
batch.map((url) => ({
url,
jobType: "course" as const,
})),
)
.returning();

await env.SCRAPING_QUEUE.sendBatch(
newJobs.map((j) => ({ body: { jobId: j.id } })),
);

await env.SCRAPING_QUEUE.sendBatch(
newJobs.map((j) => ({ body: { jobId: j.id } })),
);
console.log(
`Queued batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(courseUrls.length / BATCH_SIZE)} (${newJobs.length} jobs)`,
);
}
break;
}
case "program": {
Expand All @@ -242,18 +257,28 @@ export default {
break;
}
case "course": {
const res = await scrapeCourse(job.url, db, env);
// A single URL may contain multiple courses
if (COURSE_SCRAPE_DELAY_MS > 0) {
await sleep(COURSE_SCRAPE_DELAY_MS);
}
const courses = await scrapeCourse(job.url, db, env);

const courseId = await convex.upsertCourseWithPrerequisites({
...res.course,
prerequisites: res.prerequisites,
});
console.log(
`Scraped ${courses.length} courses from ${job.url}`,
);

if (!courseId) {
throw new JobError(
"Failed to upsert course: no ID returned",
"validation",
);
for (const courseData of courses) {
const courseId = await convex.upsertCourseWithPrerequisites({
...courseData.course,
prerequisites: courseData.prerequisites,
});

if (!courseId) {
throw new JobError(
`Failed to upsert course ${courseData.course.code}: no ID returned`,
"validation",
);
}
}
break;
}
Expand Down
15 changes: 10 additions & 5 deletions apps/scraper/src/modules/courses/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,21 @@ describe("Courses Scraper", () => {
const mockDb = createMockDb();
const mockEnv = createMockEnv();

const result = await scrapeCourse(
const courses = await scrapeCourse(
"https://bulletins.nyu.edu/courses/acct_gb/",
mockDb,
mockEnv,
);

expect(result).toHaveProperty("course");
expect(result).toHaveProperty("prerequisites");
expect(typeof result.course).toBe("object");
expect(Array.isArray(result.prerequisites)).toBe(true);
expect(Array.isArray(courses)).toBe(true);
expect(courses.length).toBeGreaterThan(0);

for (const result of courses) {
expect(result).toHaveProperty("course");
expect(result).toHaveProperty("prerequisites");
expect(typeof result.course).toBe("object");
expect(Array.isArray(result.prerequisites)).toBe(true);
}
});

test("should handle invalid course URLs", async () => {
Expand Down
Loading