patrickluzdev · JoaoPixelCode · Mar 6, 2026 · Mar 6, 2026 · Copilot · Mar 6, 2026
diff --git a/packages/scraper/src/plugins/course/index.ts b/packages/scraper/src/plugins/course/index.ts
@@ -0,0 +1,52 @@
+import crypto from "node:crypto";
+import * as cheerio from "cheerio";
+import type { Fetcher } from "../../core/fetcher.js";
+import type { ScrapeRequest, ScrapeResult, Scraper } from "../../core/types.js";
+import { CursoDetailPage } from "./pages/detail.js";
+import { CursoListPage } from "./pages/list.js";
+
+export class CursosScraper implements Scraper {
+	readonly id = "cursos-ifrs-canoas";
+
+	constructor(private fetcher: Fetcher) {}
+
+	async *run(request: ScrapeRequest): AsyncGenerator<ScrapeResult> {
+		const { startUrl } = request;
+
+		console.log(`[CursosScraper] Buscando lista em: ${startUrl}`);
+
+		const html = await this.fetcher.get(startUrl);
+		const $ = cheerio.load(html);
+		const listPage = new CursoListPage($);
+		const items = listPage.extractItems();
+
+		console.log(`[CursosScraper] ${items.length} cursos encontrados`);
+
+		for (const item of items) {
+			if (!item.url) continue;
+
+			try {
+				console.log(`[CursosScraper] Scraping: ${item.title}`);
+
+				const detailHtml = await this.fetcher.get(item.url);
+				const $detail = cheerio.load(detailHtml);
+				const detailPage = new CursoDetailPage($detail);
+
+				const rawText = detailPage.extractContent();
+				if (!rawText) continue;
+
+				yield {
+					url: item.url,
+					title: detailPage.extractTitle() || item.title,
+					rawText,
+					contentHash: crypto.createHash("md5").update(rawText).digest("hex"),
+					category: "curso",
+					publishedAt: detailPage.extractDate(),
+					sourceType: "webpage",
+				} satisfies ScrapeResult;
+			} catch (err) {
+				console.error(`[CursosScraper] Erro em ${item.url}:`, err);
+			}
+		}
+	}
+}
diff --git a/packages/scraper/src/plugins/course/pages/detail.ts b/packages/scraper/src/plugins/course/pages/detail.ts
@@ -0,0 +1,32 @@
+import type { CheerioAPI } from "cheerio";
+
+const SELECTORS = {
+	title: "h2.page__title",
+	content: "div.page__content",
+	dateMeta: "p.page__meta",
+} as const;
+
+const REMOVE_SELECTORS = "script, iframe, style, nav, footer";
+
+export class CursoDetailPage {
+	constructor(private $: CheerioAPI) {}
+
+	extractTitle(): string {
+		return this.$(SELECTORS.title).first().text().trim();
+	}
+
+	extractDate(): Date | undefined {
+		const text = this.$(SELECTORS.dateMeta).first().text();
+		// Formato: "Última atualização em 20/02/2026"
+		const match = text.match(/(\d{2})\/(\d{2})\/(\d{4})/);
+		if (!match) return undefined;
+		const [, day, month, year] = match;
+		return new Date(Number(year), Number(month) - 1, Number(day));
+	}
+
+	extractContent(): string {
+		const $content = this.$(SELECTORS.content).first().clone();
+		$content.find(REMOVE_SELECTORS).remove();
+		return $content.text().replace(/\s+/g, " ").trim();
+	}
+}
diff --git a/packages/scraper/src/plugins/course/pages/list.ts b/packages/scraper/src/plugins/course/pages/list.ts
@@ -0,0 +1,39 @@
+import type { CheerioAPI } from "cheerio";
+
+export interface CursoListItem {
+	url: string;
+	title: string;
+}
+
+const SELECTORS = {
+	link: "div.page__content ul li a",
+} as const;
+
+export class CursoListPage {
+	constructor(private $: CheerioAPI) {}
+
+	extractItems(): CursoListItem[] {
+		const seen = new Set<string>();
+		const items: CursoListItem[] = [];
+
+		this.$(SELECTORS.link).each((_, el) => {
+			const $el = this.$(el);
+			const url = $el.attr("href") ?? "";
+
+			// Ignora se não for URL HTTP ou se já foi visto
-			// Ignora se não for URL HTTP ou se já foi visto
+			// Ignora URLs que não sejam do domínio https://ifrs.edu.br ou que já tenham sido vistas
-			// Ignora se não for URL HTTP ou se já foi visto
+			// Ignora URLs que não sejam do domínio https://ifrs.edu.br ou que já tenham sido vistas
+			if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return;
+
+			seen.add(url);
+			items.push({
+				url,
-			const url = $el.attr("href") ?? "";
-
-			// Ignora se não for URL HTTP ou se já foi visto
-			if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return;
-
-			seen.add(url);
-			items.push({
-				url,
+			const href = $el.attr("href");
+
+			if (!href) return;
+
+			let parsedUrl: URL;
+			try {
+				// Permite URLs relativas usando o domínio principal como base
+				parsedUrl = new URL(href, "https://ifrs.edu.br");
+			} catch {
+				// Ignora valores de href inválidos
+				return;
+			}
+
+			// Garante que o link aponte para um domínio do IFRS
+			if (!parsedUrl.hostname.endsWith("ifrs.edu.br")) return;
+
+			const normalizedUrl = parsedUrl.toString();
+
+			// Ignora se já foi visto
+			if (seen.has(normalizedUrl)) return;
+
+			seen.add(normalizedUrl);
+			items.push({
+				url: normalizedUrl,
-			const url = $el.attr("href") ?? "";
-
-			// Ignora se não for URL HTTP ou se já foi visto
-			if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return;
-
-			seen.add(url);
-			items.push({
-				url,
+			const href = $el.attr("href");
+
+			if (!href) return;
+
+			let parsedUrl: URL;
+			try {
+				// Permite URLs relativas usando o domínio principal como base
+				parsedUrl = new URL(href, "https://ifrs.edu.br");
+			} catch {
+				// Ignora valores de href inválidos
+				return;
+			}
+
+			// Garante que o link aponte para um domínio do IFRS
+			if (!parsedUrl.hostname.endsWith("ifrs.edu.br")) return;
+
+			const normalizedUrl = parsedUrl.toString();
+
+			// Ignora se já foi visto
+			if (seen.has(normalizedUrl)) return;
+
+			seen.add(normalizedUrl);
+			items.push({
+				url: normalizedUrl,
+				title: $el.text().trim(),
+			});
+		});
+
+		return items;
+	}
+
+	nextPageUrl(): string | null {
+		return null;
+	}
+}