diff --git a/packages/scraper/src/plugins/course/index.ts b/packages/scraper/src/plugins/course/index.ts new file mode 100644 index 0000000..fd5d29b --- /dev/null +++ b/packages/scraper/src/plugins/course/index.ts @@ -0,0 +1,52 @@ +import crypto from "node:crypto"; +import * as cheerio from "cheerio"; +import type { Fetcher } from "../../core/fetcher.js"; +import type { ScrapeRequest, ScrapeResult, Scraper } from "../../core/types.js"; +import { CursoDetailPage } from "./pages/detail.js"; +import { CursoListPage } from "./pages/list.js"; + +export class CursosScraper implements Scraper { + readonly id = "cursos-ifrs-canoas"; + + constructor(private fetcher: Fetcher) {} + + async *run(request: ScrapeRequest): AsyncGenerator { + const { startUrl } = request; + + console.log(`[CursosScraper] Buscando lista em: ${startUrl}`); + + const html = await this.fetcher.get(startUrl); + const $ = cheerio.load(html); + const listPage = new CursoListPage($); + const items = listPage.extractItems(); + + console.log(`[CursosScraper] ${items.length} cursos encontrados`); + + for (const item of items) { + if (!item.url) continue; + + try { + console.log(`[CursosScraper] Scraping: ${item.title}`); + + const detailHtml = await this.fetcher.get(item.url); + const $detail = cheerio.load(detailHtml); + const detailPage = new CursoDetailPage($detail); + + const rawText = detailPage.extractContent(); + if (!rawText) continue; + + yield { + url: item.url, + title: detailPage.extractTitle() || item.title, + rawText, + contentHash: crypto.createHash("md5").update(rawText).digest("hex"), + category: "curso", + publishedAt: detailPage.extractDate(), + sourceType: "webpage", + } satisfies ScrapeResult; + } catch (err) { + console.error(`[CursosScraper] Erro em ${item.url}:`, err); + } + } + } +} diff --git a/packages/scraper/src/plugins/course/pages/detail.ts b/packages/scraper/src/plugins/course/pages/detail.ts new file mode 100644 index 0000000..6c2149c --- /dev/null +++ b/packages/scraper/src/plugins/course/pages/detail.ts @@ -0,0 +1,32 @@ +import type { CheerioAPI } from "cheerio"; + +const SELECTORS = { + title: "h2.page__title", + content: "div.page__content", + dateMeta: "p.page__meta", +} as const; + +const REMOVE_SELECTORS = "script, iframe, style, nav, footer"; + +export class CursoDetailPage { + constructor(private $: CheerioAPI) {} + + extractTitle(): string { + return this.$(SELECTORS.title).first().text().trim(); + } + + extractDate(): Date | undefined { + const text = this.$(SELECTORS.dateMeta).first().text(); + // Formato: "Última atualização em 20/02/2026" + const match = text.match(/(\d{2})\/(\d{2})\/(\d{4})/); + if (!match) return undefined; + const [, day, month, year] = match; + return new Date(Number(year), Number(month) - 1, Number(day)); + } + + extractContent(): string { + const $content = this.$(SELECTORS.content).first().clone(); + $content.find(REMOVE_SELECTORS).remove(); + return $content.text().replace(/\s+/g, " ").trim(); + } +} diff --git a/packages/scraper/src/plugins/course/pages/list.ts b/packages/scraper/src/plugins/course/pages/list.ts new file mode 100644 index 0000000..a1bcff1 --- /dev/null +++ b/packages/scraper/src/plugins/course/pages/list.ts @@ -0,0 +1,39 @@ +import type { CheerioAPI } from "cheerio"; + +export interface CursoListItem { + url: string; + title: string; +} + +const SELECTORS = { + link: "div.page__content ul li a", +} as const; + +export class CursoListPage { + constructor(private $: CheerioAPI) {} + + extractItems(): CursoListItem[] { + const seen = new Set(); + const items: CursoListItem[] = []; + + this.$(SELECTORS.link).each((_, el) => { + const $el = this.$(el); + const url = $el.attr("href") ?? ""; + + // Ignora se não for URL HTTP ou se já foi visto + if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return; + + seen.add(url); + items.push({ + url, + title: $el.text().trim(), + }); + }); + + return items; + } + + nextPageUrl(): string | null { + return null; + } +}