From e63d9b9e6547c6efb408673c7a99803fccbd359b Mon Sep 17 00:00:00 2001 From: JoaoPixelCode Date: Fri, 6 Mar 2026 11:32:47 -0300 Subject: [PATCH 1/2] feat(scraper): add cursos plugin for IFRS Canoas --- apps/server/src/scraper.ts | 22 ++++++++ packages/scraper/src/plugins/course/index.ts | 52 +++++++++++++++++++ .../src/plugins/course/pages/detail.ts | 32 ++++++++++++ .../scraper/src/plugins/course/pages/list.ts | 39 ++++++++++++++ 4 files changed, 145 insertions(+) create mode 100644 apps/server/src/scraper.ts create mode 100644 packages/scraper/src/plugins/course/index.ts create mode 100644 packages/scraper/src/plugins/course/pages/detail.ts create mode 100644 packages/scraper/src/plugins/course/pages/list.ts diff --git a/apps/server/src/scraper.ts b/apps/server/src/scraper.ts new file mode 100644 index 0000000..77750e2 --- /dev/null +++ b/apps/server/src/scraper.ts @@ -0,0 +1,22 @@ +import axios from "axios"; +import * as cheerio from "cheerio"; + +const url = "https://ifrs.edu.br/canoas"; + +async function scraper() { + const response = await axios.get(url); + const html = response.data; + const posts: { title: string }[] = []; + const $ = cheerio.load(html); + + $(".ultimos-editais a").each(function () { + const title = $(this).find(".ultimos-editais__edital-title").text(); + posts.push({ + title, + }); + }); + + console.log({ posts }); +} + +scraper(); diff --git a/packages/scraper/src/plugins/course/index.ts b/packages/scraper/src/plugins/course/index.ts new file mode 100644 index 0000000..fd5d29b --- /dev/null +++ b/packages/scraper/src/plugins/course/index.ts @@ -0,0 +1,52 @@ +import crypto from "node:crypto"; +import * as cheerio from "cheerio"; +import type { Fetcher } from "../../core/fetcher.js"; +import type { ScrapeRequest, ScrapeResult, Scraper } from "../../core/types.js"; +import { CursoDetailPage } from "./pages/detail.js"; +import { CursoListPage } from "./pages/list.js"; + +export class CursosScraper implements Scraper { + readonly id = "cursos-ifrs-canoas"; + + constructor(private fetcher: Fetcher) {} + + async *run(request: ScrapeRequest): AsyncGenerator { + const { startUrl } = request; + + console.log(`[CursosScraper] Buscando lista em: ${startUrl}`); + + const html = await this.fetcher.get(startUrl); + const $ = cheerio.load(html); + const listPage = new CursoListPage($); + const items = listPage.extractItems(); + + console.log(`[CursosScraper] ${items.length} cursos encontrados`); + + for (const item of items) { + if (!item.url) continue; + + try { + console.log(`[CursosScraper] Scraping: ${item.title}`); + + const detailHtml = await this.fetcher.get(item.url); + const $detail = cheerio.load(detailHtml); + const detailPage = new CursoDetailPage($detail); + + const rawText = detailPage.extractContent(); + if (!rawText) continue; + + yield { + url: item.url, + title: detailPage.extractTitle() || item.title, + rawText, + contentHash: crypto.createHash("md5").update(rawText).digest("hex"), + category: "curso", + publishedAt: detailPage.extractDate(), + sourceType: "webpage", + } satisfies ScrapeResult; + } catch (err) { + console.error(`[CursosScraper] Erro em ${item.url}:`, err); + } + } + } +} diff --git a/packages/scraper/src/plugins/course/pages/detail.ts b/packages/scraper/src/plugins/course/pages/detail.ts new file mode 100644 index 0000000..6c2149c --- /dev/null +++ b/packages/scraper/src/plugins/course/pages/detail.ts @@ -0,0 +1,32 @@ +import type { CheerioAPI } from "cheerio"; + +const SELECTORS = { + title: "h2.page__title", + content: "div.page__content", + dateMeta: "p.page__meta", +} as const; + +const REMOVE_SELECTORS = "script, iframe, style, nav, footer"; + +export class CursoDetailPage { + constructor(private $: CheerioAPI) {} + + extractTitle(): string { + return this.$(SELECTORS.title).first().text().trim(); + } + + extractDate(): Date | undefined { + const text = this.$(SELECTORS.dateMeta).first().text(); + // Formato: "Última atualização em 20/02/2026" + const match = text.match(/(\d{2})\/(\d{2})\/(\d{4})/); + if (!match) return undefined; + const [, day, month, year] = match; + return new Date(Number(year), Number(month) - 1, Number(day)); + } + + extractContent(): string { + const $content = this.$(SELECTORS.content).first().clone(); + $content.find(REMOVE_SELECTORS).remove(); + return $content.text().replace(/\s+/g, " ").trim(); + } +} diff --git a/packages/scraper/src/plugins/course/pages/list.ts b/packages/scraper/src/plugins/course/pages/list.ts new file mode 100644 index 0000000..a1bcff1 --- /dev/null +++ b/packages/scraper/src/plugins/course/pages/list.ts @@ -0,0 +1,39 @@ +import type { CheerioAPI } from "cheerio"; + +export interface CursoListItem { + url: string; + title: string; +} + +const SELECTORS = { + link: "div.page__content ul li a", +} as const; + +export class CursoListPage { + constructor(private $: CheerioAPI) {} + + extractItems(): CursoListItem[] { + const seen = new Set(); + const items: CursoListItem[] = []; + + this.$(SELECTORS.link).each((_, el) => { + const $el = this.$(el); + const url = $el.attr("href") ?? ""; + + // Ignora se não for URL HTTP ou se já foi visto + if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return; + + seen.add(url); + items.push({ + url, + title: $el.text().trim(), + }); + }); + + return items; + } + + nextPageUrl(): string | null { + return null; + } +} From be3376953a0d8ee04f0d374b098420e3062cc4a8 Mon Sep 17 00:00:00 2001 From: JoaoPixelCode Date: Fri, 6 Mar 2026 11:34:17 -0300 Subject: [PATCH 2/2] chore: remove test scraper file --- apps/server/src/scraper.ts | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 apps/server/src/scraper.ts diff --git a/apps/server/src/scraper.ts b/apps/server/src/scraper.ts deleted file mode 100644 index 77750e2..0000000 --- a/apps/server/src/scraper.ts +++ /dev/null @@ -1,22 +0,0 @@ -import axios from "axios"; -import * as cheerio from "cheerio"; - -const url = "https://ifrs.edu.br/canoas"; - -async function scraper() { - const response = await axios.get(url); - const html = response.data; - const posts: { title: string }[] = []; - const $ = cheerio.load(html); - - $(".ultimos-editais a").each(function () { - const title = $(this).find(".ultimos-editais__edital-title").text(); - posts.push({ - title, - }); - }); - - console.log({ posts }); -} - -scraper();