Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions packages/scraper/src/plugins/course/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import crypto from "node:crypto";
import * as cheerio from "cheerio";
import type { Fetcher } from "../../core/fetcher.js";
import type { ScrapeRequest, ScrapeResult, Scraper } from "../../core/types.js";
import { CursoDetailPage } from "./pages/detail.js";
import { CursoListPage } from "./pages/list.js";

export class CursosScraper implements Scraper {
readonly id = "cursos-ifrs-canoas";

constructor(private fetcher: Fetcher) {}

async *run(request: ScrapeRequest): AsyncGenerator<ScrapeResult> {
const { startUrl } = request;

console.log(`[CursosScraper] Buscando lista em: ${startUrl}`);

const html = await this.fetcher.get(startUrl);
const $ = cheerio.load(html);
const listPage = new CursoListPage($);
const items = listPage.extractItems();

console.log(`[CursosScraper] ${items.length} cursos encontrados`);

for (const item of items) {
if (!item.url) continue;

try {
console.log(`[CursosScraper] Scraping: ${item.title}`);

const detailHtml = await this.fetcher.get(item.url);
const $detail = cheerio.load(detailHtml);
const detailPage = new CursoDetailPage($detail);

const rawText = detailPage.extractContent();
if (!rawText) continue;

yield {
url: item.url,
title: detailPage.extractTitle() || item.title,
rawText,
contentHash: crypto.createHash("md5").update(rawText).digest("hex"),
category: "curso",
publishedAt: detailPage.extractDate(),
sourceType: "webpage",
} satisfies ScrapeResult;
} catch (err) {
console.error(`[CursosScraper] Erro em ${item.url}:`, err);
}
}
}
}
32 changes: 32 additions & 0 deletions packages/scraper/src/plugins/course/pages/detail.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import type { CheerioAPI } from "cheerio";

const SELECTORS = {
title: "h2.page__title",
content: "div.page__content",
dateMeta: "p.page__meta",
} as const;

const REMOVE_SELECTORS = "script, iframe, style, nav, footer";

export class CursoDetailPage {
constructor(private $: CheerioAPI) {}

extractTitle(): string {
return this.$(SELECTORS.title).first().text().trim();
}

extractDate(): Date | undefined {
const text = this.$(SELECTORS.dateMeta).first().text();
// Formato: "Última atualização em 20/02/2026"
const match = text.match(/(\d{2})\/(\d{2})\/(\d{4})/);
if (!match) return undefined;
const [, day, month, year] = match;
return new Date(Number(year), Number(month) - 1, Number(day));
}

extractContent(): string {
const $content = this.$(SELECTORS.content).first().clone();
$content.find(REMOVE_SELECTORS).remove();
return $content.text().replace(/\s+/g, " ").trim();
}
}
39 changes: 39 additions & 0 deletions packages/scraper/src/plugins/course/pages/list.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import type { CheerioAPI } from "cheerio";

export interface CursoListItem {
url: string;
title: string;
}

const SELECTORS = {
link: "div.page__content ul li a",
} as const;

export class CursoListPage {
constructor(private $: CheerioAPI) {}

extractItems(): CursoListItem[] {
const seen = new Set<string>();
const items: CursoListItem[] = [];

this.$(SELECTORS.link).each((_, el) => {
const $el = this.$(el);
const url = $el.attr("href") ?? "";

// Ignora se não for URL HTTP ou se já foi visto
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

O comentário diz que ignora links que “não for URL HTTP”, mas a condição só permite URLs com o prefixo exato "https://ifrs.edu.br". Ajuste o comentário para refletir a regra real (somente https + domínio) ou amplie a validação para aceitar http/https de forma coerente com o comentário.

Suggested change
// Ignora se não for URL HTTP ou sefoi visto
// Ignora URLs que não sejam do domínio https://ifrs.edu.br ou quetenham sido vistas

Copilot uses AI. Check for mistakes.
if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return;

seen.add(url);
items.push({
url,
Comment on lines +21 to +28
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A filtragem url.startsWith("https://ifrs.edu.br") é muito restritiva e pode descartar links válidos do próprio IFRS (ex.: https://www.ifrs.edu.br/...) ou variações de scheme. Em vez de comparar prefixo, prefira parsear com new URL(...) e validar o hostname (por exemplo, endsWith("ifrs.edu.br")), mantendo também a deduplicação via URL normalizada.

Suggested change
const url = $el.attr("href") ?? "";
// Ignora se não for URL HTTP ou se já foi visto
if (!url.startsWith("https://ifrs.edu.br") || seen.has(url)) return;
seen.add(url);
items.push({
url,
const href = $el.attr("href");
if (!href) return;
let parsedUrl: URL;
try {
// Permite URLs relativas usando o domínio principal como base
parsedUrl = new URL(href, "https://ifrs.edu.br");
} catch {
// Ignora valores de href inválidos
return;
}
// Garante que o link aponte para um domínio do IFRS
if (!parsedUrl.hostname.endsWith("ifrs.edu.br")) return;
const normalizedUrl = parsedUrl.toString();
// Ignora se já foi visto
if (seen.has(normalizedUrl)) return;
seen.add(normalizedUrl);
items.push({
url: normalizedUrl,

Copilot uses AI. Check for mistakes.
title: $el.text().trim(),
});
});

return items;
}

nextPageUrl(): string | null {
return null;
}
}