-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathmarkdown-store.ts
More file actions
117 lines (106 loc) · 4.26 KB
/
markdown-store.ts
File metadata and controls
117 lines (106 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import { Pool, PoolConfig } from 'pg';
import { MarkdownStoreConfig } from './types';
import { Logger } from './logger';
/**
* Stores generated markdown for website pages in a Postgres table.
*
* Table schema (single shared table, default name `markdown_pages`):
* url TEXT PRIMARY KEY
* product_name TEXT NOT NULL
* markdown TEXT NOT NULL
* updated_at TIMESTAMPTZ DEFAULT NOW()
*
* When a website source has `markdown_store: true`, the crawl loop uses this
* store to decide whether to force-process a page that would otherwise be
* skipped by lastmod / ETag caching. If the URL is not yet in Postgres the
* page is processed regardless of cache signals, ensuring the table is fully
* populated after the first sync. On subsequent syncs only pages with
* detected changes are updated.
*/
export class MarkdownStore {
private pool: Pool;
private tableName: string;
private logger: Logger;
constructor(config: MarkdownStoreConfig, logger: Logger) {
this.logger = logger.child('markdown-store');
this.tableName = config.table_name ?? 'markdown_pages';
const poolConfig: PoolConfig = {};
if (config.connection_string) {
poolConfig.connectionString = config.connection_string;
} else {
if (config.host) poolConfig.host = config.host;
if (config.port) poolConfig.port = config.port;
if (config.database) poolConfig.database = config.database;
if (config.user) poolConfig.user = config.user;
if (config.password) poolConfig.password = config.password;
}
this.pool = new Pool(poolConfig);
}
/**
* Create the markdown table if it doesn't already exist.
*/
async init(): Promise<void> {
const query = `
CREATE TABLE IF NOT EXISTS ${this.escapeIdentifier(this.tableName)} (
url TEXT PRIMARY KEY,
product_name TEXT NOT NULL,
markdown TEXT NOT NULL,
updated_at TIMESTAMPTZ DEFAULT NOW()
);
`;
await this.pool.query(query);
this.logger.info(`Initialized Postgres markdown store (table: ${this.tableName})`);
}
/**
* Return the set of URLs that already have markdown stored for a given URL
* prefix (e.g., "https://istio.io/latest/docs/"). This is called once
* before the crawl starts so the crawler can decide which pages need
* force-processing.
*/
async getUrlsWithMarkdown(urlPrefix: string): Promise<Set<string>> {
const result = await this.pool.query(
`SELECT url FROM ${this.escapeIdentifier(this.tableName)} WHERE url LIKE $1`,
[urlPrefix + '%']
);
return new Set(result.rows.map((row: { url: string }) => row.url));
}
/**
* Insert or update the markdown for a URL. Called after a page is
* successfully processed (fetched + converted to markdown).
*/
async upsertMarkdown(url: string, productName: string, markdown: string): Promise<void> {
const query = `
INSERT INTO ${this.escapeIdentifier(this.tableName)} (url, product_name, markdown, updated_at)
VALUES ($1, $2, $3, NOW())
ON CONFLICT (url) DO UPDATE SET
product_name = EXCLUDED.product_name,
markdown = EXCLUDED.markdown,
updated_at = NOW();
`;
await this.pool.query(query, [url, productName, markdown]);
}
/**
* Remove a URL from the store (e.g., when a HEAD request returns 404).
*/
async deleteMarkdown(url: string): Promise<void> {
await this.pool.query(
`DELETE FROM ${this.escapeIdentifier(this.tableName)} WHERE url = $1`,
[url]
);
}
/**
* Close the connection pool. Should be called once after all sources have
* been processed.
*/
async close(): Promise<void> {
await this.pool.end();
this.logger.info('Postgres markdown store connection pool closed');
}
/**
* Escape a SQL identifier (table name) to prevent injection.
* Uses double-quoting per the SQL standard.
*/
private escapeIdentifier(identifier: string): string {
return '"' + identifier.replace(/"/g, '""') + '"';
}
}