Skip to content

Commit 1b7376e

Browse files
authored
Add Azure OpenAI support for embeddings (#52)
* (feat) support azure openai for embedding Signed-off-by: dvaz-external <dvaz.external@epo.org> * fix doc2vec test Signed-off-by: dvaz-external <dvaz.external@epo.org> --------- Signed-off-by: dvaz-external <dvaz.external@epo.org>
1 parent ab38a0b commit 1b7376e

File tree

6 files changed

+94
-10
lines changed

6 files changed

+94
-10
lines changed

README.md

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen
3131
* **HTML to Markdown:** Converts extracted HTML to clean Markdown using `turndown`, preserving code blocks and basic formatting.
3232
* **Clean Heading Text:** Automatically removes anchor links (like `[](#section-id)`) from heading text for cleaner hierarchy display.
3333
* **Intelligent Chunking:** Splits Markdown content into manageable chunks based on headings and token limits, preserving context.
34-
* **Vector Embeddings:** Generates embeddings for each chunk using OpenAI's `text-embedding-3-large` model.
34+
* **Vector Embeddings:** Generates embeddings for each chunk using OpenAI or Azure OpenAI (configurable).
3535
* **Vector Storage:** Supports storing chunks, metadata, and embeddings in:
3636
* **SQLite:** Using `better-sqlite3` and the `sqlite-vec` extension for efficient vector search.
3737
* **Qdrant:** A dedicated vector database, using the `@qdrant/js-client-rest`.
@@ -98,7 +98,7 @@ This ensures that searches for parent topics (like "Installation") will also mat
9898
* **Node.js:** Version 18 or higher recommended (check `.nvmrc` if available).
9999
* **npm:** Node Package Manager (usually comes with Node.js).
100100
* **TypeScript:** As the project is written in TypeScript (`ts-node` is used for execution via `npm start`).
101-
* **OpenAI API Key:** You need an API key from OpenAI to generate embeddings.
101+
* **OpenAI API Key or Azure OpenAI Credentials:** You need either an OpenAI API key or Azure OpenAI credentials to generate embeddings.
102102
* **GitHub Personal Access Token:** Required for accessing GitHub issues (set as `GITHUB_PERSONAL_ACCESS_TOKEN` in your environment).
103103
* **Zendesk API Token:** Required for accessing Zendesk tickets and articles (set as `ZENDESK_API_TOKEN` in your environment).
104104
* **(Optional) Qdrant Instance:** If using the `qdrant` database type, you need a running Qdrant instance accessible from where you run the script.
@@ -129,8 +129,20 @@ Configuration is managed through two files:
129129
```dotenv
130130
# .env
131131
132-
# Required: Your OpenAI API Key
132+
# Embedding Provider Configuration
133+
# Optional: Specify which provider to use (defaults to 'openai' if not set)
134+
# Can also be configured in config.yaml
135+
EMBEDDING_PROVIDER="azure" # or "openai"
136+
137+
# Required: Your OpenAI API Key (if using OpenAI provider)
133138
OPENAI_API_KEY="sk-..."
139+
OPENAI_MODEL="text-embedding-3-large" # Optional, defaults to text-embedding-3-large
140+
141+
# Required: Your Azure OpenAI credentials (if using Azure provider)
142+
AZURE_OPENAI_KEY="your-azure-key"
143+
AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com"
144+
AZURE_OPENAI_DEPLOYMENT_NAME="text-embedding-3-large"
145+
AZURE_OPENAI_API_VERSION="2024-10-21"
134146
135147
# Required for GitHub sources
136148
GITHUB_PERSONAL_ACCESS_TOKEN="ghp_..."
@@ -206,6 +218,21 @@ Configuration is managed through two files:
206218

207219
**Example (`config.yaml`):**
208220
```yaml
221+
# Optional: Configure embedding provider
222+
# Can also be set via EMBEDDING_PROVIDER environment variable
223+
# Defaults to OpenAI if not specified
224+
embedding:
225+
provider: 'openai' # or 'azure'
226+
openai:
227+
api_key: '${OPENAI_API_KEY}' # Optional, uses env var by default
228+
model: 'text-embedding-3-large' # Optional, defaults to text-embedding-3-large
229+
# For Azure OpenAI, use this instead:
230+
# azure:
231+
# api_key: '${AZURE_OPENAI_KEY}'
232+
# endpoint: '${AZURE_OPENAI_ENDPOINT}'
233+
# deployment_name: 'text-embedding-3-large'
234+
# api_version: '2024-10-21' # Optional
235+
209236
sources:
210237
# Website source example
211238
- type: 'website'

doc2vec.ts

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import * as os from 'os';
99
import { exec } from 'child_process';
1010
import { promisify } from 'util';
1111
import { Buffer } from 'buffer';
12-
import { OpenAI } from "openai";
12+
import { OpenAI, AzureOpenAI } from "openai";
1313
import * as dotenv from "dotenv";
1414
import { Logger, LogLevel } from './logger';
1515
import { Utils } from './utils';
@@ -35,7 +35,8 @@ dotenv.config();
3535

3636
export class Doc2Vec {
3737
private config: Config;
38-
private openai: OpenAI;
38+
private openai: OpenAI | AzureOpenAI;
39+
private embeddingModel: string;
3940
private contentProcessor: ContentProcessor;
4041
private logger: Logger;
4142
private configDir: string;
@@ -52,7 +53,45 @@ export class Doc2Vec {
5253
this.logger.info('Initializing Doc2Vec');
5354
this.config = this.loadConfig(configPath);
5455
this.configDir = path.dirname(path.resolve(configPath));
55-
this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
56+
57+
// Initialize OpenAI or Azure OpenAI based on configuration
58+
// Check environment variable if not specified in config
59+
const embeddingProvider = this.config.embedding?.provider || (process.env.EMBEDDING_PROVIDER as 'openai' | 'azure') || 'openai';
60+
const embeddingConfig = this.config.embedding || { provider: embeddingProvider };
61+
62+
if (embeddingProvider === 'azure') {
63+
const azureApiKey = embeddingConfig.azure?.api_key || process.env.AZURE_OPENAI_KEY;
64+
const azureEndpoint = embeddingConfig.azure?.endpoint || process.env.AZURE_OPENAI_ENDPOINT;
65+
const azureDeploymentName = embeddingConfig.azure?.deployment_name || process.env.AZURE_OPENAI_DEPLOYMENT_NAME || 'text-embedding-3-large';
66+
const azureApiVersion = embeddingConfig.azure?.api_version || process.env.AZURE_OPENAI_API_VERSION || '2024-10-21';
67+
68+
if (!azureApiKey || !azureEndpoint) {
69+
this.logger.error('Azure OpenAI requires api_key and endpoint to be configured');
70+
process.exit(1);
71+
}
72+
73+
this.openai = new AzureOpenAI({
74+
apiKey: azureApiKey,
75+
endpoint: azureEndpoint,
76+
deployment: azureDeploymentName,
77+
apiVersion: azureApiVersion,
78+
});
79+
this.embeddingModel = azureDeploymentName;
80+
this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName}`);
81+
} else {
82+
const openaiApiKey = embeddingConfig.openai?.api_key || process.env.OPENAI_API_KEY;
83+
const openaiModel = embeddingConfig.openai?.model || process.env.OPENAI_MODEL || 'text-embedding-3-large';
84+
85+
if (!openaiApiKey) {
86+
this.logger.error('OpenAI requires api_key to be configured');
87+
process.exit(1);
88+
}
89+
90+
this.openai = new OpenAI({ apiKey: openaiApiKey });
91+
this.embeddingModel = openaiModel;
92+
this.logger.info(`Using OpenAI with model: ${openaiModel}`);
93+
}
94+
5695
this.contentProcessor = new ContentProcessor(this.logger);
5796
}
5897

@@ -1483,7 +1522,7 @@ export class Doc2Vec {
14831522
try {
14841523
logger.debug(`Creating embeddings for ${texts.length} texts`);
14851524
const response = await this.openai.embeddings.create({
1486-
model: "text-embedding-3-large",
1525+
model: this.embeddingModel,
14871526
input: texts,
14881527
});
14891528
logger.debug(`Successfully created ${response.data.length} embeddings`);

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "doc2vec",
3-
"version": "2.1.0",
3+
"version": "2.2.0",
44
"type": "commonjs",
55
"description": "",
66
"main": "dist/doc2vec.js",

tests/doc2vec.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ describe('Doc2Vec class', () => {
194194
}) as any;
195195
process.exit = mockProcessExit as any;
196196

197+
// Provide a dummy API key so the constructor validation doesn't call process.exit
198+
process.env.OPENAI_API_KEY = 'test-key-for-tests';
199+
197200
// Ensure test config directory exists
198201
if (!fs.existsSync(testConfigDir)) {
199202
fs.mkdirSync(testConfigDir, { recursive: true });

types.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,23 @@ export interface QdrantDatabaseParams {
7979
collection_name?: string;
8080
}
8181

82+
export interface EmbeddingConfig {
83+
provider: 'openai' | 'azure';
84+
openai?: {
85+
api_key?: string; // Can also use OPENAI_API_KEY env var
86+
model?: string; // Default: text-embedding-3-large
87+
};
88+
azure?: {
89+
api_key?: string; // Can also use AZURE_OPENAI_KEY env var
90+
endpoint?: string; // Can also use AZURE_OPENAI_ENDPOINT env var
91+
deployment_name?: string; // Can also use AZURE_OPENAI_DEPLOYMENT_NAME env var
92+
api_version?: string; // Default: 2024-10-21
93+
};
94+
}
95+
8296
export interface Config {
8397
sources: SourceConfig[];
98+
embedding?: EmbeddingConfig; // Optional, defaults to OpenAI
8499
}
85100

86101
export interface DocumentChunk {

0 commit comments

Comments
 (0)