Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 117 additions & 36 deletions src/__tests__/commands/crawl.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -324,51 +324,93 @@ describe('executeCrawl', () => {
});

describe('Wait mode (synchronous crawl)', () => {
it('should use crawl method with wait when wait flag is set', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
beforeEach(() => {
vi.spyOn(process.stderr, 'write').mockImplementation(() => true);
vi.useFakeTimers();
});

afterEach(() => {
vi.restoreAllMocks();
vi.useRealTimers();
});

it('should use startCrawl + getCrawlStatus polling when wait flag is set', async () => {
const jobId = '550e8400-e29b-41d4-a716-446655440000';
const mockStartResponse = { id: jobId, url: 'https://example.com' };
const mockCompletedStatus = {
id: jobId,
status: 'completed',
total: 100,
completed: 100,
data: [{ markdown: '# Page 1' }],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);
mockClient.startCrawl.mockResolvedValue(mockStartResponse);
mockClient.getCrawlStatus.mockResolvedValue(mockCompletedStatus);

const result = await executeCrawl({
const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 0.001,
});
await vi.advanceTimersByTimeAsync(1);
const result = await crawlPromise;

expect(mockClient.crawl).toHaveBeenCalledTimes(1);
expect(mockClient.crawl).toHaveBeenCalledWith(
expect(mockClient.crawl).not.toHaveBeenCalled();
expect(mockClient.startCrawl).toHaveBeenCalledTimes(1);
expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
pollInterval: 5000, // Default poll interval
})
expect.objectContaining({ pollInterval: 1 })
);
expect(result).toEqual({
success: true,
data: mockCrawlJob,
expect(mockClient.getCrawlStatus).toHaveBeenCalledWith(jobId);
expect(result).toEqual({ success: true, data: mockCompletedStatus });
});

it('should not write progress to stderr when progress flag is not set', async () => {
const jobId = '550e8400-e29b-41d4-a716-446655440000';
mockClient.startCrawl.mockResolvedValue({
id: jobId,
url: 'https://example.com',
});
mockClient.getCrawlStatus.mockResolvedValue({
id: jobId,
status: 'completed',
total: 10,
completed: 10,
});

const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 0.001,
});
await vi.advanceTimersByTimeAsync(1);
await crawlPromise;

expect(process.stderr.write).not.toHaveBeenCalled();
});

it('should include custom pollInterval when provided', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
const jobId = '550e8400-e29b-41d4-a716-446655440000';
mockClient.startCrawl.mockResolvedValue({
id: jobId,
url: 'https://example.com',
});
mockClient.getCrawlStatus.mockResolvedValue({
id: jobId,
status: 'completed',
total: 100,
completed: 100,
data: [],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);
});

await executeCrawl({
const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 10,
});
await vi.advanceTimersByTimeAsync(10000);
await crawlPromise;

expect(mockClient.crawl).toHaveBeenCalledWith(
expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
pollInterval: 10000, // Converted to milliseconds
Expand All @@ -377,22 +419,28 @@ describe('executeCrawl', () => {
});

it('should include timeout when provided', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
const jobId = '550e8400-e29b-41d4-a716-446655440000';
mockClient.startCrawl.mockResolvedValue({
id: jobId,
url: 'https://example.com',
});
mockClient.getCrawlStatus.mockResolvedValue({
id: jobId,
status: 'completed',
total: 100,
completed: 100,
data: [],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);
});

await executeCrawl({
const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
timeout: 300,
pollInterval: 0.001,
});
await vi.advanceTimersByTimeAsync(1);
await crawlPromise;

expect(mockClient.crawl).toHaveBeenCalledWith(
expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
timeout: 300000, // Converted to milliseconds
Expand All @@ -401,25 +449,30 @@ describe('executeCrawl', () => {
});

it('should combine wait options with crawl options', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
const jobId = '550e8400-e29b-41d4-a716-446655440000';
mockClient.startCrawl.mockResolvedValue({
id: jobId,
url: 'https://example.com',
});
mockClient.getCrawlStatus.mockResolvedValue({
id: jobId,
status: 'completed',
total: 50,
completed: 50,
data: [],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);
});

await executeCrawl({
const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 5,
timeout: 600,
limit: 50,
maxDepth: 2,
});
await vi.advanceTimersByTimeAsync(5000);
await crawlPromise;

expect(mockClient.crawl).toHaveBeenCalledWith(
expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
pollInterval: 5000,
Expand All @@ -429,6 +482,34 @@ describe('executeCrawl', () => {
})
);
});

it('should return timeout error when crawl exceeds timeout duration', async () => {
const jobId = '550e8400-e29b-41d4-a716-446655440000';
mockClient.startCrawl.mockResolvedValue({
id: jobId,
url: 'https://example.com',
});
// Always return scraping — never completes
mockClient.getCrawlStatus.mockResolvedValue({
id: jobId,
status: 'scraping',
total: 100,
completed: 10,
});

const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
timeout: 1, // 1 second timeout
pollInterval: 0.001,
});
// Advance past the timeout (1000ms) + one poll interval
await vi.advanceTimersByTimeAsync(1002);
const result = await crawlPromise;

expect(result.success).toBe(false);
expect(result.error).toContain('Timeout after 1 seconds');
});
});

describe('Progress mode', () => {
Expand Down Expand Up @@ -526,9 +607,9 @@ describe('executeCrawl', () => {
});
});

it('should return error result when crawl fails', async () => {
it('should return error result when startCrawl fails in wait mode', async () => {
const errorMessage = 'Crawl timeout';
mockClient.crawl.mockRejectedValue(new Error(errorMessage));
mockClient.startCrawl.mockRejectedValue(new Error(errorMessage));

const result = await executeCrawl({
urlOrJobId: 'https://example.com',
Expand Down
69 changes: 35 additions & 34 deletions src/commands/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,58 +108,59 @@ export async function executeCrawl(
crawlOptions.timeout = timeout * 1000; // Convert to milliseconds
}

// Show progress if requested - use custom polling for better UX
if (options.progress) {
// Start crawl first
const response = await app.startCrawl(urlOrJobId, crawlOptions);
const jobId = response.id;
// Use manual polling for both --wait and --wait --progress.
// app.crawl() (the SDK convenience method) uses an internal mechanism
// incompatible with self-hosted instances, causing --wait to hang
// indefinitely without --progress. Polling via getCrawlStatus() works
// correctly against both cloud and self-hosted deployments.
const response = await app.startCrawl(urlOrJobId, crawlOptions);
const jobId = response.id;

if (options.progress) {
process.stderr.write(`Crawling ${urlOrJobId}...\n`);
process.stderr.write(`Job ID: ${jobId}\n`);
}

// Poll for status with progress updates
const pollMs = crawlOptions.pollInterval || 5000;
const startTime = Date.now();
const timeoutMs = timeout ? timeout * 1000 : undefined;
const pollMs = crawlOptions.pollInterval || 5000;
const startTime = Date.now();
const timeoutMs = timeout ? timeout * 1000 : undefined;

while (true) {
await new Promise((resolve) => setTimeout(resolve, pollMs));
while (true) {
await new Promise((resolve) => setTimeout(resolve, pollMs));

const status = await app.getCrawlStatus(jobId);
const status = await app.getCrawlStatus(jobId);

if (options.progress) {
// Show progress
process.stderr.write(
`\rProgress: ${status.completed}/${status.total} pages (${status.status})`
);
}

if (
status.status === 'completed' ||
status.status === 'failed' ||
status.status === 'cancelled'
) {
if (
status.status === 'completed' ||
status.status === 'failed' ||
status.status === 'cancelled'
) {
if (options.progress) {
process.stderr.write('\n');
return {
success: true,
data: status,
};
}
return {
success: true,
data: status,
};
}

// Check timeout
if (timeoutMs && Date.now() - startTime > timeoutMs) {
// Check timeout
if (timeoutMs && Date.now() - startTime > timeoutMs) {
if (options.progress) {
process.stderr.write('\n');
return {
success: false,
error: `Timeout after ${timeout} seconds. Crawl still in progress.`,
};
}
return {
success: false,
error: `Timeout after ${timeout} seconds. Crawl still in progress.`,
};
}
} else {
// Use SDK's built-in polling (no progress display)
const crawlJob = await app.crawl(urlOrJobId, crawlOptions);
return {
success: true,
data: crawlJob,
};
}
}

Expand Down