150 lines
6.9 KiB
TypeScript
150 lines
6.9 KiB
TypeScript
// tests/unit/web/crwl.test.ts
|
|
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
import { crawlAndExtract } from '../../../src/web/crwl';
|
|
import path from 'path';
|
|
import os from 'os';
|
|
import crypto from 'crypto';
|
|
import { promises as fs } from 'fs';
|
|
|
|
// Note: This is an integration test, not a unit test, as it runs the actual Python script.
|
|
// Ensure Python and crawl4ai are installed in the environment where this test runs.
|
|
// Also ensure the python executable used by the test (`python` by default) can find crawl4ai.
|
|
|
|
// Integration tests for the crawlAndExtract wrapper
|
|
|
|
describe('crawlAndExtract Integration Tests', () => {
|
|
|
|
// Setup temp files for schemas and configs
|
|
let tempSchemaPath: string;
|
|
let tempBrowserConfigPath: string;
|
|
let tempCrawlerConfigPath: string;
|
|
|
|
const testSchema = {
|
|
name: "Test Products",
|
|
baseSelector: "div.item",
|
|
fields: [
|
|
{ name: "product_title", selector: "h2.title", type: "text" },
|
|
{ name: "product_price", selector: "span.price", type: "text" },
|
|
{ name: "product_link", selector: "a.link", type: "attribute", attribute: "href" }
|
|
]
|
|
};
|
|
// Dummy configs - use empty objects as we don't know exact valid keys
|
|
const testBrowserConfig = {};
|
|
const testCrawlerConfig = {};
|
|
|
|
beforeAll(async () => {
|
|
// Create temp files before tests run
|
|
const randomSuffix = crypto.randomBytes(6).toString('hex');
|
|
tempSchemaPath = path.join(os.tmpdir(), `test-schema-${randomSuffix}.json`);
|
|
tempBrowserConfigPath = path.join(os.tmpdir(), `test-browser-cfg-${randomSuffix}.json`);
|
|
tempCrawlerConfigPath = path.join(os.tmpdir(), `test-crawler-cfg-${randomSuffix}.json`);
|
|
|
|
try {
|
|
await fs.writeFile(tempSchemaPath, JSON.stringify(testSchema, null, 2));
|
|
await fs.writeFile(tempBrowserConfigPath, JSON.stringify(testBrowserConfig, null, 2));
|
|
await fs.writeFile(tempCrawlerConfigPath, JSON.stringify(testCrawlerConfig, null, 2));
|
|
console.log(`Created temp schema file: ${tempSchemaPath}`);
|
|
console.log(`Created temp browser config: ${tempBrowserConfigPath}`);
|
|
console.log(`Created temp crawler config: ${tempCrawlerConfigPath}`);
|
|
} catch (err) {
|
|
console.error("Failed to create temp files for tests:", err);
|
|
// Cannot call afterAll directly, rely on the hook itself for cleanup
|
|
throw err;
|
|
}
|
|
});
|
|
|
|
afterAll(async () => {
|
|
// Cleanup temp files after all tests run
|
|
const filesToClean = [tempSchemaPath, tempBrowserConfigPath, tempCrawlerConfigPath];
|
|
for (const filePath of filesToClean) {
|
|
if (filePath) {
|
|
try {
|
|
await fs.unlink(filePath);
|
|
console.log(`Cleaned up temp file: ${filePath}`);
|
|
} catch (err: any) {
|
|
// Ignore ENOENT (file already gone), warn others
|
|
if (err.code !== 'ENOENT') {
|
|
console.warn(`Failed to clean up temp file ${filePath}:`, err);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
// Test Markdown generation (basic)
|
|
const markdownTimeoutMs = 60000;
|
|
it('should generate markdown for a given URL', async () => {
|
|
const url = 'https://www.nbcnews.com/business';
|
|
try {
|
|
const result = await crawlAndExtract(url, {
|
|
outputMode: 'markdown',
|
|
});
|
|
expect(result).toBeTypeOf('string');
|
|
expect(result.length).toBeGreaterThan(100);
|
|
console.log(`Markdown generated successfully for ${url}. Length: ${result.length}`);
|
|
} catch (error: any) {
|
|
console.error("Markdown generation test failed unexpectedly:", error);
|
|
throw error;
|
|
}
|
|
}, markdownTimeoutMs);
|
|
|
|
// Test JSON extraction using schema file (basic)
|
|
const jsonTimeoutMs = 20000;
|
|
it('should extract JSON using a schema file', async () => {
|
|
const dummyHtml = `
|
|
<html><body>
|
|
<div class='item'><h2 class='title'>P1</h2><span class='price'>$10</span><a class='link' href='/p1'>L1</a></div>
|
|
<div class='item'><h2 class='title'>P2</h2><span class='price'>$20</span><a class='link' href='/p2'>L2</a></div>
|
|
</body></html>`;
|
|
const rawUrl = `raw://${dummyHtml}`;
|
|
|
|
try {
|
|
const result = await crawlAndExtract(rawUrl, {
|
|
outputMode: 'json',
|
|
schemaPath: tempSchemaPath,
|
|
strategyType: 'css',
|
|
});
|
|
expect(result).toBeInstanceOf(Array);
|
|
expect(result).toHaveLength(2);
|
|
expect(result[0]).toEqual({ product_title: "P1", product_price: "$10", product_link: "/p1" });
|
|
expect(result[1]).toEqual({ product_title: "P2", product_price: "$20", product_link: "/p2" });
|
|
console.log(`JSON extracted successfully using schema file: ${tempSchemaPath}`);
|
|
} catch (error: any) {
|
|
console.error("JSON extraction test (schema file) failed unexpectedly:", error);
|
|
throw error;
|
|
}
|
|
}, jsonTimeoutMs);
|
|
|
|
// Test JSON extraction using schema file AND config files
|
|
it('should extract JSON using schema and config files', async () => {
|
|
const dummyHtml = `
|
|
<html><body>
|
|
<div class='item'><h2 class='title'>CfgP1</h2><span class='price'>$30</span><a class='link' href='/cfg/p1'>CfgL1</a></div>
|
|
</body></html>`;
|
|
const rawUrl = `raw://${dummyHtml}`;
|
|
|
|
try {
|
|
// This test primarily ensures the script runs without crashing due to config loading/passing
|
|
const result = await crawlAndExtract(rawUrl, {
|
|
outputMode: 'json',
|
|
schemaPath: tempSchemaPath,
|
|
browserConfigPath: tempBrowserConfigPath, // Pass browser config path
|
|
crawlerConfigPath: tempCrawlerConfigPath, // Pass crawler config path
|
|
strategyType: 'css',
|
|
});
|
|
// Basic validation of output - config effects are not asserted here
|
|
expect(result).toBeInstanceOf(Array);
|
|
expect(result).toHaveLength(1);
|
|
expect(result[0]).toEqual({ product_title: "CfgP1", product_price: "$30", product_link: "/cfg/p1" });
|
|
console.log(`JSON extracted successfully using schema and config files.`);
|
|
} catch (error: any) {
|
|
console.error("JSON extraction test (schema + config files) failed unexpectedly:", error);
|
|
throw error;
|
|
}
|
|
}, jsonTimeoutMs);
|
|
|
|
// TODO: Add tests for browserConfigPath and crawlerConfigPath if needed
|
|
// These would likely require creating dummy JSON config files and checking stderr
|
|
// for specific log messages indicating the configs were loaded/applied.
|
|
|
|
});
|