crawler : crwl 2/3
This commit is contained in:
parent
5994766841
commit
0ec7f89cbc
@ -36,8 +36,11 @@ async def main(url, schema_path, strategy_type, output_mode, output_file,
|
||||
|
||||
# --- Prepare CrawlerRunConfig arguments ---
|
||||
run_config_kwargs = crawler_config.copy() # Start with crawler config file content
|
||||
# Set cache mode
|
||||
run_config_kwargs['cache_mode'] = CacheMode.BYPASS if bypass_cache else CacheMode.CACHE_FIRST
|
||||
|
||||
# Set cache mode ONLY if bypassCache is true
|
||||
if bypass_cache:
|
||||
run_config_kwargs['cache_mode'] = CacheMode.BYPASS
|
||||
# Otherwise, let crawl4ai use its default cache mode
|
||||
|
||||
# Add extraction strategy if JSON mode
|
||||
if output_mode == 'json':
|
||||
@ -61,9 +64,12 @@ async def main(url, schema_path, strategy_type, output_mode, output_file,
|
||||
sys.exit(1)
|
||||
|
||||
# Create CrawlerRunConfig
|
||||
# Note: Assumes crawler_config keys match CrawlerRunConfig parameters
|
||||
try:
|
||||
# Only pass kwargs if the loaded config was not empty
|
||||
if run_config_kwargs:
|
||||
config = CrawlerRunConfig(**run_config_kwargs)
|
||||
else:
|
||||
config = CrawlerRunConfig() # Use default config if file was empty/not provided
|
||||
except TypeError as e:
|
||||
print(f"Error creating CrawlerRunConfig. Check crawler config file content or arguments: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
@ -74,8 +80,12 @@ async def main(url, schema_path, strategy_type, output_mode, output_file,
|
||||
|
||||
try:
|
||||
# Create crawler instance
|
||||
# Note: Assumes browser_config keys match AsyncWebCrawler parameters
|
||||
# Only pass kwargs if the loaded config was not empty
|
||||
if crawler_kwargs:
|
||||
crawler_instance = AsyncWebCrawler(**crawler_kwargs)
|
||||
else:
|
||||
# Should not happen as verbose is always added, but for safety:
|
||||
crawler_instance = AsyncWebCrawler()
|
||||
|
||||
# Attempt to redirect logger to stderr if not verbose (to keep final output clean)
|
||||
if not verbose:
|
||||
|
||||
@ -14,8 +14,11 @@ import { promises as fs } from 'fs';
|
||||
|
||||
describe('crawlAndExtract Integration Tests', () => {
|
||||
|
||||
// Setup for JSON test: create a temporary schema file
|
||||
// Setup temp files for schemas and configs
|
||||
let tempSchemaPath: string;
|
||||
let tempBrowserConfigPath: string;
|
||||
let tempCrawlerConfigPath: string;
|
||||
|
||||
const testSchema = {
|
||||
name: "Test Products",
|
||||
baseSelector: "div.item",
|
||||
@ -25,116 +28,117 @@ describe('crawlAndExtract Integration Tests', () => {
|
||||
{ name: "product_link", selector: "a.link", type: "attribute", attribute: "href" }
|
||||
]
|
||||
};
|
||||
// Dummy configs - use empty objects as we don't know exact valid keys
|
||||
const testBrowserConfig = {};
|
||||
const testCrawlerConfig = {};
|
||||
|
||||
beforeAll(async () => {
|
||||
// Create a temp file for the schema before tests run
|
||||
const tempFileName = `test-schema-${crypto.randomBytes(6).toString('hex')}.json`;
|
||||
tempSchemaPath = path.join(os.tmpdir(), tempFileName);
|
||||
// Create temp files before tests run
|
||||
const randomSuffix = crypto.randomBytes(6).toString('hex');
|
||||
tempSchemaPath = path.join(os.tmpdir(), `test-schema-${randomSuffix}.json`);
|
||||
tempBrowserConfigPath = path.join(os.tmpdir(), `test-browser-cfg-${randomSuffix}.json`);
|
||||
tempCrawlerConfigPath = path.join(os.tmpdir(), `test-crawler-cfg-${randomSuffix}.json`);
|
||||
|
||||
try {
|
||||
await fs.writeFile(tempSchemaPath, JSON.stringify(testSchema, null, 2));
|
||||
await fs.writeFile(tempBrowserConfigPath, JSON.stringify(testBrowserConfig, null, 2));
|
||||
await fs.writeFile(tempCrawlerConfigPath, JSON.stringify(testCrawlerConfig, null, 2));
|
||||
console.log(`Created temp schema file: ${tempSchemaPath}`);
|
||||
console.log(`Created temp browser config: ${tempBrowserConfigPath}`);
|
||||
console.log(`Created temp crawler config: ${tempCrawlerConfigPath}`);
|
||||
} catch (err) {
|
||||
console.error("Failed to create temp schema file for tests:", err);
|
||||
throw err; // Fail setup if file creation fails
|
||||
console.error("Failed to create temp files for tests:", err);
|
||||
// Cannot call afterAll directly, rely on the hook itself for cleanup
|
||||
throw err;
|
||||
}
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
// Cleanup the temp schema file after all tests run
|
||||
if (tempSchemaPath) {
|
||||
// Cleanup temp files after all tests run
|
||||
const filesToClean = [tempSchemaPath, tempBrowserConfigPath, tempCrawlerConfigPath];
|
||||
for (const filePath of filesToClean) {
|
||||
if (filePath) {
|
||||
try {
|
||||
await fs.unlink(tempSchemaPath);
|
||||
console.log(`Cleaned up temp schema file: ${tempSchemaPath}`);
|
||||
} catch (err) {
|
||||
console.warn(`Failed to clean up temp schema file ${tempSchemaPath}:`, err);
|
||||
await fs.unlink(filePath);
|
||||
console.log(`Cleaned up temp file: ${filePath}`);
|
||||
} catch (err: any) {
|
||||
// Ignore ENOENT (file already gone), warn others
|
||||
if (err.code !== 'ENOENT') {
|
||||
console.warn(`Failed to clean up temp file ${filePath}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Test Markdown generation
|
||||
// Increase timeout for this test as it involves network I/O and python script execution
|
||||
// vitest default is 5000ms
|
||||
const markdownTimeoutMs = 60000; // 60 seconds
|
||||
it('should generate markdown with verbose and bypassCache flags', async () => {
|
||||
// Test Markdown generation (basic)
|
||||
const markdownTimeoutMs = 60000;
|
||||
it('should generate markdown for a given URL', async () => {
|
||||
const url = 'https://www.nbcnews.com/business';
|
||||
let stderrOutput = '';
|
||||
const originalWarn = console.warn; // Store original console.warn
|
||||
console.warn = (message) => { // Capture stderr from the wrapper
|
||||
if (typeof message === 'string' && message.startsWith('Python script stderr:')) {
|
||||
stderrOutput += message;
|
||||
}
|
||||
originalWarn.apply(console, [message]); // Call original warn
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await crawlAndExtract(url, {
|
||||
outputMode: 'markdown',
|
||||
verbose: true, // Test verbose flag
|
||||
bypassCache: true, // Test bypassCache flag
|
||||
});
|
||||
console.warn = originalWarn; // Restore original console.warn
|
||||
|
||||
expect(result).toBeTypeOf('string');
|
||||
expect(result.length).toBeGreaterThan(100);
|
||||
// Check stderr for expected verbose log patterns (adjust patterns if needed)
|
||||
expect(stderrOutput).toContain('[INIT]');
|
||||
expect(stderrOutput).toContain('[FETCH]');
|
||||
expect(stderrOutput).toContain('[SCRAPE]');
|
||||
expect(stderrOutput).toContain('[MARKDOWN]'); // Or similar markdown step log
|
||||
expect(stderrOutput).toContain('[COMPLETE]');
|
||||
console.log(`Markdown generated (verbose) successfully for ${url}. Length: ${result.length}`);
|
||||
} catch (error) {
|
||||
console.warn = originalWarn; // Restore on error too
|
||||
console.error("Markdown generation test (verbose) failed:", error);
|
||||
console.log(`Markdown generated successfully for ${url}. Length: ${result.length}`);
|
||||
} catch (error: any) {
|
||||
console.error("Markdown generation test failed unexpectedly:", error);
|
||||
throw error;
|
||||
}
|
||||
}, markdownTimeoutMs);
|
||||
|
||||
// Test JSON extraction using schema file
|
||||
// Increase timeout slightly as it still involves process spawning
|
||||
const jsonTimeoutMs = 20000; // 20 seconds
|
||||
it('should extract JSON using schema file with verbose flag', async () => {
|
||||
// Using a simple, static HTML page for reliable JSON extraction testing
|
||||
// This avoids relying on external site structure which can change.
|
||||
// We'll use the raw:// scheme with dummy HTML passed directly to the python script.
|
||||
// Test JSON extraction using schema file (basic)
|
||||
const jsonTimeoutMs = 20000;
|
||||
it('should extract JSON using a schema file', async () => {
|
||||
const dummyHtml = `
|
||||
<html><body>
|
||||
<div class='item'><h2 class='title'>P1</h2><span class='price'>$10</span><a class='link' href='/p1'>L1</a></div>
|
||||
<div class='item'><h2 class='title'>P2</h2><span class='price'>$20</span><a class='link' href='/p2'>L2</a></div>
|
||||
</body></html>`;
|
||||
const rawUrl = `raw://${dummyHtml}`;
|
||||
let stderrOutput = '';
|
||||
const originalWarn = console.warn; // Store original console.warn
|
||||
console.warn = (message) => { // Capture stderr from the wrapper
|
||||
if (typeof message === 'string' && message.startsWith('Python script stderr:')) {
|
||||
stderrOutput += message;
|
||||
}
|
||||
originalWarn.apply(console, [message]); // Call original warn
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await crawlAndExtract(rawUrl, {
|
||||
outputMode: 'json',
|
||||
schemaPath: tempSchemaPath,
|
||||
strategyType: 'css',
|
||||
verbose: true, // Test verbose flag
|
||||
});
|
||||
console.warn = originalWarn; // Restore original console.warn
|
||||
|
||||
expect(result).toBeInstanceOf(Array);
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0]).toEqual({ product_title: "P1", product_price: "$10", product_link: "/p1" });
|
||||
expect(result[1]).toEqual({ product_title: "P2", product_price: "$20", product_link: "/p2" });
|
||||
// Check stderr for expected verbose log patterns
|
||||
expect(stderrOutput).toContain('[INIT]');
|
||||
expect(stderrOutput).toContain('[FETCH]');
|
||||
expect(stderrOutput).toContain('[SCRAPE]');
|
||||
expect(stderrOutput).toContain('[EXTRACT]'); // JSON extraction step log
|
||||
expect(stderrOutput).toContain('[COMPLETE]');
|
||||
console.log(`JSON extracted (verbose) successfully using schema file: ${tempSchemaPath}`);
|
||||
} catch (error) {
|
||||
console.warn = originalWarn; // Restore on error too
|
||||
console.error("JSON extraction test (verbose, schema file) failed:", error);
|
||||
console.log(`JSON extracted successfully using schema file: ${tempSchemaPath}`);
|
||||
} catch (error: any) {
|
||||
console.error("JSON extraction test (schema file) failed unexpectedly:", error);
|
||||
throw error;
|
||||
}
|
||||
}, jsonTimeoutMs);
|
||||
|
||||
// Test JSON extraction using schema file AND config files
|
||||
it('should extract JSON using schema and config files', async () => {
|
||||
const dummyHtml = `
|
||||
<html><body>
|
||||
<div class='item'><h2 class='title'>CfgP1</h2><span class='price'>$30</span><a class='link' href='/cfg/p1'>CfgL1</a></div>
|
||||
</body></html>`;
|
||||
const rawUrl = `raw://${dummyHtml}`;
|
||||
|
||||
try {
|
||||
// This test primarily ensures the script runs without crashing due to config loading/passing
|
||||
const result = await crawlAndExtract(rawUrl, {
|
||||
outputMode: 'json',
|
||||
schemaPath: tempSchemaPath,
|
||||
browserConfigPath: tempBrowserConfigPath, // Pass browser config path
|
||||
crawlerConfigPath: tempCrawlerConfigPath, // Pass crawler config path
|
||||
strategyType: 'css',
|
||||
});
|
||||
// Basic validation of output - config effects are not asserted here
|
||||
expect(result).toBeInstanceOf(Array);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0]).toEqual({ product_title: "CfgP1", product_price: "$30", product_link: "/cfg/p1" });
|
||||
console.log(`JSON extracted successfully using schema and config files.`);
|
||||
} catch (error: any) {
|
||||
console.error("JSON extraction test (schema + config files) failed unexpectedly:", error);
|
||||
throw error;
|
||||
}
|
||||
}, jsonTimeoutMs);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user