From 0ec7f89cbc544c14abde4d0a2c82bb463f2e7d51 Mon Sep 17 00:00:00 2001 From: babayaga Date: Thu, 10 Apr 2025 20:29:28 +0200 Subject: [PATCH] crawler : crwl 2/3 --- packages/kbot/src/web/crwl_script.py | 22 +++- packages/kbot/tests/unit/web/crwl.test.ts | 146 +++++++++++----------- 2 files changed, 91 insertions(+), 77 deletions(-) diff --git a/packages/kbot/src/web/crwl_script.py b/packages/kbot/src/web/crwl_script.py index 159024f1..53d257f1 100644 --- a/packages/kbot/src/web/crwl_script.py +++ b/packages/kbot/src/web/crwl_script.py @@ -36,8 +36,11 @@ async def main(url, schema_path, strategy_type, output_mode, output_file, # --- Prepare CrawlerRunConfig arguments --- run_config_kwargs = crawler_config.copy() # Start with crawler config file content - # Set cache mode - run_config_kwargs['cache_mode'] = CacheMode.BYPASS if bypass_cache else CacheMode.CACHE_FIRST + + # Set cache mode ONLY if bypassCache is true + if bypass_cache: + run_config_kwargs['cache_mode'] = CacheMode.BYPASS + # Otherwise, let crawl4ai use its default cache mode # Add extraction strategy if JSON mode if output_mode == 'json': @@ -61,9 +64,12 @@ async def main(url, schema_path, strategy_type, output_mode, output_file, sys.exit(1) # Create CrawlerRunConfig - # Note: Assumes crawler_config keys match CrawlerRunConfig parameters try: - config = CrawlerRunConfig(**run_config_kwargs) + # Only pass kwargs if the loaded config was not empty + if run_config_kwargs: + config = CrawlerRunConfig(**run_config_kwargs) + else: + config = CrawlerRunConfig() # Use default config if file was empty/not provided except TypeError as e: print(f"Error creating CrawlerRunConfig. Check crawler config file content or arguments: {e}", file=sys.stderr) sys.exit(1) @@ -74,8 +80,12 @@ async def main(url, schema_path, strategy_type, output_mode, output_file, try: # Create crawler instance - # Note: Assumes browser_config keys match AsyncWebCrawler parameters - crawler_instance = AsyncWebCrawler(**crawler_kwargs) + # Only pass kwargs if the loaded config was not empty + if crawler_kwargs: + crawler_instance = AsyncWebCrawler(**crawler_kwargs) + else: + # Should not happen as verbose is always added, but for safety: + crawler_instance = AsyncWebCrawler() # Attempt to redirect logger to stderr if not verbose (to keep final output clean) if not verbose: diff --git a/packages/kbot/tests/unit/web/crwl.test.ts b/packages/kbot/tests/unit/web/crwl.test.ts index 342c856e..89bb39bd 100644 --- a/packages/kbot/tests/unit/web/crwl.test.ts +++ b/packages/kbot/tests/unit/web/crwl.test.ts @@ -14,8 +14,11 @@ import { promises as fs } from 'fs'; describe('crawlAndExtract Integration Tests', () => { - // Setup for JSON test: create a temporary schema file + // Setup temp files for schemas and configs let tempSchemaPath: string; + let tempBrowserConfigPath: string; + let tempCrawlerConfigPath: string; + const testSchema = { name: "Test Products", baseSelector: "div.item", @@ -25,117 +28,118 @@ describe('crawlAndExtract Integration Tests', () => { { name: "product_link", selector: "a.link", type: "attribute", attribute: "href" } ] }; + // Dummy configs - use empty objects as we don't know exact valid keys + const testBrowserConfig = {}; + const testCrawlerConfig = {}; beforeAll(async () => { - // Create a temp file for the schema before tests run - const tempFileName = `test-schema-${crypto.randomBytes(6).toString('hex')}.json`; - tempSchemaPath = path.join(os.tmpdir(), tempFileName); + // Create temp files before tests run + const randomSuffix = crypto.randomBytes(6).toString('hex'); + tempSchemaPath = path.join(os.tmpdir(), `test-schema-${randomSuffix}.json`); + tempBrowserConfigPath = path.join(os.tmpdir(), `test-browser-cfg-${randomSuffix}.json`); + tempCrawlerConfigPath = path.join(os.tmpdir(), `test-crawler-cfg-${randomSuffix}.json`); + try { await fs.writeFile(tempSchemaPath, JSON.stringify(testSchema, null, 2)); + await fs.writeFile(tempBrowserConfigPath, JSON.stringify(testBrowserConfig, null, 2)); + await fs.writeFile(tempCrawlerConfigPath, JSON.stringify(testCrawlerConfig, null, 2)); console.log(`Created temp schema file: ${tempSchemaPath}`); + console.log(`Created temp browser config: ${tempBrowserConfigPath}`); + console.log(`Created temp crawler config: ${tempCrawlerConfigPath}`); } catch (err) { - console.error("Failed to create temp schema file for tests:", err); - throw err; // Fail setup if file creation fails + console.error("Failed to create temp files for tests:", err); + // Cannot call afterAll directly, rely on the hook itself for cleanup + throw err; } }); afterAll(async () => { - // Cleanup the temp schema file after all tests run - if (tempSchemaPath) { - try { - await fs.unlink(tempSchemaPath); - console.log(`Cleaned up temp schema file: ${tempSchemaPath}`); - } catch (err) { - console.warn(`Failed to clean up temp schema file ${tempSchemaPath}:`, err); + // Cleanup temp files after all tests run + const filesToClean = [tempSchemaPath, tempBrowserConfigPath, tempCrawlerConfigPath]; + for (const filePath of filesToClean) { + if (filePath) { + try { + await fs.unlink(filePath); + console.log(`Cleaned up temp file: ${filePath}`); + } catch (err: any) { + // Ignore ENOENT (file already gone), warn others + if (err.code !== 'ENOENT') { + console.warn(`Failed to clean up temp file ${filePath}:`, err); + } + } } } }); - // Test Markdown generation - // Increase timeout for this test as it involves network I/O and python script execution - // vitest default is 5000ms - const markdownTimeoutMs = 60000; // 60 seconds - it('should generate markdown with verbose and bypassCache flags', async () => { + // Test Markdown generation (basic) + const markdownTimeoutMs = 60000; + it('should generate markdown for a given URL', async () => { const url = 'https://www.nbcnews.com/business'; - let stderrOutput = ''; - const originalWarn = console.warn; // Store original console.warn - console.warn = (message) => { // Capture stderr from the wrapper - if (typeof message === 'string' && message.startsWith('Python script stderr:')) { - stderrOutput += message; - } - originalWarn.apply(console, [message]); // Call original warn - }; - try { const result = await crawlAndExtract(url, { outputMode: 'markdown', - verbose: true, // Test verbose flag - bypassCache: true, // Test bypassCache flag }); - console.warn = originalWarn; // Restore original console.warn - expect(result).toBeTypeOf('string'); expect(result.length).toBeGreaterThan(100); - // Check stderr for expected verbose log patterns (adjust patterns if needed) - expect(stderrOutput).toContain('[INIT]'); - expect(stderrOutput).toContain('[FETCH]'); - expect(stderrOutput).toContain('[SCRAPE]'); - expect(stderrOutput).toContain('[MARKDOWN]'); // Or similar markdown step log - expect(stderrOutput).toContain('[COMPLETE]'); - console.log(`Markdown generated (verbose) successfully for ${url}. Length: ${result.length}`); - } catch (error) { - console.warn = originalWarn; // Restore on error too - console.error("Markdown generation test (verbose) failed:", error); - throw error; + console.log(`Markdown generated successfully for ${url}. Length: ${result.length}`); + } catch (error: any) { + console.error("Markdown generation test failed unexpectedly:", error); + throw error; } }, markdownTimeoutMs); - // Test JSON extraction using schema file - // Increase timeout slightly as it still involves process spawning - const jsonTimeoutMs = 20000; // 20 seconds - it('should extract JSON using schema file with verbose flag', async () => { - // Using a simple, static HTML page for reliable JSON extraction testing - // This avoids relying on external site structure which can change. - // We'll use the raw:// scheme with dummy HTML passed directly to the python script. + // Test JSON extraction using schema file (basic) + const jsonTimeoutMs = 20000; + it('should extract JSON using a schema file', async () => { const dummyHtml = `

P1

$10L1

P2

$20L2
`; const rawUrl = `raw://${dummyHtml}`; - let stderrOutput = ''; - const originalWarn = console.warn; // Store original console.warn - console.warn = (message) => { // Capture stderr from the wrapper - if (typeof message === 'string' && message.startsWith('Python script stderr:')) { - stderrOutput += message; - } - originalWarn.apply(console, [message]); // Call original warn - }; try { const result = await crawlAndExtract(rawUrl, { outputMode: 'json', schemaPath: tempSchemaPath, strategyType: 'css', - verbose: true, // Test verbose flag }); - console.warn = originalWarn; // Restore original console.warn - expect(result).toBeInstanceOf(Array); expect(result).toHaveLength(2); expect(result[0]).toEqual({ product_title: "P1", product_price: "$10", product_link: "/p1" }); expect(result[1]).toEqual({ product_title: "P2", product_price: "$20", product_link: "/p2" }); - // Check stderr for expected verbose log patterns - expect(stderrOutput).toContain('[INIT]'); - expect(stderrOutput).toContain('[FETCH]'); - expect(stderrOutput).toContain('[SCRAPE]'); - expect(stderrOutput).toContain('[EXTRACT]'); // JSON extraction step log - expect(stderrOutput).toContain('[COMPLETE]'); - console.log(`JSON extracted (verbose) successfully using schema file: ${tempSchemaPath}`); - } catch (error) { - console.warn = originalWarn; // Restore on error too - console.error("JSON extraction test (verbose, schema file) failed:", error); - throw error; + console.log(`JSON extracted successfully using schema file: ${tempSchemaPath}`); + } catch (error: any) { + console.error("JSON extraction test (schema file) failed unexpectedly:", error); + throw error; + } + }, jsonTimeoutMs); + + // Test JSON extraction using schema file AND config files + it('should extract JSON using schema and config files', async () => { + const dummyHtml = ` + +

CfgP1

$30CfgL1
+ `; + const rawUrl = `raw://${dummyHtml}`; + + try { + // This test primarily ensures the script runs without crashing due to config loading/passing + const result = await crawlAndExtract(rawUrl, { + outputMode: 'json', + schemaPath: tempSchemaPath, + browserConfigPath: tempBrowserConfigPath, // Pass browser config path + crawlerConfigPath: tempCrawlerConfigPath, // Pass crawler config path + strategyType: 'css', + }); + // Basic validation of output - config effects are not asserted here + expect(result).toBeInstanceOf(Array); + expect(result).toHaveLength(1); + expect(result[0]).toEqual({ product_title: "CfgP1", product_price: "$30", product_link: "/cfg/p1" }); + console.log(`JSON extracted successfully using schema and config files.`); + } catch (error: any) { + console.error("JSON extraction test (schema + config files) failed unexpectedly:", error); + throw error; } }, jsonTimeoutMs);