From 599476684134601d6bf1d1649a8df656802ec18a Mon Sep 17 00:00:00 2001 From: babayaga Date: Thu, 10 Apr 2025 20:22:31 +0200 Subject: [PATCH] crawler : crwl 2/3 --- packages/kbot/docs/report.md | 34 ++++ packages/kbot/package.json | 1 + packages/kbot/src/web/crwl.ts | 206 ++++++++++++++++++++++ packages/kbot/src/web/crwl_script.py | 195 ++++++++++++++++++++ packages/kbot/tests/unit/web/crwl.test.ts | 146 +++++++++++++++ 5 files changed, 582 insertions(+) create mode 100644 packages/kbot/docs/report.md create mode 100644 packages/kbot/src/web/crwl.ts create mode 100644 packages/kbot/src/web/crwl_script.py create mode 100644 packages/kbot/tests/unit/web/crwl.test.ts diff --git a/packages/kbot/docs/report.md b/packages/kbot/docs/report.md new file mode 100644 index 00000000..094a6a11 --- /dev/null +++ b/packages/kbot/docs/report.md @@ -0,0 +1,34 @@ +# KBot Codebase Analysis Report (src/**/*.ts) + +## Summary of Findings + +1. **Code Structure:** The codebase is organized into directories like `commands`, `models`, `utils`, `examples`, etc., which seems logical. Core logic appears distributed across these, with `iterator.ts` and `async-iterator.ts` handling complex data transformations and `source.ts` managing file/URL processing. +2. **`TODO` Markers:** A single `TODO` was found in `src/source.ts` suggesting future support for OpenAI vector stores. This is a feature note, not a bug. +3. **Logging:** `console.log` statements are prevalent, but heavily concentrated within the `src/examples/` directory, which is expected. Core files seem to use a `tslog` logger (`logger` instance), which is good practice. Ensure no temporary `console.log` calls remain in production code paths. +4. **Error Handling:** Numerous generic `try...catch` blocks exist (e.g., `catch (error)`, `catch (e)`). Many do not explicitly type the caught error, defaulting to `any` or leaving it untyped. This can obscure the nature of errors during runtime. `src/config.ts` explicitly uses `catch (error: any)`. +5. **Type Safety (`any` usage):** The type `any` is used frequently throughout the codebase (`zod_schema.ts`, `types.ts`, `iterator.ts`, command files, etc.). This bypasses TypeScript's static type checking, potentially hiding type-related bugs and making refactoring harder. +6. **Dependencies:** The project utilizes local `@polymech` packages and standard libraries like `openai`, `zod`, `axios`, `marked`, `unified`, `yargs`, etc., suitable for its purpose. +7. **Complexity:** Files like `iterator.ts` handle complex logic involving data iteration, transformation, caching, and asynchronous operations (LLM calls). + +## Potential Improvements & Suggestions + +1. **Reduce `any` Usage (High Priority):** + * **Action:** Systematically replace `any` with specific types (interfaces, types derived from Zod schemas) or `unknown`. + * **Benefit:** Improves type safety, catches errors at compile time, enhances code maintainability and refactoring confidence. + * **Focus Areas:** `types.ts` (callback definitions), `iterator.ts`/`iterator-cache.ts` (data handling, cache keys/values), command handlers (`run*.ts`), `zod_schema.ts`, utility functions. +2. **Improve Error Handling:** + * **Action:** Type caught errors using `catch (error: unknown)` and perform type checks (e.g., `if (error instanceof Error) { ... }`). Replace `catch (error: any)` in `src/config.ts`. + * **Benefit:** Safer error handling, prevents accessing non-existent properties on error objects. + * **Consideration:** Introduce custom error classes for specific failure scenarios if needed. +3. **Leverage Zod:** + * **Action:** Ensure Zod schemas (`src/zod_schema.ts`, `src/zod_types.ts`) comprehensively define expected data structures, especially for external inputs (config, API responses). Use `schema.parse` or `schema.safeParse` consistently at boundaries. + * **Benefit:** Enhances runtime safety by validating data against defined schemas. +4. **Refactor Complex Code:** + * **Action:** Review `iterator.ts`, `async-iterator.ts`, and potentially large command files (`src/commands/run.ts`) for opportunities to break down large functions or simplify logic. + * **Benefit:** Improves readability and testability. +5. **Standardize Logging:** + * **Action:** Ensure all core logic uses the configured `tslog` logger instead of `console.log`. Remove any remaining debug `console.log`s outside the `examples` directory. + * **Benefit:** Consistent logging output, easier log management. +6. **Configuration Loading (`config.ts`):** + * **Action:** Avoid the `as any` type assertion when loading the default configuration. Ensure the `CONFIG_DEFAULT` function returns a type-compatible object or validate its output. + * **Benefit:** Improves type safety during configuration loading. \ No newline at end of file diff --git a/packages/kbot/package.json b/packages/kbot/package.json index 4d5c866b..3b82efda 100644 --- a/packages/kbot/package.json +++ b/packages/kbot/package.json @@ -32,6 +32,7 @@ "test:tools": "vitest run tests/unit/tools.test.ts", "test:coding": "vitest run tests/unit/coding.test.ts", "test:web": "vitest run tests/unit/web.test.ts", + "test:web:crwl": "vitest run tests/unit/web/crwl.test.ts", "test:files": "vitest run tests/unit/files.test.ts", "test:research": "vitest run tests/unit/research.test.ts", "test:core": "vitest run tests/unit/core", diff --git a/packages/kbot/src/web/crwl.ts b/packages/kbot/src/web/crwl.ts new file mode 100644 index 00000000..6bb21397 --- /dev/null +++ b/packages/kbot/src/web/crwl.ts @@ -0,0 +1,206 @@ +import { spawn } from 'child_process'; +import path from 'path'; +import os from 'os'; +import crypto from 'crypto'; +import { promises as fs } from 'fs'; + +// Define the structure for the crawl4ai schema based on the documentation +// This provides type safety for the schema object +interface Crawl4aiField { + name: string; + selector: string; + type: 'text' | 'attribute' | 'html' | 'regex' | string; // Allow string for other types + attribute?: string; // Optional for 'attribute' type + default?: any; // Optional default value + // TODO: Add support for nested schemas if required by your use case +} + +interface Crawl4aiSchema { + name: string; + baseSelector: string; + baseFields?: Crawl4aiField[]; // Optional base fields extracted from the container + fields: Crawl4aiField[]; + // TODO: Add other schema properties like 'nested' if needed +} + +// Base options interface with new config paths and flags +interface BaseCrawlOptions { + outputMode: 'json' | 'markdown'; + pythonExecutable?: string; + scriptPath?: string; + browserConfigPath?: string; // Path to browser config (JSON) + crawlerConfigPath?: string; // Path to crawler config (JSON) + bypassCache?: boolean; // Corresponds to --bypass-cache + verbose?: boolean; // Corresponds to -v, --verbose +} + +// Options specific to JSON output mode (schema is now path) +interface JsonCrawlOptions extends BaseCrawlOptions { + outputMode: 'json'; + schemaPath: string; // Path to schema file (required for JSON mode) + strategyType?: 'css' | 'xpath'; // Default to 'css' if not provided +} + +// Options specific to Markdown output mode +interface MarkdownCrawlOptions extends BaseCrawlOptions { + outputMode: 'markdown'; + schemaPath?: never; + strategyType?: never; +} + +// Combined type for the crawlAndExtract function options +export type CrawlOptions = JsonCrawlOptions | MarkdownCrawlOptions; + +/** + * Executes the crawl4ai Python script with extended options, using a temp file for output. + * Assumes 'crawl4ai' is installed in the Python environment accessible by `pythonExecutable`. + * + * @param url The URL to crawl and extract data from. + * @param options Configuration options including output mode, config paths, flags, etc. + * @returns A promise that resolves with the extracted JSON data (parsed) or the generated Markdown string. + * @throws An error if the Python script fails or returns an error message on stderr. + */ +export async function crawlAndExtract(url: string, options: CrawlOptions): Promise { + const { + outputMode, + pythonExecutable = 'python', + scriptPath = path.resolve(__dirname, 'crwl_script.py'), + browserConfigPath, + crawlerConfigPath, + bypassCache, + verbose + } = options; + + if (!url) { + return Promise.reject(new Error('URL parameter is required.')); + } + + const tempFileName = `crwl-${crypto.randomBytes(8).toString('hex')}.tmp`; + const tempFilePath = path.join(os.tmpdir(), tempFileName); + + // Base arguments + const args = [ + scriptPath, + url, + '--output-mode', outputMode, + '--output-file', tempFilePath + ]; + + // Add JSON-specific options + if (outputMode === 'json') { + if (!options.schemaPath) { + // Ensure schemaPath is provided for JSON mode + return Promise.reject(new Error('schemaPath is required for JSON output mode.')); + } + args.push('--schema', options.schemaPath); + const strategyType = options.strategyType || 'css'; + args.push('--strategy-type', strategyType); + } + + // Add optional config paths + if (browserConfigPath) { + args.push('--browser-config', browserConfigPath); + } + if (crawlerConfigPath) { + args.push('--crawler-config', crawlerConfigPath); + } + + // Add optional flags + if (bypassCache) { + args.push('--bypass-cache'); + } + if (verbose) { + args.push('--verbose'); + } + + return new Promise(async (resolve, reject) => { + console.log(`Spawning: ${pythonExecutable} ${args.join(' ')} (output to: ${tempFilePath})`); + const env = { ...process.env, PYTHONIOENCODING: 'UTF-8' }; + + const pythonProcess = spawn(pythonExecutable, args, { + stdio: ['pipe', 'pipe', 'pipe'], + env: env + }); + + let stderrData = ''; + pythonProcess.stderr.setEncoding('utf8'); + pythonProcess.stderr.on('data', (data) => { + stderrData += data; + }); + + pythonProcess.on('close', async (code) => { + console.log(`Python script exited with code ${code}`); + if (stderrData) { + console.warn(`Python script stderr:\n${stderrData}`); + } + + if (code !== 0) { + try { await fs.unlink(tempFilePath); } catch (cleanupErr) { console.warn(`Failed to cleanup temp file ${tempFilePath}: ${cleanupErr}`); } + return reject(new Error(`Python script exited with code ${code}. Stderr: ${stderrData || 'None'}`)); + } + + try { + const fileContent = await fs.readFile(tempFilePath, 'utf8'); + try { await fs.unlink(tempFilePath); } catch (cleanupErr) { console.warn(`Failed to cleanup temp file ${tempFilePath}: ${cleanupErr}`); } + + if (outputMode === 'json') { + try { + const jsonData = JSON.parse(fileContent); + resolve(jsonData); + } catch (parseError: any) { + reject(new Error(`Failed to parse JSON from temp file: ${parseError.message}. File content length: ${fileContent?.length || 0}. Stderr: ${stderrData || 'None'}`)); + } + } else { + resolve(fileContent); + } + } catch (readFileError: any) { + try { await fs.unlink(tempFilePath); } catch (cleanupErr) { console.warn(`Failed to cleanup temp file ${tempFilePath}: ${cleanupErr}`); } + reject(new Error(`Failed to read output file ${tempFilePath}: ${readFileError.message}. Stderr: ${stderrData || 'None'}`)); + } + }); + + pythonProcess.on('error', (err) => { + reject(new Error(`Failed to start Python script '${pythonExecutable}': ${err.message}`)); + }); + }); +} + +// Example Usage (remove or comment out in production) +async function example() { + // Example 1: JSON Extraction using schema file + const schemaFilePath = path.join(__dirname, 'example-schema.json'); // Create this file! + const schemaContent = { + name: "Example Extraction", + baseSelector: "div", + fields: [{ name: "title", selector: "h1", type: "text" }] + }; + try { + await fs.writeFile(schemaFilePath, JSON.stringify(schemaContent, null, 2)); + console.log(`Attempting JSON crawl on https://example.com using schema file...`); + const jsonData = await crawlAndExtract('https://example.com', { + outputMode: 'json', + schemaPath: schemaFilePath, // Pass path to schema + // bypassCache: true, // Example flag + // verbose: true // Example flag + }); + console.log('Extracted JSON data:', JSON.stringify(jsonData, null, 2)); + } catch (error) { + console.error('JSON Crawling failed:', error); + } finally { + try { await fs.unlink(schemaFilePath); } catch {} // Cleanup schema file + } + + // Example 2: Markdown Generation + const markdownTestUrl = 'https://www.nbcnews.com/business'; + try { + console.log(`\nAttempting Markdown crawl on ${markdownTestUrl}...`); + const markdownData = await crawlAndExtract(markdownTestUrl, { + outputMode: 'markdown' + }); + console.log('Generated Markdown (first 500 chars):', markdownData.substring(0, 500) + '...'); + } catch (error) { + console.error('Markdown Crawling failed:', error); + } +} + +// example(); // Uncomment to run the example function \ No newline at end of file diff --git a/packages/kbot/src/web/crwl_script.py b/packages/kbot/src/web/crwl_script.py new file mode 100644 index 00000000..159024f1 --- /dev/null +++ b/packages/kbot/src/web/crwl_script.py @@ -0,0 +1,195 @@ +import sys +import json +import argparse +import asyncio +import os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy + +def load_json_config(config_path): + """Loads a JSON configuration file.""" + if not config_path or not os.path.exists(config_path): + # print(f"Warning: Config file not found: {config_path}", file=sys.stderr) + return {} + try: + with open(config_path, 'r', encoding='utf-8') as f: + return json.load(f) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in config file '{config_path}': {e}", file=sys.stderr) + sys.exit(1) + except IOError as e: + print(f"Error: Could not read config file '{config_path}': {e}", file=sys.stderr) + sys.exit(1) + +async def main(url, schema_path, strategy_type, output_mode, output_file, + browser_config_path, crawler_config_path, bypass_cache, verbose): + + output_dir = os.path.dirname(output_file) + if output_dir and not os.path.exists(output_dir): + print(f"Error: Output directory does not exist: {output_dir}", file=sys.stderr) + sys.exit(1) + + # Load configurations from files + schema_config = load_json_config(schema_path) # Used only if output_mode is json + browser_config = load_json_config(browser_config_path) + crawler_config = load_json_config(crawler_config_path) + + # --- Prepare CrawlerRunConfig arguments --- + run_config_kwargs = crawler_config.copy() # Start with crawler config file content + # Set cache mode + run_config_kwargs['cache_mode'] = CacheMode.BYPASS if bypass_cache else CacheMode.CACHE_FIRST + + # Add extraction strategy if JSON mode + if output_mode == 'json': + if not schema_config: + # If --schema path wasn't provided or file was empty/invalid, but mode is json + print(f"Error: Schema file ('{schema_path}') is required and must contain valid JSON for json output mode.", file=sys.stderr) + sys.exit(1) + if strategy_type == 'css': + extraction_strategy = JsonCssExtractionStrategy(schema_config, verbose=verbose) + elif strategy_type == 'xpath': + extraction_strategy = JsonXPathExtractionStrategy(schema_config, verbose=verbose) + else: + # This case should ideally be caught by argparse choices, but added for safety + print(f"Error: Invalid strategy type '{strategy_type}'. Use 'css' or 'xpath'.", file=sys.stderr) + sys.exit(1) + run_config_kwargs['extraction_strategy'] = extraction_strategy + elif output_mode == 'markdown': + pass # Markdown handled by default or potentially via crawler_config + else: + print(f"Error: Invalid output mode '{output_mode}'. Use 'json' or 'markdown'.", file=sys.stderr) + sys.exit(1) + + # Create CrawlerRunConfig + # Note: Assumes crawler_config keys match CrawlerRunConfig parameters + try: + config = CrawlerRunConfig(**run_config_kwargs) + except TypeError as e: + print(f"Error creating CrawlerRunConfig. Check crawler config file content or arguments: {e}", file=sys.stderr) + sys.exit(1) + + # --- Prepare AsyncWebCrawler arguments --- + crawler_kwargs = browser_config.copy() # Start with browser config file content + crawler_kwargs['verbose'] = verbose + + try: + # Create crawler instance + # Note: Assumes browser_config keys match AsyncWebCrawler parameters + crawler_instance = AsyncWebCrawler(**crawler_kwargs) + + # Attempt to redirect logger to stderr if not verbose (to keep final output clean) + if not verbose: + try: + if hasattr(crawler_instance, 'logger') and hasattr(crawler_instance.logger, 'set_output_stream'): + crawler_instance.logger.set_output_stream(sys.stderr) + elif hasattr(crawler_instance, 'logger') and hasattr(crawler_instance.logger, 'handlers'): + for handler in crawler_instance.logger.handlers: + if hasattr(handler, 'setStream'): handler.setStream(sys.stderr) + elif hasattr(handler, 'stream'): handler.stream = sys.stderr + except Exception as log_config_err: + # Non-fatal warning if redirection fails + print(f"Warning: Could not redirect non-verbose crawler logger to stderr: {log_config_err}", file=sys.stderr) + + # Run the crawler + async with crawler_instance as crawler: + result = await crawler.arun( + url=url, + config=config + ) + + if not result.success: + print(f"Error: Crawl failed for URL '{url}': {result.error_message}", file=sys.stderr) + sys.exit(1) + + # Write output to the specified file + try: + with open(output_file, 'w', encoding='utf-8') as f: + if output_mode == 'json': + json_output = result.extracted_content if result.extracted_content is not None else 'null' + f.write(json_output) + elif output_mode == 'markdown': + markdown_output = result.markdown if result.markdown is not None else '' + f.write(markdown_output) + sys.exit(0) # Success + except IOError as e: + print(f"Error: Could not write to output file '{output_file}': {e}", file=sys.stderr) + sys.exit(1) + + except TypeError as e: + print(f"Error creating AsyncWebCrawler. Check browser config file content or arguments: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + # Force stderr to UTF-8 as well for error messages + try: sys.stderr.reconfigure(encoding='utf-8') + except Exception: pass + print(f"Error: An unexpected error occurred during processing: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Crawl a URL and write content to a file using crawl4ai library.') + parser.add_argument('url', help='The URL to crawl.') + parser.add_argument( + '-s', '--schema', + dest='schema_path', + help='Path to JSON schema file for extraction (required for json output mode).', + default=None + ) + parser.add_argument( + '--strategy-type', + help="Extraction strategy type ('css' or 'xpath'). Used only with --schema.", + choices=['css', 'xpath'], + default='css' + ) + parser.add_argument( + '-o', '--output-mode', + help="Output mode ('json' or 'markdown').", + choices=['json', 'markdown'], + required=True + ) + parser.add_argument( + '-O', '--output-file', + dest='output_file', + help="Path to write the output file.", + required=True + ) + parser.add_argument( + '-B', '--browser-config', + dest='browser_config_path', + help='Path to browser config file (JSON format).', + default=None + ) + parser.add_argument( + '-C', '--crawler-config', + dest='crawler_config_path', + help='Path to crawler config file (JSON format).', + default=None + ) + parser.add_argument( + '--bypass-cache', + action='store_true', + help='Bypass cache when crawling.' + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Enable verbose logging.' + ) + + args = parser.parse_args() + + if args.output_mode == 'json' and not args.schema_path: + parser.error('--schema is required when --output-mode is json') + + asyncio.run(main( + url=args.url, + schema_path=args.schema_path, + strategy_type=args.strategy_type, + output_mode=args.output_mode, + output_file=args.output_file, + browser_config_path=args.browser_config_path, + crawler_config_path=args.crawler_config_path, + bypass_cache=args.bypass_cache, + verbose=args.verbose + )) \ No newline at end of file diff --git a/packages/kbot/tests/unit/web/crwl.test.ts b/packages/kbot/tests/unit/web/crwl.test.ts new file mode 100644 index 00000000..342c856e --- /dev/null +++ b/packages/kbot/tests/unit/web/crwl.test.ts @@ -0,0 +1,146 @@ +// tests/unit/web/crwl.test.ts +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { crawlAndExtract } from '../../../src/web/crwl'; +import path from 'path'; +import os from 'os'; +import crypto from 'crypto'; +import { promises as fs } from 'fs'; + +// Note: This is an integration test, not a unit test, as it runs the actual Python script. +// Ensure Python and crawl4ai are installed in the environment where this test runs. +// Also ensure the python executable used by the test (`python` by default) can find crawl4ai. + +// Integration tests for the crawlAndExtract wrapper + +describe('crawlAndExtract Integration Tests', () => { + + // Setup for JSON test: create a temporary schema file + let tempSchemaPath: string; + const testSchema = { + name: "Test Products", + baseSelector: "div.item", + fields: [ + { name: "product_title", selector: "h2.title", type: "text" }, + { name: "product_price", selector: "span.price", type: "text" }, + { name: "product_link", selector: "a.link", type: "attribute", attribute: "href" } + ] + }; + + beforeAll(async () => { + // Create a temp file for the schema before tests run + const tempFileName = `test-schema-${crypto.randomBytes(6).toString('hex')}.json`; + tempSchemaPath = path.join(os.tmpdir(), tempFileName); + try { + await fs.writeFile(tempSchemaPath, JSON.stringify(testSchema, null, 2)); + console.log(`Created temp schema file: ${tempSchemaPath}`); + } catch (err) { + console.error("Failed to create temp schema file for tests:", err); + throw err; // Fail setup if file creation fails + } + }); + + afterAll(async () => { + // Cleanup the temp schema file after all tests run + if (tempSchemaPath) { + try { + await fs.unlink(tempSchemaPath); + console.log(`Cleaned up temp schema file: ${tempSchemaPath}`); + } catch (err) { + console.warn(`Failed to clean up temp schema file ${tempSchemaPath}:`, err); + } + } + }); + + // Test Markdown generation + // Increase timeout for this test as it involves network I/O and python script execution + // vitest default is 5000ms + const markdownTimeoutMs = 60000; // 60 seconds + it('should generate markdown with verbose and bypassCache flags', async () => { + const url = 'https://www.nbcnews.com/business'; + let stderrOutput = ''; + const originalWarn = console.warn; // Store original console.warn + console.warn = (message) => { // Capture stderr from the wrapper + if (typeof message === 'string' && message.startsWith('Python script stderr:')) { + stderrOutput += message; + } + originalWarn.apply(console, [message]); // Call original warn + }; + + try { + const result = await crawlAndExtract(url, { + outputMode: 'markdown', + verbose: true, // Test verbose flag + bypassCache: true, // Test bypassCache flag + }); + console.warn = originalWarn; // Restore original console.warn + + expect(result).toBeTypeOf('string'); + expect(result.length).toBeGreaterThan(100); + // Check stderr for expected verbose log patterns (adjust patterns if needed) + expect(stderrOutput).toContain('[INIT]'); + expect(stderrOutput).toContain('[FETCH]'); + expect(stderrOutput).toContain('[SCRAPE]'); + expect(stderrOutput).toContain('[MARKDOWN]'); // Or similar markdown step log + expect(stderrOutput).toContain('[COMPLETE]'); + console.log(`Markdown generated (verbose) successfully for ${url}. Length: ${result.length}`); + } catch (error) { + console.warn = originalWarn; // Restore on error too + console.error("Markdown generation test (verbose) failed:", error); + throw error; + } + }, markdownTimeoutMs); + + // Test JSON extraction using schema file + // Increase timeout slightly as it still involves process spawning + const jsonTimeoutMs = 20000; // 20 seconds + it('should extract JSON using schema file with verbose flag', async () => { + // Using a simple, static HTML page for reliable JSON extraction testing + // This avoids relying on external site structure which can change. + // We'll use the raw:// scheme with dummy HTML passed directly to the python script. + const dummyHtml = ` + +

P1

$10L1
+

P2

$20L2
+ `; + const rawUrl = `raw://${dummyHtml}`; + let stderrOutput = ''; + const originalWarn = console.warn; // Store original console.warn + console.warn = (message) => { // Capture stderr from the wrapper + if (typeof message === 'string' && message.startsWith('Python script stderr:')) { + stderrOutput += message; + } + originalWarn.apply(console, [message]); // Call original warn + }; + + try { + const result = await crawlAndExtract(rawUrl, { + outputMode: 'json', + schemaPath: tempSchemaPath, + strategyType: 'css', + verbose: true, // Test verbose flag + }); + console.warn = originalWarn; // Restore original console.warn + + expect(result).toBeInstanceOf(Array); + expect(result).toHaveLength(2); + expect(result[0]).toEqual({ product_title: "P1", product_price: "$10", product_link: "/p1" }); + expect(result[1]).toEqual({ product_title: "P2", product_price: "$20", product_link: "/p2" }); + // Check stderr for expected verbose log patterns + expect(stderrOutput).toContain('[INIT]'); + expect(stderrOutput).toContain('[FETCH]'); + expect(stderrOutput).toContain('[SCRAPE]'); + expect(stderrOutput).toContain('[EXTRACT]'); // JSON extraction step log + expect(stderrOutput).toContain('[COMPLETE]'); + console.log(`JSON extracted (verbose) successfully using schema file: ${tempSchemaPath}`); + } catch (error) { + console.warn = originalWarn; // Restore on error too + console.error("JSON extraction test (verbose, schema file) failed:", error); + throw error; + } + }, jsonTimeoutMs); + + // TODO: Add tests for browserConfigPath and crawlerConfigPath if needed + // These would likely require creating dummy JSON config files and checking stderr + // for specific log messages indicating the configs were loaded/applied. + +}); \ No newline at end of file