crawler : crwl 2/3
This commit is contained in:
parent
e2ba560720
commit
5994766841
34
packages/kbot/docs/report.md
Normal file
34
packages/kbot/docs/report.md
Normal file
@ -0,0 +1,34 @@
|
||||
# KBot Codebase Analysis Report (src/**/*.ts)
|
||||
|
||||
## Summary of Findings
|
||||
|
||||
1. **Code Structure:** The codebase is organized into directories like `commands`, `models`, `utils`, `examples`, etc., which seems logical. Core logic appears distributed across these, with `iterator.ts` and `async-iterator.ts` handling complex data transformations and `source.ts` managing file/URL processing.
|
||||
2. **`TODO` Markers:** A single `TODO` was found in `src/source.ts` suggesting future support for OpenAI vector stores. This is a feature note, not a bug.
|
||||
3. **Logging:** `console.log` statements are prevalent, but heavily concentrated within the `src/examples/` directory, which is expected. Core files seem to use a `tslog` logger (`logger` instance), which is good practice. Ensure no temporary `console.log` calls remain in production code paths.
|
||||
4. **Error Handling:** Numerous generic `try...catch` blocks exist (e.g., `catch (error)`, `catch (e)`). Many do not explicitly type the caught error, defaulting to `any` or leaving it untyped. This can obscure the nature of errors during runtime. `src/config.ts` explicitly uses `catch (error: any)`.
|
||||
5. **Type Safety (`any` usage):** The type `any` is used frequently throughout the codebase (`zod_schema.ts`, `types.ts`, `iterator.ts`, command files, etc.). This bypasses TypeScript's static type checking, potentially hiding type-related bugs and making refactoring harder.
|
||||
6. **Dependencies:** The project utilizes local `@polymech` packages and standard libraries like `openai`, `zod`, `axios`, `marked`, `unified`, `yargs`, etc., suitable for its purpose.
|
||||
7. **Complexity:** Files like `iterator.ts` handle complex logic involving data iteration, transformation, caching, and asynchronous operations (LLM calls).
|
||||
|
||||
## Potential Improvements & Suggestions
|
||||
|
||||
1. **Reduce `any` Usage (High Priority):**
|
||||
* **Action:** Systematically replace `any` with specific types (interfaces, types derived from Zod schemas) or `unknown`.
|
||||
* **Benefit:** Improves type safety, catches errors at compile time, enhances code maintainability and refactoring confidence.
|
||||
* **Focus Areas:** `types.ts` (callback definitions), `iterator.ts`/`iterator-cache.ts` (data handling, cache keys/values), command handlers (`run*.ts`), `zod_schema.ts`, utility functions.
|
||||
2. **Improve Error Handling:**
|
||||
* **Action:** Type caught errors using `catch (error: unknown)` and perform type checks (e.g., `if (error instanceof Error) { ... }`). Replace `catch (error: any)` in `src/config.ts`.
|
||||
* **Benefit:** Safer error handling, prevents accessing non-existent properties on error objects.
|
||||
* **Consideration:** Introduce custom error classes for specific failure scenarios if needed.
|
||||
3. **Leverage Zod:**
|
||||
* **Action:** Ensure Zod schemas (`src/zod_schema.ts`, `src/zod_types.ts`) comprehensively define expected data structures, especially for external inputs (config, API responses). Use `schema.parse` or `schema.safeParse` consistently at boundaries.
|
||||
* **Benefit:** Enhances runtime safety by validating data against defined schemas.
|
||||
4. **Refactor Complex Code:**
|
||||
* **Action:** Review `iterator.ts`, `async-iterator.ts`, and potentially large command files (`src/commands/run.ts`) for opportunities to break down large functions or simplify logic.
|
||||
* **Benefit:** Improves readability and testability.
|
||||
5. **Standardize Logging:**
|
||||
* **Action:** Ensure all core logic uses the configured `tslog` logger instead of `console.log`. Remove any remaining debug `console.log`s outside the `examples` directory.
|
||||
* **Benefit:** Consistent logging output, easier log management.
|
||||
6. **Configuration Loading (`config.ts`):**
|
||||
* **Action:** Avoid the `as any` type assertion when loading the default configuration. Ensure the `CONFIG_DEFAULT` function returns a type-compatible object or validate its output.
|
||||
* **Benefit:** Improves type safety during configuration loading.
|
||||
@ -32,6 +32,7 @@
|
||||
"test:tools": "vitest run tests/unit/tools.test.ts",
|
||||
"test:coding": "vitest run tests/unit/coding.test.ts",
|
||||
"test:web": "vitest run tests/unit/web.test.ts",
|
||||
"test:web:crwl": "vitest run tests/unit/web/crwl.test.ts",
|
||||
"test:files": "vitest run tests/unit/files.test.ts",
|
||||
"test:research": "vitest run tests/unit/research.test.ts",
|
||||
"test:core": "vitest run tests/unit/core",
|
||||
|
||||
206
packages/kbot/src/web/crwl.ts
Normal file
206
packages/kbot/src/web/crwl.ts
Normal file
@ -0,0 +1,206 @@
|
||||
import { spawn } from 'child_process';
|
||||
import path from 'path';
|
||||
import os from 'os';
|
||||
import crypto from 'crypto';
|
||||
import { promises as fs } from 'fs';
|
||||
|
||||
// Define the structure for the crawl4ai schema based on the documentation
|
||||
// This provides type safety for the schema object
|
||||
interface Crawl4aiField {
|
||||
name: string;
|
||||
selector: string;
|
||||
type: 'text' | 'attribute' | 'html' | 'regex' | string; // Allow string for other types
|
||||
attribute?: string; // Optional for 'attribute' type
|
||||
default?: any; // Optional default value
|
||||
// TODO: Add support for nested schemas if required by your use case
|
||||
}
|
||||
|
||||
interface Crawl4aiSchema {
|
||||
name: string;
|
||||
baseSelector: string;
|
||||
baseFields?: Crawl4aiField[]; // Optional base fields extracted from the container
|
||||
fields: Crawl4aiField[];
|
||||
// TODO: Add other schema properties like 'nested' if needed
|
||||
}
|
||||
|
||||
// Base options interface with new config paths and flags
|
||||
interface BaseCrawlOptions {
|
||||
outputMode: 'json' | 'markdown';
|
||||
pythonExecutable?: string;
|
||||
scriptPath?: string;
|
||||
browserConfigPath?: string; // Path to browser config (JSON)
|
||||
crawlerConfigPath?: string; // Path to crawler config (JSON)
|
||||
bypassCache?: boolean; // Corresponds to --bypass-cache
|
||||
verbose?: boolean; // Corresponds to -v, --verbose
|
||||
}
|
||||
|
||||
// Options specific to JSON output mode (schema is now path)
|
||||
interface JsonCrawlOptions extends BaseCrawlOptions {
|
||||
outputMode: 'json';
|
||||
schemaPath: string; // Path to schema file (required for JSON mode)
|
||||
strategyType?: 'css' | 'xpath'; // Default to 'css' if not provided
|
||||
}
|
||||
|
||||
// Options specific to Markdown output mode
|
||||
interface MarkdownCrawlOptions extends BaseCrawlOptions {
|
||||
outputMode: 'markdown';
|
||||
schemaPath?: never;
|
||||
strategyType?: never;
|
||||
}
|
||||
|
||||
// Combined type for the crawlAndExtract function options
|
||||
export type CrawlOptions = JsonCrawlOptions | MarkdownCrawlOptions;
|
||||
|
||||
/**
|
||||
* Executes the crawl4ai Python script with extended options, using a temp file for output.
|
||||
* Assumes 'crawl4ai' is installed in the Python environment accessible by `pythonExecutable`.
|
||||
*
|
||||
* @param url The URL to crawl and extract data from.
|
||||
* @param options Configuration options including output mode, config paths, flags, etc.
|
||||
* @returns A promise that resolves with the extracted JSON data (parsed) or the generated Markdown string.
|
||||
* @throws An error if the Python script fails or returns an error message on stderr.
|
||||
*/
|
||||
export async function crawlAndExtract(url: string, options: CrawlOptions): Promise<any | string> {
|
||||
const {
|
||||
outputMode,
|
||||
pythonExecutable = 'python',
|
||||
scriptPath = path.resolve(__dirname, 'crwl_script.py'),
|
||||
browserConfigPath,
|
||||
crawlerConfigPath,
|
||||
bypassCache,
|
||||
verbose
|
||||
} = options;
|
||||
|
||||
if (!url) {
|
||||
return Promise.reject(new Error('URL parameter is required.'));
|
||||
}
|
||||
|
||||
const tempFileName = `crwl-${crypto.randomBytes(8).toString('hex')}.tmp`;
|
||||
const tempFilePath = path.join(os.tmpdir(), tempFileName);
|
||||
|
||||
// Base arguments
|
||||
const args = [
|
||||
scriptPath,
|
||||
url,
|
||||
'--output-mode', outputMode,
|
||||
'--output-file', tempFilePath
|
||||
];
|
||||
|
||||
// Add JSON-specific options
|
||||
if (outputMode === 'json') {
|
||||
if (!options.schemaPath) {
|
||||
// Ensure schemaPath is provided for JSON mode
|
||||
return Promise.reject(new Error('schemaPath is required for JSON output mode.'));
|
||||
}
|
||||
args.push('--schema', options.schemaPath);
|
||||
const strategyType = options.strategyType || 'css';
|
||||
args.push('--strategy-type', strategyType);
|
||||
}
|
||||
|
||||
// Add optional config paths
|
||||
if (browserConfigPath) {
|
||||
args.push('--browser-config', browserConfigPath);
|
||||
}
|
||||
if (crawlerConfigPath) {
|
||||
args.push('--crawler-config', crawlerConfigPath);
|
||||
}
|
||||
|
||||
// Add optional flags
|
||||
if (bypassCache) {
|
||||
args.push('--bypass-cache');
|
||||
}
|
||||
if (verbose) {
|
||||
args.push('--verbose');
|
||||
}
|
||||
|
||||
return new Promise(async (resolve, reject) => {
|
||||
console.log(`Spawning: ${pythonExecutable} ${args.join(' ')} (output to: ${tempFilePath})`);
|
||||
const env = { ...process.env, PYTHONIOENCODING: 'UTF-8' };
|
||||
|
||||
const pythonProcess = spawn(pythonExecutable, args, {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
env: env
|
||||
});
|
||||
|
||||
let stderrData = '';
|
||||
pythonProcess.stderr.setEncoding('utf8');
|
||||
pythonProcess.stderr.on('data', (data) => {
|
||||
stderrData += data;
|
||||
});
|
||||
|
||||
pythonProcess.on('close', async (code) => {
|
||||
console.log(`Python script exited with code ${code}`);
|
||||
if (stderrData) {
|
||||
console.warn(`Python script stderr:\n${stderrData}`);
|
||||
}
|
||||
|
||||
if (code !== 0) {
|
||||
try { await fs.unlink(tempFilePath); } catch (cleanupErr) { console.warn(`Failed to cleanup temp file ${tempFilePath}: ${cleanupErr}`); }
|
||||
return reject(new Error(`Python script exited with code ${code}. Stderr: ${stderrData || 'None'}`));
|
||||
}
|
||||
|
||||
try {
|
||||
const fileContent = await fs.readFile(tempFilePath, 'utf8');
|
||||
try { await fs.unlink(tempFilePath); } catch (cleanupErr) { console.warn(`Failed to cleanup temp file ${tempFilePath}: ${cleanupErr}`); }
|
||||
|
||||
if (outputMode === 'json') {
|
||||
try {
|
||||
const jsonData = JSON.parse(fileContent);
|
||||
resolve(jsonData);
|
||||
} catch (parseError: any) {
|
||||
reject(new Error(`Failed to parse JSON from temp file: ${parseError.message}. File content length: ${fileContent?.length || 0}. Stderr: ${stderrData || 'None'}`));
|
||||
}
|
||||
} else {
|
||||
resolve(fileContent);
|
||||
}
|
||||
} catch (readFileError: any) {
|
||||
try { await fs.unlink(tempFilePath); } catch (cleanupErr) { console.warn(`Failed to cleanup temp file ${tempFilePath}: ${cleanupErr}`); }
|
||||
reject(new Error(`Failed to read output file ${tempFilePath}: ${readFileError.message}. Stderr: ${stderrData || 'None'}`));
|
||||
}
|
||||
});
|
||||
|
||||
pythonProcess.on('error', (err) => {
|
||||
reject(new Error(`Failed to start Python script '${pythonExecutable}': ${err.message}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Example Usage (remove or comment out in production)
|
||||
async function example() {
|
||||
// Example 1: JSON Extraction using schema file
|
||||
const schemaFilePath = path.join(__dirname, 'example-schema.json'); // Create this file!
|
||||
const schemaContent = {
|
||||
name: "Example Extraction",
|
||||
baseSelector: "div",
|
||||
fields: [{ name: "title", selector: "h1", type: "text" }]
|
||||
};
|
||||
try {
|
||||
await fs.writeFile(schemaFilePath, JSON.stringify(schemaContent, null, 2));
|
||||
console.log(`Attempting JSON crawl on https://example.com using schema file...`);
|
||||
const jsonData = await crawlAndExtract('https://example.com', {
|
||||
outputMode: 'json',
|
||||
schemaPath: schemaFilePath, // Pass path to schema
|
||||
// bypassCache: true, // Example flag
|
||||
// verbose: true // Example flag
|
||||
});
|
||||
console.log('Extracted JSON data:', JSON.stringify(jsonData, null, 2));
|
||||
} catch (error) {
|
||||
console.error('JSON Crawling failed:', error);
|
||||
} finally {
|
||||
try { await fs.unlink(schemaFilePath); } catch {} // Cleanup schema file
|
||||
}
|
||||
|
||||
// Example 2: Markdown Generation
|
||||
const markdownTestUrl = 'https://www.nbcnews.com/business';
|
||||
try {
|
||||
console.log(`\nAttempting Markdown crawl on ${markdownTestUrl}...`);
|
||||
const markdownData = await crawlAndExtract(markdownTestUrl, {
|
||||
outputMode: 'markdown'
|
||||
});
|
||||
console.log('Generated Markdown (first 500 chars):', markdownData.substring(0, 500) + '...');
|
||||
} catch (error) {
|
||||
console.error('Markdown Crawling failed:', error);
|
||||
}
|
||||
}
|
||||
|
||||
// example(); // Uncomment to run the example function
|
||||
195
packages/kbot/src/web/crwl_script.py
Normal file
195
packages/kbot/src/web/crwl_script.py
Normal file
@ -0,0 +1,195 @@
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
|
||||
def load_json_config(config_path):
|
||||
"""Loads a JSON configuration file."""
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
# print(f"Warning: Config file not found: {config_path}", file=sys.stderr)
|
||||
return {}
|
||||
try:
|
||||
with open(config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in config file '{config_path}': {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except IOError as e:
|
||||
print(f"Error: Could not read config file '{config_path}': {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
async def main(url, schema_path, strategy_type, output_mode, output_file,
|
||||
browser_config_path, crawler_config_path, bypass_cache, verbose):
|
||||
|
||||
output_dir = os.path.dirname(output_file)
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
print(f"Error: Output directory does not exist: {output_dir}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Load configurations from files
|
||||
schema_config = load_json_config(schema_path) # Used only if output_mode is json
|
||||
browser_config = load_json_config(browser_config_path)
|
||||
crawler_config = load_json_config(crawler_config_path)
|
||||
|
||||
# --- Prepare CrawlerRunConfig arguments ---
|
||||
run_config_kwargs = crawler_config.copy() # Start with crawler config file content
|
||||
# Set cache mode
|
||||
run_config_kwargs['cache_mode'] = CacheMode.BYPASS if bypass_cache else CacheMode.CACHE_FIRST
|
||||
|
||||
# Add extraction strategy if JSON mode
|
||||
if output_mode == 'json':
|
||||
if not schema_config:
|
||||
# If --schema path wasn't provided or file was empty/invalid, but mode is json
|
||||
print(f"Error: Schema file ('{schema_path}') is required and must contain valid JSON for json output mode.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if strategy_type == 'css':
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema_config, verbose=verbose)
|
||||
elif strategy_type == 'xpath':
|
||||
extraction_strategy = JsonXPathExtractionStrategy(schema_config, verbose=verbose)
|
||||
else:
|
||||
# This case should ideally be caught by argparse choices, but added for safety
|
||||
print(f"Error: Invalid strategy type '{strategy_type}'. Use 'css' or 'xpath'.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
run_config_kwargs['extraction_strategy'] = extraction_strategy
|
||||
elif output_mode == 'markdown':
|
||||
pass # Markdown handled by default or potentially via crawler_config
|
||||
else:
|
||||
print(f"Error: Invalid output mode '{output_mode}'. Use 'json' or 'markdown'.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Create CrawlerRunConfig
|
||||
# Note: Assumes crawler_config keys match CrawlerRunConfig parameters
|
||||
try:
|
||||
config = CrawlerRunConfig(**run_config_kwargs)
|
||||
except TypeError as e:
|
||||
print(f"Error creating CrawlerRunConfig. Check crawler config file content or arguments: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# --- Prepare AsyncWebCrawler arguments ---
|
||||
crawler_kwargs = browser_config.copy() # Start with browser config file content
|
||||
crawler_kwargs['verbose'] = verbose
|
||||
|
||||
try:
|
||||
# Create crawler instance
|
||||
# Note: Assumes browser_config keys match AsyncWebCrawler parameters
|
||||
crawler_instance = AsyncWebCrawler(**crawler_kwargs)
|
||||
|
||||
# Attempt to redirect logger to stderr if not verbose (to keep final output clean)
|
||||
if not verbose:
|
||||
try:
|
||||
if hasattr(crawler_instance, 'logger') and hasattr(crawler_instance.logger, 'set_output_stream'):
|
||||
crawler_instance.logger.set_output_stream(sys.stderr)
|
||||
elif hasattr(crawler_instance, 'logger') and hasattr(crawler_instance.logger, 'handlers'):
|
||||
for handler in crawler_instance.logger.handlers:
|
||||
if hasattr(handler, 'setStream'): handler.setStream(sys.stderr)
|
||||
elif hasattr(handler, 'stream'): handler.stream = sys.stderr
|
||||
except Exception as log_config_err:
|
||||
# Non-fatal warning if redirection fails
|
||||
print(f"Warning: Could not redirect non-verbose crawler logger to stderr: {log_config_err}", file=sys.stderr)
|
||||
|
||||
# Run the crawler
|
||||
async with crawler_instance as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=config
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
print(f"Error: Crawl failed for URL '{url}': {result.error_message}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Write output to the specified file
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
if output_mode == 'json':
|
||||
json_output = result.extracted_content if result.extracted_content is not None else 'null'
|
||||
f.write(json_output)
|
||||
elif output_mode == 'markdown':
|
||||
markdown_output = result.markdown if result.markdown is not None else ''
|
||||
f.write(markdown_output)
|
||||
sys.exit(0) # Success
|
||||
except IOError as e:
|
||||
print(f"Error: Could not write to output file '{output_file}': {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except TypeError as e:
|
||||
print(f"Error creating AsyncWebCrawler. Check browser config file content or arguments: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
# Force stderr to UTF-8 as well for error messages
|
||||
try: sys.stderr.reconfigure(encoding='utf-8')
|
||||
except Exception: pass
|
||||
print(f"Error: An unexpected error occurred during processing: {e}", file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Crawl a URL and write content to a file using crawl4ai library.')
|
||||
parser.add_argument('url', help='The URL to crawl.')
|
||||
parser.add_argument(
|
||||
'-s', '--schema',
|
||||
dest='schema_path',
|
||||
help='Path to JSON schema file for extraction (required for json output mode).',
|
||||
default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
'--strategy-type',
|
||||
help="Extraction strategy type ('css' or 'xpath'). Used only with --schema.",
|
||||
choices=['css', 'xpath'],
|
||||
default='css'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--output-mode',
|
||||
help="Output mode ('json' or 'markdown').",
|
||||
choices=['json', 'markdown'],
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
'-O', '--output-file',
|
||||
dest='output_file',
|
||||
help="Path to write the output file.",
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
'-B', '--browser-config',
|
||||
dest='browser_config_path',
|
||||
help='Path to browser config file (JSON format).',
|
||||
default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
'-C', '--crawler-config',
|
||||
dest='crawler_config_path',
|
||||
help='Path to crawler config file (JSON format).',
|
||||
default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
'--bypass-cache',
|
||||
action='store_true',
|
||||
help='Bypass cache when crawling.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true',
|
||||
help='Enable verbose logging.'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.output_mode == 'json' and not args.schema_path:
|
||||
parser.error('--schema <PATH> is required when --output-mode is json')
|
||||
|
||||
asyncio.run(main(
|
||||
url=args.url,
|
||||
schema_path=args.schema_path,
|
||||
strategy_type=args.strategy_type,
|
||||
output_mode=args.output_mode,
|
||||
output_file=args.output_file,
|
||||
browser_config_path=args.browser_config_path,
|
||||
crawler_config_path=args.crawler_config_path,
|
||||
bypass_cache=args.bypass_cache,
|
||||
verbose=args.verbose
|
||||
))
|
||||
146
packages/kbot/tests/unit/web/crwl.test.ts
Normal file
146
packages/kbot/tests/unit/web/crwl.test.ts
Normal file
@ -0,0 +1,146 @@
|
||||
// tests/unit/web/crwl.test.ts
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { crawlAndExtract } from '../../../src/web/crwl';
|
||||
import path from 'path';
|
||||
import os from 'os';
|
||||
import crypto from 'crypto';
|
||||
import { promises as fs } from 'fs';
|
||||
|
||||
// Note: This is an integration test, not a unit test, as it runs the actual Python script.
|
||||
// Ensure Python and crawl4ai are installed in the environment where this test runs.
|
||||
// Also ensure the python executable used by the test (`python` by default) can find crawl4ai.
|
||||
|
||||
// Integration tests for the crawlAndExtract wrapper
|
||||
|
||||
describe('crawlAndExtract Integration Tests', () => {
|
||||
|
||||
// Setup for JSON test: create a temporary schema file
|
||||
let tempSchemaPath: string;
|
||||
const testSchema = {
|
||||
name: "Test Products",
|
||||
baseSelector: "div.item",
|
||||
fields: [
|
||||
{ name: "product_title", selector: "h2.title", type: "text" },
|
||||
{ name: "product_price", selector: "span.price", type: "text" },
|
||||
{ name: "product_link", selector: "a.link", type: "attribute", attribute: "href" }
|
||||
]
|
||||
};
|
||||
|
||||
beforeAll(async () => {
|
||||
// Create a temp file for the schema before tests run
|
||||
const tempFileName = `test-schema-${crypto.randomBytes(6).toString('hex')}.json`;
|
||||
tempSchemaPath = path.join(os.tmpdir(), tempFileName);
|
||||
try {
|
||||
await fs.writeFile(tempSchemaPath, JSON.stringify(testSchema, null, 2));
|
||||
console.log(`Created temp schema file: ${tempSchemaPath}`);
|
||||
} catch (err) {
|
||||
console.error("Failed to create temp schema file for tests:", err);
|
||||
throw err; // Fail setup if file creation fails
|
||||
}
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
// Cleanup the temp schema file after all tests run
|
||||
if (tempSchemaPath) {
|
||||
try {
|
||||
await fs.unlink(tempSchemaPath);
|
||||
console.log(`Cleaned up temp schema file: ${tempSchemaPath}`);
|
||||
} catch (err) {
|
||||
console.warn(`Failed to clean up temp schema file ${tempSchemaPath}:`, err);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Test Markdown generation
|
||||
// Increase timeout for this test as it involves network I/O and python script execution
|
||||
// vitest default is 5000ms
|
||||
const markdownTimeoutMs = 60000; // 60 seconds
|
||||
it('should generate markdown with verbose and bypassCache flags', async () => {
|
||||
const url = 'https://www.nbcnews.com/business';
|
||||
let stderrOutput = '';
|
||||
const originalWarn = console.warn; // Store original console.warn
|
||||
console.warn = (message) => { // Capture stderr from the wrapper
|
||||
if (typeof message === 'string' && message.startsWith('Python script stderr:')) {
|
||||
stderrOutput += message;
|
||||
}
|
||||
originalWarn.apply(console, [message]); // Call original warn
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await crawlAndExtract(url, {
|
||||
outputMode: 'markdown',
|
||||
verbose: true, // Test verbose flag
|
||||
bypassCache: true, // Test bypassCache flag
|
||||
});
|
||||
console.warn = originalWarn; // Restore original console.warn
|
||||
|
||||
expect(result).toBeTypeOf('string');
|
||||
expect(result.length).toBeGreaterThan(100);
|
||||
// Check stderr for expected verbose log patterns (adjust patterns if needed)
|
||||
expect(stderrOutput).toContain('[INIT]');
|
||||
expect(stderrOutput).toContain('[FETCH]');
|
||||
expect(stderrOutput).toContain('[SCRAPE]');
|
||||
expect(stderrOutput).toContain('[MARKDOWN]'); // Or similar markdown step log
|
||||
expect(stderrOutput).toContain('[COMPLETE]');
|
||||
console.log(`Markdown generated (verbose) successfully for ${url}. Length: ${result.length}`);
|
||||
} catch (error) {
|
||||
console.warn = originalWarn; // Restore on error too
|
||||
console.error("Markdown generation test (verbose) failed:", error);
|
||||
throw error;
|
||||
}
|
||||
}, markdownTimeoutMs);
|
||||
|
||||
// Test JSON extraction using schema file
|
||||
// Increase timeout slightly as it still involves process spawning
|
||||
const jsonTimeoutMs = 20000; // 20 seconds
|
||||
it('should extract JSON using schema file with verbose flag', async () => {
|
||||
// Using a simple, static HTML page for reliable JSON extraction testing
|
||||
// This avoids relying on external site structure which can change.
|
||||
// We'll use the raw:// scheme with dummy HTML passed directly to the python script.
|
||||
const dummyHtml = `
|
||||
<html><body>
|
||||
<div class='item'><h2 class='title'>P1</h2><span class='price'>$10</span><a class='link' href='/p1'>L1</a></div>
|
||||
<div class='item'><h2 class='title'>P2</h2><span class='price'>$20</span><a class='link' href='/p2'>L2</a></div>
|
||||
</body></html>`;
|
||||
const rawUrl = `raw://${dummyHtml}`;
|
||||
let stderrOutput = '';
|
||||
const originalWarn = console.warn; // Store original console.warn
|
||||
console.warn = (message) => { // Capture stderr from the wrapper
|
||||
if (typeof message === 'string' && message.startsWith('Python script stderr:')) {
|
||||
stderrOutput += message;
|
||||
}
|
||||
originalWarn.apply(console, [message]); // Call original warn
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await crawlAndExtract(rawUrl, {
|
||||
outputMode: 'json',
|
||||
schemaPath: tempSchemaPath,
|
||||
strategyType: 'css',
|
||||
verbose: true, // Test verbose flag
|
||||
});
|
||||
console.warn = originalWarn; // Restore original console.warn
|
||||
|
||||
expect(result).toBeInstanceOf(Array);
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0]).toEqual({ product_title: "P1", product_price: "$10", product_link: "/p1" });
|
||||
expect(result[1]).toEqual({ product_title: "P2", product_price: "$20", product_link: "/p2" });
|
||||
// Check stderr for expected verbose log patterns
|
||||
expect(stderrOutput).toContain('[INIT]');
|
||||
expect(stderrOutput).toContain('[FETCH]');
|
||||
expect(stderrOutput).toContain('[SCRAPE]');
|
||||
expect(stderrOutput).toContain('[EXTRACT]'); // JSON extraction step log
|
||||
expect(stderrOutput).toContain('[COMPLETE]');
|
||||
console.log(`JSON extracted (verbose) successfully using schema file: ${tempSchemaPath}`);
|
||||
} catch (error) {
|
||||
console.warn = originalWarn; // Restore on error too
|
||||
console.error("JSON extraction test (verbose, schema file) failed:", error);
|
||||
throw error;
|
||||
}
|
||||
}, jsonTimeoutMs);
|
||||
|
||||
// TODO: Add tests for browserConfigPath and crawlerConfigPath if needed
|
||||
// These would likely require creating dummy JSON config files and checking stderr
|
||||
// for specific log messages indicating the configs were loaded/applied.
|
||||
|
||||
});
|
||||
Loading…
Reference in New Issue
Block a user