kbot iterator example: md-ast transformer

This commit is contained in:
lovebird 2025-04-07 17:41:05 +02:00
parent 181a655d94
commit e3f6f74b1b
4 changed files with 361 additions and 281 deletions

View File

@ -1 +1 @@
export declare function markdownTransformExample(useCache?: boolean): Promise<any>;
export declare function markdownTransformExample(useCache?: boolean): Promise<import("mdast").Root>;

File diff suppressed because one or more lines are too long

View File

@ -3,7 +3,7 @@
"messages": [
{
"role": "user",
"content": "Summarize this paragraph in one short sentence (max 15 words).\n\nText to transform: \"A concluding paragraph summarizing the key findings.\""
"content": "Analyze the following text and extract keywords and sentiment. Respond ONLY with a valid JSON object matching the schema provided in the 'format' field. Do not include any other text, explanations, or markdown formatting.\n\nText to analyze:\n\"The text mentions sunny weather, making it ideal for a park walk.\""
},
{
"role": "user",

View File

@ -10,6 +10,8 @@ import { E_OPENROUTER_MODEL } from '../../models/cache/openrouter-models.js';
import { E_Mode } from '../../zod_schema.js';
import { FieldMapping, transform, IOptions, createLLMTransformer, CacheConfig, ILogger } from '../../iterator.js';
import { OnTransformCallback } from '../../async-iterator.js';
import { deepClone } from '@polymech/core/objects';
import { run } from '../../commands/run.js';
/**
* Notes for LLM modifications
@ -25,6 +27,7 @@ const MODEL = E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI; // Corrected model na
const ROUTER = 'openrouter';
const INPUT_MD_PATH = path.resolve('./tests/test-data/core/md-test.md');
const OUTPUT_MD_PATH = path.resolve('./tests/test-data/core/md-test-out.md');
const OUTPUT_JSON_PATH = path.resolve('./tests/test-data/core/md-test-out.json');
// Basic logger
const logger: ILogger = {
@ -52,56 +55,22 @@ function getLLMTransformerForMdast(
}
// Define field mappings for the mdast structure
const fieldMappings: FieldMapping[] = [
// Mappings for standard transforms (targetPath: null)
const standardMappings: FieldMapping[] = [
{
// Target text value within H1-H5 headings' children
jsonPath: '$.children[?(@.type=="heading" && @.depth <= 5)].children[?(@.type=="text")].value',
targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
options: {
prompt: 'Rewrite this heading to be more concise and impactful (max 5 words).'
}
targetPath: null,
options: { prompt: 'Rewrite this heading to be more concise and impactful (max 5 words).' }
},
{
// Target the specific paragraph NODE in Chapter 4 for structured analysis.
// The transformer will extract text from the node passed to it.
// Using targetPath='analysisResult' stores the LLM output on the node itself.
jsonPath: '$.children[?(@.type=="paragraph" && @.children[0].value.includes("results"))]',
targetPath: 'analysisResult', // Store result in a new property on the paragraph node
options: {
prompt: 'Extract keywords and sentiment from this text.',
format: {
type: "object",
properties: {
sentiment: {
type: "string",
enum: ["positive", "neutral", "negative"],
description: "Overall sentiment"
},
keywords: {
type: "array",
items: { type: "string" },
description: "Main keywords (max 5)"
}
},
required: ["sentiment", "keywords"]
}
}
},
{
// Target text value within paragraphs' children
jsonPath: '$.children[?(@.type=="paragraph")].children[?(@.type=="text")].value',
targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
options: {
prompt: 'Summarize this paragraph in one short sentence (max 15 words).'
}
targetPath: null,
options: { prompt: 'Summarize this paragraph in one short sentence (max 15 words).' }
},
{
// Target text value within table cells' children
jsonPath: '$.children[?(@.type=="table")].children[*].children[*].children[?(@.type=="text")].value',
targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
options: {
prompt: 'Rephrase this table cell content slightly.'
}
targetPath: null,
options: { prompt: 'Rephrase this table cell content slightly.' }
}
];
@ -129,9 +98,25 @@ export async function markdownTransformExample(useCache = true) {
// Make a deep copy to avoid modifying the original AST if transform fails
// Note: Standard deepClone might not work perfectly with complex AST nodes (position, data fields).
// For this example, JSON stringify/parse is a common workaround, but beware of data loss.
let astToTransform = JSON.parse(JSON.stringify(ast));
// Using deepClone instead of JSON.parse/stringify
let astToTransform = deepClone(ast);
// --- Pre-processing: Add temporary ID to the target node ---
const analysisTargetText = "results"; // Text to identify the node
const tempIdValue = 'analyze-me';
let identifiedNodeForAnalysis: any = null; // Store reference to the node
try {
visit(astToTransform, 'paragraph', (node: any) => {
if (node.children?.[0]?.value?.includes(analysisTargetText)) {
node.tempId = tempIdValue;
identifiedNodeForAnalysis = node; // Store the reference
logger.info(`[Pre-process] Added tempId='${tempIdValue}' and stored reference to node containing '${analysisTargetText}'`);
}
});
} catch(e) {
logger.error("[Pre-process] Error adding temporary ID:", e);
identifiedNodeForAnalysis = null; // Ensure we don't proceed if tagging failed
}
// 3. Define global options and iterator options
const globalOptionsMixin: Partial<IKBotTask> = {
@ -141,142 +126,185 @@ export async function markdownTransformExample(useCache = true) {
};
const iteratorOptions: IOptions = {
// We provide our custom transformer factory
transformerFactory: (opts) => getLLMTransformerForMdast(opts, logger, { enabled: useCache }),
logger: logger,
cacheConfig: { enabled: useCache, namespace: 'markdown-transforms' },
// onTransform receives the value matched by jsonPath (can be string or node).
// It must return the STRING to be transformed by the LLM.
onTransform: async (jsonPath, value, kbotOptions) => {
let textContent = '';
// Check if the value is a node object or just a string
if (typeof value === 'string') {
textContent = value;
console.log(` -> onTransform String Value: Path='${jsonPath}', Value='${textContent.substring(0, 50)}...'`);
} else if (typeof value === 'object' && value !== null) {
// Attempt to extract text if it's a node (e.g., for the analysisResult mapping)
const node = value as any; // Basic type assertion
if ((node.type === 'heading' || node.type === 'paragraph' || node.type === 'tableCell') && node.children?.length === 1 && node.children[0].type === 'text') {
textContent = node.children[0].value || '';
} else if (node.type === 'text') {
textContent = node.value || '';
} else if (Array.isArray(node.children)) { // Handle complex children
textContent = node.children
.filter((child: any) => child.type === 'text' || child.type === 'inlineCode')
.map((child: any) => child.value)
.join('');
}
console.log(` -> onTransform AST Node: Path='${jsonPath}', Node Type='${node?.type}', Extracted Text='${textContent.substring(0, 50)}...'`);
} else {
console.log(` -> onTransform Unexpected Value Type: Path='${jsonPath}', Type='${typeof value}'`);
}
// Return the extracted string for the LLM transformer
return textContent;
if (typeof value === 'string') { textContent = value; console.log(` -> onTransform String Value: Path='${jsonPath}', Value='${textContent.substring(0, 50)}...'`); }
else if (typeof value === 'object' && value !== null) { const node = value as any; textContent = node.children?.[0]?.value || node.value || ''; console.log(` -> onTransform AST Node: Path='${jsonPath}', Node Type='${node?.type}', Extracted Text='${textContent.substring(0, 50)}...'`); }
else { console.log(` -> onTransform Unexpected Value Type: Path='${jsonPath}', Type='${typeof value}'`); }
return textContent;
},
errorCallback: (path, value, error) => {
if (error instanceof Error) {
logger.error(`Error processing path ${path}: ${error.message}`, error);
} else {
logger.error(`Error processing path ${path}: ${error}`, error);
}
},
filterCallback: async (value, jsonPath) => {
// Allow transformation if the value is an object (likely an AST node targeted directly)
if (typeof value === 'object' && value !== null) {
return true;
}
// If it's a string, apply the default string filter logic
if (typeof value === 'string') {
// Reuse the default isValidString filter logic for strings
const allow = value.trim() !== '';
if (!allow) {
logger.info(`Filter: Skipping empty string at ${jsonPath}`);
}
return allow;
}
// Skip other types (numbers, booleans, null, undefined)
logger.warn(`Filter: Skipping non-string/non-object value type (${typeof value}) at ${jsonPath}`);
return false;
}
errorCallback: (path, value, error) => {
if (error instanceof Error) { logger.error(`Error processing path ${path}: ${error.message}`, error); } else { logger.error(`Error processing path ${path}: ${error}`, error); }
},
filterCallback: async (value, jsonPath) => {
if (typeof value === 'object' && value !== null) { return true; } if (typeof value === 'string') { const allow = value.trim() !== ''; if (!allow) { logger.info(`Filter: Skipping empty string at ${jsonPath}`); } return allow; } logger.warn(`Filter: Skipping non-string/non-object value type (${typeof value}) at ${jsonPath}`); return false;
}
};
// 4. Use the transform function
console.log("Applying transformations to AST...");
// The 'transform' function iterates based on JSONPath and applies the transformerFactory's result
// It modifies the 'astToTransform' object in place.
// 4. Run standard transformations
console.log("Applying standard transformations to AST...");
await transform(
astToTransform, // Pass the AST object
fieldMappings,
astToTransform,
standardMappings,
globalOptionsMixin,
iteratorOptions
);
// *** Add logging before visit ***
console.log("\n[DEBUG] Inspecting AST before visit call:");
try {
// Attempt to find the specific paragraph node expected to have analysisResult
// NOTE: This relies on index/structure and might be brittle
const potentialNode = astToTransform.children?.find((node: any) =>
node.type === 'paragraph' &&
node.children?.[0]?.value?.includes("results")
);
if (potentialNode) {
console.log("[DEBUG] Found potential node:", JSON.stringify(potentialNode, null, 2));
console.log(`[DEBUG] Does potential node have analysisResult? ${potentialNode.hasOwnProperty('analysisResult')}`);
} else {
console.log("[DEBUG] Could not find the target paragraph node directly before visit.");
// 5. Perform Analysis Manually (if node was identified)
console.log("\nPerforming manual analysis...");
let analysisData: any = null; // Store successfully parsed result here
let rawAnalysisResult: string | null = null; // Store raw LLM result
if (identifiedNodeForAnalysis) {
try {
let nodeText: string = '';
// Extract text from the (now potentially modified) node
if (identifiedNodeForAnalysis.children?.[0]?.type === 'text') {
nodeText = identifiedNodeForAnalysis.children[0].value || '';
}
logger.info(`[Manual Analysis] Found node with tempId. Extracted text: "${nodeText.substring(0, 50)}..."`);
if (nodeText) {
// Define analysis task options
const analysisTaskOptions: IKBotTask = {
...globalOptionsMixin,
// Explicitly request ONLY JSON matching the schema
prompt: `Analyze the following text and extract keywords and sentiment. Respond ONLY with a valid JSON object matching the schema provided in the 'format' field. Do not include any other text, explanations, or markdown formatting.\n\nText to analyze:\n"${nodeText}"`,
format: {
type: "object",
properties: {
sentiment: { type: "string", enum: ["positive", "neutral", "negative"] },
keywords: { type: "array", items: { type: "string" }, maxItems: 5 }
},
required: ["sentiment", "keywords"]
},
mode: E_Mode.COMPLETION
};
// Call the run command
logger.info(`[Manual Analysis] Calling run command with format option...`);
const results = await run(analysisTaskOptions);
// Process the result (run should return the parsed object)
if (results && results.length > 0) {
const rawResult = results[0];
logger.info(`[Manual Analysis] Raw result from run: ${JSON.stringify(rawResult)}`);
if (typeof rawResult === 'object') {
// If it's already an object, use it directly
analysisData = rawResult;
logger.info(`[Manual Analysis] 'run' returned an object.`);
} else if (typeof rawResult === 'string') {
// If it's a string, try to parse it as JSON
logger.info(`[Manual Analysis] 'run' returned a string, attempting JSON parse...`);
try {
// Basic cleanup: Remove potential markdown like ```json ... ``` wrapper
const cleanedString = rawResult.replace(/^```json\s*|```$/g, '').trim();
analysisData = JSON.parse(cleanedString);
logger.info(`[Manual Analysis] Successfully parsed string result.`);
} catch (parseError) {
// Corrected logger calls
logger.error(`[Manual Analysis] Failed to parse string result from 'run': ${parseError instanceof Error ? parseError.message : parseError}`);
logger.error(`[Manual Analysis] Raw string was: ${rawResult}`);
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'parse failed', raw: rawResult }; // Store error/raw
}
} else {
// Unexpected type
logger.warn(`[Manual Analysis] 'run' returned unexpected type: ${typeof rawResult}`);
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'unexpected result type', raw: rawResult };
}
// If parsing succeeded and we have data, attach it
if (analysisData) {
identifiedNodeForAnalysis.manualAnalysisResult = analysisData; // Attach parsed data
logger.info(`[Manual Analysis] Successfully attached analysis result object.`);
logger.info(`[Manual Analysis] Result Content: ${JSON.stringify(analysisData, null, 2)}`);
} else {
logger.warn(`[Manual Analysis] 'run' command returned empty or unexpected result: ${JSON.stringify(results)}`);
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'empty or unexpected result', raw: results };
}
}
// Clean up tempId after processing
if (identifiedNodeForAnalysis && identifiedNodeForAnalysis.tempId) {
delete identifiedNodeForAnalysis.tempId;
logger.info(`[Manual Analysis] Removed tempId.`);
}
} else {
logger.warn("[Manual Analysis] Node text was empty, skipping analysis LLM call.");
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'empty source text' };
}
} catch (manualAnalysisError) {
logger.error("Error during manual analysis step:", manualAnalysisError);
// Attempt to clean up tempId even if analysis failed
if (identifiedNodeForAnalysis && identifiedNodeForAnalysis.tempId) {
delete identifiedNodeForAnalysis.tempId;
}
}
console.log("---------------------------------------");
} else {
logger.warn("[Manual Analysis] Target node for analysis was not identified in pre-processing. Skipping analysis.");
}
// --- Debug log checks for manualAnalysisResult ---
console.log("\n[DEBUG] Inspecting AST after all transforms, before visit call:");
try {
const potentialNode = astToTransform.children?.[16];
if (potentialNode && potentialNode.type === 'paragraph') {
const hasManualProp = potentialNode.hasOwnProperty('manualAnalysisResult');
console.log(`[DEBUG] Does potential node have manualAnalysisResult? ${hasManualProp}`);
if (hasManualProp) {
console.log("[DEBUG] manualAnalysisResult Content:", (potentialNode as any).manualAnalysisResult);
}
} else {
console.log("[DEBUG] Could not find the target paragraph node at index 16.");
}
console.log("---------------------------------------");
} catch (e) {
console.error("[DEBUG] Error during pre-visit inspection:", e);
}
// Log the manually obtained result (if parsed successfully)
if (analysisData) {
console.log("\nStructured Analysis Result (Manually Obtained):");
console.log(JSON.stringify(analysisData, null, 2));
} else {
console.log("\nStructured Analysis Result was not successfully obtained.");
// Raw result might be in the error object attached to the node now
}
// Retrieve the structured analysis result (if any)
// Note: The 'analysisResult' was added as a new property to the AST node by the mapping.
// We need to find that node again to see the result. This is clumsy.
let analysisData: any = null;
// 6. Save the transformed AST to JSON (will include manualAnalysisResult)
console.log(`\nWriting transformed AST to JSON: ${OUTPUT_JSON_PATH}`);
try {
visit(astToTransform, 'paragraph', (node: any) => {
if (node.analysisResult) {
analysisData = node.analysisResult;
// Optional: Remove the temporary property from the AST before stringifying
// delete node.analysisResult;
}
});
if (analysisData) {
console.log("\nStructured Analysis Result:");
// The LLM might return a stringified JSON, try parsing it
try {
const parsedResult = typeof analysisData === 'string' ? JSON.parse(analysisData) : analysisData;
console.log(JSON.stringify(parsedResult, null, 2));
} catch (e) {
console.log("Analysis result (raw):", analysisData);
}
} else {
console.log("\nStructured Analysis Result not found on AST (check targetPath logic and LLM response).");
}
} catch (e) {
logger.error("Error visiting AST for analysis result retrieval:", e)
const outputJsonDir = path.dirname(OUTPUT_JSON_PATH);
if (!fs.existsSync(outputJsonDir)) { fs.mkdirSync(outputJsonDir, { recursive: true }); }
fs.writeFileSync(OUTPUT_JSON_PATH, JSON.stringify(astToTransform, null, 2));
console.log("AST JSON saved successfully.");
} catch (error) {
logger.error("Failed to write AST JSON:", error);
}
// 7. Stringify the modified AST back to Markdown
console.log("\nStringifying transformed AST to Markdown...");
try {
const processorStringify = unified().use(remarkStringify);
const outputMarkdown = processorStringify.stringify(astToTransform as any);
// 5. Stringify the modified AST back to Markdown
console.log("\nStringifying transformed AST...");
const processorStringify = unified().use(remarkStringify);
// Stringify might fail if the AST structure became invalid during transformation
const outputMarkdown = processorStringify.stringify(astToTransform as any);
// 6. Write the output file
console.log(`Writing output to: ${OUTPUT_MD_PATH}`);
const outputDir = path.dirname(OUTPUT_MD_PATH);
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
// 8. Write the Markdown output file
console.log(`Writing output Markdown to: ${OUTPUT_MD_PATH}`);
const outputMdDir = path.dirname(OUTPUT_MD_PATH);
if (!fs.existsSync(outputMdDir)) { fs.mkdirSync(outputMdDir, { recursive: true }); }
write(OUTPUT_MD_PATH, outputMarkdown);
console.log("Markdown file saved successfully.");
} catch (stringifyError) {
logger.error("Failed to stringify or write Markdown output:", stringifyError);
}
write(OUTPUT_MD_PATH, outputMarkdown);
console.log("Markdown transformation complete.");
return astToTransform; // Return the transformed AST
return astToTransform;
} catch (error) {
logger.error("ERROR during Markdown transformation:", error);