kbot iterator example: md-ast transformer

2025-04-07 17:41:05 +02:00 · 2025-04-07 17:41:05 +02:00 · e3f6f74b1b
commit e3f6f74b1b
parent 181a655d94
4 changed files with 361 additions and 281 deletions
--- a/packages/kbot/dist-in/examples/core/iterator-markdown-example.d.ts
+++ b/packages/kbot/dist-in/examples/core/iterator-markdown-example.d.ts
@ -1 +1 @@
-export declare function markdownTransformExample(useCache?: boolean): Promise<any>;
+export declare function markdownTransformExample(useCache?: boolean): Promise<import("mdast").Root>;
--- a/packages/kbot/dist-in/examples/core/iterator-markdown-example.js
+++ b/packages/kbot/dist-in/examples/core/iterator-markdown-example.js
--- a/packages/kbot/logs/params.json
+++ b/packages/kbot/logs/params.json
@ -3,7 +3,7 @@
  "messages": [
    {
      "role": "user",
-      "content": "Summarize this paragraph in one short sentence (max 15 words).\n\nText to transform: \"A concluding paragraph summarizing the key findings.\""
+      "content": "Analyze the following text and extract keywords and sentiment. Respond ONLY with a valid JSON object matching the schema provided in the 'format' field. Do not include any other text, explanations, or markdown formatting.\n\nText to analyze:\n\"The text mentions sunny weather, making it ideal for a park walk.\""
    },
    {
      "role": "user",
--- a/packages/kbot/src/examples/core/iterator-markdown-example.ts
+++ b/packages/kbot/src/examples/core/iterator-markdown-example.ts
@ -10,6 +10,8 @@ import { E_OPENROUTER_MODEL } from '../../models/cache/openrouter-models.js';
 import { E_Mode } from '../../zod_schema.js';
 import { FieldMapping, transform, IOptions, createLLMTransformer, CacheConfig, ILogger } from '../../iterator.js';
 import { OnTransformCallback } from '../../async-iterator.js';
+import { deepClone } from '@polymech/core/objects';
+import { run } from '../../commands/run.js';

 /**
 * Notes for LLM modifications
@ -25,6 +27,7 @@ const MODEL = E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI; // Corrected model na
 const ROUTER = 'openrouter';
 const INPUT_MD_PATH = path.resolve('./tests/test-data/core/md-test.md');
 const OUTPUT_MD_PATH = path.resolve('./tests/test-data/core/md-test-out.md');
+const OUTPUT_JSON_PATH = path.resolve('./tests/test-data/core/md-test-out.json');

 // Basic logger
 const logger: ILogger = {
@ -52,56 +55,22 @@ function getLLMTransformerForMdast(
 }

 // Define field mappings for the mdast structure
-const fieldMappings: FieldMapping[] = [
+// Mappings for standard transforms (targetPath: null)
+const standardMappings: FieldMapping[] = [
    {
-        // Target text value within H1-H5 headings' children
        jsonPath: '$.children[?(@.type=="heading" && @.depth <= 5)].children[?(@.type=="text")].value',
-        targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
-        options: {
-            prompt: 'Rewrite this heading to be more concise and impactful (max 5 words).'
-        }
+        targetPath: null,
+        options: { prompt: 'Rewrite this heading to be more concise and impactful (max 5 words).' }
    },
    {
-         // Target the specific paragraph NODE in Chapter 4 for structured analysis.
-         // The transformer will extract text from the node passed to it.
-         // Using targetPath='analysisResult' stores the LLM output on the node itself.
-        jsonPath: '$.children[?(@.type=="paragraph" && @.children[0].value.includes("results"))]',
-        targetPath: 'analysisResult', // Store result in a new property on the paragraph node
-        options: {
-            prompt: 'Extract keywords and sentiment from this text.',
-            format: {
-                type: "object",
-                properties: {
-                    sentiment: {
-                        type: "string",
-                        enum: ["positive", "neutral", "negative"],
-                        description: "Overall sentiment"
-                    },
-                    keywords: {
-                        type: "array",
-                        items: { type: "string" },
-                        description: "Main keywords (max 5)"
-                    }
-                },
-                required: ["sentiment", "keywords"]
-            }
-        }
-    },
-    {
-        // Target text value within paragraphs' children
        jsonPath: '$.children[?(@.type=="paragraph")].children[?(@.type=="text")].value',
-        targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
-        options: {
-            prompt: 'Summarize this paragraph in one short sentence (max 15 words).'
-        }
+        targetPath: null,
+        options: { prompt: 'Summarize this paragraph in one short sentence (max 15 words).' }
    },
    {
-        // Target text value within table cells' children
        jsonPath: '$.children[?(@.type=="table")].children[*].children[*].children[?(@.type=="text")].value',
-        targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
-        options: {
-            prompt: 'Rephrase this table cell content slightly.'
-        }
+        targetPath: null,
+        options: { prompt: 'Rephrase this table cell content slightly.' }
    }
 ];

@ -129,9 +98,25 @@ export async function markdownTransformExample(useCache = true) {

        // Make a deep copy to avoid modifying the original AST if transform fails
        // Note: Standard deepClone might not work perfectly with complex AST nodes (position, data fields).
-        // For this example, JSON stringify/parse is a common workaround, but beware of data loss.
-        let astToTransform = JSON.parse(JSON.stringify(ast));
+        // Using deepClone instead of JSON.parse/stringify
+        let astToTransform = deepClone(ast);

+        // --- Pre-processing: Add temporary ID to the target node --- 
+        const analysisTargetText = "results"; // Text to identify the node
+        const tempIdValue = 'analyze-me';
+        let identifiedNodeForAnalysis: any = null; // Store reference to the node
+        try {
+            visit(astToTransform, 'paragraph', (node: any) => {
+                if (node.children?.[0]?.value?.includes(analysisTargetText)) {
+                    node.tempId = tempIdValue;
+                    identifiedNodeForAnalysis = node; // Store the reference
+                    logger.info(`[Pre-process] Added tempId='${tempIdValue}' and stored reference to node containing '${analysisTargetText}'`);
+                }
+            });
+        } catch(e) {
+            logger.error("[Pre-process] Error adding temporary ID:", e);
+             identifiedNodeForAnalysis = null; // Ensure we don't proceed if tagging failed
+        }

        // 3. Define global options and iterator options
        const globalOptionsMixin: Partial<IKBotTask> = {
@ -141,142 +126,185 @@ export async function markdownTransformExample(useCache = true) {
        };

        const iteratorOptions: IOptions = {
-            // We provide our custom transformer factory
            transformerFactory: (opts) => getLLMTransformerForMdast(opts, logger, { enabled: useCache }),
            logger: logger,
            cacheConfig: { enabled: useCache, namespace: 'markdown-transforms' },
-            // onTransform receives the value matched by jsonPath (can be string or node).
-            // It must return the STRING to be transformed by the LLM.
            onTransform: async (jsonPath, value, kbotOptions) => {
                let textContent = '';
-                // Check if the value is a node object or just a string
-                if (typeof value === 'string') {
-                    textContent = value;
-                    console.log(` -> onTransform String Value: Path='${jsonPath}', Value='${textContent.substring(0, 50)}...'`);
-                } else if (typeof value === 'object' && value !== null) {
-                    // Attempt to extract text if it's a node (e.g., for the analysisResult mapping)
-                    const node = value as any; // Basic type assertion
-                    if ((node.type === 'heading' || node.type === 'paragraph' || node.type === 'tableCell') && node.children?.length === 1 && node.children[0].type === 'text') {
-                        textContent = node.children[0].value || '';
-                    } else if (node.type === 'text') {
-                        textContent = node.value || '';
-                    } else if (Array.isArray(node.children)) { // Handle complex children
-                        textContent = node.children
-                            .filter((child: any) => child.type === 'text' || child.type === 'inlineCode')
-                            .map((child: any) => child.value)
-                            .join('');
-                    }
-                    console.log(` -> onTransform AST Node: Path='${jsonPath}', Node Type='${node?.type}', Extracted Text='${textContent.substring(0, 50)}...'`);
-                } else {
-                    console.log(` -> onTransform Unexpected Value Type: Path='${jsonPath}', Type='${typeof value}'`);
-                }
-
-                // Return the extracted string for the LLM transformer
-                return textContent;
+                if (typeof value === 'string') { textContent = value; console.log(` -> onTransform String Value: Path='${jsonPath}', Value='${textContent.substring(0, 50)}...'`); }
+                 else if (typeof value === 'object' && value !== null) { const node = value as any; textContent = node.children?.[0]?.value || node.value || ''; console.log(` -> onTransform AST Node: Path='${jsonPath}', Node Type='${node?.type}', Extracted Text='${textContent.substring(0, 50)}...'`); }
+                 else { console.log(` -> onTransform Unexpected Value Type: Path='${jsonPath}', Type='${typeof value}'`); }
+                 return textContent;
            },
-            errorCallback: (path, value, error) => {
-                if (error instanceof Error) {
-                    logger.error(`Error processing path ${path}: ${error.message}`, error);
-                } else {
-                    logger.error(`Error processing path ${path}: ${error}`, error);
-                }
-            },
-            filterCallback: async (value, jsonPath) => {
-                // Allow transformation if the value is an object (likely an AST node targeted directly)
-                if (typeof value === 'object' && value !== null) {
-                    return true; 
-                }
-                // If it's a string, apply the default string filter logic
-                if (typeof value === 'string') {
-                     // Reuse the default isValidString filter logic for strings
-                     const allow = value.trim() !== ''; 
-                     if (!allow) {
-                         logger.info(`Filter: Skipping empty string at ${jsonPath}`);
-                     }
-                     return allow;
-                } 
-                // Skip other types (numbers, booleans, null, undefined)
-                logger.warn(`Filter: Skipping non-string/non-object value type (${typeof value}) at ${jsonPath}`);
-                return false;
-            }
+             errorCallback: (path, value, error) => {
+                if (error instanceof Error) { logger.error(`Error processing path ${path}: ${error.message}`, error); } else { logger.error(`Error processing path ${path}: ${error}`, error); }
+             },
+             filterCallback: async (value, jsonPath) => {
+                 if (typeof value === 'object' && value !== null) { return true; } if (typeof value === 'string') { const allow = value.trim() !== ''; if (!allow) { logger.info(`Filter: Skipping empty string at ${jsonPath}`); } return allow; } logger.warn(`Filter: Skipping non-string/non-object value type (${typeof value}) at ${jsonPath}`); return false;
+             }
        };

-        // 4. Use the transform function
-        console.log("Applying transformations to AST...");
-        // The 'transform' function iterates based on JSONPath and applies the transformerFactory's result
-        // It modifies the 'astToTransform' object in place.
+        // 4. Run standard transformations
+        console.log("Applying standard transformations to AST...");
        await transform(
-            astToTransform, // Pass the AST object
-            fieldMappings,
+            astToTransform,
+            standardMappings,
            globalOptionsMixin,
            iteratorOptions
        );

-        // *** Add logging before visit ***
-        console.log("\n[DEBUG] Inspecting AST before visit call:");
-        try {
-            // Attempt to find the specific paragraph node expected to have analysisResult
-            // NOTE: This relies on index/structure and might be brittle
-            const potentialNode = astToTransform.children?.find((node: any) => 
-                 node.type === 'paragraph' && 
-                 node.children?.[0]?.value?.includes("results")
-            );
-            if (potentialNode) {
-                 console.log("[DEBUG] Found potential node:", JSON.stringify(potentialNode, null, 2));
-                 console.log(`[DEBUG] Does potential node have analysisResult? ${potentialNode.hasOwnProperty('analysisResult')}`);
-            } else {
-                 console.log("[DEBUG] Could not find the target paragraph node directly before visit.");
+        // 5. Perform Analysis Manually (if node was identified)
+        console.log("\nPerforming manual analysis...");
+        let analysisData: any = null; // Store successfully parsed result here
+        let rawAnalysisResult: string | null = null; // Store raw LLM result
+
+        if (identifiedNodeForAnalysis) {
+            try {
+                let nodeText: string = '';
+                // Extract text from the (now potentially modified) node
+                if (identifiedNodeForAnalysis.children?.[0]?.type === 'text') {
+                    nodeText = identifiedNodeForAnalysis.children[0].value || '';
+                }
+                logger.info(`[Manual Analysis] Found node with tempId. Extracted text: "${nodeText.substring(0, 50)}..."`);
+
+                if (nodeText) {
+                    // Define analysis task options
+                    const analysisTaskOptions: IKBotTask = {
+                        ...globalOptionsMixin,
+                        // Explicitly request ONLY JSON matching the schema
+                        prompt: `Analyze the following text and extract keywords and sentiment. Respond ONLY with a valid JSON object matching the schema provided in the 'format' field. Do not include any other text, explanations, or markdown formatting.\n\nText to analyze:\n"${nodeText}"`, 
+                        format: {
+                            type: "object",
+                            properties: {
+                                sentiment: { type: "string", enum: ["positive", "neutral", "negative"] },
+                                keywords: { type: "array", items: { type: "string" }, maxItems: 5 }
+                            },
+                            required: ["sentiment", "keywords"]
+                        },
+                        mode: E_Mode.COMPLETION
+                    };
+
+                    // Call the run command
+                    logger.info(`[Manual Analysis] Calling run command with format option...`);
+                    const results = await run(analysisTaskOptions);
+
+                    // Process the result (run should return the parsed object)
+                    if (results && results.length > 0) {
+                        const rawResult = results[0];
+                        logger.info(`[Manual Analysis] Raw result from run: ${JSON.stringify(rawResult)}`);
+
+                        if (typeof rawResult === 'object') {
+                            // If it's already an object, use it directly
+                            analysisData = rawResult;
+                            logger.info(`[Manual Analysis] 'run' returned an object.`);
+                        } else if (typeof rawResult === 'string') {
+                            // If it's a string, try to parse it as JSON
+                            logger.info(`[Manual Analysis] 'run' returned a string, attempting JSON parse...`);
+                            try {
+                                // Basic cleanup: Remove potential markdown like ```json ... ``` wrapper
+                                const cleanedString = rawResult.replace(/^```json\s*|```$/g, '').trim();
+                                analysisData = JSON.parse(cleanedString);
+                                logger.info(`[Manual Analysis] Successfully parsed string result.`);
+                            } catch (parseError) {
+                                // Corrected logger calls
+                                logger.error(`[Manual Analysis] Failed to parse string result from 'run': ${parseError instanceof Error ? parseError.message : parseError}`);
+                                logger.error(`[Manual Analysis] Raw string was: ${rawResult}`);
+                                identifiedNodeForAnalysis.manualAnalysisResult = { error: 'parse failed', raw: rawResult }; // Store error/raw
+                            }
+                        } else {
+                            // Unexpected type
+                            logger.warn(`[Manual Analysis] 'run' returned unexpected type: ${typeof rawResult}`);
+                            identifiedNodeForAnalysis.manualAnalysisResult = { error: 'unexpected result type', raw: rawResult };
+                        }
+
+                        // If parsing succeeded and we have data, attach it
+                        if (analysisData) {
+                            identifiedNodeForAnalysis.manualAnalysisResult = analysisData; // Attach parsed data
+                            logger.info(`[Manual Analysis] Successfully attached analysis result object.`);
+                            logger.info(`[Manual Analysis] Result Content: ${JSON.stringify(analysisData, null, 2)}`);
+                        } else {
+                            logger.warn(`[Manual Analysis] 'run' command returned empty or unexpected result: ${JSON.stringify(results)}`);
+                            identifiedNodeForAnalysis.manualAnalysisResult = { error: 'empty or unexpected result', raw: results };
+                        }
+                    }
+
+                    // Clean up tempId after processing
+                    if (identifiedNodeForAnalysis && identifiedNodeForAnalysis.tempId) {
+                        delete identifiedNodeForAnalysis.tempId;
+                        logger.info(`[Manual Analysis] Removed tempId.`);
+                    }
+
+                } else {
+                    logger.warn("[Manual Analysis] Node text was empty, skipping analysis LLM call.");
+                    identifiedNodeForAnalysis.manualAnalysisResult = { error: 'empty source text' };
+                }
+
+            } catch (manualAnalysisError) {
+                logger.error("Error during manual analysis step:", manualAnalysisError);
+                // Attempt to clean up tempId even if analysis failed
+                if (identifiedNodeForAnalysis && identifiedNodeForAnalysis.tempId) {
+                    delete identifiedNodeForAnalysis.tempId;
+                }
            }
-             console.log("---------------------------------------");
+        } else {
+            logger.warn("[Manual Analysis] Target node for analysis was not identified in pre-processing. Skipping analysis.");
+        }
+
+        // --- Debug log checks for manualAnalysisResult --- 
+        console.log("\n[DEBUG] Inspecting AST after all transforms, before visit call:");
+        try {
+            const potentialNode = astToTransform.children?.[16];
+            if (potentialNode && potentialNode.type === 'paragraph') {
+                const hasManualProp = potentialNode.hasOwnProperty('manualAnalysisResult');
+                console.log(`[DEBUG] Does potential node have manualAnalysisResult? ${hasManualProp}`);
+                if (hasManualProp) {
+                    console.log("[DEBUG] manualAnalysisResult Content:", (potentialNode as any).manualAnalysisResult);
+                }
+            } else {
+                console.log("[DEBUG] Could not find the target paragraph node at index 16.");
+            }
+            console.log("---------------------------------------");
        } catch (e) {
            console.error("[DEBUG] Error during pre-visit inspection:", e);
        }
+        
+        // Log the manually obtained result (if parsed successfully)
+        if (analysisData) {
+            console.log("\nStructured Analysis Result (Manually Obtained):");
+            console.log(JSON.stringify(analysisData, null, 2));
+        } else {
+            console.log("\nStructured Analysis Result was not successfully obtained.");
+            // Raw result might be in the error object attached to the node now
+        }

-        // Retrieve the structured analysis result (if any)
-        // Note: The 'analysisResult' was added as a new property to the AST node by the mapping.
-        // We need to find that node again to see the result. This is clumsy.
-        let analysisData: any = null;
+        // 6. Save the transformed AST to JSON (will include manualAnalysisResult)
+        console.log(`\nWriting transformed AST to JSON: ${OUTPUT_JSON_PATH}`);
        try {
-            visit(astToTransform, 'paragraph', (node: any) => {
-                if (node.analysisResult) {
-                    analysisData = node.analysisResult;
-                    // Optional: Remove the temporary property from the AST before stringifying
-                    // delete node.analysisResult;
-                }
-            });
-             if (analysisData) {
-                 console.log("\nStructured Analysis Result:");
-                 // The LLM might return a stringified JSON, try parsing it
-                 try {
-                     const parsedResult = typeof analysisData === 'string' ? JSON.parse(analysisData) : analysisData;
-                     console.log(JSON.stringify(parsedResult, null, 2));
-                 } catch (e) {
-                     console.log("Analysis result (raw):", analysisData);
-                 }
-             } else {
-                 console.log("\nStructured Analysis Result not found on AST (check targetPath logic and LLM response).");
-             }
-        } catch (e) {
-             logger.error("Error visiting AST for analysis result retrieval:", e)
+            const outputJsonDir = path.dirname(OUTPUT_JSON_PATH);
+            if (!fs.existsSync(outputJsonDir)) { fs.mkdirSync(outputJsonDir, { recursive: true }); }
+            fs.writeFileSync(OUTPUT_JSON_PATH, JSON.stringify(astToTransform, null, 2));
+            console.log("AST JSON saved successfully.");
+        } catch (error) {
+            logger.error("Failed to write AST JSON:", error);
        }

+        // 7. Stringify the modified AST back to Markdown
+        console.log("\nStringifying transformed AST to Markdown...");
+        try {
+            const processorStringify = unified().use(remarkStringify);
+            const outputMarkdown = processorStringify.stringify(astToTransform as any);

-        // 5. Stringify the modified AST back to Markdown
-        console.log("\nStringifying transformed AST...");
-        const processorStringify = unified().use(remarkStringify);
-        // Stringify might fail if the AST structure became invalid during transformation
-        const outputMarkdown = processorStringify.stringify(astToTransform as any);
-
-        // 6. Write the output file
-        console.log(`Writing output to: ${OUTPUT_MD_PATH}`);
-        const outputDir = path.dirname(OUTPUT_MD_PATH);
-        if (!fs.existsSync(outputDir)) {
-            fs.mkdirSync(outputDir, { recursive: true });
+            // 8. Write the Markdown output file
+            console.log(`Writing output Markdown to: ${OUTPUT_MD_PATH}`);
+            const outputMdDir = path.dirname(OUTPUT_MD_PATH);
+            if (!fs.existsSync(outputMdDir)) { fs.mkdirSync(outputMdDir, { recursive: true }); }
+            write(OUTPUT_MD_PATH, outputMarkdown);
+            console.log("Markdown file saved successfully.");
+        } catch (stringifyError) {
+            logger.error("Failed to stringify or write Markdown output:", stringifyError);
        }
-        write(OUTPUT_MD_PATH, outputMarkdown);

-        console.log("Markdown transformation complete.");
-        return astToTransform; // Return the transformed AST
+        return astToTransform;

    } catch (error) {
        logger.error("ERROR during Markdown transformation:", error);