kbot iterator example: md-ast transformer
This commit is contained in:
parent
181a655d94
commit
e3f6f74b1b
@ -1 +1 @@
|
||||
export declare function markdownTransformExample(useCache?: boolean): Promise<any>;
|
||||
export declare function markdownTransformExample(useCache?: boolean): Promise<import("mdast").Root>;
|
||||
|
||||
File diff suppressed because one or more lines are too long
@ -3,7 +3,7 @@
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Summarize this paragraph in one short sentence (max 15 words).\n\nText to transform: \"A concluding paragraph summarizing the key findings.\""
|
||||
"content": "Analyze the following text and extract keywords and sentiment. Respond ONLY with a valid JSON object matching the schema provided in the 'format' field. Do not include any other text, explanations, or markdown formatting.\n\nText to analyze:\n\"The text mentions sunny weather, making it ideal for a park walk.\""
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
|
||||
@ -10,6 +10,8 @@ import { E_OPENROUTER_MODEL } from '../../models/cache/openrouter-models.js';
|
||||
import { E_Mode } from '../../zod_schema.js';
|
||||
import { FieldMapping, transform, IOptions, createLLMTransformer, CacheConfig, ILogger } from '../../iterator.js';
|
||||
import { OnTransformCallback } from '../../async-iterator.js';
|
||||
import { deepClone } from '@polymech/core/objects';
|
||||
import { run } from '../../commands/run.js';
|
||||
|
||||
/**
|
||||
* Notes for LLM modifications
|
||||
@ -25,6 +27,7 @@ const MODEL = E_OPENROUTER_MODEL.MODEL_OPENAI_GPT_4O_MINI; // Corrected model na
|
||||
const ROUTER = 'openrouter';
|
||||
const INPUT_MD_PATH = path.resolve('./tests/test-data/core/md-test.md');
|
||||
const OUTPUT_MD_PATH = path.resolve('./tests/test-data/core/md-test-out.md');
|
||||
const OUTPUT_JSON_PATH = path.resolve('./tests/test-data/core/md-test-out.json');
|
||||
|
||||
// Basic logger
|
||||
const logger: ILogger = {
|
||||
@ -52,56 +55,22 @@ function getLLMTransformerForMdast(
|
||||
}
|
||||
|
||||
// Define field mappings for the mdast structure
|
||||
const fieldMappings: FieldMapping[] = [
|
||||
// Mappings for standard transforms (targetPath: null)
|
||||
const standardMappings: FieldMapping[] = [
|
||||
{
|
||||
// Target text value within H1-H5 headings' children
|
||||
jsonPath: '$.children[?(@.type=="heading" && @.depth <= 5)].children[?(@.type=="text")].value',
|
||||
targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
|
||||
options: {
|
||||
prompt: 'Rewrite this heading to be more concise and impactful (max 5 words).'
|
||||
}
|
||||
targetPath: null,
|
||||
options: { prompt: 'Rewrite this heading to be more concise and impactful (max 5 words).' }
|
||||
},
|
||||
{
|
||||
// Target the specific paragraph NODE in Chapter 4 for structured analysis.
|
||||
// The transformer will extract text from the node passed to it.
|
||||
// Using targetPath='analysisResult' stores the LLM output on the node itself.
|
||||
jsonPath: '$.children[?(@.type=="paragraph" && @.children[0].value.includes("results"))]',
|
||||
targetPath: 'analysisResult', // Store result in a new property on the paragraph node
|
||||
options: {
|
||||
prompt: 'Extract keywords and sentiment from this text.',
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
sentiment: {
|
||||
type: "string",
|
||||
enum: ["positive", "neutral", "negative"],
|
||||
description: "Overall sentiment"
|
||||
},
|
||||
keywords: {
|
||||
type: "array",
|
||||
items: { type: "string" },
|
||||
description: "Main keywords (max 5)"
|
||||
}
|
||||
},
|
||||
required: ["sentiment", "keywords"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// Target text value within paragraphs' children
|
||||
jsonPath: '$.children[?(@.type=="paragraph")].children[?(@.type=="text")].value',
|
||||
targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
|
||||
options: {
|
||||
prompt: 'Summarize this paragraph in one short sentence (max 15 words).'
|
||||
}
|
||||
targetPath: null,
|
||||
options: { prompt: 'Summarize this paragraph in one short sentence (max 15 words).' }
|
||||
},
|
||||
{
|
||||
// Target text value within table cells' children
|
||||
jsonPath: '$.children[?(@.type=="table")].children[*].children[*].children[?(@.type=="text")].value',
|
||||
targetPath: null, // Modify in place (NOTE: This likely won't work due to iterator limitations)
|
||||
options: {
|
||||
prompt: 'Rephrase this table cell content slightly.'
|
||||
}
|
||||
targetPath: null,
|
||||
options: { prompt: 'Rephrase this table cell content slightly.' }
|
||||
}
|
||||
];
|
||||
|
||||
@ -129,9 +98,25 @@ export async function markdownTransformExample(useCache = true) {
|
||||
|
||||
// Make a deep copy to avoid modifying the original AST if transform fails
|
||||
// Note: Standard deepClone might not work perfectly with complex AST nodes (position, data fields).
|
||||
// For this example, JSON stringify/parse is a common workaround, but beware of data loss.
|
||||
let astToTransform = JSON.parse(JSON.stringify(ast));
|
||||
// Using deepClone instead of JSON.parse/stringify
|
||||
let astToTransform = deepClone(ast);
|
||||
|
||||
// --- Pre-processing: Add temporary ID to the target node ---
|
||||
const analysisTargetText = "results"; // Text to identify the node
|
||||
const tempIdValue = 'analyze-me';
|
||||
let identifiedNodeForAnalysis: any = null; // Store reference to the node
|
||||
try {
|
||||
visit(astToTransform, 'paragraph', (node: any) => {
|
||||
if (node.children?.[0]?.value?.includes(analysisTargetText)) {
|
||||
node.tempId = tempIdValue;
|
||||
identifiedNodeForAnalysis = node; // Store the reference
|
||||
logger.info(`[Pre-process] Added tempId='${tempIdValue}' and stored reference to node containing '${analysisTargetText}'`);
|
||||
}
|
||||
});
|
||||
} catch(e) {
|
||||
logger.error("[Pre-process] Error adding temporary ID:", e);
|
||||
identifiedNodeForAnalysis = null; // Ensure we don't proceed if tagging failed
|
||||
}
|
||||
|
||||
// 3. Define global options and iterator options
|
||||
const globalOptionsMixin: Partial<IKBotTask> = {
|
||||
@ -141,142 +126,185 @@ export async function markdownTransformExample(useCache = true) {
|
||||
};
|
||||
|
||||
const iteratorOptions: IOptions = {
|
||||
// We provide our custom transformer factory
|
||||
transformerFactory: (opts) => getLLMTransformerForMdast(opts, logger, { enabled: useCache }),
|
||||
logger: logger,
|
||||
cacheConfig: { enabled: useCache, namespace: 'markdown-transforms' },
|
||||
// onTransform receives the value matched by jsonPath (can be string or node).
|
||||
// It must return the STRING to be transformed by the LLM.
|
||||
onTransform: async (jsonPath, value, kbotOptions) => {
|
||||
let textContent = '';
|
||||
// Check if the value is a node object or just a string
|
||||
if (typeof value === 'string') {
|
||||
textContent = value;
|
||||
console.log(` -> onTransform String Value: Path='${jsonPath}', Value='${textContent.substring(0, 50)}...'`);
|
||||
} else if (typeof value === 'object' && value !== null) {
|
||||
// Attempt to extract text if it's a node (e.g., for the analysisResult mapping)
|
||||
const node = value as any; // Basic type assertion
|
||||
if ((node.type === 'heading' || node.type === 'paragraph' || node.type === 'tableCell') && node.children?.length === 1 && node.children[0].type === 'text') {
|
||||
textContent = node.children[0].value || '';
|
||||
} else if (node.type === 'text') {
|
||||
textContent = node.value || '';
|
||||
} else if (Array.isArray(node.children)) { // Handle complex children
|
||||
textContent = node.children
|
||||
.filter((child: any) => child.type === 'text' || child.type === 'inlineCode')
|
||||
.map((child: any) => child.value)
|
||||
.join('');
|
||||
}
|
||||
console.log(` -> onTransform AST Node: Path='${jsonPath}', Node Type='${node?.type}', Extracted Text='${textContent.substring(0, 50)}...'`);
|
||||
} else {
|
||||
console.log(` -> onTransform Unexpected Value Type: Path='${jsonPath}', Type='${typeof value}'`);
|
||||
}
|
||||
|
||||
// Return the extracted string for the LLM transformer
|
||||
return textContent;
|
||||
if (typeof value === 'string') { textContent = value; console.log(` -> onTransform String Value: Path='${jsonPath}', Value='${textContent.substring(0, 50)}...'`); }
|
||||
else if (typeof value === 'object' && value !== null) { const node = value as any; textContent = node.children?.[0]?.value || node.value || ''; console.log(` -> onTransform AST Node: Path='${jsonPath}', Node Type='${node?.type}', Extracted Text='${textContent.substring(0, 50)}...'`); }
|
||||
else { console.log(` -> onTransform Unexpected Value Type: Path='${jsonPath}', Type='${typeof value}'`); }
|
||||
return textContent;
|
||||
},
|
||||
errorCallback: (path, value, error) => {
|
||||
if (error instanceof Error) {
|
||||
logger.error(`Error processing path ${path}: ${error.message}`, error);
|
||||
} else {
|
||||
logger.error(`Error processing path ${path}: ${error}`, error);
|
||||
}
|
||||
},
|
||||
filterCallback: async (value, jsonPath) => {
|
||||
// Allow transformation if the value is an object (likely an AST node targeted directly)
|
||||
if (typeof value === 'object' && value !== null) {
|
||||
return true;
|
||||
}
|
||||
// If it's a string, apply the default string filter logic
|
||||
if (typeof value === 'string') {
|
||||
// Reuse the default isValidString filter logic for strings
|
||||
const allow = value.trim() !== '';
|
||||
if (!allow) {
|
||||
logger.info(`Filter: Skipping empty string at ${jsonPath}`);
|
||||
}
|
||||
return allow;
|
||||
}
|
||||
// Skip other types (numbers, booleans, null, undefined)
|
||||
logger.warn(`Filter: Skipping non-string/non-object value type (${typeof value}) at ${jsonPath}`);
|
||||
return false;
|
||||
}
|
||||
errorCallback: (path, value, error) => {
|
||||
if (error instanceof Error) { logger.error(`Error processing path ${path}: ${error.message}`, error); } else { logger.error(`Error processing path ${path}: ${error}`, error); }
|
||||
},
|
||||
filterCallback: async (value, jsonPath) => {
|
||||
if (typeof value === 'object' && value !== null) { return true; } if (typeof value === 'string') { const allow = value.trim() !== ''; if (!allow) { logger.info(`Filter: Skipping empty string at ${jsonPath}`); } return allow; } logger.warn(`Filter: Skipping non-string/non-object value type (${typeof value}) at ${jsonPath}`); return false;
|
||||
}
|
||||
};
|
||||
|
||||
// 4. Use the transform function
|
||||
console.log("Applying transformations to AST...");
|
||||
// The 'transform' function iterates based on JSONPath and applies the transformerFactory's result
|
||||
// It modifies the 'astToTransform' object in place.
|
||||
// 4. Run standard transformations
|
||||
console.log("Applying standard transformations to AST...");
|
||||
await transform(
|
||||
astToTransform, // Pass the AST object
|
||||
fieldMappings,
|
||||
astToTransform,
|
||||
standardMappings,
|
||||
globalOptionsMixin,
|
||||
iteratorOptions
|
||||
);
|
||||
|
||||
// *** Add logging before visit ***
|
||||
console.log("\n[DEBUG] Inspecting AST before visit call:");
|
||||
try {
|
||||
// Attempt to find the specific paragraph node expected to have analysisResult
|
||||
// NOTE: This relies on index/structure and might be brittle
|
||||
const potentialNode = astToTransform.children?.find((node: any) =>
|
||||
node.type === 'paragraph' &&
|
||||
node.children?.[0]?.value?.includes("results")
|
||||
);
|
||||
if (potentialNode) {
|
||||
console.log("[DEBUG] Found potential node:", JSON.stringify(potentialNode, null, 2));
|
||||
console.log(`[DEBUG] Does potential node have analysisResult? ${potentialNode.hasOwnProperty('analysisResult')}`);
|
||||
} else {
|
||||
console.log("[DEBUG] Could not find the target paragraph node directly before visit.");
|
||||
// 5. Perform Analysis Manually (if node was identified)
|
||||
console.log("\nPerforming manual analysis...");
|
||||
let analysisData: any = null; // Store successfully parsed result here
|
||||
let rawAnalysisResult: string | null = null; // Store raw LLM result
|
||||
|
||||
if (identifiedNodeForAnalysis) {
|
||||
try {
|
||||
let nodeText: string = '';
|
||||
// Extract text from the (now potentially modified) node
|
||||
if (identifiedNodeForAnalysis.children?.[0]?.type === 'text') {
|
||||
nodeText = identifiedNodeForAnalysis.children[0].value || '';
|
||||
}
|
||||
logger.info(`[Manual Analysis] Found node with tempId. Extracted text: "${nodeText.substring(0, 50)}..."`);
|
||||
|
||||
if (nodeText) {
|
||||
// Define analysis task options
|
||||
const analysisTaskOptions: IKBotTask = {
|
||||
...globalOptionsMixin,
|
||||
// Explicitly request ONLY JSON matching the schema
|
||||
prompt: `Analyze the following text and extract keywords and sentiment. Respond ONLY with a valid JSON object matching the schema provided in the 'format' field. Do not include any other text, explanations, or markdown formatting.\n\nText to analyze:\n"${nodeText}"`,
|
||||
format: {
|
||||
type: "object",
|
||||
properties: {
|
||||
sentiment: { type: "string", enum: ["positive", "neutral", "negative"] },
|
||||
keywords: { type: "array", items: { type: "string" }, maxItems: 5 }
|
||||
},
|
||||
required: ["sentiment", "keywords"]
|
||||
},
|
||||
mode: E_Mode.COMPLETION
|
||||
};
|
||||
|
||||
// Call the run command
|
||||
logger.info(`[Manual Analysis] Calling run command with format option...`);
|
||||
const results = await run(analysisTaskOptions);
|
||||
|
||||
// Process the result (run should return the parsed object)
|
||||
if (results && results.length > 0) {
|
||||
const rawResult = results[0];
|
||||
logger.info(`[Manual Analysis] Raw result from run: ${JSON.stringify(rawResult)}`);
|
||||
|
||||
if (typeof rawResult === 'object') {
|
||||
// If it's already an object, use it directly
|
||||
analysisData = rawResult;
|
||||
logger.info(`[Manual Analysis] 'run' returned an object.`);
|
||||
} else if (typeof rawResult === 'string') {
|
||||
// If it's a string, try to parse it as JSON
|
||||
logger.info(`[Manual Analysis] 'run' returned a string, attempting JSON parse...`);
|
||||
try {
|
||||
// Basic cleanup: Remove potential markdown like ```json ... ``` wrapper
|
||||
const cleanedString = rawResult.replace(/^```json\s*|```$/g, '').trim();
|
||||
analysisData = JSON.parse(cleanedString);
|
||||
logger.info(`[Manual Analysis] Successfully parsed string result.`);
|
||||
} catch (parseError) {
|
||||
// Corrected logger calls
|
||||
logger.error(`[Manual Analysis] Failed to parse string result from 'run': ${parseError instanceof Error ? parseError.message : parseError}`);
|
||||
logger.error(`[Manual Analysis] Raw string was: ${rawResult}`);
|
||||
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'parse failed', raw: rawResult }; // Store error/raw
|
||||
}
|
||||
} else {
|
||||
// Unexpected type
|
||||
logger.warn(`[Manual Analysis] 'run' returned unexpected type: ${typeof rawResult}`);
|
||||
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'unexpected result type', raw: rawResult };
|
||||
}
|
||||
|
||||
// If parsing succeeded and we have data, attach it
|
||||
if (analysisData) {
|
||||
identifiedNodeForAnalysis.manualAnalysisResult = analysisData; // Attach parsed data
|
||||
logger.info(`[Manual Analysis] Successfully attached analysis result object.`);
|
||||
logger.info(`[Manual Analysis] Result Content: ${JSON.stringify(analysisData, null, 2)}`);
|
||||
} else {
|
||||
logger.warn(`[Manual Analysis] 'run' command returned empty or unexpected result: ${JSON.stringify(results)}`);
|
||||
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'empty or unexpected result', raw: results };
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up tempId after processing
|
||||
if (identifiedNodeForAnalysis && identifiedNodeForAnalysis.tempId) {
|
||||
delete identifiedNodeForAnalysis.tempId;
|
||||
logger.info(`[Manual Analysis] Removed tempId.`);
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.warn("[Manual Analysis] Node text was empty, skipping analysis LLM call.");
|
||||
identifiedNodeForAnalysis.manualAnalysisResult = { error: 'empty source text' };
|
||||
}
|
||||
|
||||
} catch (manualAnalysisError) {
|
||||
logger.error("Error during manual analysis step:", manualAnalysisError);
|
||||
// Attempt to clean up tempId even if analysis failed
|
||||
if (identifiedNodeForAnalysis && identifiedNodeForAnalysis.tempId) {
|
||||
delete identifiedNodeForAnalysis.tempId;
|
||||
}
|
||||
}
|
||||
console.log("---------------------------------------");
|
||||
} else {
|
||||
logger.warn("[Manual Analysis] Target node for analysis was not identified in pre-processing. Skipping analysis.");
|
||||
}
|
||||
|
||||
// --- Debug log checks for manualAnalysisResult ---
|
||||
console.log("\n[DEBUG] Inspecting AST after all transforms, before visit call:");
|
||||
try {
|
||||
const potentialNode = astToTransform.children?.[16];
|
||||
if (potentialNode && potentialNode.type === 'paragraph') {
|
||||
const hasManualProp = potentialNode.hasOwnProperty('manualAnalysisResult');
|
||||
console.log(`[DEBUG] Does potential node have manualAnalysisResult? ${hasManualProp}`);
|
||||
if (hasManualProp) {
|
||||
console.log("[DEBUG] manualAnalysisResult Content:", (potentialNode as any).manualAnalysisResult);
|
||||
}
|
||||
} else {
|
||||
console.log("[DEBUG] Could not find the target paragraph node at index 16.");
|
||||
}
|
||||
console.log("---------------------------------------");
|
||||
} catch (e) {
|
||||
console.error("[DEBUG] Error during pre-visit inspection:", e);
|
||||
}
|
||||
|
||||
// Log the manually obtained result (if parsed successfully)
|
||||
if (analysisData) {
|
||||
console.log("\nStructured Analysis Result (Manually Obtained):");
|
||||
console.log(JSON.stringify(analysisData, null, 2));
|
||||
} else {
|
||||
console.log("\nStructured Analysis Result was not successfully obtained.");
|
||||
// Raw result might be in the error object attached to the node now
|
||||
}
|
||||
|
||||
// Retrieve the structured analysis result (if any)
|
||||
// Note: The 'analysisResult' was added as a new property to the AST node by the mapping.
|
||||
// We need to find that node again to see the result. This is clumsy.
|
||||
let analysisData: any = null;
|
||||
// 6. Save the transformed AST to JSON (will include manualAnalysisResult)
|
||||
console.log(`\nWriting transformed AST to JSON: ${OUTPUT_JSON_PATH}`);
|
||||
try {
|
||||
visit(astToTransform, 'paragraph', (node: any) => {
|
||||
if (node.analysisResult) {
|
||||
analysisData = node.analysisResult;
|
||||
// Optional: Remove the temporary property from the AST before stringifying
|
||||
// delete node.analysisResult;
|
||||
}
|
||||
});
|
||||
if (analysisData) {
|
||||
console.log("\nStructured Analysis Result:");
|
||||
// The LLM might return a stringified JSON, try parsing it
|
||||
try {
|
||||
const parsedResult = typeof analysisData === 'string' ? JSON.parse(analysisData) : analysisData;
|
||||
console.log(JSON.stringify(parsedResult, null, 2));
|
||||
} catch (e) {
|
||||
console.log("Analysis result (raw):", analysisData);
|
||||
}
|
||||
} else {
|
||||
console.log("\nStructured Analysis Result not found on AST (check targetPath logic and LLM response).");
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error("Error visiting AST for analysis result retrieval:", e)
|
||||
const outputJsonDir = path.dirname(OUTPUT_JSON_PATH);
|
||||
if (!fs.existsSync(outputJsonDir)) { fs.mkdirSync(outputJsonDir, { recursive: true }); }
|
||||
fs.writeFileSync(OUTPUT_JSON_PATH, JSON.stringify(astToTransform, null, 2));
|
||||
console.log("AST JSON saved successfully.");
|
||||
} catch (error) {
|
||||
logger.error("Failed to write AST JSON:", error);
|
||||
}
|
||||
|
||||
// 7. Stringify the modified AST back to Markdown
|
||||
console.log("\nStringifying transformed AST to Markdown...");
|
||||
try {
|
||||
const processorStringify = unified().use(remarkStringify);
|
||||
const outputMarkdown = processorStringify.stringify(astToTransform as any);
|
||||
|
||||
// 5. Stringify the modified AST back to Markdown
|
||||
console.log("\nStringifying transformed AST...");
|
||||
const processorStringify = unified().use(remarkStringify);
|
||||
// Stringify might fail if the AST structure became invalid during transformation
|
||||
const outputMarkdown = processorStringify.stringify(astToTransform as any);
|
||||
|
||||
// 6. Write the output file
|
||||
console.log(`Writing output to: ${OUTPUT_MD_PATH}`);
|
||||
const outputDir = path.dirname(OUTPUT_MD_PATH);
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
// 8. Write the Markdown output file
|
||||
console.log(`Writing output Markdown to: ${OUTPUT_MD_PATH}`);
|
||||
const outputMdDir = path.dirname(OUTPUT_MD_PATH);
|
||||
if (!fs.existsSync(outputMdDir)) { fs.mkdirSync(outputMdDir, { recursive: true }); }
|
||||
write(OUTPUT_MD_PATH, outputMarkdown);
|
||||
console.log("Markdown file saved successfully.");
|
||||
} catch (stringifyError) {
|
||||
logger.error("Failed to stringify or write Markdown output:", stringifyError);
|
||||
}
|
||||
write(OUTPUT_MD_PATH, outputMarkdown);
|
||||
|
||||
console.log("Markdown transformation complete.");
|
||||
return astToTransform; // Return the transformed AST
|
||||
return astToTransform;
|
||||
|
||||
} catch (error) {
|
||||
logger.error("ERROR during Markdown transformation:", error);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user