190 lines
6.9 KiB
TypeScript
190 lines
6.9 KiB
TypeScript
import { writeFile } from 'fs/promises'
|
|
import { join } from 'path'
|
|
import {
|
|
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
logEvent,
|
|
} from '../services/analytics/index.js'
|
|
import type { MCPResultType } from '../services/mcp/client.js'
|
|
import { toError } from './errors.js'
|
|
import { formatFileSize } from './format.js'
|
|
import { logError } from './log.js'
|
|
import { ensureToolResultsDir, getToolResultsDir } from './toolResultStorage.js'
|
|
|
|
/**
|
|
* Generates a format description string based on the MCP result type and schema.
|
|
*/
|
|
export function getFormatDescription(
|
|
type: MCPResultType,
|
|
schema?: unknown,
|
|
): string {
|
|
switch (type) {
|
|
case 'toolResult':
|
|
return 'Plain text'
|
|
case 'structuredContent':
|
|
return schema ? `JSON with schema: ${schema}` : 'JSON'
|
|
case 'contentArray':
|
|
return schema ? `JSON array with schema: ${schema}` : 'JSON array'
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generates instruction text for Claude to read from a saved output file.
|
|
*
|
|
* @param rawOutputPath - Path to the saved output file
|
|
* @param contentLength - Length of the content in characters
|
|
* @param formatDescription - Description of the content format
|
|
* @param maxReadLength - Optional max chars for Read tool (for Bash output context)
|
|
* @returns Instruction text to include in the tool result
|
|
*/
|
|
export function getLargeOutputInstructions(
|
|
rawOutputPath: string,
|
|
contentLength: number,
|
|
formatDescription: string,
|
|
maxReadLength?: number,
|
|
): string {
|
|
const baseInstructions =
|
|
`Error: result (${contentLength.toLocaleString()} characters) exceeds maximum allowed tokens. Output has been saved to ${rawOutputPath}.\n` +
|
|
`Format: ${formatDescription}\n` +
|
|
`Use offset and limit parameters to read specific portions of the file, search within it for specific content, and jq to make structured queries.\n` +
|
|
`REQUIREMENTS FOR SUMMARIZATION/ANALYSIS/REVIEW:\n` +
|
|
`- You MUST read the content from the file at ${rawOutputPath} in sequential chunks until 100% of the content has been read.\n`
|
|
|
|
const truncationWarning = maxReadLength
|
|
? `- If you receive truncation warnings when reading the file ("[N lines truncated]"), reduce the chunk size until you have read 100% of the content without truncation ***DO NOT PROCEED UNTIL YOU HAVE DONE THIS***. Bash output is limited to ${maxReadLength.toLocaleString()} chars.\n`
|
|
: `- If you receive truncation warnings when reading the file, reduce the chunk size until you have read 100% of the content without truncation.\n`
|
|
|
|
const completionRequirement = `- Before producing ANY summary or analysis, you MUST explicitly describe what portion of the content you have read. ***If you did not read the entire content, you MUST explicitly state this.***\n`
|
|
|
|
return baseInstructions + truncationWarning + completionRequirement
|
|
}
|
|
|
|
/**
|
|
* Map a mime type to a file extension. Conservative: known types get their
|
|
* proper extension; unknown types get 'bin'. The extension matters because
|
|
* the Read tool dispatches on it (PDFs, images, etc. need the right ext).
|
|
*/
|
|
export function extensionForMimeType(mimeType: string | undefined): string {
|
|
if (!mimeType) return 'bin'
|
|
// Strip any charset/boundary parameter
|
|
const mt = (mimeType.split(';')[0] ?? '').trim().toLowerCase()
|
|
switch (mt) {
|
|
case 'application/pdf':
|
|
return 'pdf'
|
|
case 'application/json':
|
|
return 'json'
|
|
case 'text/csv':
|
|
return 'csv'
|
|
case 'text/plain':
|
|
return 'txt'
|
|
case 'text/html':
|
|
return 'html'
|
|
case 'text/markdown':
|
|
return 'md'
|
|
case 'application/zip':
|
|
return 'zip'
|
|
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
|
return 'docx'
|
|
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
|
|
return 'xlsx'
|
|
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
|
|
return 'pptx'
|
|
case 'application/msword':
|
|
return 'doc'
|
|
case 'application/vnd.ms-excel':
|
|
return 'xls'
|
|
case 'audio/mpeg':
|
|
return 'mp3'
|
|
case 'audio/wav':
|
|
return 'wav'
|
|
case 'audio/ogg':
|
|
return 'ogg'
|
|
case 'video/mp4':
|
|
return 'mp4'
|
|
case 'video/webm':
|
|
return 'webm'
|
|
case 'image/png':
|
|
return 'png'
|
|
case 'image/jpeg':
|
|
return 'jpg'
|
|
case 'image/gif':
|
|
return 'gif'
|
|
case 'image/webp':
|
|
return 'webp'
|
|
case 'image/svg+xml':
|
|
return 'svg'
|
|
default:
|
|
return 'bin'
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Heuristic for whether a content-type header indicates binary content that
|
|
* should be saved to disk rather than put into the model context.
|
|
* Text-ish types (text/*, json, xml, form data) are treated as non-binary.
|
|
*/
|
|
export function isBinaryContentType(contentType: string): boolean {
|
|
if (!contentType) return false
|
|
const mt = (contentType.split(';')[0] ?? '').trim().toLowerCase()
|
|
if (mt.startsWith('text/')) return false
|
|
// Structured text formats delivered with an application/ type. Use suffix
|
|
// or exact match rather than substring so 'openxmlformats' (docx/xlsx) stays binary.
|
|
if (mt.endsWith('+json') || mt === 'application/json') return false
|
|
if (mt.endsWith('+xml') || mt === 'application/xml') return false
|
|
if (mt.startsWith('application/javascript')) return false
|
|
if (mt === 'application/x-www-form-urlencoded') return false
|
|
return true
|
|
}
|
|
|
|
export type PersistBinaryResult =
|
|
| { filepath: string; size: number; ext: string }
|
|
| { error: string }
|
|
|
|
/**
|
|
* Write raw binary bytes to the tool-results directory with a mime-derived
|
|
* extension. Unlike persistToolResult (which stringifies), this writes the
|
|
* bytes as-is so the resulting file can be opened with native tools (Read
|
|
* for PDFs, pandas for xlsx, etc.).
|
|
*/
|
|
export async function persistBinaryContent(
|
|
bytes: Buffer,
|
|
mimeType: string | undefined,
|
|
persistId: string,
|
|
): Promise<PersistBinaryResult> {
|
|
await ensureToolResultsDir()
|
|
const ext = extensionForMimeType(mimeType)
|
|
const filepath = join(getToolResultsDir(), `${persistId}.${ext}`)
|
|
|
|
try {
|
|
await writeFile(filepath, bytes)
|
|
} catch (error) {
|
|
const err = toError(error)
|
|
logError(err)
|
|
return { error: err.message }
|
|
}
|
|
|
|
// mime type and extension are safe fixed-vocabulary strings (not paths/code)
|
|
logEvent('tengu_binary_content_persisted', {
|
|
mimeType: (mimeType ??
|
|
'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
sizeBytes: bytes.length,
|
|
ext: ext as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
})
|
|
|
|
return { filepath, size: bytes.length, ext }
|
|
}
|
|
|
|
/**
|
|
* Build a short message telling Claude where binary content was saved.
|
|
* Just states the path — no prescriptive hint, since what the model can
|
|
* actually do with the file depends on provider/tooling.
|
|
*/
|
|
export function getBinaryBlobSavedMessage(
|
|
filepath: string,
|
|
mimeType: string | undefined,
|
|
size: number,
|
|
sourceDescription: string,
|
|
): string {
|
|
const mt = mimeType || 'unknown type'
|
|
return `${sourceDescription}Binary content (${mt}, ${formatFileSize(size)}) saved to ${filepath}`
|
|
}
|