mono/packages/kbot/src/source.ts
2025-06-03 20:55:18 +02:00

219 lines
6.4 KiB
TypeScript

import * as path from 'node:path'
import * as fs from 'node:fs'
import { sync as read } from '@polymech/fs/read'
import { sync as dir } from '@polymech/fs/dir'
import { createItem as toNode } from '@polymech/fs/inspect'
import { sync as exists } from '@polymech/fs/exists'
import { isFile, forward_slash } from '@polymech/commons'
import { logger } from './index.js'
import { lookup } from 'mime-types'
import { globSync } from 'glob'
import { EXCLUDE_GLOB, MAX_FILE_SIZE } from './constants.js'
import { defaultMimeRegistry, IHandlerResult } from './mime-handlers.js'
import { ChatCompletionContentPartImage } from 'openai/resources/index.mjs'
import { IKBotTask, ICollector } from '@polymech/ai-tools'
import { supported } from './commands/run-assistant.js'
import { handleWebUrl } from './http.js'
/**
* @todos
* - add support for vector stores : https://platform.openai.com/docs/assistants/tools/file-search?lang=node.js
*/
export const default_filters = {
isFile,
exists,
size: (filePath: string) => toNode(filePath).size < MAX_FILE_SIZE,
};
const isPathInside = (childPath: string, parentPath: string): boolean => {
const relation = path.relative(parentPath, childPath);
return Boolean(
relation &&
!relation.startsWith('..') &&
!relation.startsWith('..' + path.sep)
);
};
export const isPathOutsideSafe = (pathA: string, pathB: string): boolean => {
const realA = fs.realpathSync(pathA);
const realB = fs.realpathSync(pathB);
return !isPathInside(realA, realB);
};
export const base64 = (filePath: string): string | null => {
try {
const fileBuffer = fs.readFileSync(filePath);
const mimeType = lookup(filePath);
if (!mimeType) {
throw new Error('Unable to determine MIME type.');
}
const base64Data = fileBuffer.toString('base64');
return `data:${mimeType};base64,${base64Data}`;
} catch (error) {
logger.error('fileToBase64 : Error reading file:', error);
return null;
}
};
export const images = (files: string[]): ChatCompletionContentPartImage[] => {
return files.map((f) => ({
type: "image_url",
image_url: { url: base64(f) }
}))
}
/**
* Check if a string is a web URL
*/
export const isWebUrl = (str: string): boolean => {
return /^https?:\/\//.test(str);
}
export const glob = (
projectPath: string,
include: string[] = [],
exclude: string[] = []
): { files: string[], webUrls: Set<string> } => {
if (!exists(projectPath)) {
dir(projectPath)
return { files: [], webUrls: new Set<string>() }
}
const filters = new Set<string>()
const absolutePaths = new Set<string>()
const webUrls = new Set<string>()
const ignorePatterns = new Set<string>(EXCLUDE_GLOB)
include.forEach(pattern => {
// Check if the pattern is a web URL
if (isWebUrl(pattern)) {
webUrls.add(pattern)
return
}
if (path.isAbsolute(pattern)) {
if (isPathInside(pattern, projectPath)) {
filters.add(pattern)
} else {
absolutePaths.add(pattern)
}
} else {
filters.add(pattern)
}
})
// Process exclude patterns
exclude.forEach(pattern => {
if (isWebUrl(pattern)) {
// Web URLs are typically not "excluded" in a file glob sense,
// but if there's a use case, it needs clarification.
// For now, we'll assume web URLs in exclude are ignored for globbing.
return;
}
// Add all exclude patterns (absolute or relative) to ignorePatterns
// globSync handles absolute paths correctly in its `ignore` option when `cwd` is set.
ignorePatterns.add(pattern);
});
const globFiles = globSync([...filters], {
cwd: projectPath,
absolute: false,
ignore: [...ignorePatterns]
})
const allFiles = Array.from(new Set([
...globFiles.map(file => path.join(projectPath, file)),
...Array.from(absolutePaths)
]))
let files = allFiles.filter((f) =>
Object.keys(default_filters).every((key) => default_filters[key](f))
)
return { files, webUrls }
}
export async function get(
projectPath: string,
include: string[] = [],
options: IKBotTask
): Promise<Array<IHandlerResult>> {
const { files, webUrls } = glob(projectPath, include, options.exclude)
// Process file contents
const fileResults = files.map((fullPath) => {
try {
const relativePath = forward_slash(path.relative(projectPath, fullPath))
if (isFile(fullPath) && exists(fullPath)) {
const mimeType = lookup(fullPath) || 'text/plain'
const handler = defaultMimeRegistry.getHandler(mimeType)
if (handler) {
return handler.handle(fullPath, relativePath)
}
return defaultMimeRegistry.getHandler('text/*')?.handle(fullPath, relativePath) || null
}
return null
} catch (error) {
logger.error(`Error reading file ${fullPath}:`, error)
return null
}
})
// Process web URLs
const webUrlPromises = Array.from(webUrls).map(async (url: string) => {
try {
return await handleWebUrl(url)
} catch (error) {
logger.error(`Error processing web URL ${url}:`, error)
return null
}
})
const webResults = await Promise.all(webUrlPromises)
// Combine and filter results
const results = [...fileResults, ...webResults].filter((r) => r !== null)
return results
}
export async function vectorize(file: string, options: IKBotTask): Promise<string> {
if (!options.client) {
throw new Error('OpenAI client is required for vectorization')
}
const ext = path.extname(file).toLowerCase()
if (!(ext in supported)) {
throw new Error(`Unsupported file format: ${ext}. Supported formats: ${Object.keys(supported).join(', ')}`)
}
try {
// Create a vector store
const vectorStore = await options.client.vectorStores.create({
name: path.basename(file)
})
// Upload file to vector store
const fileStream = fs.createReadStream(file)
await options.client.vectorStores.fileBatches.uploadAndPoll(vectorStore.id, {
files: [fileStream]
})
// Create meta file path by appending .meta.json to the original file path
const metaPath = `${file}.meta.json`
const metaData = {
vectorStoreId: vectorStore.id,
vectorizedAt: new Date().toISOString(),
originalPath: file,
mimeType: supported[ext]
}
// Write meta data to file
fs.writeFileSync(metaPath, JSON.stringify(metaData, null, 2))
return vectorStore.id
} catch (error) {
logger.error(`Failed to vectorize file ${file}:`, error)
throw error
}
}