219 lines
6.4 KiB
TypeScript
219 lines
6.4 KiB
TypeScript
import * as path from 'node:path'
|
|
import * as fs from 'node:fs'
|
|
import { sync as read } from '@polymech/fs/read'
|
|
import { sync as dir } from '@polymech/fs/dir'
|
|
|
|
import { createItem as toNode } from '@polymech/fs/inspect'
|
|
import { sync as exists } from '@polymech/fs/exists'
|
|
import { isFile, forward_slash } from '@polymech/commons'
|
|
import { logger } from './index.js'
|
|
import { lookup } from 'mime-types'
|
|
import { globSync } from 'glob'
|
|
import { EXCLUDE_GLOB, MAX_FILE_SIZE } from './constants.js'
|
|
import { defaultMimeRegistry, IHandlerResult } from './mime-handlers.js'
|
|
import { ChatCompletionContentPartImage } from 'openai/resources/index.mjs'
|
|
import { IKBotTask, ICollector } from '@polymech/ai-tools'
|
|
import { supported } from './commands/run-assistant.js'
|
|
import { handleWebUrl } from './http.js'
|
|
|
|
/**
|
|
* @todos
|
|
* - add support for vector stores : https://platform.openai.com/docs/assistants/tools/file-search?lang=node.js
|
|
*/
|
|
|
|
export const default_filters = {
|
|
isFile,
|
|
exists,
|
|
size: (filePath: string) => toNode(filePath).size < MAX_FILE_SIZE,
|
|
};
|
|
|
|
const isPathInside = (childPath: string, parentPath: string): boolean => {
|
|
const relation = path.relative(parentPath, childPath);
|
|
return Boolean(
|
|
relation &&
|
|
!relation.startsWith('..') &&
|
|
!relation.startsWith('..' + path.sep)
|
|
);
|
|
};
|
|
|
|
export const isPathOutsideSafe = (pathA: string, pathB: string): boolean => {
|
|
const realA = fs.realpathSync(pathA);
|
|
const realB = fs.realpathSync(pathB);
|
|
return !isPathInside(realA, realB);
|
|
};
|
|
|
|
export const base64 = (filePath: string): string | null => {
|
|
try {
|
|
const fileBuffer = fs.readFileSync(filePath);
|
|
const mimeType = lookup(filePath);
|
|
if (!mimeType) {
|
|
throw new Error('Unable to determine MIME type.');
|
|
}
|
|
const base64Data = fileBuffer.toString('base64');
|
|
return `data:${mimeType};base64,${base64Data}`;
|
|
} catch (error) {
|
|
logger.error('fileToBase64 : Error reading file:', error);
|
|
return null;
|
|
}
|
|
};
|
|
|
|
export const images = (files: string[]): ChatCompletionContentPartImage[] => {
|
|
return files.map((f) => ({
|
|
type: "image_url",
|
|
image_url: { url: base64(f) }
|
|
}))
|
|
}
|
|
|
|
/**
|
|
* Check if a string is a web URL
|
|
*/
|
|
export const isWebUrl = (str: string): boolean => {
|
|
return /^https?:\/\//.test(str);
|
|
}
|
|
|
|
export const glob = (
|
|
projectPath: string,
|
|
include: string[] = [],
|
|
exclude: string[] = []
|
|
): { files: string[], webUrls: Set<string> } => {
|
|
if (!exists(projectPath)) {
|
|
dir(projectPath)
|
|
return { files: [], webUrls: new Set<string>() }
|
|
}
|
|
|
|
const filters = new Set<string>()
|
|
const absolutePaths = new Set<string>()
|
|
const webUrls = new Set<string>()
|
|
const ignorePatterns = new Set<string>(EXCLUDE_GLOB)
|
|
|
|
include.forEach(pattern => {
|
|
// Check if the pattern is a web URL
|
|
if (isWebUrl(pattern)) {
|
|
webUrls.add(pattern)
|
|
return
|
|
}
|
|
|
|
if (path.isAbsolute(pattern)) {
|
|
if (isPathInside(pattern, projectPath)) {
|
|
filters.add(pattern)
|
|
} else {
|
|
absolutePaths.add(pattern)
|
|
}
|
|
} else {
|
|
filters.add(pattern)
|
|
}
|
|
})
|
|
|
|
// Process exclude patterns
|
|
exclude.forEach(pattern => {
|
|
if (isWebUrl(pattern)) {
|
|
// Web URLs are typically not "excluded" in a file glob sense,
|
|
// but if there's a use case, it needs clarification.
|
|
// For now, we'll assume web URLs in exclude are ignored for globbing.
|
|
return;
|
|
}
|
|
// Add all exclude patterns (absolute or relative) to ignorePatterns
|
|
// globSync handles absolute paths correctly in its `ignore` option when `cwd` is set.
|
|
ignorePatterns.add(pattern);
|
|
});
|
|
|
|
const globFiles = globSync([...filters], {
|
|
cwd: projectPath,
|
|
absolute: false,
|
|
ignore: [...ignorePatterns]
|
|
})
|
|
|
|
const allFiles = Array.from(new Set([
|
|
...globFiles.map(file => path.join(projectPath, file)),
|
|
...Array.from(absolutePaths)
|
|
]))
|
|
|
|
let files = allFiles.filter((f) =>
|
|
Object.keys(default_filters).every((key) => default_filters[key](f))
|
|
)
|
|
return { files, webUrls }
|
|
}
|
|
|
|
export async function get(
|
|
projectPath: string,
|
|
include: string[] = [],
|
|
options: IKBotTask
|
|
): Promise<Array<IHandlerResult>> {
|
|
const { files, webUrls } = glob(projectPath, include, options.exclude)
|
|
|
|
// Process file contents
|
|
const fileResults = files.map((fullPath) => {
|
|
try {
|
|
const relativePath = forward_slash(path.relative(projectPath, fullPath))
|
|
if (isFile(fullPath) && exists(fullPath)) {
|
|
const mimeType = lookup(fullPath) || 'text/plain'
|
|
const handler = defaultMimeRegistry.getHandler(mimeType)
|
|
if (handler) {
|
|
return handler.handle(fullPath, relativePath)
|
|
}
|
|
return defaultMimeRegistry.getHandler('text/*')?.handle(fullPath, relativePath) || null
|
|
}
|
|
return null
|
|
} catch (error) {
|
|
logger.error(`Error reading file ${fullPath}:`, error)
|
|
return null
|
|
}
|
|
})
|
|
|
|
// Process web URLs
|
|
const webUrlPromises = Array.from(webUrls).map(async (url: string) => {
|
|
try {
|
|
return await handleWebUrl(url)
|
|
} catch (error) {
|
|
logger.error(`Error processing web URL ${url}:`, error)
|
|
return null
|
|
}
|
|
})
|
|
|
|
const webResults = await Promise.all(webUrlPromises)
|
|
|
|
// Combine and filter results
|
|
const results = [...fileResults, ...webResults].filter((r) => r !== null)
|
|
return results
|
|
}
|
|
|
|
export async function vectorize(file: string, options: IKBotTask): Promise<string> {
|
|
if (!options.client) {
|
|
throw new Error('OpenAI client is required for vectorization')
|
|
}
|
|
|
|
const ext = path.extname(file).toLowerCase()
|
|
if (!(ext in supported)) {
|
|
throw new Error(`Unsupported file format: ${ext}. Supported formats: ${Object.keys(supported).join(', ')}`)
|
|
}
|
|
|
|
try {
|
|
// Create a vector store
|
|
const vectorStore = await options.client.vectorStores.create({
|
|
name: path.basename(file)
|
|
})
|
|
|
|
// Upload file to vector store
|
|
const fileStream = fs.createReadStream(file)
|
|
await options.client.vectorStores.fileBatches.uploadAndPoll(vectorStore.id, {
|
|
files: [fileStream]
|
|
})
|
|
|
|
// Create meta file path by appending .meta.json to the original file path
|
|
const metaPath = `${file}.meta.json`
|
|
const metaData = {
|
|
vectorStoreId: vectorStore.id,
|
|
vectorizedAt: new Date().toISOString(),
|
|
originalPath: file,
|
|
mimeType: supported[ext]
|
|
}
|
|
|
|
// Write meta data to file
|
|
fs.writeFileSync(metaPath, JSON.stringify(metaData, null, 2))
|
|
|
|
return vectorStore.id
|
|
} catch (error) {
|
|
logger.error(`Failed to vectorize file ${file}:`, error)
|
|
throw error
|
|
}
|
|
} |