import * as path from 'node:path' import * as fs from 'node:fs' import { sync as read } from '@polymech/fs/read' import { sync as dir } from '@polymech/fs/dir' import { createItem as toNode } from '@polymech/fs/inspect' import { sync as exists } from '@polymech/fs/exists' import { isFile, forward_slash } from '@polymech/commons' import { logger } from './index.js' import { lookup } from 'mime-types' import { globSync } from 'glob' import { EXCLUDE_GLOB, MAX_FILE_SIZE } from './constants.js' import { defaultMimeRegistry, IHandlerResult } from './mime-handlers.js' import { ChatCompletionContentPartImage } from 'openai/resources/index.mjs' import { IKBotTask, ICollector } from '@polymech/ai-tools' import { supported } from './commands/run-assistant.js' import { handleWebUrl } from './http.js' /** * @todos * - add support for vector stores : https://platform.openai.com/docs/assistants/tools/file-search?lang=node.js */ export const default_filters = { isFile, exists, size: (filePath: string) => toNode(filePath).size < MAX_FILE_SIZE, }; const isPathInside = (childPath: string, parentPath: string): boolean => { const relation = path.relative(parentPath, childPath); return Boolean( relation && !relation.startsWith('..') && !relation.startsWith('..' + path.sep) ); }; export const isPathOutsideSafe = (pathA: string, pathB: string): boolean => { const realA = fs.realpathSync(pathA); const realB = fs.realpathSync(pathB); return !isPathInside(realA, realB); }; export const base64 = (filePath: string): string | null => { try { const fileBuffer = fs.readFileSync(filePath); const mimeType = lookup(filePath); if (!mimeType) { throw new Error('Unable to determine MIME type.'); } const base64Data = fileBuffer.toString('base64'); return `data:${mimeType};base64,${base64Data}`; } catch (error) { logger.error('fileToBase64 : Error reading file:', error); return null; } }; export const images = (files: string[]): ChatCompletionContentPartImage[] => { return files.map((f) => ({ type: "image_url", image_url: { url: base64(f) } })) } /** * Check if a string is a web URL */ export const isWebUrl = (str: string): boolean => { return /^https?:\/\//.test(str); } export const glob = ( projectPath: string, include: string[] = [] ): { files: string[], webUrls: Set } => { if (!exists(projectPath)) { dir(projectPath) return { files: [], webUrls: new Set() } } const filters = new Set() const absolutePaths = new Set() const webUrls = new Set() EXCLUDE_GLOB.forEach(pattern => filters.add(pattern)) include.forEach(pattern => { // Check if the pattern is a web URL if (isWebUrl(pattern)) { webUrls.add(pattern) return } if (path.isAbsolute(pattern)) { if (isPathInside(pattern, projectPath)) { filters.add(pattern) } else { absolutePaths.add(pattern) } } else { filters.add(pattern) } }) const globFiles = globSync([...filters], { cwd: projectPath, absolute: false, ignore: EXCLUDE_GLOB }) const allFiles = Array.from(new Set([ ...globFiles.map(file => path.join(projectPath, file)), ...Array.from(absolutePaths) ])) let files = allFiles.filter((f) => Object.keys(default_filters).every((key) => default_filters[key](f)) ) return { files, webUrls } } export async function get( projectPath: string, include: string[] = [], options: IKBotTask ): Promise> { const { files, webUrls } = glob(projectPath, include) // Process file contents const fileResults = files.map((fullPath) => { try { const relativePath = forward_slash(path.relative(projectPath, fullPath)) if (isFile(fullPath) && exists(fullPath)) { const mimeType = lookup(fullPath) || 'text/plain' const handler = defaultMimeRegistry.getHandler(mimeType) if (handler) { return handler.handle(fullPath, relativePath) } return defaultMimeRegistry.getHandler('text/*')?.handle(fullPath, relativePath) || null } return null } catch (error) { logger.error(`Error reading file ${fullPath}:`, error) return null } }) // Process web URLs const webUrlPromises = Array.from(webUrls).map(async (url: string) => { try { return await handleWebUrl(url) } catch (error) { logger.error(`Error processing web URL ${url}:`, error) return null } }) const webResults = await Promise.all(webUrlPromises) // Combine and filter results const results = [...fileResults, ...webResults].filter((r) => r !== null) return results } export async function vectorize(file: string, options: IKBotTask): Promise { if (!options.client) { throw new Error('OpenAI client is required for vectorization') } const ext = path.extname(file).toLowerCase() if (!(ext in supported)) { throw new Error(`Unsupported file format: ${ext}. Supported formats: ${Object.keys(supported).join(', ')}`) } try { // Create a vector store const vectorStore = await options.client.vectorStores.create({ name: path.basename(file) }) // Upload file to vector store const fileStream = fs.createReadStream(file) await options.client.vectorStores.fileBatches.uploadAndPoll(vectorStore.id, { files: [fileStream] }) // Create meta file path by appending .meta.json to the original file path const metaPath = `${file}.meta.json` const metaData = { vectorStoreId: vectorStore.id, vectorizedAt: new Date().toISOString(), originalPath: file, mimeType: supported[ext] } // Write meta data to file fs.writeFileSync(metaPath, JSON.stringify(metaData, null, 2)) return vectorStore.id } catch (error) { logger.error(`Failed to vectorize file ${file}:`, error) throw error } }