mono/packages/kbot/src/source.ts

import * as path from 'node:path'
import * as fs from 'node:fs'
import { sync as read } from '@polymech/fs/read'
import { sync as dir } from '@polymech/fs/dir'

import { createItem as toNode } from '@polymech/fs/inspect'
import { sync as exists } from '@polymech/fs/exists'
import { isFile, forward_slash } from '@polymech/commons'
import { logger } from './index.js'
import { lookup } from 'mime-types'
import { globSync } from 'glob'
import { EXCLUDE_GLOB, MAX_FILE_SIZE } from './constants.js'
import { defaultMimeRegistry, IHandlerResult } from './mime-handlers.js'
import { ChatCompletionContentPartImage } from 'openai/resources/index.mjs'
import { IKBotTask, ICollector } from '@polymech/ai-tools'
import { supported } from './commands/run-assistant.js'
import { handleWebUrl } from './http.js'

/**
 * @todos
 * - add support for vector stores : https://platform.openai.com/docs/assistants/tools/file-search?lang=node.js
 */

export const default_filters = {
  isFile,
  exists,
  size: (filePath: string) => toNode(filePath).size < MAX_FILE_SIZE,
};

const isPathInside = (childPath: string, parentPath: string): boolean => {
  const relation = path.relative(parentPath, childPath);
  return Boolean(
    relation &&
    !relation.startsWith('..') &&
    !relation.startsWith('..' + path.sep)
  );
};

export const isPathOutsideSafe = (pathA: string, pathB: string): boolean => {
  const realA = fs.realpathSync(pathA);
  const realB = fs.realpathSync(pathB);
  return !isPathInside(realA, realB);
};

export const base64 = (filePath: string): string | null => {
  try {
    const fileBuffer = fs.readFileSync(filePath);
    const mimeType = lookup(filePath);
    if (!mimeType) {
      throw new Error('Unable to determine MIME type.');
    }
    const base64Data = fileBuffer.toString('base64');
    return `data:${mimeType};base64,${base64Data}`;
  } catch (error) {
    logger.error('fileToBase64 : Error reading file:', error);
    return null;
  }
};

export const images = (files: string[]): ChatCompletionContentPartImage[] => {
  return files.map((f) => ({
    type: "image_url",
    image_url: { url: base64(f) }
  }))
}

/**
 * Check if a string is a web URL
 */
export const isWebUrl = (str: string): boolean => {
  return /^https?:\/\//.test(str);
}

export const glob = (
  projectPath: string,
  include: string[] = [],
  exclude: string[] = []
): { files: string[], webUrls: Set<string> } => {
  if (!exists(projectPath)) {
    dir(projectPath)
    return { files: [], webUrls: new Set<string>() }
  }

  const filters = new Set<string>()
  const absolutePaths = new Set<string>()
  const webUrls = new Set<string>()
  const ignorePatterns = new Set<string>(EXCLUDE_GLOB)

  include.forEach(pattern => {
    // Check if the pattern is a web URL
    if (isWebUrl(pattern)) {
      webUrls.add(pattern)
      return
    }

    if (path.isAbsolute(pattern)) {
      if (isPathInside(pattern, projectPath)) {
        filters.add(pattern)
      } else {
        absolutePaths.add(pattern)
      }
    } else {
      filters.add(pattern)
    }
  })

  // Process exclude patterns
  exclude.forEach(pattern => {
    if (isWebUrl(pattern)) {
      // Web URLs are typically not "excluded" in a file glob sense,
      // but if there's a use case, it needs clarification.
      // For now, we'll assume web URLs in exclude are ignored for globbing.
      return;
    }
    // Add all exclude patterns (absolute or relative) to ignorePatterns
    // globSync handles absolute paths correctly in its `ignore` option when `cwd` is set.
    ignorePatterns.add(pattern);
  });

  const globFiles = globSync([...filters], {
    cwd: projectPath,
    absolute: false,
    ignore: [...ignorePatterns]
  })

  const allFiles = Array.from(new Set([
    ...globFiles.map(file => path.join(projectPath, file)),
    ...Array.from(absolutePaths)
  ]))

  let files = allFiles.filter((f) =>
    Object.keys(default_filters).every((key) => default_filters[key](f))
  )
  return { files, webUrls }
}

export async function get(
  projectPath: string,
  include: string[] = [],
  options: IKBotTask
): Promise<Array<IHandlerResult>> {
  const { files, webUrls } = glob(projectPath, include, options.exclude)

  // Process file contents
  const fileResults = files.map((fullPath) => {
    try {
      const relativePath = forward_slash(path.relative(projectPath, fullPath))
      if (isFile(fullPath) && exists(fullPath)) {
        const mimeType = lookup(fullPath) || 'text/plain'
        const handler = defaultMimeRegistry.getHandler(mimeType)
        if (handler) {
          return handler.handle(fullPath, relativePath)
        }
        return defaultMimeRegistry.getHandler('text/*')?.handle(fullPath, relativePath) || null
      }
      return null
    } catch (error) {
      logger.error(`Error reading file ${fullPath}:`, error)
      return null
    }
  })

  // Process web URLs
  const webUrlPromises = Array.from(webUrls).map(async (url: string) => {
    try {
      return await handleWebUrl(url)
    } catch (error) {
      logger.error(`Error processing web URL ${url}:`, error)
      return null
    }
  })

  const webResults = await Promise.all(webUrlPromises)

  // Combine and filter results
  const results = [...fileResults, ...webResults].filter((r) => r !== null)
  return results
}

export async function vectorize(file: string, options: IKBotTask): Promise<string> {
  if (!options.client) {
    throw new Error('OpenAI client is required for vectorization')
  }

  const ext = path.extname(file).toLowerCase()
  if (!(ext in supported)) {
    throw new Error(`Unsupported file format: ${ext}. Supported formats: ${Object.keys(supported).join(', ')}`)
  }

  try {
    // Create a vector store
    const vectorStore = await options.client.vectorStores.create({
      name: path.basename(file)
    })

    // Upload file to vector store
    const fileStream = fs.createReadStream(file)
    await options.client.vectorStores.fileBatches.uploadAndPoll(vectorStore.id, {
      files: [fileStream]
    })

    // Create meta file path by appending .meta.json to the original file path
    const metaPath = `${file}.meta.json`
    const metaData = {
      vectorStoreId: vectorStore.id,
      vectorizedAt: new Date().toISOString(),
      originalPath: file,
      mimeType: supported[ext]
    }

    // Write meta data to file
    fs.writeFileSync(metaPath, JSON.stringify(metaData, null, 2))

    return vectorStore.id
  } catch (error) {
    logger.error(`Failed to vectorize file ${file}:`, error)
    throw error
  }
}