import { randomUUID } from 'crypto' import { mkdir, readdir, readFile } from 'fs/promises' import { join } from 'path' import { PDF_MAX_EXTRACT_SIZE, PDF_TARGET_RAW_SIZE, } from '../constants/apiLimits.js' import { errorMessage } from './errors.js' import { execFileNoThrow } from './execFileNoThrow.js' import { formatFileSize } from './format.js' import { getFsImplementation } from './fsOperations.js' import { getToolResultsDir } from './toolResultStorage.js' export type PDFError = { reason: | 'empty' | 'too_large' | 'password_protected' | 'corrupted' | 'unknown' | 'unavailable' message: string } export type PDFResult = | { success: true; data: T } | { success: false; error: PDFError } /** * Read a PDF file and return it as base64-encoded data. * @param filePath Path to the PDF file * @returns Result containing PDF data or a structured error */ export async function readPDF(filePath: string): Promise< PDFResult<{ type: 'pdf' file: { filePath: string base64: string originalSize: number } }> > { try { const fs = getFsImplementation() const stats = await fs.stat(filePath) const originalSize = stats.size // Check if file is empty if (originalSize === 0) { return { success: false, error: { reason: 'empty', message: `PDF file is empty: ${filePath}` }, } } // Check if PDF exceeds maximum size // The API has a 32MB total request limit. After base64 encoding (~33% larger), // a PDF must be under ~20MB raw to leave room for conversation context. if (originalSize > PDF_TARGET_RAW_SIZE) { return { success: false, error: { reason: 'too_large', message: `PDF file exceeds maximum allowed size of ${formatFileSize(PDF_TARGET_RAW_SIZE)}.`, }, } } const fileBuffer = await readFile(filePath) // Validate PDF magic bytes — reject files that aren't actually PDFs // (e.g., HTML files renamed to .pdf) before they enter conversation context. // Once an invalid PDF document block is in the message history, every subsequent // API call fails with 400 "The PDF specified was not valid" and the session // becomes unrecoverable without /clear. const header = fileBuffer.subarray(0, 5).toString('ascii') if (!header.startsWith('%PDF-')) { return { success: false, error: { reason: 'corrupted', message: `File is not a valid PDF (missing %PDF- header): ${filePath}`, }, } } const base64 = fileBuffer.toString('base64') // Note: We cannot check page count here without parsing the PDF // The API will enforce the 100-page limit and return an error if exceeded return { success: true, data: { type: 'pdf', file: { filePath, base64, originalSize, }, }, } } catch (e: unknown) { return { success: false, error: { reason: 'unknown', message: errorMessage(e), }, } } } /** * Get the number of pages in a PDF file using `pdfinfo` (from poppler-utils). * Returns `null` if pdfinfo is not available or if the page count cannot be determined. */ export async function getPDFPageCount( filePath: string, ): Promise { const { code, stdout } = await execFileNoThrow('pdfinfo', [filePath], { timeout: 10_000, useCwd: false, }) if (code !== 0) { return null } const match = /^Pages:\s+(\d+)/m.exec(stdout) if (!match) { return null } const count = parseInt(match[1]!, 10) return isNaN(count) ? null : count } export type PDFExtractPagesResult = { type: 'parts' file: { filePath: string originalSize: number count: number outputDir: string } } let pdftoppmAvailable: boolean | undefined /** * Reset the pdftoppm availability cache. Used by tests only. */ export function resetPdftoppmCache(): void { pdftoppmAvailable = undefined } /** * Check whether the `pdftoppm` binary (from poppler-utils) is available. * The result is cached for the lifetime of the process. */ export async function isPdftoppmAvailable(): Promise { if (pdftoppmAvailable !== undefined) return pdftoppmAvailable const { code, stderr } = await execFileNoThrow('pdftoppm', ['-v'], { timeout: 5000, useCwd: false, }) // pdftoppm prints version info to stderr and exits 0 (or sometimes 99 on older versions) pdftoppmAvailable = code === 0 || stderr.length > 0 return pdftoppmAvailable } /** * Extract PDF pages as JPEG images using pdftoppm. * Produces page-01.jpg, page-02.jpg, etc. in an output directory. * This enables reading large PDFs and works with all API providers. * * @param filePath Path to the PDF file * @param options Optional page range (1-indexed, inclusive) */ export async function extractPDFPages( filePath: string, options?: { firstPage?: number; lastPage?: number }, ): Promise> { try { const fs = getFsImplementation() const stats = await fs.stat(filePath) const originalSize = stats.size if (originalSize === 0) { return { success: false, error: { reason: 'empty', message: `PDF file is empty: ${filePath}` }, } } if (originalSize > PDF_MAX_EXTRACT_SIZE) { return { success: false, error: { reason: 'too_large', message: `PDF file exceeds maximum allowed size for text extraction (${formatFileSize(PDF_MAX_EXTRACT_SIZE)}).`, }, } } const available = await isPdftoppmAvailable() if (!available) { return { success: false, error: { reason: 'unavailable', message: 'pdftoppm is not installed. Install poppler-utils (e.g. `brew install poppler` or `apt-get install poppler-utils`) to enable PDF page rendering.', }, } } const uuid = randomUUID() const outputDir = join(getToolResultsDir(), `pdf-${uuid}`) await mkdir(outputDir, { recursive: true }) // pdftoppm produces files like -01.jpg, -02.jpg, etc. const prefix = join(outputDir, 'page') const args = ['-jpeg', '-r', '100'] if (options?.firstPage) { args.push('-f', String(options.firstPage)) } if (options?.lastPage && options.lastPage !== Infinity) { args.push('-l', String(options.lastPage)) } args.push(filePath, prefix) const { code, stderr } = await execFileNoThrow('pdftoppm', args, { timeout: 120_000, useCwd: false, }) if (code !== 0) { if (/password/i.test(stderr)) { return { success: false, error: { reason: 'password_protected', message: 'PDF is password-protected. Please provide an unprotected version.', }, } } if (/damaged|corrupt|invalid/i.test(stderr)) { return { success: false, error: { reason: 'corrupted', message: 'PDF file is corrupted or invalid.', }, } } return { success: false, error: { reason: 'unknown', message: `pdftoppm failed: ${stderr}` }, } } // Read generated image files and sort naturally const entries = await readdir(outputDir) const imageFiles = entries.filter(f => f.endsWith('.jpg')).sort() const pageCount = imageFiles.length if (pageCount === 0) { return { success: false, error: { reason: 'corrupted', message: 'pdftoppm produced no output pages. The PDF may be invalid.', }, } } const count = imageFiles.length return { success: true, data: { type: 'parts', file: { filePath, originalSize, outputDir, count, }, }, } } catch (e: unknown) { return { success: false, error: { reason: 'unknown', message: errorMessage(e), }, } } }