import type { StructuredPatchHunk } from 'diff' import { access, readFile } from 'fs/promises' import { dirname, join, relative, sep } from 'path' import { getCwd } from './cwd.js' import { getCachedRepository } from './detectRepository.js' import { execFileNoThrow, execFileNoThrowWithCwd } from './execFileNoThrow.js' import { isFileWithinReadSizeLimit } from './file.js' import { findGitRoot, getDefaultBranch, getGitDir, getIsGit, gitExe, } from './git.js' export type GitDiffStats = { filesCount: number linesAdded: number linesRemoved: number } export type PerFileStats = { added: number removed: number isBinary: boolean isUntracked?: boolean } export type GitDiffResult = { stats: GitDiffStats perFileStats: Map hunks: Map } const GIT_TIMEOUT_MS = 5000 const MAX_FILES = 50 const MAX_DIFF_SIZE_BYTES = 1_000_000 // 1 MB - skip files larger than this const MAX_LINES_PER_FILE = 400 // GitHub's auto-load limit const MAX_FILES_FOR_DETAILS = 500 // Skip per-file details if more files than this /** * Fetch git diff stats and hunks comparing working tree to HEAD. * Returns null if not in a git repo or if git commands fail. * * Returns null during merge/rebase/cherry-pick/revert operations since the * working tree contains incoming changes that weren't intentionally * made by the user. */ export async function fetchGitDiff(): Promise { const isGit = await getIsGit() if (!isGit) return null // Skip diff calculation during transient git states since the // working tree contains incoming changes, not user-intentional edits if (await isInTransientGitState()) { return null } // Quick probe: use --shortstat to get totals without loading all content. // This is O(1) memory and lets us detect massive diffs (e.g., jj workspaces) // before committing to expensive operations. const { stdout: shortstatOut, code: shortstatCode } = await execFileNoThrow( gitExe(), ['--no-optional-locks', 'diff', 'HEAD', '--shortstat'], { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false }, ) if (shortstatCode === 0) { const quickStats = parseShortstat(shortstatOut) if (quickStats && quickStats.filesCount > MAX_FILES_FOR_DETAILS) { // Too many files - return accurate totals but skip per-file details // to avoid loading hundreds of MB into memory return { stats: quickStats, perFileStats: new Map(), hunks: new Map(), } } } // Get stats via --numstat (all uncommitted changes vs HEAD) const { stdout: numstatOut, code: numstatCode } = await execFileNoThrow( gitExe(), ['--no-optional-locks', 'diff', 'HEAD', '--numstat'], { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false }, ) if (numstatCode !== 0) return null const { stats, perFileStats } = parseGitNumstat(numstatOut) // Include untracked files (new files not yet staged) // Just filenames - no content reading for performance const remainingSlots = MAX_FILES - perFileStats.size if (remainingSlots > 0) { const untrackedStats = await fetchUntrackedFiles(remainingSlots) if (untrackedStats) { stats.filesCount += untrackedStats.size for (const [path, fileStats] of untrackedStats) { perFileStats.set(path, fileStats) } } } // Return stats only - hunks are fetched on-demand via fetchGitDiffHunks() // to avoid expensive git diff HEAD call on every poll return { stats, perFileStats, hunks: new Map() } } /** * Fetch git diff hunks on-demand (for DiffDialog). * Separated from fetchGitDiff() to avoid expensive calls during polling. */ export async function fetchGitDiffHunks(): Promise< Map > { const isGit = await getIsGit() if (!isGit) return new Map() if (await isInTransientGitState()) { return new Map() } const { stdout: diffOut, code: diffCode } = await execFileNoThrow( gitExe(), ['--no-optional-locks', 'diff', 'HEAD'], { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false }, ) if (diffCode !== 0) { return new Map() } return parseGitDiff(diffOut) } export type NumstatResult = { stats: GitDiffStats perFileStats: Map } /** * Parse git diff --numstat output into stats. * Format: \t\t * Binary files show '-' for counts. * Only stores first MAX_FILES entries in perFileStats. */ export function parseGitNumstat(stdout: string): NumstatResult { const lines = stdout.trim().split('\n').filter(Boolean) let added = 0 let removed = 0 let validFileCount = 0 const perFileStats = new Map() for (const line of lines) { const parts = line.split('\t') // Valid numstat lines have exactly 3 tab-separated parts: added, removed, filename if (parts.length < 3) continue validFileCount++ const addStr = parts[0] const remStr = parts[1] const filePath = parts.slice(2).join('\t') // filename may contain tabs const isBinary = addStr === '-' || remStr === '-' const fileAdded = isBinary ? 0 : parseInt(addStr ?? '0', 10) || 0 const fileRemoved = isBinary ? 0 : parseInt(remStr ?? '0', 10) || 0 added += fileAdded removed += fileRemoved // Only store first MAX_FILES entries if (perFileStats.size < MAX_FILES) { perFileStats.set(filePath, { added: fileAdded, removed: fileRemoved, isBinary, }) } } return { stats: { filesCount: validFileCount, linesAdded: added, linesRemoved: removed, }, perFileStats, } } /** * Parse unified diff output into per-file hunks. * Splits by "diff --git" and parses each file's hunks. * * Applies limits: * - MAX_FILES: stop after this many files * - Files >1MB: skipped entirely (not in result map) * - Files ≤1MB: parsed but limited to MAX_LINES_PER_FILE lines */ export function parseGitDiff( stdout: string, ): Map { const result = new Map() if (!stdout.trim()) return result // Split by file diffs const fileDiffs = stdout.split(/^diff --git /m).filter(Boolean) for (const fileDiff of fileDiffs) { // Stop after MAX_FILES if (result.size >= MAX_FILES) break // Skip files larger than 1MB if (fileDiff.length > MAX_DIFF_SIZE_BYTES) { continue } const lines = fileDiff.split('\n') // Extract filename from first line: "a/path/to/file b/path/to/file" const headerMatch = lines[0]?.match(/^a\/(.+?) b\/(.+)$/) if (!headerMatch) continue const filePath = headerMatch[2] ?? headerMatch[1] ?? '' // Find and parse hunks const fileHunks: StructuredPatchHunk[] = [] let currentHunk: StructuredPatchHunk | null = null let lineCount = 0 for (let i = 1; i < lines.length; i++) { const line = lines[i] ?? '' // StructuredPatchHunk header: @@ -oldStart,oldLines +newStart,newLines @@ const hunkMatch = line.match( /^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@/, ) if (hunkMatch) { if (currentHunk) { fileHunks.push(currentHunk) } currentHunk = { oldStart: parseInt(hunkMatch[1] ?? '0', 10), oldLines: parseInt(hunkMatch[2] ?? '1', 10), newStart: parseInt(hunkMatch[3] ?? '0', 10), newLines: parseInt(hunkMatch[4] ?? '1', 10), lines: [], } continue } // Skip binary file markers and other metadata if ( line.startsWith('index ') || line.startsWith('---') || line.startsWith('+++') || line.startsWith('new file') || line.startsWith('deleted file') || line.startsWith('old mode') || line.startsWith('new mode') || line.startsWith('Binary files') ) { continue } // Add diff lines to current hunk (with line limit) if ( currentHunk && (line.startsWith('+') || line.startsWith('-') || line.startsWith(' ') || line === '') ) { // Stop adding lines once we hit the limit if (lineCount >= MAX_LINES_PER_FILE) { continue } // Force a flat string copy to break V8 sliced string references. // When split() creates lines, V8 creates "sliced strings" that reference // the parent. This keeps the entire parent string (~MBs) alive as long as // any line is retained. Using '' + line forces a new flat string allocation, // unlike slice(0) which V8 may optimize to return the same reference. currentHunk.lines.push('' + line) lineCount++ } } // Don't forget the last hunk if (currentHunk) { fileHunks.push(currentHunk) } if (fileHunks.length > 0) { result.set(filePath, fileHunks) } } return result } /** * Check if we're in a transient git state (merge, rebase, cherry-pick, or revert). * During these operations, we skip diff calculation since the working * tree contains incoming changes that weren't intentionally made. * * Uses fs.access to check for transient ref files, avoiding process spawns. */ async function isInTransientGitState(): Promise { const gitDir = await getGitDir(getCwd()) if (!gitDir) return false const transientFiles = [ 'MERGE_HEAD', 'REBASE_HEAD', 'CHERRY_PICK_HEAD', 'REVERT_HEAD', ] const results = await Promise.all( transientFiles.map(file => access(join(gitDir, file)) .then(() => true) .catch(() => false), ), ) return results.some(Boolean) } /** * Fetch untracked file names (no content reading). * Returns file paths only - they'll be displayed with a note to stage them. * * @param maxFiles Maximum number of untracked files to include */ async function fetchUntrackedFiles( maxFiles: number, ): Promise | null> { // Get list of untracked files (excludes gitignored) const { stdout, code } = await execFileNoThrow( gitExe(), ['--no-optional-locks', 'ls-files', '--others', '--exclude-standard'], { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false }, ) if (code !== 0 || !stdout.trim()) return null const untrackedPaths = stdout.trim().split('\n').filter(Boolean) if (untrackedPaths.length === 0) return null const perFileStats = new Map() // Just record filenames, no content reading for (const filePath of untrackedPaths.slice(0, maxFiles)) { perFileStats.set(filePath, { added: 0, removed: 0, isBinary: false, isUntracked: true, }) } return perFileStats } /** * Parse git diff --shortstat output into stats. * Format: " 1648 files changed, 52341 insertions(+), 8123 deletions(-)" * * This is O(1) memory regardless of diff size - git computes totals without * loading all content. Used as a quick probe before expensive operations. */ export function parseShortstat(stdout: string): GitDiffStats | null { // Match: "N files changed" with optional ", N insertions(+)" and ", N deletions(-)" const match = stdout.match( /(\d+)\s+files?\s+changed(?:,\s+(\d+)\s+insertions?\(\+\))?(?:,\s+(\d+)\s+deletions?\(-\))?/, ) if (!match) return null return { filesCount: parseInt(match[1] ?? '0', 10), linesAdded: parseInt(match[2] ?? '0', 10), linesRemoved: parseInt(match[3] ?? '0', 10), } } const SINGLE_FILE_DIFF_TIMEOUT_MS = 3000 export type ToolUseDiff = { filename: string status: 'modified' | 'added' additions: number deletions: number changes: number patch: string /** GitHub "owner/repo" when available (null for non-github.com or unknown repos) */ repository: string | null } /** * Fetch a structured diff for a single file against the merge base with the * default branch. This produces a PR-like diff showing all changes since * the branch diverged. Falls back to diffing against HEAD if the merge base * cannot be determined (e.g., on the default branch itself). * For untracked files, generates a synthetic diff showing all additions. * Returns null if not in a git repo or if git commands fail. */ export async function fetchSingleFileGitDiff( absoluteFilePath: string, ): Promise { const gitRoot = findGitRoot(dirname(absoluteFilePath)) if (!gitRoot) return null const gitPath = relative(gitRoot, absoluteFilePath).split(sep).join('/') const repository = getCachedRepository() // Check if the file is tracked by git const { code: lsFilesCode } = await execFileNoThrowWithCwd( gitExe(), ['--no-optional-locks', 'ls-files', '--error-unmatch', gitPath], { cwd: gitRoot, timeout: SINGLE_FILE_DIFF_TIMEOUT_MS }, ) if (lsFilesCode === 0) { // File is tracked - diff against merge base for PR-like view const diffRef = await getDiffRef(gitRoot) const { stdout, code } = await execFileNoThrowWithCwd( gitExe(), ['--no-optional-locks', 'diff', diffRef, '--', gitPath], { cwd: gitRoot, timeout: SINGLE_FILE_DIFF_TIMEOUT_MS }, ) if (code !== 0) return null if (!stdout) return null return { ...parseRawDiffToToolUseDiff(gitPath, stdout, 'modified'), repository, } } // File is untracked - generate synthetic diff const syntheticDiff = await generateSyntheticDiff(gitPath, absoluteFilePath) if (!syntheticDiff) return null return { ...syntheticDiff, repository } } /** * Parse raw unified diff output into the structured ToolUseDiff format. * Extracts only the hunk content (starting from @@) as the patch, * and counts additions/deletions. */ function parseRawDiffToToolUseDiff( filename: string, rawDiff: string, status: 'modified' | 'added', ): Omit { const lines = rawDiff.split('\n') const patchLines: string[] = [] let inHunks = false let additions = 0 let deletions = 0 for (const line of lines) { if (line.startsWith('@@')) { inHunks = true } if (inHunks) { patchLines.push(line) if (line.startsWith('+') && !line.startsWith('+++')) { additions++ } else if (line.startsWith('-') && !line.startsWith('---')) { deletions++ } } } return { filename, status, additions, deletions, changes: additions + deletions, patch: patchLines.join('\n'), } } /** * Determine the best ref to diff against for a PR-like diff. * Priority: * 1. CLAUDE_CODE_BASE_REF env var (set externally, e.g. by CCR managed containers) * 2. Merge base with the default branch (best guess) * 3. HEAD (fallback if merge-base fails) */ async function getDiffRef(gitRoot: string): Promise { const baseBranch = process.env.CLAUDE_CODE_BASE_REF || (await getDefaultBranch()) const { stdout, code } = await execFileNoThrowWithCwd( gitExe(), ['--no-optional-locks', 'merge-base', 'HEAD', baseBranch], { cwd: gitRoot, timeout: SINGLE_FILE_DIFF_TIMEOUT_MS }, ) if (code === 0 && stdout.trim()) { return stdout.trim() } return 'HEAD' } async function generateSyntheticDiff( gitPath: string, absoluteFilePath: string, ): Promise | null> { try { if (!isFileWithinReadSizeLimit(absoluteFilePath, MAX_DIFF_SIZE_BYTES)) { return null } const content = await readFile(absoluteFilePath, 'utf-8') const lines = content.split('\n') // Remove trailing empty line from split if file ends with newline if (lines.length > 0 && lines.at(-1) === '') { lines.pop() } const lineCount = lines.length const addedLines = lines.map(line => `+${line}`).join('\n') const patch = `@@ -0,0 +1,${lineCount} @@\n${addedLines}` return { filename: gitPath, status: 'added', additions: lineCount, deletions: 0, changes: lineCount, patch, } } catch { return null } }