1121 lines
29 KiB
TypeScript
1121 lines
29 KiB
TypeScript
/**
|
||
* Perfetto Tracing for Claude Code (Ant-only)
|
||
*
|
||
* This module generates traces in the Chrome Trace Event format that can be
|
||
* viewed in ui.perfetto.dev or Chrome's chrome://tracing.
|
||
*
|
||
* NOTE: This feature is ant-only and eliminated from external builds.
|
||
*
|
||
* The trace file includes:
|
||
* - Agent hierarchy (parent-child relationships in a swarm)
|
||
* - API requests with TTFT, TTLT, prompt length, cache stats, msg ID, speculative flag
|
||
* - Tool executions with name, duration, and token usage
|
||
* - User input waiting time
|
||
*
|
||
* Usage:
|
||
* 1. Enable via CLAUDE_CODE_PERFETTO_TRACE=1 or CLAUDE_CODE_PERFETTO_TRACE=<path>
|
||
* 2. Optionally set CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S=<positive integer> to write the
|
||
* trace file periodically (default: write only on exit).
|
||
* 3. Run Claude Code normally
|
||
* 4. Trace file is written to ~/.claude/traces/trace-<session-id>.json
|
||
* or to the specified path
|
||
* 5. Open in ui.perfetto.dev to visualize
|
||
*/
|
||
|
||
import { feature } from 'bun:bundle'
|
||
import { mkdirSync, writeFileSync } from 'fs'
|
||
import { mkdir, writeFile } from 'fs/promises'
|
||
import { dirname, join } from 'path'
|
||
import { getSessionId } from '../../bootstrap/state.js'
|
||
import { registerCleanup } from '../cleanupRegistry.js'
|
||
import { logForDebugging } from '../debug.js'
|
||
import {
|
||
getClaudeConfigHomeDir,
|
||
isEnvDefinedFalsy,
|
||
isEnvTruthy,
|
||
} from '../envUtils.js'
|
||
import { errorMessage } from '../errors.js'
|
||
import { djb2Hash } from '../hash.js'
|
||
import { jsonStringify } from '../slowOperations.js'
|
||
import { getAgentId, getAgentName, getParentSessionId } from '../teammate.js'
|
||
|
||
/**
|
||
* Chrome Trace Event format types
|
||
* See: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU
|
||
*/
|
||
|
||
export type TraceEventPhase =
|
||
| 'B' // Begin duration event
|
||
| 'E' // End duration event
|
||
| 'X' // Complete event (with duration)
|
||
| 'i' // Instant event
|
||
| 'C' // Counter event
|
||
| 'b' // Async begin
|
||
| 'n' // Async instant
|
||
| 'e' // Async end
|
||
| 'M' // Metadata event
|
||
|
||
export type TraceEvent = {
|
||
name: string
|
||
cat: string
|
||
ph: TraceEventPhase
|
||
ts: number // Timestamp in microseconds
|
||
pid: number // Process ID (we use 1 for main, agent IDs for subagents)
|
||
tid: number // Thread ID (we use numeric hash of agent name or 1 for main)
|
||
dur?: number // Duration in microseconds (for 'X' events)
|
||
args?: Record<string, unknown>
|
||
id?: string // For async events
|
||
scope?: string
|
||
}
|
||
|
||
/**
|
||
* Agent info for tracking hierarchy
|
||
*/
|
||
type AgentInfo = {
|
||
agentId: string
|
||
agentName: string
|
||
parentAgentId?: string
|
||
processId: number
|
||
threadId: number
|
||
}
|
||
|
||
/**
|
||
* Pending span for tracking begin/end pairs
|
||
*/
|
||
type PendingSpan = {
|
||
name: string
|
||
category: string
|
||
startTime: number
|
||
agentInfo: AgentInfo
|
||
args: Record<string, unknown>
|
||
}
|
||
|
||
// Global state for the Perfetto tracer
|
||
let isEnabled = false
|
||
let tracePath: string | null = null
|
||
// Metadata events (ph: 'M' — process/thread names, parent links) are kept
|
||
// separate so they survive eviction — Perfetto UI needs them to label
|
||
// tracks. Bounded by agent count (~3 events per agent).
|
||
const metadataEvents: TraceEvent[] = []
|
||
const events: TraceEvent[] = []
|
||
// events[] cap. Cron-driven sessions run for days; 22 push sites × many
|
||
// turns would otherwise grow unboundedly (periodicWrite flushes to disk but
|
||
// does not truncate — it writes the full snapshot). At ~300B/event this is
|
||
// ~30MB, enough trace history for any debugging session. Eviction drops the
|
||
// oldest half when hit, amortized O(1).
|
||
const MAX_EVENTS = 100_000
|
||
const pendingSpans = new Map<string, PendingSpan>()
|
||
const agentRegistry = new Map<string, AgentInfo>()
|
||
let totalAgentCount = 0
|
||
let startTimeMs = 0
|
||
let spanIdCounter = 0
|
||
let traceWritten = false // Flag to avoid double writes
|
||
|
||
// Map agent IDs to numeric process IDs (Perfetto requires numeric IDs)
|
||
let processIdCounter = 1
|
||
const agentIdToProcessId = new Map<string, number>()
|
||
|
||
// Periodic write interval handle
|
||
let writeIntervalId: ReturnType<typeof setInterval> | null = null
|
||
|
||
const STALE_SPAN_TTL_MS = 30 * 60 * 1000 // 30 minutes
|
||
const STALE_SPAN_CLEANUP_INTERVAL_MS = 60 * 1000 // 1 minute
|
||
let staleSpanCleanupId: ReturnType<typeof setInterval> | null = null
|
||
|
||
/**
|
||
* Convert a string to a numeric hash for use as thread ID
|
||
*/
|
||
function stringToNumericHash(str: string): number {
|
||
return Math.abs(djb2Hash(str)) || 1 // Ensure non-zero
|
||
}
|
||
|
||
/**
|
||
* Get or create a numeric process ID for an agent
|
||
*/
|
||
function getProcessIdForAgent(agentId: string): number {
|
||
const existing = agentIdToProcessId.get(agentId)
|
||
if (existing !== undefined) return existing
|
||
|
||
processIdCounter++
|
||
agentIdToProcessId.set(agentId, processIdCounter)
|
||
return processIdCounter
|
||
}
|
||
|
||
/**
|
||
* Get current agent info
|
||
*/
|
||
function getCurrentAgentInfo(): AgentInfo {
|
||
const agentId = getAgentId() ?? getSessionId()
|
||
const agentName = getAgentName() ?? 'main'
|
||
const parentSessionId = getParentSessionId()
|
||
|
||
// Check if we've already registered this agent
|
||
const existing = agentRegistry.get(agentId)
|
||
if (existing) return existing
|
||
|
||
const info: AgentInfo = {
|
||
agentId,
|
||
agentName,
|
||
parentAgentId: parentSessionId,
|
||
processId: agentId === getSessionId() ? 1 : getProcessIdForAgent(agentId),
|
||
threadId: stringToNumericHash(agentName),
|
||
}
|
||
|
||
agentRegistry.set(agentId, info)
|
||
totalAgentCount++
|
||
return info
|
||
}
|
||
|
||
/**
|
||
* Get timestamp in microseconds relative to trace start
|
||
*/
|
||
function getTimestamp(): number {
|
||
return (Date.now() - startTimeMs) * 1000
|
||
}
|
||
|
||
/**
|
||
* Generate a unique span ID
|
||
*/
|
||
function generateSpanId(): string {
|
||
return `span_${++spanIdCounter}`
|
||
}
|
||
|
||
/**
|
||
* Evict pending spans older than STALE_SPAN_TTL_MS.
|
||
* Mirrors the TTL cleanup pattern in sessionTracing.ts.
|
||
*/
|
||
function evictStaleSpans(): void {
|
||
const now = getTimestamp()
|
||
const ttlUs = STALE_SPAN_TTL_MS * 1000 // Convert ms to microseconds
|
||
for (const [spanId, span] of pendingSpans) {
|
||
if (now - span.startTime > ttlUs) {
|
||
// Emit an end event so the span shows up in the trace as incomplete
|
||
events.push({
|
||
name: span.name,
|
||
cat: span.category,
|
||
ph: 'E',
|
||
ts: now,
|
||
pid: span.agentInfo.processId,
|
||
tid: span.agentInfo.threadId,
|
||
args: {
|
||
...span.args,
|
||
evicted: true,
|
||
duration_ms: (now - span.startTime) / 1000,
|
||
},
|
||
})
|
||
pendingSpans.delete(spanId)
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Build the full trace document (Chrome Trace JSON format).
|
||
*/
|
||
function buildTraceDocument(): string {
|
||
return jsonStringify({
|
||
traceEvents: [...metadataEvents, ...events],
|
||
metadata: {
|
||
session_id: getSessionId(),
|
||
trace_start_time: new Date(startTimeMs).toISOString(),
|
||
agent_count: totalAgentCount,
|
||
total_event_count: metadataEvents.length + events.length,
|
||
},
|
||
})
|
||
}
|
||
|
||
/**
|
||
* Drop the oldest half of events[] when over MAX_EVENTS. Called from the
|
||
* stale-span cleanup interval (60s). The half-batch splice keeps this
|
||
* amortized O(1) — we don't pay splice cost per-push. A synthetic marker
|
||
* is inserted so the gap is visible in ui.perfetto.dev.
|
||
*/
|
||
function evictOldestEvents(): void {
|
||
if (events.length < MAX_EVENTS) return
|
||
const dropped = events.splice(0, MAX_EVENTS / 2)
|
||
events.unshift({
|
||
name: 'trace_truncated',
|
||
cat: '__metadata',
|
||
ph: 'i',
|
||
ts: dropped[dropped.length - 1]?.ts ?? 0,
|
||
pid: 1,
|
||
tid: 0,
|
||
args: { dropped_events: dropped.length },
|
||
})
|
||
logForDebugging(
|
||
`[Perfetto] Evicted ${dropped.length} oldest events (cap ${MAX_EVENTS})`,
|
||
)
|
||
}
|
||
|
||
/**
|
||
* Initialize Perfetto tracing
|
||
* Call this early in the application lifecycle
|
||
*/
|
||
export function initializePerfettoTracing(): void {
|
||
const envValue = process.env.CLAUDE_CODE_PERFETTO_TRACE
|
||
logForDebugging(
|
||
`[Perfetto] initializePerfettoTracing called, env value: ${envValue}`,
|
||
)
|
||
|
||
// Wrap in feature() for dead code elimination - entire block removed from external builds
|
||
if (feature('PERFETTO_TRACING')) {
|
||
if (!envValue || isEnvDefinedFalsy(envValue)) {
|
||
logForDebugging(
|
||
'[Perfetto] Tracing disabled (env var not set or disabled)',
|
||
)
|
||
return
|
||
}
|
||
|
||
isEnabled = true
|
||
startTimeMs = Date.now()
|
||
|
||
// Determine trace file path
|
||
if (isEnvTruthy(envValue)) {
|
||
const tracesDir = join(getClaudeConfigHomeDir(), 'traces')
|
||
tracePath = join(tracesDir, `trace-${getSessionId()}.json`)
|
||
} else {
|
||
// Use the provided path
|
||
tracePath = envValue
|
||
}
|
||
|
||
logForDebugging(
|
||
`[Perfetto] Tracing enabled, will write to: ${tracePath}, isEnabled=${isEnabled}`,
|
||
)
|
||
|
||
// Start periodic full-trace write if CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S is a positive integer
|
||
const intervalSec = parseInt(
|
||
process.env.CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S ?? '',
|
||
10,
|
||
)
|
||
if (intervalSec > 0) {
|
||
writeIntervalId = setInterval(() => {
|
||
void periodicWrite()
|
||
}, intervalSec * 1000)
|
||
// Don't let the interval keep the process alive on its own
|
||
if (writeIntervalId.unref) writeIntervalId.unref()
|
||
logForDebugging(
|
||
`[Perfetto] Periodic write enabled, interval: ${intervalSec}s`,
|
||
)
|
||
}
|
||
|
||
// Start stale span cleanup interval
|
||
staleSpanCleanupId = setInterval(() => {
|
||
evictStaleSpans()
|
||
evictOldestEvents()
|
||
}, STALE_SPAN_CLEANUP_INTERVAL_MS)
|
||
if (staleSpanCleanupId.unref) staleSpanCleanupId.unref()
|
||
|
||
// Register cleanup to write final trace on exit
|
||
registerCleanup(async () => {
|
||
logForDebugging('[Perfetto] Cleanup callback invoked')
|
||
await writePerfettoTrace()
|
||
})
|
||
|
||
// Also register a beforeExit handler as a fallback
|
||
// This ensures the trace is written even if cleanup registry is not called
|
||
process.on('beforeExit', () => {
|
||
logForDebugging('[Perfetto] beforeExit handler invoked')
|
||
void writePerfettoTrace()
|
||
})
|
||
|
||
// Register a synchronous exit handler as a last resort
|
||
// This is the final fallback to ensure trace is written before process exits
|
||
process.on('exit', () => {
|
||
if (!traceWritten) {
|
||
logForDebugging(
|
||
'[Perfetto] exit handler invoked, writing trace synchronously',
|
||
)
|
||
writePerfettoTraceSync()
|
||
}
|
||
})
|
||
|
||
// Emit process metadata events for main process
|
||
const mainAgent = getCurrentAgentInfo()
|
||
emitProcessMetadata(mainAgent)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Emit metadata events for a process/agent
|
||
*/
|
||
function emitProcessMetadata(agentInfo: AgentInfo): void {
|
||
if (!isEnabled) return
|
||
|
||
// Process name
|
||
metadataEvents.push({
|
||
name: 'process_name',
|
||
cat: '__metadata',
|
||
ph: 'M',
|
||
ts: 0,
|
||
pid: agentInfo.processId,
|
||
tid: 0,
|
||
args: { name: agentInfo.agentName },
|
||
})
|
||
|
||
// Thread name (same as process for now)
|
||
metadataEvents.push({
|
||
name: 'thread_name',
|
||
cat: '__metadata',
|
||
ph: 'M',
|
||
ts: 0,
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args: { name: agentInfo.agentName },
|
||
})
|
||
|
||
// Add parent info if available
|
||
if (agentInfo.parentAgentId) {
|
||
metadataEvents.push({
|
||
name: 'parent_agent',
|
||
cat: '__metadata',
|
||
ph: 'M',
|
||
ts: 0,
|
||
pid: agentInfo.processId,
|
||
tid: 0,
|
||
args: {
|
||
parent_agent_id: agentInfo.parentAgentId,
|
||
},
|
||
})
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Check if Perfetto tracing is enabled
|
||
*/
|
||
export function isPerfettoTracingEnabled(): boolean {
|
||
return isEnabled
|
||
}
|
||
|
||
/**
|
||
* Register a new agent in the trace
|
||
* Call this when a subagent/teammate is spawned
|
||
*/
|
||
export function registerAgent(
|
||
agentId: string,
|
||
agentName: string,
|
||
parentAgentId?: string,
|
||
): void {
|
||
if (!isEnabled) return
|
||
|
||
const info: AgentInfo = {
|
||
agentId,
|
||
agentName,
|
||
parentAgentId,
|
||
processId: getProcessIdForAgent(agentId),
|
||
threadId: stringToNumericHash(agentName),
|
||
}
|
||
|
||
agentRegistry.set(agentId, info)
|
||
totalAgentCount++
|
||
emitProcessMetadata(info)
|
||
}
|
||
|
||
/**
|
||
* Unregister an agent from the trace.
|
||
* Call this when an agent completes, fails, or is aborted to free memory.
|
||
*/
|
||
export function unregisterAgent(agentId: string): void {
|
||
if (!isEnabled) return
|
||
agentRegistry.delete(agentId)
|
||
agentIdToProcessId.delete(agentId)
|
||
}
|
||
|
||
/**
|
||
* Start an API call span
|
||
*/
|
||
export function startLLMRequestPerfettoSpan(args: {
|
||
model: string
|
||
promptTokens?: number
|
||
messageId?: string
|
||
isSpeculative?: boolean
|
||
querySource?: string
|
||
}): string {
|
||
if (!isEnabled) return ''
|
||
|
||
const spanId = generateSpanId()
|
||
const agentInfo = getCurrentAgentInfo()
|
||
|
||
pendingSpans.set(spanId, {
|
||
name: 'API Call',
|
||
category: 'api',
|
||
startTime: getTimestamp(),
|
||
agentInfo,
|
||
args: {
|
||
model: args.model,
|
||
prompt_tokens: args.promptTokens,
|
||
message_id: args.messageId,
|
||
is_speculative: args.isSpeculative ?? false,
|
||
query_source: args.querySource,
|
||
},
|
||
})
|
||
|
||
// Emit begin event
|
||
events.push({
|
||
name: 'API Call',
|
||
cat: 'api',
|
||
ph: 'B',
|
||
ts: pendingSpans.get(spanId)!.startTime,
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args: pendingSpans.get(spanId)!.args,
|
||
})
|
||
|
||
return spanId
|
||
}
|
||
|
||
/**
|
||
* End an API call span with response metadata
|
||
*/
|
||
export function endLLMRequestPerfettoSpan(
|
||
spanId: string,
|
||
metadata: {
|
||
ttftMs?: number
|
||
ttltMs?: number
|
||
promptTokens?: number
|
||
outputTokens?: number
|
||
cacheReadTokens?: number
|
||
cacheCreationTokens?: number
|
||
messageId?: string
|
||
success?: boolean
|
||
error?: string
|
||
/** Time spent in pre-request setup (client creation, retries) before the successful attempt */
|
||
requestSetupMs?: number
|
||
/** Timestamps (Date.now()) of each attempt start — used to emit retry sub-spans */
|
||
attemptStartTimes?: number[]
|
||
},
|
||
): void {
|
||
if (!isEnabled || !spanId) return
|
||
|
||
const pending = pendingSpans.get(spanId)
|
||
if (!pending) return
|
||
|
||
const endTime = getTimestamp()
|
||
const duration = endTime - pending.startTime
|
||
|
||
const promptTokens =
|
||
metadata.promptTokens ?? (pending.args.prompt_tokens as number | undefined)
|
||
const ttftMs = metadata.ttftMs
|
||
const ttltMs = metadata.ttltMs
|
||
const outputTokens = metadata.outputTokens
|
||
const cacheReadTokens = metadata.cacheReadTokens
|
||
|
||
// Compute derived metrics
|
||
// ITPS: input tokens per second (prompt processing speed)
|
||
const itps =
|
||
ttftMs !== undefined && promptTokens !== undefined && ttftMs > 0
|
||
? Math.round((promptTokens / (ttftMs / 1000)) * 100) / 100
|
||
: undefined
|
||
|
||
// OTPS: output tokens per second (sampling speed)
|
||
const samplingMs =
|
||
ttltMs !== undefined && ttftMs !== undefined ? ttltMs - ttftMs : undefined
|
||
const otps =
|
||
samplingMs !== undefined && outputTokens !== undefined && samplingMs > 0
|
||
? Math.round((outputTokens / (samplingMs / 1000)) * 100) / 100
|
||
: undefined
|
||
|
||
// Cache hit rate: percentage of prompt tokens from cache
|
||
const cacheHitRate =
|
||
cacheReadTokens !== undefined &&
|
||
promptTokens !== undefined &&
|
||
promptTokens > 0
|
||
? Math.round((cacheReadTokens / promptTokens) * 10000) / 100
|
||
: undefined
|
||
|
||
const requestSetupMs = metadata.requestSetupMs
|
||
const attemptStartTimes = metadata.attemptStartTimes
|
||
|
||
// Merge metadata with original args
|
||
const args = {
|
||
...pending.args,
|
||
ttft_ms: ttftMs,
|
||
ttlt_ms: ttltMs,
|
||
prompt_tokens: promptTokens,
|
||
output_tokens: outputTokens,
|
||
cache_read_tokens: cacheReadTokens,
|
||
cache_creation_tokens: metadata.cacheCreationTokens,
|
||
message_id: metadata.messageId ?? pending.args.message_id,
|
||
success: metadata.success ?? true,
|
||
error: metadata.error,
|
||
duration_ms: duration / 1000,
|
||
request_setup_ms: requestSetupMs,
|
||
// Derived metrics
|
||
itps,
|
||
otps,
|
||
cache_hit_rate_pct: cacheHitRate,
|
||
}
|
||
|
||
// Emit Request Setup sub-span when there was measurable setup time
|
||
// (client creation, param building, retries before the successful attempt)
|
||
const setupUs =
|
||
requestSetupMs !== undefined && requestSetupMs > 0
|
||
? requestSetupMs * 1000
|
||
: 0
|
||
if (setupUs > 0) {
|
||
const setupEndTs = pending.startTime + setupUs
|
||
|
||
events.push({
|
||
name: 'Request Setup',
|
||
cat: 'api,setup',
|
||
ph: 'B',
|
||
ts: pending.startTime,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args: {
|
||
request_setup_ms: requestSetupMs,
|
||
attempt_count: attemptStartTimes?.length ?? 1,
|
||
},
|
||
})
|
||
|
||
// Emit retry attempt sub-spans within Request Setup.
|
||
// Each failed attempt runs from its start to the next attempt's start.
|
||
if (attemptStartTimes && attemptStartTimes.length > 1) {
|
||
// attemptStartTimes[0] is the reference point (first attempt).
|
||
// Convert wall-clock deltas into Perfetto-relative microseconds.
|
||
const baseWallMs = attemptStartTimes[0]!
|
||
for (let i = 0; i < attemptStartTimes.length - 1; i++) {
|
||
const attemptStartUs =
|
||
pending.startTime + (attemptStartTimes[i]! - baseWallMs) * 1000
|
||
const attemptEndUs =
|
||
pending.startTime + (attemptStartTimes[i + 1]! - baseWallMs) * 1000
|
||
|
||
events.push({
|
||
name: `Attempt ${i + 1} (retry)`,
|
||
cat: 'api,retry',
|
||
ph: 'B',
|
||
ts: attemptStartUs,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args: { attempt: i + 1 },
|
||
})
|
||
events.push({
|
||
name: `Attempt ${i + 1} (retry)`,
|
||
cat: 'api,retry',
|
||
ph: 'E',
|
||
ts: attemptEndUs,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
})
|
||
}
|
||
}
|
||
|
||
events.push({
|
||
name: 'Request Setup',
|
||
cat: 'api,setup',
|
||
ph: 'E',
|
||
ts: setupEndTs,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
})
|
||
}
|
||
|
||
// Emit sub-spans for First Token and Sampling phases (before API Call end)
|
||
// Using B/E pairs in proper nesting order for correct Perfetto visualization
|
||
if (ttftMs !== undefined) {
|
||
// First Token starts after request setup (if any)
|
||
const firstTokenStartTs = pending.startTime + setupUs
|
||
const firstTokenEndTs = firstTokenStartTs + ttftMs * 1000
|
||
|
||
// First Token phase: from successful attempt start to first token
|
||
events.push({
|
||
name: 'First Token',
|
||
cat: 'api,ttft',
|
||
ph: 'B',
|
||
ts: firstTokenStartTs,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args: {
|
||
ttft_ms: ttftMs,
|
||
prompt_tokens: promptTokens,
|
||
itps,
|
||
cache_hit_rate_pct: cacheHitRate,
|
||
},
|
||
})
|
||
events.push({
|
||
name: 'First Token',
|
||
cat: 'api,ttft',
|
||
ph: 'E',
|
||
ts: firstTokenEndTs,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
})
|
||
|
||
// Sampling phase: from first token to last token
|
||
// Note: samplingMs = ttltMs - ttftMs still includes setup time in ttltMs,
|
||
// so we compute the actual sampling duration for the span as the time from
|
||
// first token to API call end (endTime), not samplingMs directly.
|
||
const actualSamplingMs =
|
||
ttltMs !== undefined ? ttltMs - ttftMs - setupUs / 1000 : undefined
|
||
if (actualSamplingMs !== undefined && actualSamplingMs > 0) {
|
||
events.push({
|
||
name: 'Sampling',
|
||
cat: 'api,sampling',
|
||
ph: 'B',
|
||
ts: firstTokenEndTs,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args: {
|
||
sampling_ms: actualSamplingMs,
|
||
output_tokens: outputTokens,
|
||
otps,
|
||
},
|
||
})
|
||
events.push({
|
||
name: 'Sampling',
|
||
cat: 'api,sampling',
|
||
ph: 'E',
|
||
ts: firstTokenEndTs + actualSamplingMs * 1000,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
})
|
||
}
|
||
}
|
||
|
||
// Emit API Call end event (after sub-spans)
|
||
events.push({
|
||
name: pending.name,
|
||
cat: pending.category,
|
||
ph: 'E',
|
||
ts: endTime,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args,
|
||
})
|
||
|
||
pendingSpans.delete(spanId)
|
||
}
|
||
|
||
/**
|
||
* Start a tool execution span
|
||
*/
|
||
export function startToolPerfettoSpan(
|
||
toolName: string,
|
||
args?: Record<string, unknown>,
|
||
): string {
|
||
if (!isEnabled) return ''
|
||
|
||
const spanId = generateSpanId()
|
||
const agentInfo = getCurrentAgentInfo()
|
||
|
||
pendingSpans.set(spanId, {
|
||
name: `Tool: ${toolName}`,
|
||
category: 'tool',
|
||
startTime: getTimestamp(),
|
||
agentInfo,
|
||
args: {
|
||
tool_name: toolName,
|
||
...args,
|
||
},
|
||
})
|
||
|
||
// Emit begin event
|
||
events.push({
|
||
name: `Tool: ${toolName}`,
|
||
cat: 'tool',
|
||
ph: 'B',
|
||
ts: pendingSpans.get(spanId)!.startTime,
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args: pendingSpans.get(spanId)!.args,
|
||
})
|
||
|
||
return spanId
|
||
}
|
||
|
||
/**
|
||
* End a tool execution span
|
||
*/
|
||
export function endToolPerfettoSpan(
|
||
spanId: string,
|
||
metadata?: {
|
||
success?: boolean
|
||
error?: string
|
||
resultTokens?: number
|
||
},
|
||
): void {
|
||
if (!isEnabled || !spanId) return
|
||
|
||
const pending = pendingSpans.get(spanId)
|
||
if (!pending) return
|
||
|
||
const endTime = getTimestamp()
|
||
const duration = endTime - pending.startTime
|
||
|
||
const args = {
|
||
...pending.args,
|
||
success: metadata?.success ?? true,
|
||
error: metadata?.error,
|
||
result_tokens: metadata?.resultTokens,
|
||
duration_ms: duration / 1000,
|
||
}
|
||
|
||
// Emit end event
|
||
events.push({
|
||
name: pending.name,
|
||
cat: pending.category,
|
||
ph: 'E',
|
||
ts: endTime,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args,
|
||
})
|
||
|
||
pendingSpans.delete(spanId)
|
||
}
|
||
|
||
/**
|
||
* Start a user input waiting span
|
||
*/
|
||
export function startUserInputPerfettoSpan(context?: string): string {
|
||
if (!isEnabled) return ''
|
||
|
||
const spanId = generateSpanId()
|
||
const agentInfo = getCurrentAgentInfo()
|
||
|
||
pendingSpans.set(spanId, {
|
||
name: 'Waiting for User Input',
|
||
category: 'user_input',
|
||
startTime: getTimestamp(),
|
||
agentInfo,
|
||
args: {
|
||
context,
|
||
},
|
||
})
|
||
|
||
// Emit begin event
|
||
events.push({
|
||
name: 'Waiting for User Input',
|
||
cat: 'user_input',
|
||
ph: 'B',
|
||
ts: pendingSpans.get(spanId)!.startTime,
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args: pendingSpans.get(spanId)!.args,
|
||
})
|
||
|
||
return spanId
|
||
}
|
||
|
||
/**
|
||
* End a user input waiting span
|
||
*/
|
||
export function endUserInputPerfettoSpan(
|
||
spanId: string,
|
||
metadata?: {
|
||
decision?: string
|
||
source?: string
|
||
},
|
||
): void {
|
||
if (!isEnabled || !spanId) return
|
||
|
||
const pending = pendingSpans.get(spanId)
|
||
if (!pending) return
|
||
|
||
const endTime = getTimestamp()
|
||
const duration = endTime - pending.startTime
|
||
|
||
const args = {
|
||
...pending.args,
|
||
decision: metadata?.decision,
|
||
source: metadata?.source,
|
||
duration_ms: duration / 1000,
|
||
}
|
||
|
||
// Emit end event
|
||
events.push({
|
||
name: pending.name,
|
||
cat: pending.category,
|
||
ph: 'E',
|
||
ts: endTime,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args,
|
||
})
|
||
|
||
pendingSpans.delete(spanId)
|
||
}
|
||
|
||
/**
|
||
* Emit an instant event (marker)
|
||
*/
|
||
export function emitPerfettoInstant(
|
||
name: string,
|
||
category: string,
|
||
args?: Record<string, unknown>,
|
||
): void {
|
||
if (!isEnabled) return
|
||
|
||
const agentInfo = getCurrentAgentInfo()
|
||
|
||
events.push({
|
||
name,
|
||
cat: category,
|
||
ph: 'i',
|
||
ts: getTimestamp(),
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args,
|
||
})
|
||
}
|
||
|
||
/**
|
||
* Emit a counter event for tracking metrics over time
|
||
*/
|
||
export function emitPerfettoCounter(
|
||
name: string,
|
||
values: Record<string, number>,
|
||
): void {
|
||
if (!isEnabled) return
|
||
|
||
const agentInfo = getCurrentAgentInfo()
|
||
|
||
events.push({
|
||
name,
|
||
cat: 'counter',
|
||
ph: 'C',
|
||
ts: getTimestamp(),
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args: values,
|
||
})
|
||
}
|
||
|
||
/**
|
||
* Start an interaction span (wraps a full user request cycle)
|
||
*/
|
||
export function startInteractionPerfettoSpan(userPrompt?: string): string {
|
||
if (!isEnabled) return ''
|
||
|
||
const spanId = generateSpanId()
|
||
const agentInfo = getCurrentAgentInfo()
|
||
|
||
pendingSpans.set(spanId, {
|
||
name: 'Interaction',
|
||
category: 'interaction',
|
||
startTime: getTimestamp(),
|
||
agentInfo,
|
||
args: {
|
||
user_prompt_length: userPrompt?.length,
|
||
},
|
||
})
|
||
|
||
// Emit begin event
|
||
events.push({
|
||
name: 'Interaction',
|
||
cat: 'interaction',
|
||
ph: 'B',
|
||
ts: pendingSpans.get(spanId)!.startTime,
|
||
pid: agentInfo.processId,
|
||
tid: agentInfo.threadId,
|
||
args: pendingSpans.get(spanId)!.args,
|
||
})
|
||
|
||
return spanId
|
||
}
|
||
|
||
/**
|
||
* End an interaction span
|
||
*/
|
||
export function endInteractionPerfettoSpan(spanId: string): void {
|
||
if (!isEnabled || !spanId) return
|
||
|
||
const pending = pendingSpans.get(spanId)
|
||
if (!pending) return
|
||
|
||
const endTime = getTimestamp()
|
||
const duration = endTime - pending.startTime
|
||
|
||
// Emit end event
|
||
events.push({
|
||
name: pending.name,
|
||
cat: pending.category,
|
||
ph: 'E',
|
||
ts: endTime,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args: {
|
||
...pending.args,
|
||
duration_ms: duration / 1000,
|
||
},
|
||
})
|
||
|
||
pendingSpans.delete(spanId)
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Periodic write helpers
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Stop the periodic write timer.
|
||
*/
|
||
function stopWriteInterval(): void {
|
||
if (staleSpanCleanupId) {
|
||
clearInterval(staleSpanCleanupId)
|
||
staleSpanCleanupId = null
|
||
}
|
||
if (writeIntervalId) {
|
||
clearInterval(writeIntervalId)
|
||
writeIntervalId = null
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Force-close any remaining open spans at session end.
|
||
*/
|
||
function closeOpenSpans(): void {
|
||
for (const [spanId, pending] of pendingSpans) {
|
||
const endTime = getTimestamp()
|
||
events.push({
|
||
name: pending.name,
|
||
cat: pending.category,
|
||
ph: 'E',
|
||
ts: endTime,
|
||
pid: pending.agentInfo.processId,
|
||
tid: pending.agentInfo.threadId,
|
||
args: {
|
||
...pending.args,
|
||
incomplete: true,
|
||
duration_ms: (endTime - pending.startTime) / 1000,
|
||
},
|
||
})
|
||
pendingSpans.delete(spanId)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Write the full trace to disk. Errors are logged but swallowed so that a
|
||
* transient I/O problem does not crash the session — the next periodic tick
|
||
* (or the final exit write) will retry with a complete snapshot.
|
||
*/
|
||
async function periodicWrite(): Promise<void> {
|
||
if (!isEnabled || !tracePath || traceWritten) return
|
||
|
||
try {
|
||
await mkdir(dirname(tracePath), { recursive: true })
|
||
await writeFile(tracePath, buildTraceDocument())
|
||
logForDebugging(
|
||
`[Perfetto] Periodic write: ${events.length} events to ${tracePath}`,
|
||
)
|
||
} catch (error) {
|
||
logForDebugging(
|
||
`[Perfetto] Periodic write failed: ${errorMessage(error)}`,
|
||
{ level: 'error' },
|
||
)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Final async write: close open spans and write the complete trace.
|
||
* Idempotent — sets `traceWritten` on success so subsequent calls are no-ops.
|
||
*/
|
||
async function writePerfettoTrace(): Promise<void> {
|
||
if (!isEnabled || !tracePath || traceWritten) {
|
||
logForDebugging(
|
||
`[Perfetto] Skipping final write: isEnabled=${isEnabled}, tracePath=${tracePath}, traceWritten=${traceWritten}`,
|
||
)
|
||
return
|
||
}
|
||
|
||
stopWriteInterval()
|
||
closeOpenSpans()
|
||
|
||
logForDebugging(
|
||
`[Perfetto] writePerfettoTrace called: events=${events.length}`,
|
||
)
|
||
|
||
try {
|
||
await mkdir(dirname(tracePath), { recursive: true })
|
||
await writeFile(tracePath, buildTraceDocument())
|
||
traceWritten = true
|
||
logForDebugging(`[Perfetto] Trace finalized at: ${tracePath}`)
|
||
} catch (error) {
|
||
logForDebugging(
|
||
`[Perfetto] Failed to write final trace: ${errorMessage(error)}`,
|
||
{ level: 'error' },
|
||
)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Final synchronous write (fallback for process 'exit' handler where async is forbidden).
|
||
*/
|
||
function writePerfettoTraceSync(): void {
|
||
if (!isEnabled || !tracePath || traceWritten) {
|
||
logForDebugging(
|
||
`[Perfetto] Skipping final sync write: isEnabled=${isEnabled}, tracePath=${tracePath}, traceWritten=${traceWritten}`,
|
||
)
|
||
return
|
||
}
|
||
|
||
stopWriteInterval()
|
||
closeOpenSpans()
|
||
|
||
logForDebugging(
|
||
`[Perfetto] writePerfettoTraceSync called: events=${events.length}`,
|
||
)
|
||
|
||
try {
|
||
const dir = dirname(tracePath)
|
||
// eslint-disable-next-line custom-rules/no-sync-fs -- Only called from process.on('exit') handler
|
||
mkdirSync(dir, { recursive: true })
|
||
// eslint-disable-next-line custom-rules/no-sync-fs, eslint-plugin-n/no-sync -- Required for process 'exit' handler which doesn't support async
|
||
writeFileSync(tracePath, buildTraceDocument())
|
||
traceWritten = true
|
||
logForDebugging(`[Perfetto] Trace finalized synchronously at: ${tracePath}`)
|
||
} catch (error) {
|
||
logForDebugging(
|
||
`[Perfetto] Failed to write final trace synchronously: ${errorMessage(error)}`,
|
||
{ level: 'error' },
|
||
)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get all recorded events (for testing)
|
||
*/
|
||
export function getPerfettoEvents(): TraceEvent[] {
|
||
return [...metadataEvents, ...events]
|
||
}
|
||
|
||
/**
|
||
* Reset the tracer state (for testing)
|
||
*/
|
||
export function resetPerfettoTracer(): void {
|
||
if (staleSpanCleanupId) {
|
||
clearInterval(staleSpanCleanupId)
|
||
staleSpanCleanupId = null
|
||
}
|
||
stopWriteInterval()
|
||
metadataEvents.length = 0
|
||
events.length = 0
|
||
pendingSpans.clear()
|
||
agentRegistry.clear()
|
||
agentIdToProcessId.clear()
|
||
totalAgentCount = 0
|
||
processIdCounter = 1
|
||
spanIdCounter = 0
|
||
isEnabled = false
|
||
tracePath = null
|
||
startTimeMs = 0
|
||
traceWritten = false
|
||
}
|
||
|
||
/**
|
||
* Trigger a periodic write immediately (for testing)
|
||
*/
|
||
export async function triggerPeriodicWriteForTesting(): Promise<void> {
|
||
await periodicWrite()
|
||
}
|
||
|
||
/**
|
||
* Evict stale spans immediately (for testing)
|
||
*/
|
||
export function evictStaleSpansForTesting(): void {
|
||
evictStaleSpans()
|
||
}
|
||
|
||
export const MAX_EVENTS_FOR_TESTING = MAX_EVENTS
|
||
export function evictOldestEventsForTesting(): void {
|
||
evictOldestEvents()
|
||
}
|