/** * Tree-sitter AST analysis utilities for bash command security validation. * * These functions extract security-relevant information from tree-sitter * parse trees, providing more accurate analysis than regex/shell-quote * parsing. Each function takes a root node and command string, and returns * structured data that can be used by security validators. * * The native NAPI parser returns plain JS objects — no cleanup needed. */ type TreeSitterNode = { type: string text: string startIndex: number endIndex: number children: TreeSitterNode[] childCount: number } export type QuoteContext = { /** Command text with single-quoted content removed (double-quoted content preserved) */ withDoubleQuotes: string /** Command text with all quoted content removed */ fullyUnquoted: string /** Like fullyUnquoted but preserves quote characters (', ") */ unquotedKeepQuoteChars: string } export type CompoundStructure = { /** Whether the command has compound operators (&&, ||, ;) at the top level */ hasCompoundOperators: boolean /** Whether the command has pipelines */ hasPipeline: boolean /** Whether the command has subshells */ hasSubshell: boolean /** Whether the command has command groups ({...}) */ hasCommandGroup: boolean /** Top-level compound operator types found */ operators: string[] /** Individual command segments split by compound operators */ segments: string[] } export type DangerousPatterns = { /** Has $() or backtick command substitution (outside quotes that would make it safe) */ hasCommandSubstitution: boolean /** Has <() or >() process substitution */ hasProcessSubstitution: boolean /** Has ${...} parameter expansion */ hasParameterExpansion: boolean /** Has heredoc */ hasHeredoc: boolean /** Has comment */ hasComment: boolean } export type TreeSitterAnalysis = { quoteContext: QuoteContext compoundStructure: CompoundStructure /** Whether actual operator nodes (;, &&, ||) exist — if false, \; is just a word argument */ hasActualOperatorNodes: boolean dangerousPatterns: DangerousPatterns } type QuoteSpans = { raw: Array<[number, number]> // raw_string (single-quoted) ansiC: Array<[number, number]> // ansi_c_string ($'...') double: Array<[number, number]> // string (double-quoted) heredoc: Array<[number, number]> // quoted heredoc_redirect } /** * Single-pass collection of all quote-related spans. * Previously this was 5 separate tree walks (one per type-set plus * allQuoteTypes plus heredoc); fusing cuts tree-traversal ~5x. * * Replicates the per-type walk semantics: each original walk stopped at * its own type. So the raw_string walk would recurse THROUGH a string * node (not its type) to reach nested raw_string inside $(...), but the * string walk would stop at the outer string. We track `inDouble` to * collect the *outermost* string span per path, while still descending * into $()/${} bodies to pick up inner raw_string/ansi_c_string. * * raw_string / ansi_c_string / quoted-heredoc bodies are literal text * in bash (no expansion), so no nested quote nodes exist — return early. */ function collectQuoteSpans( node: TreeSitterNode, out: QuoteSpans, inDouble: boolean, ): void { switch (node.type) { case 'raw_string': out.raw.push([node.startIndex, node.endIndex]) return // literal body, no nested quotes possible case 'ansi_c_string': out.ansiC.push([node.startIndex, node.endIndex]) return // literal body case 'string': // Only collect the outermost string (matches old per-type walk // which stops at first match). Recurse regardless — a nested // $(cmd 'x') inside "..." has a real inner raw_string. if (!inDouble) out.double.push([node.startIndex, node.endIndex]) for (const child of node.children) { if (child) collectQuoteSpans(child, out, true) } return case 'heredoc_redirect': { // Quoted heredocs (<<'EOF', <<"EOF", <<\EOF): literal body. // Unquoted (<): Set { const set = new Set() for (const [start, end] of spans) { for (let i = start; i < end; i++) { set.add(i) } } return set } /** * Drops spans that are fully contained within another span, keeping only the * outermost. Nested quotes (e.g., `"$(echo 'hi')"`) yield overlapping spans * — the inner raw_string is found by recursing into the outer string node. * Processing overlapping spans corrupts indices since removing/replacing the * outer span shifts the inner span's start/end into stale positions. */ function dropContainedSpans( spans: T[], ): T[] { return spans.filter( (s, i) => !spans.some( (other, j) => j !== i && other[0] <= s[0] && other[1] >= s[1] && (other[0] < s[0] || other[1] > s[1]), ), ) } /** * Removes spans from a string, returning the string with those character * ranges removed. */ function removeSpans(command: string, spans: Array<[number, number]>): string { if (spans.length === 0) return command // Drop inner spans that are fully contained in an outer one, then sort by // start index descending so we can splice without offset shifts. const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0]) let result = command for (const [start, end] of sorted) { result = result.slice(0, start) + result.slice(end) } return result } /** * Replaces spans with just the quote delimiters (preserving ' and " characters). */ function replaceSpansKeepQuotes( command: string, spans: Array<[number, number, string, string]>, ): string { if (spans.length === 0) return command const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0]) let result = command for (const [start, end, open, close] of sorted) { // Replace content but keep the quote delimiters result = result.slice(0, start) + open + close + result.slice(end) } return result } /** * Extract quote context from the tree-sitter AST. * Replaces the manual character-by-character extractQuotedContent() function. * * Tree-sitter node types: * - raw_string: single-quoted ('...') * - string: double-quoted ("...") * - ansi_c_string: ANSI-C quoting ($'...') — span includes the leading $ * - heredoc_redirect: QUOTED heredocs only (<<'EOF', <<"EOF", <<\EOF) — * the full redirect span (<<, delimiters, body, newlines) is stripped * since the body is literal text in bash (no expansion). UNQUOTED * heredocs (<() for (const [start, end] of doubleQuoteSpans) { doubleQuoteDelimSet.add(start) // opening " doubleQuoteDelimSet.add(end - 1) // closing " } let withDoubleQuotes = '' for (let i = 0; i < command.length; i++) { if (singleQuoteSet.has(i)) continue if (doubleQuoteDelimSet.has(i)) continue withDoubleQuotes += command[i] } // fullyUnquoted: remove all quoted content const fullyUnquoted = removeSpans(command, allQuoteSpans) // unquotedKeepQuoteChars: remove content but keep delimiter chars const spansWithQuoteChars: Array<[number, number, string, string]> = [] for (const [start, end] of singleQuoteSpans) { spansWithQuoteChars.push([start, end, "'", "'"]) } for (const [start, end] of ansiCSpans) { // ansi_c_string spans include the leading $; preserve it so this // matches the regex path, which treats $ as unquoted preceding '. spansWithQuoteChars.push([start, end, "$'", "'"]) } for (const [start, end] of doubleQuoteSpans) { spansWithQuoteChars.push([start, end, '"', '"']) } for (const [start, end] of quotedHeredocSpans) { // Heredoc redirect spans have no inline quote delimiters — strip entirely. spansWithQuoteChars.push([start, end, '', '']) } const unquotedKeepQuoteChars = replaceSpansKeepQuotes( command, spansWithQuoteChars, ) return { withDoubleQuotes, fullyUnquoted, unquotedKeepQuoteChars } } /** * Extract compound command structure from the AST. * Replaces isUnsafeCompoundCommand() and splitCommand() for tree-sitter path. */ export function extractCompoundStructure( rootNode: unknown, command: string, ): CompoundStructure { const n = rootNode as TreeSitterNode const operators: string[] = [] const segments: string[] = [] let hasSubshell = false let hasCommandGroup = false let hasPipeline = false // Walk top-level children of the program node function walkTopLevel(node: TreeSitterNode): void { for (const child of node.children) { if (!child) continue if (child.type === 'list') { // list nodes contain && and || operators for (const listChild of child.children) { if (!listChild) continue if (listChild.type === '&&' || listChild.type === '||') { operators.push(listChild.type) } else if ( listChild.type === 'list' || listChild.type === 'redirected_statement' ) { // Nested list, or redirected_statement wrapping a list/pipeline — // recurse so inner operators/pipelines are detected. For // `cmd1 && cmd2 2>/dev/null && cmd3`, the redirected_statement // wraps `list(cmd1 && cmd2)` — the inner `&&` would be missed // without recursion. walkTopLevel({ ...node, children: [listChild] } as TreeSitterNode) } else if (listChild.type === 'pipeline') { hasPipeline = true segments.push(listChild.text) } else if (listChild.type === 'subshell') { hasSubshell = true segments.push(listChild.text) } else if (listChild.type === 'compound_statement') { hasCommandGroup = true segments.push(listChild.text) } else { segments.push(listChild.text) } } } else if (child.type === ';') { operators.push(';') } else if (child.type === 'pipeline') { hasPipeline = true segments.push(child.text) } else if (child.type === 'subshell') { hasSubshell = true segments.push(child.text) } else if (child.type === 'compound_statement') { hasCommandGroup = true segments.push(child.text) } else if ( child.type === 'command' || child.type === 'declaration_command' || child.type === 'variable_assignment' ) { segments.push(child.text) } else if (child.type === 'redirected_statement') { // `cd ~/src && find path 2>/dev/null` — tree-sitter wraps the ENTIRE // compound in a redirected_statement: program → redirected_statement → // (list → cmd1, &&, cmd2) + file_redirect. Same for `cmd1 | cmd2 > out` // (wraps pipeline) and `(cmd) > out` (wraps subshell). Recurse to // detect the inner structure; skip file_redirect children (redirects // don't affect compound/pipeline classification). let foundInner = false for (const inner of child.children) { if (!inner || inner.type === 'file_redirect') continue foundInner = true walkTopLevel({ ...child, children: [inner] } as TreeSitterNode) } if (!foundInner) { // Standalone redirect with no body (shouldn't happen, but fail-safe) segments.push(child.text) } } else if (child.type === 'negated_command') { // `! cmd` — recurse into the inner command so its structure is // classified (pipeline/subshell/etc.), but also record the full // negated text as a segment so segments.length stays meaningful. segments.push(child.text) walkTopLevel(child) } else if ( child.type === 'if_statement' || child.type === 'while_statement' || child.type === 'for_statement' || child.type === 'case_statement' || child.type === 'function_definition' ) { // Control-flow constructs: the construct itself is one segment, // but recurse so inner pipelines/subshells/operators are detected. segments.push(child.text) walkTopLevel(child) } } } walkTopLevel(n) // If no segments found, the whole command is one segment if (segments.length === 0) { segments.push(command) } return { hasCompoundOperators: operators.length > 0, hasPipeline, hasSubshell, hasCommandGroup, operators, segments, } } /** * Check whether the AST contains actual operator nodes (;, &&, ||). * * This is the key function for eliminating the `find -exec \;` false positive. * Tree-sitter parses `\;` as part of a `word` node (an argument to find), * NOT as a `;` operator. So if no actual `;` operator nodes exist in the AST, * there are no compound operators and hasBackslashEscapedOperator() can be skipped. */ export function hasActualOperatorNodes(rootNode: unknown): boolean { const n = rootNode as TreeSitterNode function walk(node: TreeSitterNode): boolean { // Check for operator types that indicate compound commands if (node.type === ';' || node.type === '&&' || node.type === '||') { // Verify this is a child of a list or program, not inside a command return true } if (node.type === 'list') { // A list node means there are compound operators return true } for (const child of node.children) { if (child && walk(child)) return true } return false } return walk(n) } /** * Extract dangerous pattern information from the AST. */ export function extractDangerousPatterns(rootNode: unknown): DangerousPatterns { const n = rootNode as TreeSitterNode let hasCommandSubstitution = false let hasProcessSubstitution = false let hasParameterExpansion = false let hasHeredoc = false let hasComment = false function walk(node: TreeSitterNode): void { switch (node.type) { case 'command_substitution': hasCommandSubstitution = true break case 'process_substitution': hasProcessSubstitution = true break case 'expansion': hasParameterExpansion = true break case 'heredoc_redirect': hasHeredoc = true break case 'comment': hasComment = true break } for (const child of node.children) { if (child) walk(child) } } walk(n) return { hasCommandSubstitution, hasProcessSubstitution, hasParameterExpansion, hasHeredoc, hasComment, } } /** * Perform complete tree-sitter analysis of a command. * Extracts all security-relevant data from the AST in one pass. * This data must be extracted before tree.delete() is called. */ export function analyzeCommand( rootNode: unknown, command: string, ): TreeSitterAnalysis { return { quoteContext: extractQuoteContext(rootNode, command), compoundStructure: extractCompoundStructure(rootNode, command), hasActualOperatorNodes: hasActualOperatorNodes(rootNode), dangerousPatterns: extractDangerousPatterns(rootNode), } }