507 lines
17 KiB
TypeScript
507 lines
17 KiB
TypeScript
/**
|
|
* Tree-sitter AST analysis utilities for bash command security validation.
|
|
*
|
|
* These functions extract security-relevant information from tree-sitter
|
|
* parse trees, providing more accurate analysis than regex/shell-quote
|
|
* parsing. Each function takes a root node and command string, and returns
|
|
* structured data that can be used by security validators.
|
|
*
|
|
* The native NAPI parser returns plain JS objects — no cleanup needed.
|
|
*/
|
|
|
|
type TreeSitterNode = {
|
|
type: string
|
|
text: string
|
|
startIndex: number
|
|
endIndex: number
|
|
children: TreeSitterNode[]
|
|
childCount: number
|
|
}
|
|
|
|
export type QuoteContext = {
|
|
/** Command text with single-quoted content removed (double-quoted content preserved) */
|
|
withDoubleQuotes: string
|
|
/** Command text with all quoted content removed */
|
|
fullyUnquoted: string
|
|
/** Like fullyUnquoted but preserves quote characters (', ") */
|
|
unquotedKeepQuoteChars: string
|
|
}
|
|
|
|
export type CompoundStructure = {
|
|
/** Whether the command has compound operators (&&, ||, ;) at the top level */
|
|
hasCompoundOperators: boolean
|
|
/** Whether the command has pipelines */
|
|
hasPipeline: boolean
|
|
/** Whether the command has subshells */
|
|
hasSubshell: boolean
|
|
/** Whether the command has command groups ({...}) */
|
|
hasCommandGroup: boolean
|
|
/** Top-level compound operator types found */
|
|
operators: string[]
|
|
/** Individual command segments split by compound operators */
|
|
segments: string[]
|
|
}
|
|
|
|
export type DangerousPatterns = {
|
|
/** Has $() or backtick command substitution (outside quotes that would make it safe) */
|
|
hasCommandSubstitution: boolean
|
|
/** Has <() or >() process substitution */
|
|
hasProcessSubstitution: boolean
|
|
/** Has ${...} parameter expansion */
|
|
hasParameterExpansion: boolean
|
|
/** Has heredoc */
|
|
hasHeredoc: boolean
|
|
/** Has comment */
|
|
hasComment: boolean
|
|
}
|
|
|
|
export type TreeSitterAnalysis = {
|
|
quoteContext: QuoteContext
|
|
compoundStructure: CompoundStructure
|
|
/** Whether actual operator nodes (;, &&, ||) exist — if false, \; is just a word argument */
|
|
hasActualOperatorNodes: boolean
|
|
dangerousPatterns: DangerousPatterns
|
|
}
|
|
|
|
type QuoteSpans = {
|
|
raw: Array<[number, number]> // raw_string (single-quoted)
|
|
ansiC: Array<[number, number]> // ansi_c_string ($'...')
|
|
double: Array<[number, number]> // string (double-quoted)
|
|
heredoc: Array<[number, number]> // quoted heredoc_redirect
|
|
}
|
|
|
|
/**
|
|
* Single-pass collection of all quote-related spans.
|
|
* Previously this was 5 separate tree walks (one per type-set plus
|
|
* allQuoteTypes plus heredoc); fusing cuts tree-traversal ~5x.
|
|
*
|
|
* Replicates the per-type walk semantics: each original walk stopped at
|
|
* its own type. So the raw_string walk would recurse THROUGH a string
|
|
* node (not its type) to reach nested raw_string inside $(...), but the
|
|
* string walk would stop at the outer string. We track `inDouble` to
|
|
* collect the *outermost* string span per path, while still descending
|
|
* into $()/${} bodies to pick up inner raw_string/ansi_c_string.
|
|
*
|
|
* raw_string / ansi_c_string / quoted-heredoc bodies are literal text
|
|
* in bash (no expansion), so no nested quote nodes exist — return early.
|
|
*/
|
|
function collectQuoteSpans(
|
|
node: TreeSitterNode,
|
|
out: QuoteSpans,
|
|
inDouble: boolean,
|
|
): void {
|
|
switch (node.type) {
|
|
case 'raw_string':
|
|
out.raw.push([node.startIndex, node.endIndex])
|
|
return // literal body, no nested quotes possible
|
|
case 'ansi_c_string':
|
|
out.ansiC.push([node.startIndex, node.endIndex])
|
|
return // literal body
|
|
case 'string':
|
|
// Only collect the outermost string (matches old per-type walk
|
|
// which stops at first match). Recurse regardless — a nested
|
|
// $(cmd 'x') inside "..." has a real inner raw_string.
|
|
if (!inDouble) out.double.push([node.startIndex, node.endIndex])
|
|
for (const child of node.children) {
|
|
if (child) collectQuoteSpans(child, out, true)
|
|
}
|
|
return
|
|
case 'heredoc_redirect': {
|
|
// Quoted heredocs (<<'EOF', <<"EOF", <<\EOF): literal body.
|
|
// Unquoted (<<EOF) expands $()/${} — the body can contain
|
|
// $(cmd 'x') whose inner '...' IS a real raw_string node.
|
|
// Detection: heredoc_start text starts with '/"/\\
|
|
// Matches sync path's extractHeredocs({ quotedOnly: true }).
|
|
let isQuoted = false
|
|
for (const child of node.children) {
|
|
if (child && child.type === 'heredoc_start') {
|
|
const first = child.text[0]
|
|
isQuoted = first === "'" || first === '"' || first === '\\'
|
|
break
|
|
}
|
|
}
|
|
if (isQuoted) {
|
|
out.heredoc.push([node.startIndex, node.endIndex])
|
|
return // literal body, no nested quote nodes
|
|
}
|
|
// Unquoted: recurse into heredoc_body → command_substitution →
|
|
// inner quote nodes. The original per-type walks did NOT stop at
|
|
// heredoc_redirect (not in their type sets), so they recursed here.
|
|
break
|
|
}
|
|
}
|
|
|
|
for (const child of node.children) {
|
|
if (child) collectQuoteSpans(child, out, inDouble)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Builds a Set of all character positions covered by the given spans.
|
|
*/
|
|
function buildPositionSet(spans: Array<[number, number]>): Set<number> {
|
|
const set = new Set<number>()
|
|
for (const [start, end] of spans) {
|
|
for (let i = start; i < end; i++) {
|
|
set.add(i)
|
|
}
|
|
}
|
|
return set
|
|
}
|
|
|
|
/**
|
|
* Drops spans that are fully contained within another span, keeping only the
|
|
* outermost. Nested quotes (e.g., `"$(echo 'hi')"`) yield overlapping spans
|
|
* — the inner raw_string is found by recursing into the outer string node.
|
|
* Processing overlapping spans corrupts indices since removing/replacing the
|
|
* outer span shifts the inner span's start/end into stale positions.
|
|
*/
|
|
function dropContainedSpans<T extends readonly [number, number, ...unknown[]]>(
|
|
spans: T[],
|
|
): T[] {
|
|
return spans.filter(
|
|
(s, i) =>
|
|
!spans.some(
|
|
(other, j) =>
|
|
j !== i &&
|
|
other[0] <= s[0] &&
|
|
other[1] >= s[1] &&
|
|
(other[0] < s[0] || other[1] > s[1]),
|
|
),
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Removes spans from a string, returning the string with those character
|
|
* ranges removed.
|
|
*/
|
|
function removeSpans(command: string, spans: Array<[number, number]>): string {
|
|
if (spans.length === 0) return command
|
|
|
|
// Drop inner spans that are fully contained in an outer one, then sort by
|
|
// start index descending so we can splice without offset shifts.
|
|
const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0])
|
|
let result = command
|
|
for (const [start, end] of sorted) {
|
|
result = result.slice(0, start) + result.slice(end)
|
|
}
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Replaces spans with just the quote delimiters (preserving ' and " characters).
|
|
*/
|
|
function replaceSpansKeepQuotes(
|
|
command: string,
|
|
spans: Array<[number, number, string, string]>,
|
|
): string {
|
|
if (spans.length === 0) return command
|
|
|
|
const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0])
|
|
let result = command
|
|
for (const [start, end, open, close] of sorted) {
|
|
// Replace content but keep the quote delimiters
|
|
result = result.slice(0, start) + open + close + result.slice(end)
|
|
}
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Extract quote context from the tree-sitter AST.
|
|
* Replaces the manual character-by-character extractQuotedContent() function.
|
|
*
|
|
* Tree-sitter node types:
|
|
* - raw_string: single-quoted ('...')
|
|
* - string: double-quoted ("...")
|
|
* - ansi_c_string: ANSI-C quoting ($'...') — span includes the leading $
|
|
* - heredoc_redirect: QUOTED heredocs only (<<'EOF', <<"EOF", <<\EOF) —
|
|
* the full redirect span (<<, delimiters, body, newlines) is stripped
|
|
* since the body is literal text in bash (no expansion). UNQUOTED
|
|
* heredocs (<<EOF) are left in place since bash expands $(...)/${...}
|
|
* inside them, and validators need to see those patterns. Matches the
|
|
* sync path's extractHeredocs({ quotedOnly: true }).
|
|
*/
|
|
export function extractQuoteContext(
|
|
rootNode: unknown,
|
|
command: string,
|
|
): QuoteContext {
|
|
// Single walk collects all quote span types at once.
|
|
const spans: QuoteSpans = { raw: [], ansiC: [], double: [], heredoc: [] }
|
|
collectQuoteSpans(rootNode as TreeSitterNode, spans, false)
|
|
const singleQuoteSpans = spans.raw
|
|
const ansiCSpans = spans.ansiC
|
|
const doubleQuoteSpans = spans.double
|
|
const quotedHeredocSpans = spans.heredoc
|
|
const allQuoteSpans = [
|
|
...singleQuoteSpans,
|
|
...ansiCSpans,
|
|
...doubleQuoteSpans,
|
|
...quotedHeredocSpans,
|
|
]
|
|
|
|
// Build a set of positions that should be excluded for each output variant.
|
|
// For withDoubleQuotes: remove single-quoted spans entirely, plus the
|
|
// opening/closing `"` delimiters of double-quoted spans (but keep the
|
|
// content between them). This matches the regex extractQuotedContent()
|
|
// semantics where `"` toggles quote state but content is still emitted.
|
|
const singleQuoteSet = buildPositionSet([
|
|
...singleQuoteSpans,
|
|
...ansiCSpans,
|
|
...quotedHeredocSpans,
|
|
])
|
|
const doubleQuoteDelimSet = new Set<number>()
|
|
for (const [start, end] of doubleQuoteSpans) {
|
|
doubleQuoteDelimSet.add(start) // opening "
|
|
doubleQuoteDelimSet.add(end - 1) // closing "
|
|
}
|
|
let withDoubleQuotes = ''
|
|
for (let i = 0; i < command.length; i++) {
|
|
if (singleQuoteSet.has(i)) continue
|
|
if (doubleQuoteDelimSet.has(i)) continue
|
|
withDoubleQuotes += command[i]
|
|
}
|
|
|
|
// fullyUnquoted: remove all quoted content
|
|
const fullyUnquoted = removeSpans(command, allQuoteSpans)
|
|
|
|
// unquotedKeepQuoteChars: remove content but keep delimiter chars
|
|
const spansWithQuoteChars: Array<[number, number, string, string]> = []
|
|
for (const [start, end] of singleQuoteSpans) {
|
|
spansWithQuoteChars.push([start, end, "'", "'"])
|
|
}
|
|
for (const [start, end] of ansiCSpans) {
|
|
// ansi_c_string spans include the leading $; preserve it so this
|
|
// matches the regex path, which treats $ as unquoted preceding '.
|
|
spansWithQuoteChars.push([start, end, "$'", "'"])
|
|
}
|
|
for (const [start, end] of doubleQuoteSpans) {
|
|
spansWithQuoteChars.push([start, end, '"', '"'])
|
|
}
|
|
for (const [start, end] of quotedHeredocSpans) {
|
|
// Heredoc redirect spans have no inline quote delimiters — strip entirely.
|
|
spansWithQuoteChars.push([start, end, '', ''])
|
|
}
|
|
const unquotedKeepQuoteChars = replaceSpansKeepQuotes(
|
|
command,
|
|
spansWithQuoteChars,
|
|
)
|
|
|
|
return { withDoubleQuotes, fullyUnquoted, unquotedKeepQuoteChars }
|
|
}
|
|
|
|
/**
|
|
* Extract compound command structure from the AST.
|
|
* Replaces isUnsafeCompoundCommand() and splitCommand() for tree-sitter path.
|
|
*/
|
|
export function extractCompoundStructure(
|
|
rootNode: unknown,
|
|
command: string,
|
|
): CompoundStructure {
|
|
const n = rootNode as TreeSitterNode
|
|
const operators: string[] = []
|
|
const segments: string[] = []
|
|
let hasSubshell = false
|
|
let hasCommandGroup = false
|
|
let hasPipeline = false
|
|
|
|
// Walk top-level children of the program node
|
|
function walkTopLevel(node: TreeSitterNode): void {
|
|
for (const child of node.children) {
|
|
if (!child) continue
|
|
|
|
if (child.type === 'list') {
|
|
// list nodes contain && and || operators
|
|
for (const listChild of child.children) {
|
|
if (!listChild) continue
|
|
if (listChild.type === '&&' || listChild.type === '||') {
|
|
operators.push(listChild.type)
|
|
} else if (
|
|
listChild.type === 'list' ||
|
|
listChild.type === 'redirected_statement'
|
|
) {
|
|
// Nested list, or redirected_statement wrapping a list/pipeline —
|
|
// recurse so inner operators/pipelines are detected. For
|
|
// `cmd1 && cmd2 2>/dev/null && cmd3`, the redirected_statement
|
|
// wraps `list(cmd1 && cmd2)` — the inner `&&` would be missed
|
|
// without recursion.
|
|
walkTopLevel({ ...node, children: [listChild] } as TreeSitterNode)
|
|
} else if (listChild.type === 'pipeline') {
|
|
hasPipeline = true
|
|
segments.push(listChild.text)
|
|
} else if (listChild.type === 'subshell') {
|
|
hasSubshell = true
|
|
segments.push(listChild.text)
|
|
} else if (listChild.type === 'compound_statement') {
|
|
hasCommandGroup = true
|
|
segments.push(listChild.text)
|
|
} else {
|
|
segments.push(listChild.text)
|
|
}
|
|
}
|
|
} else if (child.type === ';') {
|
|
operators.push(';')
|
|
} else if (child.type === 'pipeline') {
|
|
hasPipeline = true
|
|
segments.push(child.text)
|
|
} else if (child.type === 'subshell') {
|
|
hasSubshell = true
|
|
segments.push(child.text)
|
|
} else if (child.type === 'compound_statement') {
|
|
hasCommandGroup = true
|
|
segments.push(child.text)
|
|
} else if (
|
|
child.type === 'command' ||
|
|
child.type === 'declaration_command' ||
|
|
child.type === 'variable_assignment'
|
|
) {
|
|
segments.push(child.text)
|
|
} else if (child.type === 'redirected_statement') {
|
|
// `cd ~/src && find path 2>/dev/null` — tree-sitter wraps the ENTIRE
|
|
// compound in a redirected_statement: program → redirected_statement →
|
|
// (list → cmd1, &&, cmd2) + file_redirect. Same for `cmd1 | cmd2 > out`
|
|
// (wraps pipeline) and `(cmd) > out` (wraps subshell). Recurse to
|
|
// detect the inner structure; skip file_redirect children (redirects
|
|
// don't affect compound/pipeline classification).
|
|
let foundInner = false
|
|
for (const inner of child.children) {
|
|
if (!inner || inner.type === 'file_redirect') continue
|
|
foundInner = true
|
|
walkTopLevel({ ...child, children: [inner] } as TreeSitterNode)
|
|
}
|
|
if (!foundInner) {
|
|
// Standalone redirect with no body (shouldn't happen, but fail-safe)
|
|
segments.push(child.text)
|
|
}
|
|
} else if (child.type === 'negated_command') {
|
|
// `! cmd` — recurse into the inner command so its structure is
|
|
// classified (pipeline/subshell/etc.), but also record the full
|
|
// negated text as a segment so segments.length stays meaningful.
|
|
segments.push(child.text)
|
|
walkTopLevel(child)
|
|
} else if (
|
|
child.type === 'if_statement' ||
|
|
child.type === 'while_statement' ||
|
|
child.type === 'for_statement' ||
|
|
child.type === 'case_statement' ||
|
|
child.type === 'function_definition'
|
|
) {
|
|
// Control-flow constructs: the construct itself is one segment,
|
|
// but recurse so inner pipelines/subshells/operators are detected.
|
|
segments.push(child.text)
|
|
walkTopLevel(child)
|
|
}
|
|
}
|
|
}
|
|
|
|
walkTopLevel(n)
|
|
|
|
// If no segments found, the whole command is one segment
|
|
if (segments.length === 0) {
|
|
segments.push(command)
|
|
}
|
|
|
|
return {
|
|
hasCompoundOperators: operators.length > 0,
|
|
hasPipeline,
|
|
hasSubshell,
|
|
hasCommandGroup,
|
|
operators,
|
|
segments,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check whether the AST contains actual operator nodes (;, &&, ||).
|
|
*
|
|
* This is the key function for eliminating the `find -exec \;` false positive.
|
|
* Tree-sitter parses `\;` as part of a `word` node (an argument to find),
|
|
* NOT as a `;` operator. So if no actual `;` operator nodes exist in the AST,
|
|
* there are no compound operators and hasBackslashEscapedOperator() can be skipped.
|
|
*/
|
|
export function hasActualOperatorNodes(rootNode: unknown): boolean {
|
|
const n = rootNode as TreeSitterNode
|
|
|
|
function walk(node: TreeSitterNode): boolean {
|
|
// Check for operator types that indicate compound commands
|
|
if (node.type === ';' || node.type === '&&' || node.type === '||') {
|
|
// Verify this is a child of a list or program, not inside a command
|
|
return true
|
|
}
|
|
|
|
if (node.type === 'list') {
|
|
// A list node means there are compound operators
|
|
return true
|
|
}
|
|
|
|
for (const child of node.children) {
|
|
if (child && walk(child)) return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
return walk(n)
|
|
}
|
|
|
|
/**
|
|
* Extract dangerous pattern information from the AST.
|
|
*/
|
|
export function extractDangerousPatterns(rootNode: unknown): DangerousPatterns {
|
|
const n = rootNode as TreeSitterNode
|
|
let hasCommandSubstitution = false
|
|
let hasProcessSubstitution = false
|
|
let hasParameterExpansion = false
|
|
let hasHeredoc = false
|
|
let hasComment = false
|
|
|
|
function walk(node: TreeSitterNode): void {
|
|
switch (node.type) {
|
|
case 'command_substitution':
|
|
hasCommandSubstitution = true
|
|
break
|
|
case 'process_substitution':
|
|
hasProcessSubstitution = true
|
|
break
|
|
case 'expansion':
|
|
hasParameterExpansion = true
|
|
break
|
|
case 'heredoc_redirect':
|
|
hasHeredoc = true
|
|
break
|
|
case 'comment':
|
|
hasComment = true
|
|
break
|
|
}
|
|
|
|
for (const child of node.children) {
|
|
if (child) walk(child)
|
|
}
|
|
}
|
|
|
|
walk(n)
|
|
|
|
return {
|
|
hasCommandSubstitution,
|
|
hasProcessSubstitution,
|
|
hasParameterExpansion,
|
|
hasHeredoc,
|
|
hasComment,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Perform complete tree-sitter analysis of a command.
|
|
* Extracts all security-relevant data from the AST in one pass.
|
|
* This data must be extracted before tree.delete() is called.
|
|
*/
|
|
export function analyzeCommand(
|
|
rootNode: unknown,
|
|
command: string,
|
|
): TreeSitterAnalysis {
|
|
return {
|
|
quoteContext: extractQuoteContext(rootNode, command),
|
|
compoundStructure: extractCompoundStructure(rootNode, command),
|
|
hasActualOperatorNodes: hasActualOperatorNodes(rootNode),
|
|
dangerousPatterns: extractDangerousPatterns(rootNode),
|
|
}
|
|
}
|