Skip to content

Cross-Language Analysis

Compare and analyze code across multiple programming languages.

Why Cross-Language Analysis?

Sitting Duck's semantic type system enables queries that work identically across all 27 supported languages. Instead of learning language-specific AST types, use universal semantic types.

Basic Cross-Language Queries

Count Functions Across Languages

SELECT
    language,
    COUNT(*) as function_count
FROM read_ast([
    '**/*.py',
    '**/*.js',
    '**/*.java',
    '**/*.go',
    '**/*.rs'
], ignore_errors := true)
WHERE semantic_type = 'DEFINITION_FUNCTION'
GROUP BY language
ORDER BY function_count DESC;

Find All Classes

SELECT
    name,
    language,
    file_path,
    start_line
FROM read_ast([
    '**/*.py',
    '**/*.java',
    '**/*.cpp',
    '**/*.cs'
], ignore_errors := true)
WHERE semantic_type = 'DEFINITION_CLASS'
  AND name IS NOT NULL
ORDER BY language, file_path;

Comparing Patterns

Complexity by Language

SELECT
    language,
    ROUND(AVG(descendant_count), 2) as avg_complexity,
    MAX(descendant_count) as max_complexity,
    COUNT(*) as function_count
FROM read_ast([
    '**/*.py',
    '**/*.js',
    '**/*.java'
], ignore_errors := true)
WHERE semantic_type = 'DEFINITION_FUNCTION'
GROUP BY language
ORDER BY avg_complexity DESC;

Import Patterns

SELECT
    language,
    COUNT(*) as import_count,
    COUNT(DISTINCT file_path) as files_with_imports
FROM read_ast([
    '**/*.py',
    '**/*.js',
    '**/*.java',
    '**/*.go'
], ignore_errors := true)
WHERE semantic_type = 'EXTERNAL_IMPORT'
GROUP BY language;

Control Flow Density

SELECT
    language,
    COUNT(CASE WHEN semantic_type = 'FLOW_CONDITIONAL' THEN 1 END) as conditionals,
    COUNT(CASE WHEN semantic_type = 'FLOW_LOOP' THEN 1 END) as loops,
    COUNT(CASE WHEN semantic_type = 'FLOW_JUMP' THEN 1 END) as jumps,
    COUNT(*) as total_nodes
FROM read_ast([
    '**/*.py',
    '**/*.js',
    '**/*.java'
], ignore_errors := true)
GROUP BY language;

Language-Specific vs Universal

Using Language-Specific Types

When you need precision for a single language:

-- Python-specific: find decorators
SELECT name, peek, start_line
FROM read_ast('**/*.py')
WHERE type = 'decorator';

-- Java-specific: find annotations
SELECT name, peek, start_line
FROM read_ast('**/*.java')
WHERE type = 'annotation';

Using Universal Semantic Types

When you want cross-language results:

-- Both decorators and annotations
SELECT
    name,
    language,
    peek
FROM read_ast([
    '**/*.py',
    '**/*.java'
], ignore_errors := true)
WHERE semantic_type = 'METADATA_ANNOTATION';

Codebase Overview

Language Distribution

SELECT
    language,
    COUNT(DISTINCT file_path) as files,
    COUNT(*) as nodes,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) as percentage
FROM read_ast('**/*.*', ignore_errors := true)
GROUP BY language
ORDER BY nodes DESC;

Definition Inventory

SELECT
    language,
    COUNT(CASE WHEN semantic_type = 'DEFINITION_FUNCTION' THEN 1 END) as functions,
    COUNT(CASE WHEN semantic_type = 'DEFINITION_CLASS' THEN 1 END) as classes,
    COUNT(CASE WHEN semantic_type = 'DEFINITION_VARIABLE' THEN 1 END) as variables
FROM read_ast('**/*.*', ignore_errors := true)
GROUP BY language
HAVING COUNT(*) > 100
ORDER BY functions DESC;

Finding Similar Code

Functions with Similar Names

SELECT
    name,
    language,
    file_path,
    start_line
FROM read_ast([
    '**/*.py',
    '**/*.js',
    '**/*.java'
], ignore_errors := true)
WHERE semantic_type = 'DEFINITION_FUNCTION'
  AND name LIKE '%validate%'
ORDER BY name, language;

Classes with Common Patterns

SELECT
    name,
    language,
    descendant_count as complexity
FROM read_ast([
    '**/*.py',
    '**/*.java',
    '**/*.cpp'
], ignore_errors := true)
WHERE semantic_type = 'DEFINITION_CLASS'
  AND name LIKE '%Service%'
ORDER BY complexity DESC;

Complexity Hotspots

Most Complex Functions

SELECT
    name,
    language,
    file_path,
    descendant_count as complexity,
    end_line - start_line + 1 as lines
FROM read_ast([
    '**/*.py',
    '**/*.js',
    '**/*.java'
], ignore_errors := true)
WHERE semantic_type = 'DEFINITION_FUNCTION'
  AND descendant_count > 100
ORDER BY complexity DESC
LIMIT 20;

Deeply Nested Code

SELECT
    file_path,
    language,
    MAX(depth) as max_depth,
    COUNT(CASE WHEN depth > 10 THEN 1 END) as deep_nodes
FROM read_ast('**/*.*', ignore_errors := true)
GROUP BY file_path, language
HAVING MAX(depth) > 10
ORDER BY max_depth DESC;

Exporting Results

To Parquet

COPY (
    SELECT
        file_path,
        language,
        type,
        name,
        semantic_type_to_string(semantic_type) as semantic_type,
        start_line,
        end_line,
        descendant_count
    FROM read_ast('**/*.*', ignore_errors := true)
    WHERE is_definition(semantic_type)
) TO 'codebase_definitions.parquet';

To CSV

COPY (
    SELECT language, COUNT(*) as nodes
    FROM read_ast('**/*.*', ignore_errors := true)
    GROUP BY language
) TO 'language_stats.csv' (HEADER);

Best Practices

  1. Use ignore_errors := true for large cross-language scans
  2. Be specific with patterns when possible for better performance
  3. Use semantic types for cross-language queries
  4. Use language-specific types when you need precision
  5. Cache results in tables for repeated analysis
-- Create a reusable table
CREATE TABLE codebase_ast AS
SELECT * FROM read_ast('**/*.*', ignore_errors := true);

-- Query the cached data
SELECT language, COUNT(*) FROM codebase_ast GROUP BY language;

Next Steps