Script: List of Functions and Classes with Pygments

July 15, 2025 Wietse Venema

The script is a self-contained script with a uv shebang. You'll need uv to run it. Refer to the uv documentation to learn more.

To run the script, you should copy the contents, save it to a file (indexer.py), make the file executable (chmod +x indexer.py), and then run it (./indexer.py), optionally providing a path of the directory you want to process.

I you're interested in learning more about this self-contained script, I wrote a post about writing Python scripts with inline dependencies

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.8"
# dependencies = [
#   "pygments",
#   "pathspec",
# ]
# ///
"""
Codebase Indexer

This script provides a high-level overview of a software project by scanning its
directory and identifying the classes and functions defined in each source file.
It is designed to be a general-purpose tool that can be run on any codebase
without configuration.

Core Features:
- Language Agnostic: Uses the Pygments library to parse dozens of programming
  languages automatically.
- .gitignore Aware: Automatically respects the project's .gitignore file,
  ensuring that ignored files are not included in the analysis.
- Smart Filtering: Only considers source code files.
- Clean Output: Presents a clear, per-file inventory, making it easy to see the
  structure of the codebase at a glance.

Usage:
  Run the script from the command line, optionally providing a path to the
  directory you wish to analyze. If no path is provided, it will analyze the
  current working directory.

  Example:
    $ ./indexer.py /path/to/my/project

This outputs a structured list of files, each with the classes and functions
it contains.
"""

import argparse
import os
from collections import defaultdict

import pathspec
from pygments import lexers
from pygments.token import Token
from pygments.util import ClassNotFound


def find_source_files(directory, verbose=False):
    """
    Recursively finds all relevant source files in a given directory.

    This function walks the directory tree and yields the paths of files that are
    likely to be source code. It employs several layers of filtering to produce
    a clean and relevant list of files.

    Args:
        directory (str): The absolute path to the directory to scan.
        verbose (bool): Whether to enable verbose logging.

    Yields:
        str: The full path of each source file found.
    """
    # --- .gitignore Integration ---
    # The script first looks for a .gitignore file in the root of the target
    # directory. If found, it loads the patterns into a `pathspec` object.
    # This object can then efficiently check if a file or directory path should
    # be ignored.
    gitignore_file = os.path.join(directory, ".gitignore")
    spec = None
    if os.path.exists(gitignore_file):
        if verbose:
            print(f"Found .gitignore at: {gitignore_file}")
        with open(gitignore_file, "r") as f:
            spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
    elif verbose:
        print("No .gitignore file found.")

    # --- Directory Exclusion List ---
    # This is a hardcoded set of common directory names that are almost always
    # irrelevant for source code analysis, such as version control metadata,
    # virtual environments, and build output folders. It is used as a fallback
    # if .gitignore isn't found. 
    exclude_dirs = {
        ".git",
        ".svn",
        "__pycache__",
        "node_modules",
        ".venv",
        "build",
        "dist",
    }

    # --- File Extension Allow List ---
    # To ensure the analysis focuses on source code, this script uses an allow
    # list of file extensions.
    include_extensions = {
        ".py",
        ".js",
        ".ts",
        ".java",
        ".kt",
        ".cpp",
        ".h",
        ".c",
        ".cs",
        ".sh",
        ".go",
        ".rs",
        ".rb",
        ".php",
        ".swift",
        ".sql",
    }

    # --- Directory Traversal and Filtering ---
    # Traverse the directory to find source code files. 
    for root, dirs, files in os.walk(directory, topdown=True):
        if verbose:
            print(f"\nEntering directory: {root}")
            print(f"Original dirs: {dirs}")
            print(f"Original files: {files}")

        # If a .gitignore file is found, rely on it for directory exclusions.
        # Otherwise, use the hardcoded fallback list.
        if not spec:
            original_dirs = set(dirs)
            dirs[:] = [d for d in dirs if d not in exclude_dirs]
            if verbose:
                excluded = original_dirs - set(dirs)
                if excluded:
                    print(f"Excluding dirs by fallback list: {excluded}")

        # Always exclude hidden directories.
        original_dirs = set(dirs)
        dirs[:] = [d for d in dirs if not d.startswith(".")]
        if verbose:
            excluded = original_dirs - set(dirs)
            if excluded:
                print(f"Excluding hidden dirs: {excluded}")

        # Apply .gitignore rules to both directories and files.
        if spec:            
            full_paths_dirs = [os.path.join(root, d) for d in dirs]
            ignored_dirs = set(spec.match_files(full_paths_dirs))
            original_dirs = set(dirs)
            dirs[:] = [d for d in dirs if os.path.join(root, d) not in ignored_dirs]
            if verbose:                
                ignored_dir_names = {os.path.basename(p) for p in ignored_dirs}
                if ignored_dir_names:
                    print(f"Excluding dirs by .gitignore: {ignored_dir_names}")

            full_paths_files = [os.path.join(root, f) for f in files]
            ignored_files = set(spec.match_files(full_paths_files))
            original_files = set(files)
            files[:] = [f for f in files if os.path.join(root, f) not in ignored_files]
            if verbose:
                ignored_file_names = {os.path.basename(p) for p in ignored_files}
                if ignored_file_names:
                    print(f"Excluding files by .gitignore: {ignored_file_names}")

        if verbose:
            print(f"Filtered dirs: {dirs}")
            print(f"Filtered files: {files}")

        # Process the remaining, filtered files.
        for file in files:
            # Ignore hidden files (starting with a dot)
            if file.startswith("."):
                if verbose:
                    print(f"Skipping hidden file: {file}")
                continue

            _, extension = os.path.splitext(file)
            if extension not in include_extensions:
                if verbose:
                    print(f"Skipping file with excluded extension: {file}")
                continue

            if verbose:
                print(f">>> Yielding source file: {file}")
            # If a file passes all checks, its full path is yielded.
            yield os.path.join(root, file)


def get_identifiers_from_file(file_path, verbose=False):
    """
    Extracts Class and Function identifiers from a single source file.

    This function uses the Pygments library to perform lexical analysis on the
    content of a file and find class and function definition tokens.

    Args:
        file_path (str): The path to the source code file.
        verbose (bool): Whether to enable verbose logging.

    Returns:
        list: A list of (identifier_name, token_type) tuples.
    """
    if verbose:
        print(f"Analyzing file: {file_path}")

    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            code = f.read()
    except (IOError, OSError) as e:
        if verbose:
            print(f"Error reading file {file_path}: {e}")
        # The file might be unreadable or have been deleted during the scan.
        return []

    if not code:
        if verbose:
            print(f"Skipping empty file: {file_path}")
        return []

    # --- Lexer Guessing and Tokenization ---
    # Pygments can raise exceptions for unknown file types or
    # during tokenization of unusual code. The script silently ignores them.
    try:
        lexer = lexers.guess_lexer_for_filename(file_path, code)
        if verbose:
            print(f"Guessed lexer for {file_path}: {lexer.name}")
    except (ClassNotFound, TypeError):
        if verbose:
            print(f"No lexer found for file: {file_path}")
        return []  # Skip files with no known lexer

    try:
        # Convert generator to list to allow multiple iterations
        tokens = list(lexer.get_tokens(code))

        # Find the identifiers we are interested in
        identifiers = []
        for i, (ttype, tvalue) in enumerate(tokens):
            
            if ttype in Token.Name.Class or ttype in Token.Name.Function:
                identifiers.append((tvalue, ttype))
                continue

            # Fallback for TypeScript/JS where function/class names are 
            # tokenized as `Name.Other` following a keyword.
            if ttype is Token.Name.Other:
                # Look at the previous non-whitespace token
                prev_idx = i - 1
                while prev_idx >= 0 and tokens[prev_idx][0] is Token.Text.Whitespace:
                    prev_idx -= 1

                if prev_idx >= 0:
                    prev_ttype, prev_tvalue = tokens[prev_idx]
                    if prev_ttype is Token.Keyword.Declaration:
                        if prev_tvalue == "function":
                            identifiers.append((tvalue, Token.Name.Function))
                        elif prev_tvalue in ("class", "interface"):
                            # Treat interfaces as class definitions
                            identifiers.append((tvalue, Token.Name.Class))

        if identifiers:
            # Return unique identifiers, as some might be caught by both methods
            return list(set(identifiers))

        # If in verbose mode and no identifiers were found, print the unique tokens
        if verbose:
            unique_token_types = set(token[0] for token in tokens)
            print(f"No target tokens found in {file_path}. Found tokens: {unique_token_types}")

        return []
    except Exception as e:
        if verbose:
            print(f"Error tokenizing file {file_path}: {e}")
        # Catch any other unexpected errors during the tokenization process.
        return []


def analyze_directory(directory, verbose=False):
    """
    Orchestrates the analysis of all source files in a directory.

    This function calls `find_source_files` to get the list of files to analyze,
    then iterates through them, calling `get_identifiers_from_file` for each one.
    It aggregates the results into a single data structure.

    Args:
        directory (str): The path to the directory to analyze.
        verbose (bool): Whether to enable verbose logging.

    Returns:
        defaultdict: A dictionary mapping relative file paths to a list of
                     (identifier, type) tuples found in that file.
    """
    print(f"Scanning directory: {directory}...")
    file_map = defaultdict(list)
    source_files = list(find_source_files(directory, verbose=verbose))

    for file_path in source_files:
        # Using relative paths makes the output cleaner.
        relative_path = os.path.relpath(file_path, directory)
        identifiers = get_identifiers_from_file(file_path, verbose=verbose)
        if identifiers:
            file_map[relative_path].extend(identifiers)

    print(f"Scan complete. Found identifiers in {len(file_map)} files.")
    return file_map


def print_analysis(file_map):
    """
    Prints the final, formatted analysis to the console.

    This function takes the aggregated data from `analyze_directory` and presents
    it in a human-readable format. It sorts the files alphabetically and then
    prints the classes and functions for each file, also sorted.

    Args:
        file_map (defaultdict): The map of files to their identifiers.
    """
    # Sort files by path for a consistent and readable output.
    sorted_files = sorted(file_map.items())

    for file, idents in sorted_files:
        print(f"\n--- {file} ---")

        # --- Data Cleaning and Filtering ---
        # 1. Get unique identifiers by converting the list to a set.
        # 2. Filter out short function names (<= 3 chars) to reduce noise.
        # 3. Sort the final lists alphabetically.
        classes = sorted(
            list(set([ident for ident, t in idents if t == Token.Name.Class]))
        )
        functions = sorted(
            list(
                set(
                    [
                        ident
                        for ident, t in idents
                        if t == Token.Name.Function and len(ident) > 3
                    ]
                )
            )
        )

        # --- Formatted Output ---
        # Only print headings if there is content to show.
        if classes:
            print("  Classes:")
            for c in classes:
                print(f"    - {c}")

        if functions:
            print("  Functions:")
            for f in functions:
                print(f"    - {f}")


def main():
    """
    Main entry point for the script.

    This function handles command-line argument parsing and orchestrates the
    main sequence of operations: analyze, then print.
    """
    # --- Argument Parsing ---    
    parser = argparse.ArgumentParser(
        description="Analyze source code to find classes and functions in each file."
    )
    parser.add_argument(
        "directory",
        nargs="?",  # The argument is optional.
        default=os.getcwd(),  # Default to the current directory if not provided.
        help="The directory to analyze (defaults to the current directory).",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="Enable verbose logging to see detailed control flow decisions.",
    )
    args = parser.parse_args()

    # Basic input validation.
    if not os.path.isdir(args.directory):
        print(f"Error: Directory not found at '{args.directory}'")
        return

    # --- Execution Flow ---
    # 1. Analyze the directory to build the data map.
    # 2. Print the formatted results.
    file_map = analyze_directory(args.directory, verbose=args.verbose)
    print_analysis(file_map)

if __name__ == "__main__":
    main()

* * *