How to Build AI-Powered Git Bisect Automation

git bisect is one of the most powerful debugging tools in a developer’s toolkit, but it requires writing a test script and manually interpreting results. AI-powered bisect automation handles the entire workflow: it generates the test script from a bug description, runs the binary search, and explains exactly which commit introduced the regression.

How Git Bisect Works (Brief)

Git bisect performs a binary search through commit history to find the first “bad” commit. You mark a known good commit and a known bad commit, then run a test script that exits 0 (good) or non-zero (bad) for each tested commit. Git bisect needs log2(N) test runs to find the culprit in N commits.

The bottleneck: writing the test script.

Architecture of the Automated Tool

Bug description
      ↓
  Claude API
      ↓
 Test script (bash)
      ↓
 git bisect run <test-script>
      ↓
 Bisect output
      ↓
  Claude API
      ↓
 Root cause explanation + diff analysis

Step 1: Generate the Test Script

# ai_bisect.py
import anthropic
import subprocess
import tempfile
import os
import sys

client = anthropic.Anthropic()


def generate_bisect_script(
    bug_description: str,
    repo_path: str,
    language: str = "auto"
) -> str:
    """Generate a git bisect test script from a bug description."""

    # Get repo context
    result = subprocess.run(
        ["git", "log", "--oneline", "-20"],
        cwd=repo_path, capture_output=True, text=True
    )
    recent_commits = result.stdout

    result = subprocess.run(
        ["ls", "package.json", "requirements.txt", "go.mod", "Makefile"],
        cwd=repo_path, capture_output=True, text=True
    )
    project_files = result.stdout

    system_prompt = """You are a git bisect test script generator. Generate a bash script that:
1. Tests for the described bug
2. Exits 0 if the bug is NOT present (commit is good)
3. Exits 1 if the bug IS present (commit is bad)
4. Exits 125 if the test can't run (skip this commit)

The script should be minimal and fast. Use grep, curl, or run the test suite.
Start with #!/bin/bash and set -e. Return ONLY the script, no explanations."""

    message = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=2048,
        system=system_prompt,
        messages=[{
            "role": "user",
            "content": f"""Bug description: {bug_description}

Recent commits:
{recent_commits}

Project files present: {project_files}

Generate a bisect test script."""
        }]
    )

    return message.content[0].text


def run_bisect(
    repo_path: str,
    good_commit: str,
    bad_commit: str,
    test_script_content: str
) -> dict:
    """Run git bisect with the generated script."""

    # Write test script to temp file
    with tempfile.NamedTemporaryFile(
        mode='w', suffix='.sh', delete=False, dir='/tmp'
    ) as f:
        f.write(test_script_content)
        script_path = f.name

    os.chmod(script_path, 0o755)

    try:
        # Start bisect
        subprocess.run(["git", "bisect", "start"], cwd=repo_path, check=True)
        subprocess.run(["git", "bisect", "bad", bad_commit], cwd=repo_path, check=True)
        subprocess.run(["git", "bisect", "good", good_commit], cwd=repo_path, check=True)

        # Run bisect
        result = subprocess.run(
            ["git", "bisect", "run", script_path],
            cwd=repo_path,
            capture_output=True,
            text=True,
            timeout=300  # 5 minute timeout
        )

        bisect_output = result.stdout + result.stderr

        # Get the bad commit hash from output
        bad_commit_line = next(
            (line for line in bisect_output.split('\n')
             if 'is the first bad commit' in line),
            None
        )

        return {
            "success": result.returncode == 0,
            "output": bisect_output,
            "bad_commit": bad_commit_line,
        }

    finally:
        # Always reset bisect state
        subprocess.run(["git", "bisect", "reset"], cwd=repo_path, check=False)
        os.unlink(script_path)


def analyze_bad_commit(repo_path: str, commit_hash: str, bug_description: str) -> str:
    """Use Claude to analyze the bad commit and explain the root cause."""

    # Get the diff of the bad commit
    diff_result = subprocess.run(
        ["git", "show", "--stat", "--patch", commit_hash],
        cwd=repo_path, capture_output=True, text=True
    )

    message_result = subprocess.run(
        ["git", "log", "-1", "--format=%B", commit_hash],
        cwd=repo_path, capture_output=True, text=True
    )

    message = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=2048,
        messages=[{
            "role": "user",
            "content": f"""Git bisect identified this commit as the first bad commit.

Bug description: {bug_description}

Commit message:
{message_result.stdout}

Diff:
{diff_result.stdout[:6000]}  # Truncate large diffs

Explain:
1. What change in this commit causes the bug
2. Which specific lines are responsible
3. How to fix it"""
        }]
    )

    return message.content[0].text

Step 2: The CLI Interface

# ai_bisect_cli.py
import click
from ai_bisect import generate_bisect_script, run_bisect, analyze_bad_commit
import re

@click.command()
@click.option('--repo', default='.', help='Repository path')
@click.option('--good', required=True, help='Known good commit/tag')
@click.option('--bad', default='HEAD', help='Known bad commit/tag')
@click.option('--bug', required=True, help='Bug description')
@click.option('--dry-run', is_flag=True, help='Show test script only, do not run bisect')
def bisect(repo, good, bad, bug, dry_run):
    """AI-powered git bisect automation."""

    click.echo(f"Generating test script for: {bug}")
    script = generate_bisect_script(bug, repo)

    click.echo("\nGenerated test script:")
    click.echo("=" * 40)
    click.echo(script)
    click.echo("=" * 40)

    if dry_run:
        click.echo("\nDry run mode — not running bisect.")
        return

    if not click.confirm("\nRun bisect with this script?"):
        return

    click.echo(f"\nRunning bisect between {good} and {bad}...")
    result = run_bisect(repo, good, bad, script)

    click.echo("\nBisect output:")
    click.echo(result['output'])

    if result['bad_commit']:
        # Extract commit hash from the output line
        hash_match = re.search(r'([0-9a-f]{40})', result['bad_commit'])
        if hash_match:
            commit_hash = hash_match.group(1)
            click.echo(f"\nAnalyzing bad commit: {commit_hash}")
            analysis = analyze_bad_commit(repo, commit_hash, bug)
            click.echo("\nRoot cause analysis:")
            click.echo(analysis)


if __name__ == '__main__':
    bisect()

Example Session

$ python ai_bisect_cli.py \
  --good v2.3.0 \
  --bad HEAD \
  --bug "User login returns 500 error when email contains uppercase letters"

Generating test script for: User login returns 500 error when email contains uppercase letters

Generated test script:
========================================
#!/bin/bash
set -e

# Build the project
npm run build 2>/dev/null || exit 125

# Test login with uppercase email
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
  -X POST http://localhost:3000/api/login \
  -H "Content-Type: application/json" \
  -d '{"email": "User@Example.com", "password": "testpassword"}')

# 200 or 401 = good (request processed), 500 = bad
if [ "$HTTP_STATUS" = "500" ]; then
  exit 1  # bad commit
elif [ "$HTTP_STATUS" = "200" ] || [ "$HTTP_STATUS" = "401" ]; then
  exit 0  # good commit
else
  exit 125  # skip (server not running, build failed, etc)
fi
========================================

Run bisect with this script? [y/N]: y

Running bisect between v2.3.0 and HEAD...
Bisecting: 15 revisions left to test after this (roughly 4 steps)
[abc1234] Fix: normalize email before lookup
...
abc1234def5678 is the first bad commit

Analyzing bad commit: abc1234def5678...

Root cause analysis:
The commit "Fix: normalize email before lookup" introduced the bug on line 47 of
auth/login.js. The developer called `.toLower()` instead of `.toLowerCase()` — a
typo that causes a TypeError at runtime when an email with uppercase letters is
provided. The fix is to change `.toLower()` to `.toLowerCase()` on line 47.

Handling Flaky Tests

The script can incorporate retry logic for flaky tests:

#!/bin/bash
# Generated with retry logic for flaky integration tests
MAX_RETRIES=3
RETRY=0

while [ $RETRY -lt $MAX_RETRIES ]; do
    npm test -- --grep "user authentication" 2>/dev/null
    EXIT_CODE=$?

    if [ $EXIT_CODE -eq 0 ]; then
        exit 0  # good
    elif [ $EXIT_CODE -eq 1 ]; then
        RETRY=$((RETRY + 1))
        sleep 2
    else
        exit 125  # skip
    fi
done

exit 1  # consistently failing = bad commit

GitHub Actions Integration

Run AI-powered bisect automatically when a regression test fails on main:

# .github/workflows/ai-bisect.yml
name: AI Bisect on Regression

on:
  issue_comment:
    types: [created]

jobs:
  ai-bisect:
    # Trigger with: /bisect good=v2.3.0 bug="login fails with uppercase email"
    if: contains(github.event.comment.body, '/bisect')
    runs-on: ubuntu-latest
    permissions:
      issues: write
      contents: read

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0  # full history needed for bisect

      - name: Parse bisect command
        id: parse
        run: |
          COMMENT="${{ github.event.comment.body }}"
          GOOD=$(echo "$COMMENT" | grep -oP 'good=\K[^\s]+')
          BUG=$(echo "$COMMENT" | grep -oP 'bug="\K[^"]+')
          echo "good=$GOOD" >> $GITHUB_OUTPUT
          echo "bug=$BUG" >> $GITHUB_OUTPUT

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'

      - name: Install dependencies
        run: pip install anthropic click

      - name: Run AI bisect
        id: bisect
        run: |
          python ai_bisect_cli.py \
            --repo . \
            --good "${{ steps.parse.outputs.good }}" \
            --bad HEAD \
            --bug "${{ steps.parse.outputs.bug }}" \
            --no-confirm 2>&1 | tee /tmp/bisect-output.txt
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

      - name: Post result as comment
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const output = fs.readFileSync('/tmp/bisect-output.txt', 'utf8');
            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: `## AI Bisect Result\n\`\`\`\n${output.slice(0, 8000)}\n\`\`\``
            });

Trigger from any GitHub issue or PR comment: /bisect good=v2.3.0 bug="search returns no results after sorting".

Strategies for Better AI Test Script Generation

The quality of the generated test script depends on how precisely you describe the bug. A vague description generates an imprecise test; a precise description generates a tight binary predicate.

Imprecise:

Bug: Login is broken

Precise:

Bug: POST /api/login returns HTTP 500 when the email field contains uppercase
letters. Example: {"email": "User@EXAMPLE.com", "password": "abc123"} → 500.
Lowercase email works fine. Started happening after deploy on 2026-03-15.
Test command to reproduce: curl -X POST http://localhost:3000/api/login
-H "Content-Type: application/json" -d '{"email": "User@EXAMPLE.COM", "password": "test"}'

Additional prompting strategies that improve script quality:

Provide the known-good reproduction step: If you know the curl command, SQL query, or test that demonstrates the bug, include it in the description. Claude will incorporate it directly.
Name the affected file or function if known: “The bug is in auth/normalizer.js, specifically the email normalization path” dramatically narrows the generated test.
Specify the build command: If your project has a non-standard build step, include it. Claude defaults to npm run build, make, or go build based on the project files it detects — but an explicit build command prevents skip-on-build-failure errors.
Add performance constraints: “Build takes 3 minutes; prefer a test that avoids a full rebuild” causes Claude to generate a test that restarts only the relevant service rather than rebuilding from scratch.

For repositories with strong test coverage, instruct Claude to use the existing test suite as the oracle: “Use npm test -- --grep 'email normalization' as the bisect predicate.” This produces more reliable results than generating ad-hoc curl commands.

Built by theluckystrike — More at zovo.one