git bisect is one of the most powerful debugging tools in a developer’s toolkit, but it requires writing a test script and manually interpreting results. AI-powered bisect automation handles the entire workflow: it generates the test script from a bug description, runs the binary search, and explains exactly which commit introduced the regression.
How Git Bisect Works (Brief)
Git bisect performs a binary search through commit history to find the first “bad” commit. You mark a known good commit and a known bad commit, then run a test script that exits 0 (good) or non-zero (bad) for each tested commit. Git bisect needs log2(N) test runs to find the culprit in N commits.
The bottleneck: writing the test script.
Architecture of the Automated Tool
Bug description
↓
Claude API
↓
Test script (bash)
↓
git bisect run <test-script>
↓
Bisect output
↓
Claude API
↓
Root cause explanation + diff analysis
Step 1: Generate the Test Script
# ai_bisect.py
import anthropic
import subprocess
import tempfile
import os
import sys
client = anthropic.Anthropic()
def generate_bisect_script(
bug_description: str,
repo_path: str,
language: str = "auto"
) -> str:
"""Generate a git bisect test script from a bug description."""
# Get repo context
result = subprocess.run(
["git", "log", "--oneline", "-20"],
cwd=repo_path, capture_output=True, text=True
)
recent_commits = result.stdout
result = subprocess.run(
["ls", "package.json", "requirements.txt", "go.mod", "Makefile"],
cwd=repo_path, capture_output=True, text=True
)
project_files = result.stdout
system_prompt = """You are a git bisect test script generator. Generate a bash script that:
1. Tests for the described bug
2. Exits 0 if the bug is NOT present (commit is good)
3. Exits 1 if the bug IS present (commit is bad)
4. Exits 125 if the test can't run (skip this commit)
The script should be minimal and fast. Use grep, curl, or run the test suite.
Start with #!/bin/bash and set -e. Return ONLY the script, no explanations."""
message = client.messages.create(
model="claude-opus-4-6",
max_tokens=2048,
system=system_prompt,
messages=[{
"role": "user",
"content": f"""Bug description: {bug_description}
Recent commits:
{recent_commits}
Project files present: {project_files}
Generate a bisect test script."""
}]
)
return message.content[0].text
def run_bisect(
repo_path: str,
good_commit: str,
bad_commit: str,
test_script_content: str
) -> dict:
"""Run git bisect with the generated script."""
# Write test script to temp file
with tempfile.NamedTemporaryFile(
mode='w', suffix='.sh', delete=False, dir='/tmp'
) as f:
f.write(test_script_content)
script_path = f.name
os.chmod(script_path, 0o755)
try:
# Start bisect
subprocess.run(["git", "bisect", "start"], cwd=repo_path, check=True)
subprocess.run(["git", "bisect", "bad", bad_commit], cwd=repo_path, check=True)
subprocess.run(["git", "bisect", "good", good_commit], cwd=repo_path, check=True)
# Run bisect
result = subprocess.run(
["git", "bisect", "run", script_path],
cwd=repo_path,
capture_output=True,
text=True,
timeout=300 # 5 minute timeout
)
bisect_output = result.stdout + result.stderr
# Get the bad commit hash from output
bad_commit_line = next(
(line for line in bisect_output.split('\n')
if 'is the first bad commit' in line),
None
)
return {
"success": result.returncode == 0,
"output": bisect_output,
"bad_commit": bad_commit_line,
}
finally:
# Always reset bisect state
subprocess.run(["git", "bisect", "reset"], cwd=repo_path, check=False)
os.unlink(script_path)
def analyze_bad_commit(repo_path: str, commit_hash: str, bug_description: str) -> str:
"""Use Claude to analyze the bad commit and explain the root cause."""
# Get the diff of the bad commit
diff_result = subprocess.run(
["git", "show", "--stat", "--patch", commit_hash],
cwd=repo_path, capture_output=True, text=True
)
message_result = subprocess.run(
["git", "log", "-1", "--format=%B", commit_hash],
cwd=repo_path, capture_output=True, text=True
)
message = client.messages.create(
model="claude-opus-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": f"""Git bisect identified this commit as the first bad commit.
Bug description: {bug_description}
Commit message:
{message_result.stdout}
Diff:
{diff_result.stdout[:6000]} # Truncate large diffs
Explain:
1. What change in this commit causes the bug
2. Which specific lines are responsible
3. How to fix it"""
}]
)
return message.content[0].text
Step 2: The CLI Interface
# ai_bisect_cli.py
import click
from ai_bisect import generate_bisect_script, run_bisect, analyze_bad_commit
import re
@click.command()
@click.option('--repo', default='.', help='Repository path')
@click.option('--good', required=True, help='Known good commit/tag')
@click.option('--bad', default='HEAD', help='Known bad commit/tag')
@click.option('--bug', required=True, help='Bug description')
@click.option('--dry-run', is_flag=True, help='Show test script only, do not run bisect')
def bisect(repo, good, bad, bug, dry_run):
"""AI-powered git bisect automation."""
click.echo(f"Generating test script for: {bug}")
script = generate_bisect_script(bug, repo)
click.echo("\nGenerated test script:")
click.echo("=" * 40)
click.echo(script)
click.echo("=" * 40)
if dry_run:
click.echo("\nDry run mode — not running bisect.")
return
if not click.confirm("\nRun bisect with this script?"):
return
click.echo(f"\nRunning bisect between {good} and {bad}...")
result = run_bisect(repo, good, bad, script)
click.echo("\nBisect output:")
click.echo(result['output'])
if result['bad_commit']:
# Extract commit hash from the output line
hash_match = re.search(r'([0-9a-f]{40})', result['bad_commit'])
if hash_match:
commit_hash = hash_match.group(1)
click.echo(f"\nAnalyzing bad commit: {commit_hash}")
analysis = analyze_bad_commit(repo, commit_hash, bug)
click.echo("\nRoot cause analysis:")
click.echo(analysis)
if __name__ == '__main__':
bisect()
Example Session
$ python ai_bisect_cli.py \
--good v2.3.0 \
--bad HEAD \
--bug "User login returns 500 error when email contains uppercase letters"
Generating test script for: User login returns 500 error when email contains uppercase letters
Generated test script:
========================================
#!/bin/bash
set -e
# Build the project
npm run build 2>/dev/null || exit 125
# Test login with uppercase email
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-X POST http://localhost:3000/api/login \
-H "Content-Type: application/json" \
-d '{"email": "User@Example.com", "password": "testpassword"}')
# 200 or 401 = good (request processed), 500 = bad
if [ "$HTTP_STATUS" = "500" ]; then
exit 1 # bad commit
elif [ "$HTTP_STATUS" = "200" ] || [ "$HTTP_STATUS" = "401" ]; then
exit 0 # good commit
else
exit 125 # skip (server not running, build failed, etc)
fi
========================================
Run bisect with this script? [y/N]: y
Running bisect between v2.3.0 and HEAD...
Bisecting: 15 revisions left to test after this (roughly 4 steps)
[abc1234] Fix: normalize email before lookup
...
abc1234def5678 is the first bad commit
Analyzing bad commit: abc1234def5678...
Root cause analysis:
The commit "Fix: normalize email before lookup" introduced the bug on line 47 of
auth/login.js. The developer called `.toLower()` instead of `.toLowerCase()` — a
typo that causes a TypeError at runtime when an email with uppercase letters is
provided. The fix is to change `.toLower()` to `.toLowerCase()` on line 47.
Handling Flaky Tests
The script can incorporate retry logic for flaky tests:
#!/bin/bash
# Generated with retry logic for flaky integration tests
MAX_RETRIES=3
RETRY=0
while [ $RETRY -lt $MAX_RETRIES ]; do
npm test -- --grep "user authentication" 2>/dev/null
EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
exit 0 # good
elif [ $EXIT_CODE -eq 1 ]; then
RETRY=$((RETRY + 1))
sleep 2
else
exit 125 # skip
fi
done
exit 1 # consistently failing = bad commit
GitHub Actions Integration
Run AI-powered bisect automatically when a regression test fails on main:
# .github/workflows/ai-bisect.yml
name: AI Bisect on Regression
on:
issue_comment:
types: [created]
jobs:
ai-bisect:
# Trigger with: /bisect good=v2.3.0 bug="login fails with uppercase email"
if: contains(github.event.comment.body, '/bisect')
runs-on: ubuntu-latest
permissions:
issues: write
contents: read
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # full history needed for bisect
- name: Parse bisect command
id: parse
run: |
COMMENT="${{ github.event.comment.body }}"
GOOD=$(echo "$COMMENT" | grep -oP 'good=\K[^\s]+')
BUG=$(echo "$COMMENT" | grep -oP 'bug="\K[^"]+')
echo "good=$GOOD" >> $GITHUB_OUTPUT
echo "bug=$BUG" >> $GITHUB_OUTPUT
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: pip install anthropic click
- name: Run AI bisect
id: bisect
run: |
python ai_bisect_cli.py \
--repo . \
--good "${{ steps.parse.outputs.good }}" \
--bad HEAD \
--bug "${{ steps.parse.outputs.bug }}" \
--no-confirm 2>&1 | tee /tmp/bisect-output.txt
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Post result as comment
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const output = fs.readFileSync('/tmp/bisect-output.txt', 'utf8');
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: `## AI Bisect Result\n\`\`\`\n${output.slice(0, 8000)}\n\`\`\``
});
Trigger from any GitHub issue or PR comment: /bisect good=v2.3.0 bug="search returns no results after sorting".
Strategies for Better AI Test Script Generation
The quality of the generated test script depends on how precisely you describe the bug. A vague description generates an imprecise test; a precise description generates a tight binary predicate.
Imprecise:
Bug: Login is broken
Precise:
Bug: POST /api/login returns HTTP 500 when the email field contains uppercase
letters. Example: {"email": "User@EXAMPLE.com", "password": "abc123"} → 500.
Lowercase email works fine. Started happening after deploy on 2026-03-15.
Test command to reproduce: curl -X POST http://localhost:3000/api/login
-H "Content-Type: application/json" -d '{"email": "User@EXAMPLE.COM", "password": "test"}'
Additional prompting strategies that improve script quality:
-
Provide the known-good reproduction step: If you know the curl command, SQL query, or test that demonstrates the bug, include it in the description. Claude will incorporate it directly.
-
Name the affected file or function if known: “The bug is in auth/normalizer.js, specifically the email normalization path” dramatically narrows the generated test.
-
Specify the build command: If your project has a non-standard build step, include it. Claude defaults to
npm run build,make, orgo buildbased on the project files it detects — but an explicit build command prevents skip-on-build-failure errors. -
Add performance constraints: “Build takes 3 minutes; prefer a test that avoids a full rebuild” causes Claude to generate a test that restarts only the relevant service rather than rebuilding from scratch.
For repositories with strong test coverage, instruct Claude to use the existing test suite as the oracle: “Use npm test -- --grep 'email normalization' as the bisect predicate.” This produces more reliable results than generating ad-hoc curl commands.
Related Reading
- AI Code Review Automation Tools Comparison
- How to Use AI for WebAssembly Development
- Claude vs GPT-4 for Writing Unit Test Mocks
- AI Coding Tools for Automating Changelog Generation from
Built by theluckystrike — More at zovo.one