zed/script/github-check-new-issue-for-duplicates.py
2026-02-18 17:15:15 +00:00

534 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Comment on newly opened issues that might be duplicates of an existing issue.
This script is run by a GitHub Actions workflow when a new bug or crash report
is opened. It:
1. Checks eligibility (must be bug/crash type, non-staff author)
2. Detects relevant areas using Claude + the area label taxonomy
3. Parses known "duplicate magnets" from tracking issue #46355
4. Searches for similar recent issues by title keywords, area labels, and error patterns
5. Asks Claude to analyze potential duplicates (magnets + search results)
6. Posts a comment on the issue if high-confidence duplicates are found
Requires:
requests (pip install requests)
Usage:
python github-check-new-issue-for-duplicates.py <issue_number>
Environment variables:
GITHUB_TOKEN - GitHub token (org members: read, issues: read & write)
ANTHROPIC_API_KEY - Anthropic API key for Claude
"""
import argparse
import json
import os
import re
import sys
from datetime import datetime, timedelta
import requests
GITHUB_API = "https://api.github.com"
REPO_OWNER = "zed-industries"
REPO_NAME = "zed"
TRACKING_ISSUE_NUMBER = 46355
STAFF_TEAM_SLUG = "staff"
# area prefixes to collapse in taxonomy (show summary instead of all sub-labels)
PREFIXES_TO_COLLAPSE = ["languages", "parity", "tooling"]
# stopwords to filter from title keyword searches (short words handled by len > 2 filter)
STOPWORDS = {
"after", "all", "also", "and", "any", "but", "can't", "does", "doesn't",
"don't", "for", "from", "have", "just", "not", "only", "some", "that",
"the", "this", "when", "while", "with", "won't", "work", "working", "zed",
}
def log(message):
"""Print to stderr so it doesn't interfere with JSON output on stdout."""
print(message, file=sys.stderr)
def github_api_get(path, params=None):
"""Fetch JSON from the GitHub API. Raises on non-2xx status."""
url = f"{GITHUB_API}/{path.lstrip('/')}"
response = requests.get(url, headers=GITHUB_HEADERS, params=params)
response.raise_for_status()
return response.json()
def github_search_issues(query, per_page=15):
"""Search issues, returning most recently created first."""
params = {"q": query, "sort": "created", "order": "desc", "per_page": per_page}
return github_api_get("/search/issues", params).get("items", [])
def check_team_membership(org, team_slug, username):
"""Check if user is an active member of a team."""
try:
data = github_api_get(f"/orgs/{org}/teams/{team_slug}/memberships/{username}")
return data.get("state") == "active"
except requests.HTTPError as e:
if e.response.status_code == 404:
return False
raise
def post_comment(issue_number: int, body):
url = f"{GITHUB_API.rstrip('/')}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
response = requests.post(url, headers=GITHUB_HEADERS, json={"body": body})
response.raise_for_status()
log(f" Posted comment on #{issue_number}")
def build_duplicate_comment(matches):
"""Build the comment body for potential duplicates."""
match_list = "\n".join(f"- #{m['number']}" for m in matches)
explanations = "\n\n".join(f"**#{m['number']}:** {m['explanation']}" for m in matches)
return f"""This issue appears to be a duplicate of:
{match_list}
**If this is indeed a duplicate:**
Please close this issue and subscribe to the linked issue for updates (select "Close as not planned""Duplicate")
**If this is a different issue:**
No action needed. A maintainer will review this shortly.
<details>
<summary>Why were these issues selected?</summary>
{explanations}
</details>
---
<sub>This is an automated analysis and might be incorrect.</sub>"""
def call_claude(api_key, system, user_content, max_tokens=1024):
"""Send a message to Claude and return the text response. Raises on non-2xx status."""
response = requests.post(
"https://api.anthropic.com/v1/messages",
headers={
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
},
json={
"model": "claude-sonnet-4-20250514",
"max_tokens": max_tokens,
"temperature": 0.0,
"system": system,
"messages": [{"role": "user", "content": user_content}],
},
)
response.raise_for_status()
data = response.json()
usage = data.get("usage", {})
log(f" Token usage - Input: {usage.get('input_tokens', 'N/A')}, Output: {usage.get('output_tokens', 'N/A')}")
content = data.get("content", [])
if content and content[0].get("type") == "text":
return content[0].get("text") or ""
return ""
def fetch_issue(issue_number: int):
"""Fetch issue from GitHub and return as a dict."""
log(f"Fetching issue #{issue_number}")
issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
issue = {
"number": issue_number,
"title": issue_data["title"],
"body": issue_data.get("body") or "",
"author": (issue_data.get("user") or {}).get("login") or "",
"type": (issue_data.get("type") or {}).get("name"),
}
log(f" Title: {issue['title']}\n Type: {issue['type']}\n Author: {issue['author']}")
return issue
def should_skip(issue):
"""Check if issue should be skipped in duplicate detection process."""
if issue["type"] not in ["Bug", "Crash"]:
log(f" Skipping: issue type '{issue['type']}' is not a bug/crash report")
return True
if issue["author"] and check_team_membership(REPO_OWNER, STAFF_TEAM_SLUG, issue["author"]):
log(f" Skipping: author '{issue['author']}' is a {STAFF_TEAM_SLUG} member")
return True
return False
def fetch_area_labels():
"""Fetch area:* labels from the repository. Returns list of {name, description} dicts."""
log("Fetching area labels")
labels = []
page = 1
while page_labels := github_api_get(
f"/repos/{REPO_OWNER}/{REPO_NAME}/labels",
params={"per_page": 100, "page": page},
):
labels.extend(page_labels)
page += 1
# label["name"][5:] removes the "area:" prefix
area_labels = [
{"name": label["name"][5:], "description": label.get("description") or ""}
for label in labels
if label["name"].startswith("area:")
]
log(f" Found {len(area_labels)} area labels")
return area_labels
def format_taxonomy_for_claude(area_labels):
"""Format area labels into a string for Claude, collapsing certain prefixes."""
lines = set()
for area in area_labels:
name = area["name"]
collapsible_prefix = next(
(p for p in PREFIXES_TO_COLLAPSE if name.startswith(f"{p}/")), None)
if collapsible_prefix:
lines.add(f"- {collapsible_prefix}/* (multiple specific sub-labels exist)")
else:
desc = area["description"]
lines.add(f"- {name}: {desc}" if desc else f"- {name}")
return "\n".join(sorted(lines))
def detect_areas(anthropic_key, issue, taxonomy):
"""Use Claude to detect relevant areas for the issue."""
log("Detecting areas with Claude")
system_prompt = """You analyze GitHub issues to identify which area labels apply.
Given an issue and a taxonomy of areas, output ONLY a comma-separated list of matching area names.
- Output at most 3 areas, ranked by relevance
- Use exact area names from the taxonomy
- If no areas clearly match, output: none
- For languages/*, tooling/*, or parity/*, use the specific sub-label (e.g., "languages/rust",
tooling/eslint, parity/vscode)
Example outputs:
- "editor, parity/vim"
- "ai, ai/agent panel"
- "none"
"""
user_content = f"""## Area Taxonomy
{taxonomy}
# Issue Title
{issue['title']}
# Issue Body
{issue['body'][:4000]}"""
response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=100).strip()
log(f" Detected areas: {response}")
if response.lower() == "none":
return []
return [area.strip() for area in response.split(",")]
def parse_duplicate_magnets():
"""Parse known duplicate magnets from tracking issue #46355.
Returns a list of magnets sorted by duplicate count (most duplicated first).
Magnets only have number, areas, and dupe_count — use enrich_magnets() to fetch
title and body_preview for the ones you need.
"""
log(f"Parsing duplicate magnets from #{TRACKING_ISSUE_NUMBER}")
issue_data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{TRACKING_ISSUE_NUMBER}")
body = issue_data.get("body") or ""
# parse the issue body
# format: ## area_name
# - [N dupes] https://github.com/zed-industries/zed/issues/NUMBER
magnets = {} # number -> {number, areas, dupe_count}
current_area = None
for line in body.split("\n"):
# check for area header
if line.startswith("## "):
current_area = line[3:].strip()
continue
if not current_area or not line.startswith("-") or "/issues/" not in line:
continue
# parse: - [N dupes] https://github.com/.../issues/NUMBER
try:
dupe_count = int(line.split("[")[1].split()[0])
number = int(line.split("/issues/")[1].split()[0].rstrip(")"))
except (ValueError, IndexError):
continue
# skip "(unlabeled)": these magnets should match everything
is_unlabeled = current_area == "(unlabeled)"
if number in magnets:
if not is_unlabeled:
magnets[number]["areas"].append(current_area)
else:
magnets[number] = {
"number": number,
"areas": [] if is_unlabeled else [current_area],
"dupe_count": dupe_count,
}
magnet_list = sorted(magnets.values(), key=lambda m: m["dupe_count"], reverse=True)
log(f" Parsed {len(magnet_list)} duplicate magnets")
return magnet_list
def enrich_magnets(magnets):
"""Fetch title and body_preview for magnets from the API."""
log(f" Fetching details for {len(magnets)} magnets")
for magnet in magnets:
data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{magnet['number']}")
magnet["title"] = data["title"]
magnet["body_preview"] = (data.get("body") or "")[:500]
def areas_match(detected, magnet_area):
"""Check if detected area matches magnet area. Matches broadly across hierarchy levels."""
return (
detected == magnet_area
or magnet_area.startswith(f"{detected}/")
or detected.startswith(f"{magnet_area}/")
)
def filter_magnets_by_areas(magnets, detected_areas):
"""Filter magnets based on detected areas."""
if not detected_areas:
return magnets
detected_set = set(detected_areas)
def matches(magnet):
# unlabeled magnets (empty areas) match everything
if not magnet["areas"]:
return True
return any(
areas_match(detected, magnet_area)
for detected in detected_set
for magnet_area in magnet["areas"]
)
return list(filter(matches, magnets))
def search_for_similar_issues(issue, detected_areas, max_searches=6):
"""Search for similar issues that might be duplicates.
Searches by title keywords, area labels (last 60 days), and error patterns.
max_searches caps the total number of queries to keep token usage and context size under control.
"""
log("Searching for similar issues")
sixty_days_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
base_query = f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open"
seen_issues = {}
queries = []
title_keywords = [word for word in issue["title"].split() if word.lower() not in STOPWORDS and len(word) > 2]
if title_keywords:
keywords_query = " ".join(title_keywords)
queries.append(("title_keywords", f"{base_query} {keywords_query}"))
for area in detected_areas:
queries.append(("area_label", f'{base_query} label:"area:{area}" created:>{sixty_days_ago}'))
# error pattern search: capture 590 chars after keyword, colon optional
error_pattern = r"(?i:\b(?:error|panicked|panic|failed)\b)\s*([^\n]{5,90})"
match = re.search(error_pattern, issue["body"])
if match:
error_snippet = match.group(1).strip()
queries.append(("error_pattern", f'{base_query} in:body "{error_snippet}"'))
for search_type, query in queries[:max_searches]:
log(f" Search ({search_type}): {query}")
try:
results = github_search_issues(query, per_page=15)
for item in results:
number = item["number"]
if number != issue["number"] and number not in seen_issues:
body = item.get("body") or ""
seen_issues[number] = {
"number": number,
"title": item["title"],
"state": item.get("state", ""),
"created_at": item.get("created_at", ""),
"body_preview": body[:500],
"source": search_type,
}
except requests.RequestException as e:
log(f" Search failed: {e}")
similar_issues = list(seen_issues.values())
log(f" Found {len(similar_issues)} similar issues")
return similar_issues
def analyze_duplicates(anthropic_key, issue, magnets, search_results):
"""Use Claude to analyze potential duplicates."""
log("Analyzing duplicates with Claude")
top_magnets = magnets[:10]
enrich_magnets(top_magnets)
magnet_numbers = {m["number"] for m in top_magnets}
candidates = [
{"number": m["number"], "title": m["title"], "body_preview": m["body_preview"], "source": "known_duplicate_magnet"}
for m in top_magnets
] + [
{"number": r["number"], "title": r["title"], "body_preview": r["body_preview"], "source": "search_result"}
for r in search_results[:10]
if r["number"] not in magnet_numbers
]
if not candidates:
return [], "No candidates to analyze"
system_prompt = """You analyze GitHub issues to identify potential duplicates.
Given a new issue and a list of existing issues, identify which existing issues might be duplicates.
For each potential duplicate, assess confidence:
- "high": Very likely the same issue (same root cause, same symptoms)
- "medium": Possibly related (likely to be the same root cause)
- Do NOT include tangentially related issues (same general area but probably different issues)
Output only valid JSON (no markdown code blocks) with this structure:
{
"matches": [
{
"number": 12345,
"confidence": "high|medium",
"explanation": "Brief explanation of why this might be a duplicate"
}
],
"summary": "One sentence summary of findings"
}
Only include matches with "high" or "medium" confidence. Return empty matches array if none found."""
user_content = f"""## New Issue #{issue['number']}
**Title:** {issue['title']}
**Body:**
{issue['body'][:3000]}
## Existing Issues to Compare
{json.dumps(candidates, indent=2)}"""
response = call_claude(anthropic_key, system_prompt, user_content, max_tokens=2048)
try:
data = json.loads(response)
except json.JSONDecodeError as e:
log(f" Failed to parse response: {e}")
log(f" Raw response: {response}")
return [], "Failed to parse analysis"
matches = data.get("matches", [])
summary = data.get("summary", "Analysis complete")
log(f" Found {len(matches)} potential matches")
return matches, summary
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Identify potential duplicate issues")
parser.add_argument("issue_number", type=int, help="Issue number to analyze")
parser.add_argument("--dry-run", action="store_true", help="Skip posting comment, just log what would be posted")
args = parser.parse_args()
github_token = os.environ.get("GITHUB_TOKEN")
anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
if not github_token:
log("Error: GITHUB_TOKEN not set")
sys.exit(1)
if not anthropic_key:
log("Error: ANTHROPIC_API_KEY not set")
sys.exit(1)
GITHUB_HEADERS = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
issue = fetch_issue(args.issue_number)
if should_skip(issue):
print(json.dumps({"skipped": True}))
sys.exit(0)
# detect areas
taxonomy = format_taxonomy_for_claude(fetch_area_labels())
detected_areas = detect_areas(anthropic_key, issue, taxonomy)
# search for potential duplicates
all_magnets = parse_duplicate_magnets()
relevant_magnets = filter_magnets_by_areas(all_magnets, detected_areas)
search_results = search_for_similar_issues(issue, detected_areas)
# analyze potential duplicates
if relevant_magnets or search_results:
matches, summary = analyze_duplicates(anthropic_key, issue, relevant_magnets, search_results)
else:
matches, summary = [], "No potential duplicates to analyze"
# post comment if high-confidence matches found
high_confidence_matches = [m for m in matches if m["confidence"] == "high"]
commented = False
if high_confidence_matches:
comment_body = build_duplicate_comment(high_confidence_matches)
if args.dry_run:
log("Dry run - would post comment:\n" + "-" * 40 + "\n" + comment_body + "\n" + "-" * 40)
else:
log("Posting comment for high-confidence match(es)")
try:
post_comment(issue["number"], comment_body)
commented = True
except requests.RequestException as e:
log(f" Failed to post comment: {e}")
print(json.dumps({
"skipped": False,
"issue": {
"number": issue["number"],
"title": issue["title"],
"author": issue["author"],
"type": issue["type"],
},
"detected_areas": detected_areas,
"magnets_count": len(relevant_magnets),
"search_results_count": len(search_results),
"matches": matches,
"summary": summary,
"commented": commented,
}))