mirror of
https://github.com/zed-industries/zed.git
synced 2026-05-31 19:05:00 +07:00
550 lines
20 KiB
Python
550 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Track the effectiveness of the duplicate-detection bot by classifying issues
|
|
into outcome categories on a GitHub Projects v2 board.
|
|
|
|
Subcommands:
|
|
classify-closed <issue_number> <closer_login> <state_reason>
|
|
Classify a closed issue and add it to the project board.
|
|
|
|
classify-open
|
|
Classify open, triaged, bot-commented issues and add them to
|
|
the project board as Noise.
|
|
|
|
Requires:
|
|
requests (pip install requests)
|
|
|
|
Environment variables:
|
|
GITHUB_TOKEN - GitHub App token
|
|
PROJECT_NUMBER - GitHub Projects v2 board number (default: 76, override for local testing)
|
|
"""
|
|
|
|
import argparse
|
|
import functools
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
import requests
|
|
|
|
GITHUB_API = "https://api.github.com"
|
|
GRAPHQL_URL = "https://api.github.com/graphql"
|
|
REPO_OWNER = "zed-industries"
|
|
REPO_NAME = "zed"
|
|
STAFF_TEAM_SLUG = "staff"
|
|
BOT_LOGIN = "zed-community-bot[bot]"
|
|
BOT_APP_SLUG = "zed-community-bot"
|
|
# Strings that identify a comment posted by the duplicate-detection bot. Any
|
|
# match counts as a bot comment for classification purposes. A single comment
|
|
# can contain both markers (v3+ produces this when there are both confident
|
|
# duplicates and lower-confidence triage context).
|
|
BOT_COMMENT_MARKERS = (
|
|
"This issue appears to be a duplicate of", # user-facing duplicate alert
|
|
"Additional recent context for triagers", # v3+ collapsed triage section
|
|
)
|
|
BOT_START_DATE = "2026-02-18"
|
|
NEEDS_TRIAGE_LABEL = "state:needs triage"
|
|
DEFAULT_PROJECT_NUMBER = 76
|
|
VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
|
|
# HTTP statuses we'll retry on for GET requests
|
|
TRANSIENT_HTTP_STATUSES = {429, 500, 502, 503, 504}
|
|
# Add a new tuple when you deploy a new version of the bot that you want to
|
|
# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
|
|
# Newest first, please. The datetime is for the deployment time (merge to main).
|
|
BOT_VERSION_TIMELINE = [
|
|
("v3", datetime(2026, 5, 25, 14, 30, tzinfo=timezone.utc)),
|
|
("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
|
|
("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
|
|
]
|
|
|
|
|
|
def bot_version_for_time(date_string):
|
|
"""Return the bot version that was active at the given ISO 8601 timestamp."""
|
|
timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
|
|
for version, deployed in BOT_VERSION_TIMELINE:
|
|
if timestamp >= deployed:
|
|
return version
|
|
return BOT_VERSION_TIMELINE[-1][0]
|
|
|
|
|
|
def github_api_get(path, params=None):
|
|
"""Fetch JSON from the GitHub REST API, retrying transient failures. Raises on non-2xx status."""
|
|
url = f"{GITHUB_API}/{path.lstrip('/')}"
|
|
for attempt in range(3):
|
|
try:
|
|
response = requests.get(url, headers=GITHUB_HEADERS, params=params)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except requests.RequestException as e:
|
|
transient = isinstance(e, (requests.ConnectionError, requests.Timeout)) or (
|
|
isinstance(e, requests.HTTPError) and e.response.status_code in TRANSIENT_HTTP_STATUSES
|
|
)
|
|
if not transient or attempt == 2:
|
|
raise
|
|
wait = 2 ** attempt
|
|
print(f" Transient GitHub API error ({e}); retrying in {wait}s")
|
|
time.sleep(wait)
|
|
|
|
|
|
def github_search_issues(query):
|
|
"""Search issues, returning most recently created first."""
|
|
# not handling pagination on purpose: the oldest issues are on the board already
|
|
params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
|
|
return github_api_get("/search/issues", params).get("items", [])
|
|
|
|
|
|
def is_staff_member(username):
|
|
"""Check if user is an active member of the staff team."""
|
|
try:
|
|
data = github_api_get(
|
|
f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
|
|
)
|
|
return data.get("state") == "active"
|
|
except requests.HTTPError as error:
|
|
if error.response.status_code == 404:
|
|
return False
|
|
raise
|
|
|
|
|
|
def fetch_issue(issue_number):
|
|
data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
|
|
return {
|
|
"number": issue_number,
|
|
"node_id": data["node_id"],
|
|
"author": (data.get("user") or {}).get("login", ""),
|
|
"type_name": (data.get("type") or {}).get("name"),
|
|
"created_at": data.get("created_at", ""),
|
|
}
|
|
|
|
|
|
def is_bot_dupe_comment(body):
|
|
"""True if the comment body looks like one posted by the duplicate-detection bot."""
|
|
return any(marker in body for marker in BOT_COMMENT_MARKERS)
|
|
|
|
|
|
def get_bot_comment_with_time(issue_number):
|
|
"""Get the bot's duplicate-detection comment and its timestamp from an issue.
|
|
|
|
Recognizes both the user-facing duplicate alert and the v3+ triage-only
|
|
comment formats. Returns {"body": str, "created_at": str} if found, else None.
|
|
"""
|
|
comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
|
|
page = 1
|
|
while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
|
|
for comment in comments:
|
|
author = (comment.get("user") or {}).get("login", "")
|
|
body = comment.get("body", "")
|
|
if author == BOT_LOGIN and is_bot_dupe_comment(body):
|
|
return {"body": body, "created_at": comment.get("created_at", "")}
|
|
page += 1
|
|
return None
|
|
|
|
|
|
def parse_suggested_issues(comment_body):
|
|
"""Extract issue numbers from the bot's comment (lines like '- #12345')."""
|
|
return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]
|
|
|
|
|
|
def github_api_graphql(query, variables=None, partial_errors_ok=False):
|
|
"""Execute a GitHub GraphQL query. Raises on errors unless partial_errors_ok is set."""
|
|
response = requests.post(
|
|
GRAPHQL_URL,
|
|
headers=GITHUB_HEADERS,
|
|
json={"query": query, "variables": variables or {}},
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
if "errors" in data:
|
|
if not partial_errors_ok or "data" not in data:
|
|
raise RuntimeError(f"GraphQL errors: {data['errors']}")
|
|
print(f" GraphQL partial errors (ignored): {data['errors']}")
|
|
return data["data"]
|
|
|
|
|
|
def find_canonical_among(duplicate_number, candidates):
|
|
"""Check if any candidate issue has duplicate_number marked as a duplicate.
|
|
|
|
The MarkedAsDuplicateEvent lives on the canonical issue's timeline, not the
|
|
duplicate's. So to find which canonical issue our duplicate was closed against,
|
|
we check each candidate's timeline for a MarkedAsDuplicateEvent whose
|
|
`duplicate` field matches our issue.
|
|
|
|
Returns the matching canonical issue number, or None.
|
|
"""
|
|
if not candidates:
|
|
return None
|
|
|
|
# candidate issue numbers are baked into the query body via field aliases
|
|
# (GraphQL doesn't let you parametrize alias names), so $numbers isn't needed.
|
|
data = github_api_graphql(
|
|
"""
|
|
query($owner: String!, $repo: String!) {
|
|
repository(owner: $owner, name: $repo) {
|
|
PLACEHOLDER
|
|
}
|
|
}
|
|
""".replace("PLACEHOLDER", "\n ".join(
|
|
f'issue_{number}: issue(number: {number}) {{'
|
|
f' timelineItems(last: 50, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {{'
|
|
f' nodes {{ ... on MarkedAsDuplicateEvent {{ duplicate {{ ... on Issue {{ number }} }} }} }} }} }}'
|
|
for number in candidates
|
|
)),
|
|
{"owner": REPO_OWNER, "repo": REPO_NAME},
|
|
partial_errors_ok=True,
|
|
)
|
|
|
|
repo = data["repository"]
|
|
for candidate in candidates:
|
|
issue_data = repo.get(f"issue_{candidate}")
|
|
if not issue_data:
|
|
continue
|
|
for node in issue_data["timelineItems"]["nodes"]:
|
|
dup_number = (node.get("duplicate") or {}).get("number")
|
|
if dup_number == duplicate_number:
|
|
return candidate
|
|
return None
|
|
|
|
|
|
@functools.lru_cache
|
|
def get_project_config():
|
|
"""Fetch the project board's ID, field IDs, and option IDs."""
|
|
data = github_api_graphql(
|
|
"""
|
|
query($org: String!, $number: Int!) {
|
|
organization(login: $org) {
|
|
projectV2(number: $number) {
|
|
id
|
|
fields(first: 30) {
|
|
nodes {
|
|
... on ProjectV2SingleSelectField { id name options { id name } }
|
|
... on ProjectV2Field { id name }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
""",
|
|
{"org": REPO_OWNER, "number": PROJECT_NUMBER},
|
|
)
|
|
project = data["organization"]["projectV2"]
|
|
|
|
config = {"project_id": project["id"], "fields": {}}
|
|
for field_node in project["fields"]["nodes"]:
|
|
name = field_node.get("name")
|
|
if not name:
|
|
continue
|
|
field_info = {"id": field_node["id"]}
|
|
if "options" in field_node:
|
|
field_info["options"] = {
|
|
option["name"]: option["id"] for option in field_node["options"]
|
|
}
|
|
config["fields"][name] = field_info
|
|
|
|
print(f" Project config loaded: {len(config['fields'])} fields")
|
|
return config
|
|
|
|
|
|
def find_project_item(issue_node_id):
|
|
"""Check if an issue is already on our project board.
|
|
|
|
Returns the project item ID if found, or None.
|
|
"""
|
|
data = github_api_graphql(
|
|
"query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
|
|
{"id": issue_node_id},
|
|
)
|
|
for item in data["node"]["projectItems"]["nodes"]:
|
|
if item["project"]["number"] == PROJECT_NUMBER:
|
|
return item["id"]
|
|
return None
|
|
|
|
|
|
def add_project_item(issue_node_id):
|
|
"""Add an issue to the project board. Returns the new item ID."""
|
|
config = get_project_config()
|
|
data = github_api_graphql(
|
|
"""
|
|
mutation($projectId: ID!, $contentId: ID!) {
|
|
addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
|
|
item { id }
|
|
}
|
|
}
|
|
""",
|
|
{"projectId": config["project_id"], "contentId": issue_node_id},
|
|
)
|
|
return data["addProjectV2ItemById"]["item"]["id"]
|
|
|
|
|
|
def set_field_value(item_id, field_name, value):
|
|
"""Set a single field value on a project board item."""
|
|
config = get_project_config()
|
|
field = config["fields"].get(field_name)
|
|
if not field:
|
|
print(f" Warning: field '{field_name}' not found on project board")
|
|
return
|
|
|
|
if "options" in field:
|
|
# single-select field
|
|
option_id = field["options"].get(value)
|
|
if not option_id:
|
|
print(f" Warning: option '{value}' not found for field '{field_name}'")
|
|
return
|
|
field_value = {"singleSelectOptionId": option_id}
|
|
else:
|
|
# text field
|
|
field_value = {"text": str(value)}
|
|
|
|
github_api_graphql(
|
|
"""
|
|
mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
|
|
updateProjectV2ItemFieldValue(input: {
|
|
projectId: $projectId
|
|
itemId: $itemId
|
|
fieldId: $fieldId
|
|
value: $value
|
|
}) {
|
|
projectV2Item { id }
|
|
}
|
|
}
|
|
""",
|
|
{
|
|
"projectId": config["project_id"],
|
|
"itemId": item_id,
|
|
"fieldId": field["id"],
|
|
"value": field_value,
|
|
},
|
|
)
|
|
|
|
|
|
def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
|
|
"""Add an issue to the project board (or update it if already there), setting field values."""
|
|
item_id = find_project_item(issue_node_id)
|
|
if item_id:
|
|
print(f" Issue already on board, updating (item {item_id})")
|
|
else:
|
|
item_id = add_project_item(issue_node_id)
|
|
print(f" Added to project board (item {item_id})")
|
|
|
|
set_field_value(item_id, "Outcome", outcome)
|
|
set_field_value(item_id, "Status", status)
|
|
|
|
if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
|
|
set_field_value(item_id, "Closed as", closed_as)
|
|
|
|
if notes:
|
|
set_field_value(item_id, "Notes", notes)
|
|
|
|
if bot_comment_time:
|
|
set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))
|
|
|
|
return item_id
|
|
|
|
|
|
def classify_closed(issue_number, closer_login, state_reason):
|
|
"""Classify a closed issue and add/update it on the project board."""
|
|
state_reason = state_reason or "unknown"
|
|
print(f"Classifying closed issue #{issue_number}")
|
|
print(f" Closer: {closer_login}, state_reason: {state_reason}")
|
|
|
|
issue = fetch_issue(issue_number)
|
|
author = issue["author"]
|
|
print(f" Author: {author}, type: {issue['type_name']}")
|
|
|
|
if is_staff_member(author):
|
|
print(f" Skipping: author '{author}' is a staff member")
|
|
return
|
|
|
|
bot_comment = get_bot_comment_with_time(issue_number)
|
|
bot_commented = bot_comment is not None
|
|
print(f" Bot commented: {bot_commented}")
|
|
|
|
closer_is_author = closer_login == author
|
|
|
|
if bot_commented and closer_is_author:
|
|
classify_as_success(issue, bot_comment, state_reason)
|
|
elif bot_commented and not closer_is_author:
|
|
# Only authors, staff, and triagers can close issues, so
|
|
# a non-author closer is always someone with elevated permissions.
|
|
classify_non_author_closed(issue, bot_comment, state_reason)
|
|
elif not bot_commented and state_reason == "duplicate":
|
|
classify_as_missed_opportunity(issue)
|
|
else:
|
|
print(" Skipping: no bot comment and not closed as duplicate")
|
|
|
|
|
|
def classify_as_success(issue, bot_comment, state_reason):
|
|
"""Author closed their own issue after the bot commented."""
|
|
if state_reason == "duplicate":
|
|
status = "Auto-classified"
|
|
notes = None
|
|
else:
|
|
# could be closed for an unrelated reason; flag for review
|
|
status = "Needs review"
|
|
notes = f"Author closed as {state_reason}"
|
|
|
|
if status == "Auto-classified":
|
|
print(f" -> Success (closed as {state_reason})")
|
|
else:
|
|
print(f" -> Possible Success, needs review ({notes})")
|
|
add_or_update_project_item(
|
|
issue["node_id"],
|
|
outcome="Success",
|
|
closed_as=state_reason,
|
|
status=status,
|
|
notes=notes,
|
|
bot_comment_time=bot_comment["created_at"],
|
|
)
|
|
|
|
|
|
def classify_non_author_closed(issue, bot_comment, state_reason):
|
|
"""Non-author (staff or triager) closed an issue the bot had commented on."""
|
|
if state_reason == "duplicate":
|
|
classify_as_assist(issue, bot_comment)
|
|
else:
|
|
notes = f"Closed by staff/triager as {state_reason}, not duplicate"
|
|
print(f" -> Possible Noise, needs review ({notes})")
|
|
add_or_update_project_item(
|
|
issue["node_id"],
|
|
outcome="Noise",
|
|
closed_as=state_reason,
|
|
status="Needs review",
|
|
notes=notes,
|
|
bot_comment_time=bot_comment["created_at"],
|
|
)
|
|
|
|
|
|
def classify_as_assist(issue, bot_comment):
|
|
"""Staff member closed as duplicate after the bot commented. Check if the dup matches."""
|
|
suggested = parse_suggested_issues(bot_comment["body"])
|
|
if not suggested:
|
|
print(" -> Assist, needs review (could not parse bot suggestions)")
|
|
add_or_update_project_item(
|
|
issue["node_id"], outcome="Assist", closed_as="duplicate",
|
|
status="Needs review", notes="Could not parse bot suggestions",
|
|
bot_comment_time=bot_comment["created_at"])
|
|
return
|
|
|
|
# Let exceptions from find_canonical_among propagate — a query failure here is
|
|
# not the same as "no canonical match" and shouldn't be silently downgraded to
|
|
# a Needs review entry. Failing the workflow surfaces the problem immediately.
|
|
original = find_canonical_among(issue["number"], suggested)
|
|
|
|
if original:
|
|
status = "Auto-classified"
|
|
notes = None
|
|
print(f" -> Assist (original #{original} matches bot suggestion)")
|
|
else:
|
|
status = "Needs review"
|
|
suggested_str = ", ".join(f"#{number}" for number in suggested)
|
|
notes = f"Bot suggested {suggested_str}; none matched as canonical"
|
|
print(f" -> Possible Assist, needs review ({notes})")
|
|
|
|
add_or_update_project_item(
|
|
issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
|
|
bot_comment_time=bot_comment["created_at"])
|
|
|
|
|
|
def classify_as_missed_opportunity(issue):
|
|
"""Issue closed as duplicate but the bot never commented."""
|
|
print(" -> Missed opportunity")
|
|
add_or_update_project_item(
|
|
issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified",
|
|
bot_comment_time=issue["created_at"])
|
|
|
|
|
|
def classify_open():
|
|
"""Classify open, triaged, bot-commented issues as Noise."""
|
|
print("Classifying open issues")
|
|
|
|
query = (
|
|
f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
|
|
f"commenter:app/{BOT_APP_SLUG} "
|
|
f'-label:"{NEEDS_TRIAGE_LABEL}" '
|
|
f"created:>={BOT_START_DATE}"
|
|
)
|
|
print(f" Search query: {query}")
|
|
|
|
results = github_search_issues(query)
|
|
print(f" Found {len(results)} candidate issues")
|
|
|
|
added, skipped, errors = 0, 0, 0
|
|
for item in results:
|
|
number = item["number"]
|
|
try:
|
|
type_name = (item.get("type") or {}).get("name")
|
|
author = (item.get("user") or {}).get("login", "")
|
|
node_id = item["node_id"]
|
|
|
|
skip_reason = (
|
|
f"type is {type_name}" if type_name and type_name not in ("Bug", "Crash")
|
|
else f"author {author} is staff" if is_staff_member(author)
|
|
else "already on the board" if find_project_item(node_id)
|
|
else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
|
|
else None
|
|
)
|
|
|
|
if skip_reason:
|
|
print(f" #{number}: skipping, {skip_reason}")
|
|
skipped += 1
|
|
continue
|
|
|
|
print(f" #{number}: adding as Noise")
|
|
add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
|
|
bot_comment_time=bot_comment["created_at"])
|
|
added += 1
|
|
except Exception as error: # broad catch: one issue failing shouldn't stop the sweep
|
|
print(f" #{number}: error processing issue, skipping: {error}")
|
|
errors += 1
|
|
|
|
print(f" Done: added {added}, skipped {skipped}, errors {errors}")
|
|
if errors > 0:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Track duplicate bot effectiveness on a GitHub project board.",
|
|
)
|
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
|
|
classify_parser = subparsers.add_parser(
|
|
"classify-closed",
|
|
help="Classify a closed issue and add it to the project board.",
|
|
)
|
|
classify_parser.add_argument("issue_number", type=int)
|
|
classify_parser.add_argument("closer_login")
|
|
classify_parser.add_argument("state_reason")
|
|
|
|
subparsers.add_parser(
|
|
"classify-open",
|
|
help="Classify open, triaged, bot-commented issues as Noise.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
|
|
if not GITHUB_TOKEN:
|
|
print("Error: GITHUB_TOKEN environment variable is required")
|
|
sys.exit(1)
|
|
|
|
raw_project_number = os.environ.get("PROJECT_NUMBER", "")
|
|
if raw_project_number:
|
|
try:
|
|
PROJECT_NUMBER = int(raw_project_number)
|
|
except ValueError:
|
|
print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
|
|
sys.exit(1)
|
|
else:
|
|
PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER
|
|
|
|
GITHUB_HEADERS = {
|
|
"Authorization": f"token {GITHUB_TOKEN}",
|
|
"Accept": "application/vnd.github+json",
|
|
}
|
|
|
|
if args.command == "classify-closed":
|
|
classify_closed(args.issue_number, args.closer_login, args.state_reason)
|
|
elif args.command == "classify-open":
|
|
classify_open()
|