zed/script/github-track-duplicate-bot-effectiveness.py

#!/usr/bin/env python3
"""
Track the effectiveness of the duplicate-detection bot by classifying issues
into outcome categories on a GitHub Projects v2 board.

Subcommands:
    classify-closed <issue_number> <closer_login> <state_reason>
        Classify a closed issue and add it to the project board.

    classify-open
        Classify open, triaged, bot-commented issues and add them to
        the project board as Noise.

Requires:
    requests (pip install requests)

Environment variables:
    GITHUB_TOKEN     - GitHub App token
    PROJECT_NUMBER   - GitHub Projects v2 board number (default: 76, override for local testing)
"""

import argparse
import functools
import os
import re
import sys
import time
from datetime import datetime, timezone

import requests

GITHUB_API = "https://api.github.com"
GRAPHQL_URL = "https://api.github.com/graphql"
REPO_OWNER = "zed-industries"
REPO_NAME = "zed"
STAFF_TEAM_SLUG = "staff"
BOT_LOGIN = "zed-community-bot[bot]"
BOT_APP_SLUG = "zed-community-bot"
# Strings that identify a comment posted by the duplicate-detection bot. Any
# match counts as a bot comment for classification purposes. A single comment
# can contain both markers (v3+ produces this when there are both confident
# duplicates and lower-confidence triage context).
BOT_COMMENT_MARKERS = (
    "This issue appears to be a duplicate of",  # user-facing duplicate alert
    "Additional recent context for triagers",  # v3+ collapsed triage section
)
BOT_START_DATE = "2026-02-18"
NEEDS_TRIAGE_LABEL = "state:needs triage"
DEFAULT_PROJECT_NUMBER = 76
VALID_CLOSED_AS_VALUES = {"duplicate", "not_planned", "completed"}
# HTTP statuses we'll retry on for GET requests
TRANSIENT_HTTP_STATUSES = {429, 500, 502, 503, 504}
# Add a new tuple when you deploy a new version of the bot that you want to
# keep track of (e.g. the prompt gets a rewrite or the model gets swapped).
# Newest first, please. The datetime is for the deployment time (merge to main).
BOT_VERSION_TIMELINE = [
    ("v3", datetime(2026, 5, 25, 14, 30, tzinfo=timezone.utc)),
    ("v2", datetime(2026, 2, 26, 14, 9, tzinfo=timezone.utc)),
    ("v1", datetime(2026, 2, 18, tzinfo=timezone.utc)),
]


def bot_version_for_time(date_string):
    """Return the bot version that was active at the given ISO 8601 timestamp."""
    timestamp = datetime.fromisoformat(date_string.replace("Z", "+00:00"))
    for version, deployed in BOT_VERSION_TIMELINE:
        if timestamp >= deployed:
            return version
    return BOT_VERSION_TIMELINE[-1][0]


def github_api_get(path, params=None):
    """Fetch JSON from the GitHub REST API, retrying transient failures. Raises on non-2xx status."""
    url = f"{GITHUB_API}/{path.lstrip('/')}"
    for attempt in range(3):
        try:
            response = requests.get(url, headers=GITHUB_HEADERS, params=params)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            transient = isinstance(e, (requests.ConnectionError, requests.Timeout)) or (
                isinstance(e, requests.HTTPError) and e.response.status_code in TRANSIENT_HTTP_STATUSES
            )
            if not transient or attempt == 2:
                raise
            wait = 2 ** attempt
            print(f"  Transient GitHub API error ({e}); retrying in {wait}s")
            time.sleep(wait)


def github_search_issues(query):
    """Search issues, returning most recently created first."""
    # not handling pagination on purpose: the oldest issues are on the board already
    params = {"q": query, "sort": "created", "order": "desc", "per_page": 100}
    return github_api_get("/search/issues", params).get("items", [])


def is_staff_member(username):
    """Check if user is an active member of the staff team."""
    try:
        data = github_api_get(
            f"/orgs/{REPO_OWNER}/teams/{STAFF_TEAM_SLUG}/memberships/{username}"
        )
        return data.get("state") == "active"
    except requests.HTTPError as error:
        if error.response.status_code == 404:
            return False
        raise


def fetch_issue(issue_number):
    data = github_api_get(f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
    return {
        "number": issue_number,
        "node_id": data["node_id"],
        "author": (data.get("user") or {}).get("login", ""),
        "type_name": (data.get("type") or {}).get("name"),
        "created_at": data.get("created_at", ""),
    }


def is_bot_dupe_comment(body):
    """True if the comment body looks like one posted by the duplicate-detection bot."""
    return any(marker in body for marker in BOT_COMMENT_MARKERS)


def get_bot_comment_with_time(issue_number):
    """Get the bot's duplicate-detection comment and its timestamp from an issue.

    Recognizes both the user-facing duplicate alert and the v3+ triage-only
    comment formats. Returns {"body": str, "created_at": str} if found, else None.
    """
    comments_path = f"/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
    page = 1
    while comments := github_api_get(comments_path, {"per_page": 100, "page": page}):
        for comment in comments:
            author = (comment.get("user") or {}).get("login", "")
            body = comment.get("body", "")
            if author == BOT_LOGIN and is_bot_dupe_comment(body):
                return {"body": body, "created_at": comment.get("created_at", "")}
        page += 1
    return None


def parse_suggested_issues(comment_body):
    """Extract issue numbers from the bot's comment (lines like '- #12345')."""
    return [int(match) for match in re.findall(r"^- #(\d+)", comment_body, re.MULTILINE)]


def github_api_graphql(query, variables=None, partial_errors_ok=False):
    """Execute a GitHub GraphQL query. Raises on errors unless partial_errors_ok is set."""
    response = requests.post(
        GRAPHQL_URL,
        headers=GITHUB_HEADERS,
        json={"query": query, "variables": variables or {}},
    )
    response.raise_for_status()
    data = response.json()
    if "errors" in data:
        if not partial_errors_ok or "data" not in data:
            raise RuntimeError(f"GraphQL errors: {data['errors']}")
        print(f"  GraphQL partial errors (ignored): {data['errors']}")
    return data["data"]


def find_canonical_among(duplicate_number, candidates):
    """Check if any candidate issue has duplicate_number marked as a duplicate.

    The MarkedAsDuplicateEvent lives on the canonical issue's timeline, not the
    duplicate's. So to find which canonical issue our duplicate was closed against,
    we check each candidate's timeline for a MarkedAsDuplicateEvent whose
    `duplicate` field matches our issue.

    Returns the matching canonical issue number, or None.
    """
    if not candidates:
        return None

    # candidate issue numbers are baked into the query body via field aliases
    # (GraphQL doesn't let you parametrize alias names), so $numbers isn't needed.
    data = github_api_graphql(
        """
        query($owner: String!, $repo: String!) {
          repository(owner: $owner, name: $repo) {
            PLACEHOLDER
          }
        }
        """.replace("PLACEHOLDER", "\n            ".join(
            f'issue_{number}: issue(number: {number}) {{'
            f' timelineItems(last: 50, itemTypes: [MARKED_AS_DUPLICATE_EVENT]) {{'
            f' nodes {{ ... on MarkedAsDuplicateEvent {{ duplicate {{ ... on Issue {{ number }} }} }} }} }} }}'
            for number in candidates
        )),
        {"owner": REPO_OWNER, "repo": REPO_NAME},
        partial_errors_ok=True,
    )

    repo = data["repository"]
    for candidate in candidates:
        issue_data = repo.get(f"issue_{candidate}")
        if not issue_data:
            continue
        for node in issue_data["timelineItems"]["nodes"]:
            dup_number = (node.get("duplicate") or {}).get("number")
            if dup_number == duplicate_number:
                return candidate
    return None


@functools.lru_cache
def get_project_config():
    """Fetch the project board's ID, field IDs, and option IDs."""
    data = github_api_graphql(
        """
        query($org: String!, $number: Int!) {
          organization(login: $org) {
            projectV2(number: $number) {
              id
              fields(first: 30) {
                nodes {
                  ... on ProjectV2SingleSelectField { id name options { id name } }
                  ... on ProjectV2Field { id name }
                }
              }
            }
          }
        }
        """,
        {"org": REPO_OWNER, "number": PROJECT_NUMBER},
    )
    project = data["organization"]["projectV2"]

    config = {"project_id": project["id"], "fields": {}}
    for field_node in project["fields"]["nodes"]:
        name = field_node.get("name")
        if not name:
            continue
        field_info = {"id": field_node["id"]}
        if "options" in field_node:
            field_info["options"] = {
                option["name"]: option["id"] for option in field_node["options"]
            }
        config["fields"][name] = field_info

    print(f"  Project config loaded: {len(config['fields'])} fields")
    return config


def find_project_item(issue_node_id):
    """Check if an issue is already on our project board.

    Returns the project item ID if found, or None.
    """
    data = github_api_graphql(
        "query($id: ID!) { node(id: $id) { ... on Issue { projectItems(first: 20) { nodes { id project { number } } } } } }",
        {"id": issue_node_id},
    )
    for item in data["node"]["projectItems"]["nodes"]:
        if item["project"]["number"] == PROJECT_NUMBER:
            return item["id"]
    return None


def add_project_item(issue_node_id):
    """Add an issue to the project board. Returns the new item ID."""
    config = get_project_config()
    data = github_api_graphql(
        """
        mutation($projectId: ID!, $contentId: ID!) {
          addProjectV2ItemById(input: {projectId: $projectId, contentId: $contentId}) {
            item { id }
          }
        }
        """,
        {"projectId": config["project_id"], "contentId": issue_node_id},
    )
    return data["addProjectV2ItemById"]["item"]["id"]


def set_field_value(item_id, field_name, value):
    """Set a single field value on a project board item."""
    config = get_project_config()
    field = config["fields"].get(field_name)
    if not field:
        print(f"  Warning: field '{field_name}' not found on project board")
        return

    if "options" in field:
        # single-select field
        option_id = field["options"].get(value)
        if not option_id:
            print(f"  Warning: option '{value}' not found for field '{field_name}'")
            return
        field_value = {"singleSelectOptionId": option_id}
    else:
        # text field
        field_value = {"text": str(value)}

    github_api_graphql(
        """
        mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: ProjectV2FieldValue!) {
          updateProjectV2ItemFieldValue(input: {
            projectId: $projectId
            itemId: $itemId
            fieldId: $fieldId
            value: $value
          }) {
            projectV2Item { id }
          }
        }
        """,
        {
            "projectId": config["project_id"],
            "itemId": item_id,
            "fieldId": field["id"],
            "value": field_value,
        },
    )


def add_or_update_project_item(issue_node_id, outcome, closed_as=None, status="Auto-classified", notes=None, bot_comment_time=None):
    """Add an issue to the project board (or update it if already there), setting field values."""
    item_id = find_project_item(issue_node_id)
    if item_id:
        print(f"  Issue already on board, updating (item {item_id})")
    else:
        item_id = add_project_item(issue_node_id)
        print(f"  Added to project board (item {item_id})")

    set_field_value(item_id, "Outcome", outcome)
    set_field_value(item_id, "Status", status)

    if closed_as and closed_as in VALID_CLOSED_AS_VALUES:
        set_field_value(item_id, "Closed as", closed_as)

    if notes:
        set_field_value(item_id, "Notes", notes)

    if bot_comment_time:
        set_field_value(item_id, "Bot version", bot_version_for_time(bot_comment_time))

    return item_id


def classify_closed(issue_number, closer_login, state_reason):
    """Classify a closed issue and add/update it on the project board."""
    state_reason = state_reason or "unknown"
    print(f"Classifying closed issue #{issue_number}")
    print(f"  Closer: {closer_login}, state_reason: {state_reason}")

    issue = fetch_issue(issue_number)
    author = issue["author"]
    print(f"  Author: {author}, type: {issue['type_name']}")

    if is_staff_member(author):
        print(f"  Skipping: author '{author}' is a staff member")
        return

    bot_comment = get_bot_comment_with_time(issue_number)
    bot_commented = bot_comment is not None
    print(f"  Bot commented: {bot_commented}")

    closer_is_author = closer_login == author

    if bot_commented and closer_is_author:
        classify_as_success(issue, bot_comment, state_reason)
    elif bot_commented and not closer_is_author:
        # Only authors, staff, and triagers can close issues, so
        # a non-author closer is always someone with elevated permissions.
        classify_non_author_closed(issue, bot_comment, state_reason)
    elif not bot_commented and state_reason == "duplicate":
        classify_as_missed_opportunity(issue)
    else:
        print("  Skipping: no bot comment and not closed as duplicate")


def classify_as_success(issue, bot_comment, state_reason):
    """Author closed their own issue after the bot commented."""
    if state_reason == "duplicate":
        status = "Auto-classified"
        notes = None
    else:
        # could be closed for an unrelated reason; flag for review
        status = "Needs review"
        notes = f"Author closed as {state_reason}"

    if status == "Auto-classified":
        print(f"  -> Success (closed as {state_reason})")
    else:
        print(f"  -> Possible Success, needs review ({notes})")
    add_or_update_project_item(
        issue["node_id"],
        outcome="Success",
        closed_as=state_reason,
        status=status,
        notes=notes,
        bot_comment_time=bot_comment["created_at"],
    )


def classify_non_author_closed(issue, bot_comment, state_reason):
    """Non-author (staff or triager) closed an issue the bot had commented on."""
    if state_reason == "duplicate":
        classify_as_assist(issue, bot_comment)
    else:
        notes = f"Closed by staff/triager as {state_reason}, not duplicate"
        print(f"  -> Possible Noise, needs review ({notes})")
        add_or_update_project_item(
            issue["node_id"],
            outcome="Noise",
            closed_as=state_reason,
            status="Needs review",
            notes=notes,
            bot_comment_time=bot_comment["created_at"],
        )


def classify_as_assist(issue, bot_comment):
    """Staff member closed as duplicate after the bot commented. Check if the dup matches."""
    suggested = parse_suggested_issues(bot_comment["body"])
    if not suggested:
        print("  -> Assist, needs review (could not parse bot suggestions)")
        add_or_update_project_item(
            issue["node_id"], outcome="Assist", closed_as="duplicate",
            status="Needs review", notes="Could not parse bot suggestions",
            bot_comment_time=bot_comment["created_at"])
        return

    # Let exceptions from find_canonical_among propagate — a query failure here is
    # not the same as "no canonical match" and shouldn't be silently downgraded to
    # a Needs review entry. Failing the workflow surfaces the problem immediately.
    original = find_canonical_among(issue["number"], suggested)

    if original:
        status = "Auto-classified"
        notes = None
        print(f"  -> Assist (original #{original} matches bot suggestion)")
    else:
        status = "Needs review"
        suggested_str = ", ".join(f"#{number}" for number in suggested)
        notes = f"Bot suggested {suggested_str}; none matched as canonical"
        print(f"  -> Possible Assist, needs review ({notes})")

    add_or_update_project_item(
        issue["node_id"], outcome="Assist", closed_as="duplicate", status=status, notes=notes,
        bot_comment_time=bot_comment["created_at"])


def classify_as_missed_opportunity(issue):
    """Issue closed as duplicate but the bot never commented."""
    print("  -> Missed opportunity")
    add_or_update_project_item(
        issue["node_id"], outcome="Missed opportunity", closed_as="duplicate", status="Auto-classified",
        bot_comment_time=issue["created_at"])


def classify_open():
    """Classify open, triaged, bot-commented issues as Noise."""
    print("Classifying open issues")

    query = (
        f"repo:{REPO_OWNER}/{REPO_NAME} is:issue is:open "
        f"commenter:app/{BOT_APP_SLUG} "
        f'-label:"{NEEDS_TRIAGE_LABEL}" '
        f"created:>={BOT_START_DATE}"
    )
    print(f"  Search query: {query}")

    results = github_search_issues(query)
    print(f"  Found {len(results)} candidate issues")

    added, skipped, errors = 0, 0, 0
    for item in results:
        number = item["number"]
        try:
            type_name = (item.get("type") or {}).get("name")
            author = (item.get("user") or {}).get("login", "")
            node_id = item["node_id"]

            skip_reason = (
                f"type is {type_name}" if type_name and type_name not in ("Bug", "Crash")
                else f"author {author} is staff" if is_staff_member(author)
                else "already on the board" if find_project_item(node_id)
                else "no bot duplicate comment found" if not (bot_comment := get_bot_comment_with_time(number))
                else None
            )

            if skip_reason:
                print(f"  #{number}: skipping, {skip_reason}")
                skipped += 1
                continue

            print(f"  #{number}: adding as Noise")
            add_or_update_project_item(node_id, outcome="Noise", status="Auto-classified",
                                       bot_comment_time=bot_comment["created_at"])
            added += 1
        except Exception as error:  # broad catch: one issue failing shouldn't stop the sweep
            print(f"  #{number}: error processing issue, skipping: {error}")
            errors += 1

    print(f"  Done: added {added}, skipped {skipped}, errors {errors}")
    if errors > 0:
        sys.exit(1)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Track duplicate bot effectiveness on a GitHub project board.",
    )
    subparsers = parser.add_subparsers(dest="command", required=True)

    classify_parser = subparsers.add_parser(
        "classify-closed",
        help="Classify a closed issue and add it to the project board.",
    )
    classify_parser.add_argument("issue_number", type=int)
    classify_parser.add_argument("closer_login")
    classify_parser.add_argument("state_reason")

    subparsers.add_parser(
        "classify-open",
        help="Classify open, triaged, bot-commented issues as Noise.",
    )

    args = parser.parse_args()

    GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
    if not GITHUB_TOKEN:
        print("Error: GITHUB_TOKEN environment variable is required")
        sys.exit(1)

    raw_project_number = os.environ.get("PROJECT_NUMBER", "")
    if raw_project_number:
        try:
            PROJECT_NUMBER = int(raw_project_number)
        except ValueError:
            print(f"Error: PROJECT_NUMBER must be an integer, got '{raw_project_number}'")
            sys.exit(1)
    else:
        PROJECT_NUMBER = DEFAULT_PROJECT_NUMBER

    GITHUB_HEADERS = {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github+json",
    }

    if args.command == "classify-closed":
        classify_closed(args.issue_number, args.closer_login, args.state_reason)
    elif args.command == "classify-open":
        classify_open()