apix/lib/crawler.ts

import { Prompt } from './types';

const JIMMYLV_SOURCE_URL = "https://raw.githubusercontent.com/JimmyLv/awesome-nano-banana/main/cases";
const YOUMIND_README_URL = "https://raw.githubusercontent.com/YouMind-OpenLab/awesome-nano-banana-pro-prompts/main/README.md";
const ZEROLU_README_URL = "https://raw.githubusercontent.com/ZeroLu/awesome-nanobanana-pro/main/README.md";

const MAX_CASE_ID = 200; // Increased limit slightly
const BATCH_SIZE = 10;

export class JimmyLvCrawler {
    async crawl(limit: number = 300): Promise<Prompt[]> {
        console.log(`Starting crawl for ${limit} cases...`);
        const prompts: Prompt[] = [];

        // Create batches of IDs to fetch
        const ids = Array.from({ length: limit }, (_, i) => i + 1);

        for (let i = 0; i < ids.length; i += BATCH_SIZE) {
            const batch = ids.slice(i, i + BATCH_SIZE);
            // console.log(`Fetching batch ${i + 1} to ${i + batch.length}...`);

            const results = await Promise.all(
                batch.map(id => this.fetchCase(id))
            );

            results.forEach(p => {
                if (p) prompts.push(p);
            });
        }

        console.log(`[JimmyLv] Crawled ${prompts.length} valid prompts.`);
        return prompts;
    }

    private async fetchCase(id: number): Promise<Prompt | null> {
        try {
            const url = `${JIMMYLV_SOURCE_URL}/${id}/case.yml`;
            const res = await fetch(url);

            if (!res.ok) {
                // console.warn(`Failed to fetch ${url}: ${res.status}`);
                return null;
            }

            const text = await res.text();
            return this.parseCase(text, id);
        } catch (error) {
            console.error(`Error fetching case ${id}:`, error);
            return null;
        }
    }

    private parseCase(content: string, caseId: number): Prompt | null {
        try {
            // Extract title
            let title = this.extract(content, /title_en:\s*(.+)/);
            if (!title) title = this.extract(content, /title:\s*(.+)/) || "Unknown";

            // Extract prompt (Multi-line block scalar)
            let promptText = "";
            const promptMatch = content.match(/prompt_en:\s*\|\s*\n((?:  .+\n)+)/) ||
                content.match(/prompt:\s*\|\s*\n((?:  .+\n)+)/);

            if (promptMatch) {
                promptText = promptMatch[1]
                    .split('\n')
                    .map(line => line.trim())
                    .join(' ')
                    .trim();
            }

            if (!promptText) {
                // Try simpler single line prompt
                promptText = this.extract(content, /prompt:\s*(.+)/) || "";
            }

            if (!promptText) return null;

            // Extract image filename
            const imageFilename = this.extract(content, /image:\s*(.+)/);
            let imageUrl = "";
            if (imageFilename) {
                imageUrl = `${JIMMYLV_SOURCE_URL}/${caseId}/${imageFilename}`;
            }

            // Extract author
            const author = this.extract(content, /author:\s*"?([^"\n]+)"?/) || "JimmyLv Repo";

            const category = this.inferCategory(title, promptText);

            return {
                id: 0, // Will be assigned by manager
                title: title.slice(0, 150),
                prompt: promptText,
                category,
                category_type: "style", // Simplified
                description: promptText.slice(0, 200) + (promptText.length > 200 ? "..." : ""),
                images: imageUrl ? [imageUrl] : [],
                author,
                source: "jimmylv",
                source_url: `https://github.com/JimmyLv/awesome-nano-banana/tree/main/cases/${caseId}`
            };

        } catch (error) {
            return null;
        }
    }

    private extract(content: string, regex: RegExp): string | null {
        const match = content.match(regex);
        return match ? match[1].trim() : null;
    }

    private inferCategory(title: string, prompt: string): string {
        const text = (title + " " + prompt).toLowerCase();

        const rules: [string[], string][] = [
            [["ghibli", "anime", "cartoon", "chibi", "comic", "illustration", "drawing"], "Illustration"],
            [["icon", "logo", "symbol"], "Logo / Icon"],
            [["product", "packaging", "mockup"], "Product"],
            [["avatar", "profile", "headshot"], "Profile / Avatar"],
            [["infographic", "chart", "diagram"], "Infographic / Edu Visual"],
            [["cinematic", "film", "movie"], "Cinematic / Film Still"],
            [["3d", "render", "blender"], "3D Render"],
            [["pixel", "8-bit", "retro game"], "Pixel Art"],
        ];

        for (const [keywords, cat] of rules) {
            if (keywords.some(k => text.includes(k))) return cat;
        }

        return "Photography";
    }
}

export class YouMindCrawler {
    async crawl(): Promise<Prompt[]> {
        console.log(`[YouMind] Starting crawl of README...`);
        const prompts: Prompt[] = [];

        try {
            const res = await fetch(YOUMIND_README_URL);
            if (!res.ok) throw new Error("Failed to fetch YouMind README");
            const text = await res.text();

            // Split by "### No." sections
            const sections = text.split(/### No\./g).slice(1);

            let idCounter = 1;
            for (const section of sections) {
                const prompt = this.parseSection(section, idCounter++);
                if (prompt) prompts.push(prompt);
            }

        } catch (e) {
            console.error("[YouMind] Crawl failed", e);
        }

        console.log(`[YouMind] Crawled ${prompts.length} valid prompts.`);
        return prompts;
    }

    private parseSection(content: string, index: number): Prompt | null {
        try {
            // Title: First line after number
            const titleMatch = content.match(/\s*\d+:\s*(.+)/);
            const title = titleMatch ? titleMatch[1].trim() : `YouMind Case ${index}`;

            // Prompt Block
            const promptMatch = content.match(/```\s*([\s\S]*?)\s*```/);
            // Some sections might have multiple blocks, assume first large one is prompt?
            // The README format shows prompt in a code block under #### 📝 Prompt
            // Better regex: look for #### 📝 Prompt\n\n```\n...
            const strictPromptMatch = content.match(/#### 📝 Prompt\s+```[\s\S]*?\n([\s\S]*?)```/);

            const promptText = strictPromptMatch ? strictPromptMatch[1].trim() : (promptMatch ? promptMatch[1].trim() : "");

            if (!promptText) return null;

            // Images
            const imageMatches = [...content.matchAll(/<img src="(.*?)"/g)];
            const images = imageMatches.map(m => m[1]).filter(url => !url.includes("img.shields.io")); // Exclude badges

            // Author / Source
            const authorMatch = content.match(/- \*\*Author:\*\* \[(.*?)\]/);
            const author = authorMatch ? authorMatch[1] : "YouMind Community";

            const sourceMatch = content.match(/- \*\*Source:\*\* \[(.*?)\]\((.*?)\)/);
            const sourceUrl = sourceMatch ? sourceMatch[2] : `https://github.com/YouMind-OpenLab/awesome-nano-banana-pro-prompts#no-${index}`;

            return {
                id: 0,
                title,
                prompt: promptText,
                category: this.inferCategory(title, promptText),
                category_type: "style",
                description: title,
                images,
                author,
                source: "youmind",
                source_url: sourceUrl
            };
        } catch (e) {
            return null;
        }
    }

    private inferCategory(title: string, prompt: string): string {
        // Reuse similar logic, maybe static util later
        const text = (title + " " + prompt).toLowerCase();
        if (text.includes("logo") || text.includes("icon")) return "Logo / Icon";
        if (text.includes("3d")) return "3D Render";
        if (text.includes("photo") || text.includes("realistic")) return "Photography";
        return "Illustration";
    }
}

export class ZeroLuCrawler {
    async crawl(): Promise<Prompt[]> {
        console.log(`[ZeroLu] Starting crawl of README...`);
        const prompts: Prompt[] = [];

        try {
            const res = await fetch(ZEROLU_README_URL);
            if (!res.ok) throw new Error("Failed to fetch ZeroLu README");
            const text = await res.text();

            // Split by H3 headers like "### 1.1 " or "### 1.2 "
            // The format is `### X.X. Title`
            const sections = text.split(/### \d+\.\d+\.?\s+/).slice(1);

            // We need to capture the title which was consumed by split, or use matchAll
            // Better to use regex global match to find headers and their content positions.
            // Or just split and accept title is lost? No, title is important.

            // Alternative loop:
            const regex = /### (\d+\.\d+\.?\s+.*?)\n([\s\S]*?)(?=### \d+\.\d+|$)/g;
            let match;
            let count = 0;
            while ((match = regex.exec(text)) !== null) {
                const title = match[1].trim();
                const body = match[2];
                const prompt = this.parseSection(title, body);
                if (prompt) prompts.push(prompt);
                count++;
            }

        } catch (e) {
            console.error("[ZeroLu] Crawl failed", e);
        }

        console.log(`[ZeroLu] Crawled ${prompts.length} valid prompts.`);
        return prompts;
    }

    private parseSection(title: string, content: string): Prompt | null {
        // Extract Prompt
        // Format: **Prompt:**\n\n```\n...\n```
        const promptMatch = content.match(/\*\*Prompt:\*\*\s*[\n\r]*```[\w]*([\s\S]*?)```/);
        if (!promptMatch) return null;

        const promptText = promptMatch[1].trim();

        // Extract Images
        // Markdown image: ![...](url) or HTML <img src="...">
        const mdImageMatch = content.match(/!\[.*?\]\((.*?)\)/);
        const htmlImageMatch = content.match(/<img.*?src="(.*?)".*?>/);

        let imageUrl = mdImageMatch ? mdImageMatch[1] : (htmlImageMatch ? htmlImageMatch[1] : "");

        // Clean URL if it has query params (sometimes github adds them) unless needed
        // Assuming raw github images work fine.

        // Source
        const sourceMatch = content.match(/Source: \[@(.*?)\]\((.*?)\)/);
        const sourceUrl = sourceMatch ? sourceMatch[2] : `https://github.com/ZeroLu/awesome-nanobanana-pro#${title.toLowerCase().replace(/\s+/g, '-')}`;
        const author = sourceMatch ? sourceMatch[1] : "ZeroLu Community";

        return {
            id: 0,
            title,
            prompt: promptText,
            category: this.inferCategory(title, promptText),
            category_type: "style",
            description: title,
            images: imageUrl ? [imageUrl] : [],
            author,
            source: "zerolu",
            source_url: sourceUrl
        };
    }

    private inferCategory(title: string, prompt: string): string {
        const text = (title + " " + prompt).toLowerCase();
        if (text.includes("logo") || text.includes("icon")) return "Logo / Icon";
        if (text.includes("3d")) return "3D Render";
        if (text.includes("photo") || text.includes("realistic") || text.includes("selfie")) return "Photography";
        return "Illustration";
    }
}