300 lines
11 KiB
TypeScript
300 lines
11 KiB
TypeScript
import { Prompt } from './types';
|
|
|
|
const JIMMYLV_SOURCE_URL = "https://raw.githubusercontent.com/JimmyLv/awesome-nano-banana/main/cases";
|
|
const YOUMIND_README_URL = "https://raw.githubusercontent.com/YouMind-OpenLab/awesome-nano-banana-pro-prompts/main/README.md";
|
|
const ZEROLU_README_URL = "https://raw.githubusercontent.com/ZeroLu/awesome-nanobanana-pro/main/README.md";
|
|
|
|
const MAX_CASE_ID = 200; // Increased limit slightly
|
|
const BATCH_SIZE = 10;
|
|
|
|
export class JimmyLvCrawler {
|
|
async crawl(limit: number = 300): Promise<Prompt[]> {
|
|
console.log(`Starting crawl for ${limit} cases...`);
|
|
const prompts: Prompt[] = [];
|
|
|
|
// Create batches of IDs to fetch
|
|
const ids = Array.from({ length: limit }, (_, i) => i + 1);
|
|
|
|
for (let i = 0; i < ids.length; i += BATCH_SIZE) {
|
|
const batch = ids.slice(i, i + BATCH_SIZE);
|
|
// console.log(`Fetching batch ${i + 1} to ${i + batch.length}...`);
|
|
|
|
const results = await Promise.all(
|
|
batch.map(id => this.fetchCase(id))
|
|
);
|
|
|
|
results.forEach(p => {
|
|
if (p) prompts.push(p);
|
|
});
|
|
}
|
|
|
|
console.log(`[JimmyLv] Crawled ${prompts.length} valid prompts.`);
|
|
return prompts;
|
|
}
|
|
|
|
private async fetchCase(id: number): Promise<Prompt | null> {
|
|
try {
|
|
const url = `${JIMMYLV_SOURCE_URL}/${id}/case.yml`;
|
|
const res = await fetch(url);
|
|
|
|
if (!res.ok) {
|
|
// console.warn(`Failed to fetch ${url}: ${res.status}`);
|
|
return null;
|
|
}
|
|
|
|
const text = await res.text();
|
|
return this.parseCase(text, id);
|
|
} catch (error) {
|
|
console.error(`Error fetching case ${id}:`, error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private parseCase(content: string, caseId: number): Prompt | null {
|
|
try {
|
|
// Extract title
|
|
let title = this.extract(content, /title_en:\s*(.+)/);
|
|
if (!title) title = this.extract(content, /title:\s*(.+)/) || "Unknown";
|
|
|
|
// Extract prompt (Multi-line block scalar)
|
|
let promptText = "";
|
|
const promptMatch = content.match(/prompt_en:\s*\|\s*\n((?: .+\n)+)/) ||
|
|
content.match(/prompt:\s*\|\s*\n((?: .+\n)+)/);
|
|
|
|
if (promptMatch) {
|
|
promptText = promptMatch[1]
|
|
.split('\n')
|
|
.map(line => line.trim())
|
|
.join(' ')
|
|
.trim();
|
|
}
|
|
|
|
if (!promptText) {
|
|
// Try simpler single line prompt
|
|
promptText = this.extract(content, /prompt:\s*(.+)/) || "";
|
|
}
|
|
|
|
if (!promptText) return null;
|
|
|
|
// Extract image filename
|
|
const imageFilename = this.extract(content, /image:\s*(.+)/);
|
|
let imageUrl = "";
|
|
if (imageFilename) {
|
|
imageUrl = `${JIMMYLV_SOURCE_URL}/${caseId}/${imageFilename}`;
|
|
}
|
|
|
|
// Extract author
|
|
const author = this.extract(content, /author:\s*"?([^"\n]+)"?/) || "JimmyLv Repo";
|
|
|
|
const category = this.inferCategory(title, promptText);
|
|
|
|
return {
|
|
id: 0, // Will be assigned by manager
|
|
title: title.slice(0, 150),
|
|
prompt: promptText,
|
|
category,
|
|
category_type: "style", // Simplified
|
|
description: promptText.slice(0, 200) + (promptText.length > 200 ? "..." : ""),
|
|
images: imageUrl ? [imageUrl] : [],
|
|
author,
|
|
source: "jimmylv",
|
|
source_url: `https://github.com/JimmyLv/awesome-nano-banana/tree/main/cases/${caseId}`
|
|
};
|
|
|
|
} catch (error) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private extract(content: string, regex: RegExp): string | null {
|
|
const match = content.match(regex);
|
|
return match ? match[1].trim() : null;
|
|
}
|
|
|
|
private inferCategory(title: string, prompt: string): string {
|
|
const text = (title + " " + prompt).toLowerCase();
|
|
|
|
const rules: [string[], string][] = [
|
|
[["ghibli", "anime", "cartoon", "chibi", "comic", "illustration", "drawing"], "Illustration"],
|
|
[["icon", "logo", "symbol"], "Logo / Icon"],
|
|
[["product", "packaging", "mockup"], "Product"],
|
|
[["avatar", "profile", "headshot"], "Profile / Avatar"],
|
|
[["infographic", "chart", "diagram"], "Infographic / Edu Visual"],
|
|
[["cinematic", "film", "movie"], "Cinematic / Film Still"],
|
|
[["3d", "render", "blender"], "3D Render"],
|
|
[["pixel", "8-bit", "retro game"], "Pixel Art"],
|
|
];
|
|
|
|
for (const [keywords, cat] of rules) {
|
|
if (keywords.some(k => text.includes(k))) return cat;
|
|
}
|
|
|
|
return "Photography";
|
|
}
|
|
}
|
|
|
|
export class YouMindCrawler {
|
|
async crawl(): Promise<Prompt[]> {
|
|
console.log(`[YouMind] Starting crawl of README...`);
|
|
const prompts: Prompt[] = [];
|
|
|
|
try {
|
|
const res = await fetch(YOUMIND_README_URL);
|
|
if (!res.ok) throw new Error("Failed to fetch YouMind README");
|
|
const text = await res.text();
|
|
|
|
// Split by "### No." sections
|
|
const sections = text.split(/### No\./g).slice(1);
|
|
|
|
let idCounter = 1;
|
|
for (const section of sections) {
|
|
const prompt = this.parseSection(section, idCounter++);
|
|
if (prompt) prompts.push(prompt);
|
|
}
|
|
|
|
} catch (e) {
|
|
console.error("[YouMind] Crawl failed", e);
|
|
}
|
|
|
|
console.log(`[YouMind] Crawled ${prompts.length} valid prompts.`);
|
|
return prompts;
|
|
}
|
|
|
|
private parseSection(content: string, index: number): Prompt | null {
|
|
try {
|
|
// Title: First line after number
|
|
const titleMatch = content.match(/\s*\d+:\s*(.+)/);
|
|
const title = titleMatch ? titleMatch[1].trim() : `YouMind Case ${index}`;
|
|
|
|
// Prompt Block
|
|
const promptMatch = content.match(/```\s*([\s\S]*?)\s*```/);
|
|
// Some sections might have multiple blocks, assume first large one is prompt?
|
|
// The README format shows prompt in a code block under #### 📝 Prompt
|
|
// Better regex: look for #### 📝 Prompt\n\n```\n...
|
|
const strictPromptMatch = content.match(/#### 📝 Prompt\s+```[\s\S]*?\n([\s\S]*?)```/);
|
|
|
|
const promptText = strictPromptMatch ? strictPromptMatch[1].trim() : (promptMatch ? promptMatch[1].trim() : "");
|
|
|
|
if (!promptText) return null;
|
|
|
|
// Images
|
|
const imageMatches = [...content.matchAll(/<img src="(.*?)"/g)];
|
|
const images = imageMatches.map(m => m[1]).filter(url => !url.includes("img.shields.io")); // Exclude badges
|
|
|
|
// Author / Source
|
|
const authorMatch = content.match(/- \*\*Author:\*\* \[(.*?)\]/);
|
|
const author = authorMatch ? authorMatch[1] : "YouMind Community";
|
|
|
|
const sourceMatch = content.match(/- \*\*Source:\*\* \[(.*?)\]\((.*?)\)/);
|
|
const sourceUrl = sourceMatch ? sourceMatch[2] : `https://github.com/YouMind-OpenLab/awesome-nano-banana-pro-prompts#no-${index}`;
|
|
|
|
return {
|
|
id: 0,
|
|
title,
|
|
prompt: promptText,
|
|
category: this.inferCategory(title, promptText),
|
|
category_type: "style",
|
|
description: title,
|
|
images,
|
|
author,
|
|
source: "youmind",
|
|
source_url: sourceUrl
|
|
};
|
|
} catch (e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private inferCategory(title: string, prompt: string): string {
|
|
// Reuse similar logic, maybe static util later
|
|
const text = (title + " " + prompt).toLowerCase();
|
|
if (text.includes("logo") || text.includes("icon")) return "Logo / Icon";
|
|
if (text.includes("3d")) return "3D Render";
|
|
if (text.includes("photo") || text.includes("realistic")) return "Photography";
|
|
return "Illustration";
|
|
}
|
|
}
|
|
|
|
export class ZeroLuCrawler {
|
|
async crawl(): Promise<Prompt[]> {
|
|
console.log(`[ZeroLu] Starting crawl of README...`);
|
|
const prompts: Prompt[] = [];
|
|
|
|
try {
|
|
const res = await fetch(ZEROLU_README_URL);
|
|
if (!res.ok) throw new Error("Failed to fetch ZeroLu README");
|
|
const text = await res.text();
|
|
|
|
// Split by H3 headers like "### 1.1 " or "### 1.2 "
|
|
// The format is `### X.X. Title`
|
|
const sections = text.split(/### \d+\.\d+\.?\s+/).slice(1);
|
|
|
|
// We need to capture the title which was consumed by split, or use matchAll
|
|
// Better to use regex global match to find headers and their content positions.
|
|
// Or just split and accept title is lost? No, title is important.
|
|
|
|
// Alternative loop:
|
|
const regex = /### (\d+\.\d+\.?\s+.*?)\n([\s\S]*?)(?=### \d+\.\d+|$)/g;
|
|
let match;
|
|
let count = 0;
|
|
while ((match = regex.exec(text)) !== null) {
|
|
const title = match[1].trim();
|
|
const body = match[2];
|
|
const prompt = this.parseSection(title, body);
|
|
if (prompt) prompts.push(prompt);
|
|
count++;
|
|
}
|
|
|
|
} catch (e) {
|
|
console.error("[ZeroLu] Crawl failed", e);
|
|
}
|
|
|
|
console.log(`[ZeroLu] Crawled ${prompts.length} valid prompts.`);
|
|
return prompts;
|
|
}
|
|
|
|
private parseSection(title: string, content: string): Prompt | null {
|
|
// Extract Prompt
|
|
// Format: **Prompt:**\n\n```\n...\n```
|
|
const promptMatch = content.match(/\*\*Prompt:\*\*\s*[\n\r]*```[\w]*([\s\S]*?)```/);
|
|
if (!promptMatch) return null;
|
|
|
|
const promptText = promptMatch[1].trim();
|
|
|
|
// Extract Images
|
|
// Markdown image:  or HTML <img src="...">
|
|
const mdImageMatch = content.match(/!\[.*?\]\((.*?)\)/);
|
|
const htmlImageMatch = content.match(/<img.*?src="(.*?)".*?>/);
|
|
|
|
let imageUrl = mdImageMatch ? mdImageMatch[1] : (htmlImageMatch ? htmlImageMatch[1] : "");
|
|
|
|
// Clean URL if it has query params (sometimes github adds them) unless needed
|
|
// Assuming raw github images work fine.
|
|
|
|
// Source
|
|
const sourceMatch = content.match(/Source: \[@(.*?)\]\((.*?)\)/);
|
|
const sourceUrl = sourceMatch ? sourceMatch[2] : `https://github.com/ZeroLu/awesome-nanobanana-pro#${title.toLowerCase().replace(/\s+/g, '-')}`;
|
|
const author = sourceMatch ? sourceMatch[1] : "ZeroLu Community";
|
|
|
|
return {
|
|
id: 0,
|
|
title,
|
|
prompt: promptText,
|
|
category: this.inferCategory(title, promptText),
|
|
category_type: "style",
|
|
description: title,
|
|
images: imageUrl ? [imageUrl] : [],
|
|
author,
|
|
source: "zerolu",
|
|
source_url: sourceUrl
|
|
};
|
|
}
|
|
|
|
private inferCategory(title: string, prompt: string): string {
|
|
const text = (title + " " + prompt).toLowerCase();
|
|
if (text.includes("logo") || text.includes("icon")) return "Logo / Icon";
|
|
if (text.includes("3d")) return "3D Render";
|
|
if (text.includes("photo") || text.includes("realistic") || text.includes("selfie")) return "Photography";
|
|
return "Illustration";
|
|
}
|
|
}
|