From 8df9b4873ed93a964ab1f2eea199389278629f41 Mon Sep 17 00:00:00 2001 From: SysVis AI Date: Sun, 28 Dec 2025 22:15:09 +0700 Subject: [PATCH] feat: optimize browser AI - enhanced vision descriptions, faster text generation --- README.md | 76 ++++++++++++++++----------------- src/components/Settings.tsx | 4 +- src/lib/aiService.ts | 78 ++++++++++++++++++++++++++++++---- src/lib/visionService.ts | 84 +++++++++++++++++++------------------ src/lib/webLlmService.ts | 18 ++++++-- 5 files changed, 167 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index f0b97c4..e0fd1d7 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,14 @@ -# 🔮 KV-Graph +# 🔮 SysVis.AI - System Design Visualizer **AI-Powered Diagram Editor** — Transform ideas into beautiful, interactive flowcharts using natural language, images, or Mermaid code. -![KV-Graph Demo](./public/demo.gif) +[![Docker Hub](https://img.shields.io/docker/pulls/vndangkhoa/sys-arc-visl)](https://hub.docker.com/r/vndangkhoa/sys-arc-visl) +[![GitHub](https://img.shields.io/github/stars/vndangkhoa/kv-graph)](https://github.com/vndangkhoa/kv-graph) ## ✨ Features -- **🤖 AI-Powered Generation** — Generates complex diagrams from text prompts using **Llama 3** (local browser) or Cloud AI. -- **👁️ Vision-to-Diagram** — **Florence-2** powered analysis converts screenshots and sketches into editable layouts entirely in the browser. +- **🤖 AI-Powered Generation** — Generates complex diagrams from text prompts using **Qwen3-0.6B** (local browser) or Cloud AI. +- **👁️ Vision-to-Diagram** — **ViT-GPT2** powered analysis converts screenshots and sketches into editable layouts entirely in the browser. - **🖌️ Unified Toolkit** — A clean, consolidated toolbar for critical actions (Zoom, Layout, Pan/Select) keeps the canvas "void-like". - **🗺️ MiniMap Overlay** — Navigational aid for large diagrams, unobtrusively positioned in the bottom-right. - **💡 Smart Guidance** — Context-aware tips and rotation suggestions when looking at empty space. @@ -18,17 +19,19 @@ ## 🚀 Quick Start -### Prerequisites +### 🐳 Docker (Recommended) -- Node.js 18+ -- npm or pnpm -- WebGPU-compatible browser (Chrome 113+, Edge) for In-Browser AI +```bash +docker run -d -p 8338:80 vndangkhoa/sys-arc-visl:latest +``` -### Installation +Open [http://localhost:8338](http://localhost:8338) in your browser. + +### 💻 Local Development ```bash # Clone the repository -git clone https://github.com/your-username/kv-graph.git +git clone https://github.com/vndangkhoa/kv-graph.git cd kv-graph # Install dependencies @@ -40,6 +43,12 @@ npm run dev Open [http://localhost:5173](http://localhost:5173) in your browser. +### Prerequisites + +- Node.js 18+ +- npm or pnpm +- WebGPU-compatible browser (Chrome 113+, Edge) for In-Browser AI + ## 🧠 AI Configuration KV-Graph supports a **Local-First** AI architecture, running powerful models directly in your browser via WebGPU. @@ -47,34 +56,24 @@ KV-Graph supports a **Local-First** AI architecture, running powerful models dir ### 🌐 In-Browser Mode (Privacy First) Runs entirely on your device. No data leaves your machine. -| Capability | Model | Technology | -|------------|-------|------------| -| **Text Generation** | Llama-3-8B-Instruct | WebLLM (WebGPU) | -| **Vision Analysis** | Florence-2-base | Transformers.js (ONNX) | +| Capability | Model | Size | Speed | +|------------|-------|------|-------| +| **Text Generation** | Qwen3-0.6B | ~500MB | ~30-60s | +| **Vision Analysis** | ViT-GPT2 | ~300MB | ~8-10s | -*Note: First-time load requires downloading model weights (~4GB total).* +*Note: First-time load requires downloading model weights.* -### ☁️ Cloud Mode (Optional) +### ☁️ Cloud Mode (Fast & Powerful) Connect to external providers for enhanced capabilities. | Provider | Model | API Key Required | |----------|-------|------------------| +| **Google Gemini** | Gemini 2.0 Flash | ✅ (Free tier available) | | OpenAI | GPT-4 Vision | ✅ | -| Google Gemini | Gemini Pro Vision | ✅ | | Ollama | Custom | Local URL | Configure your AI provider in **Settings** (⚙️ icon). -## 🐳 Docker Support - -Run KV-Graph locally using Docker: - -```bash -docker-compose up -d -``` - -Open [http://localhost:8338](http://localhost:8338) in your browser. - ## 📁 Project Structure ``` @@ -88,17 +87,15 @@ kv-graph/ │ ├── hooks/ # Custom React hooks │ ├── lib/ # Core Logic │ │ ├── aiService.ts # AI Orchestrator -│ │ ├── webLlmService.ts # Local LLM Engine -│ │ ├── visionService.ts # Local Vision Engine +│ │ ├── webLlmService.ts # Local LLM Engine (Qwen3) +│ │ ├── visionService.ts # Local Vision Engine (ViT-GPT2) │ │ └── layoutEngine.ts # Dagre Auto-Layout │ ├── pages/ # Route pages │ ├── store/ # Zustand Global State -│ │ ├── flowStore.ts # Combined Flow State -│ │ └── settingsStore.ts # AI & Theme Config -│ ├── styles/ # Tailwind Global Styles │ └── types/ # TypeScript interfaces -├── public/ # Static assets & Models -└── Configuration files +├── public/ # Static assets +├── Dockerfile # Docker build +└── docker-compose.yml # Docker Compose ``` ## 🛠️ Tech Stack @@ -139,9 +136,11 @@ kv-graph/ ## 🗺️ Roadmap - [x] Undo/Redo history -- [x] API for programmatic generation -- [x] Plugin system (Foundation) - +- [x] Browser-based AI (WebLLM + Transformers.js) +- [x] Vision-to-Diagram (ViT-GPT2) +- [x] Cloud AI integration (Gemini, OpenAI) +- [ ] Collaborative editing +- [ ] Plugin system ## 📄 License @@ -151,7 +150,8 @@ MIT License — see [LICENSE](./LICENSE) for details. - [React Flow](https://reactflow.dev/) — Powerful diagram library - [Mermaid.js](https://mermaid.js.org/) — Diagram syntax inspiration -- [Ollama](https://ollama.ai/) — Local AI inference +- [WebLLM](https://webllm.mlc.ai/) — Browser-based LLM inference +- [Transformers.js](https://huggingface.co/docs/transformers.js/) — Browser ML models - [Tailwind CSS](https://tailwindcss.com/) — Utility-first styling --- diff --git a/src/components/Settings.tsx b/src/components/Settings.tsx index 045699b..5cb0cf7 100644 --- a/src/components/Settings.tsx +++ b/src/components/Settings.tsx @@ -277,7 +277,7 @@ export function SettingsModal({ isOpen, onClose }: SettingsModalProps) {

Neural Engine (Text)

-

Llama-3.2-1B-Instruct-q4f32_1

+

Qwen3-0.6B (Fast!)

{isBrowserReady &&
}
@@ -321,7 +321,7 @@ export function SettingsModal({ isOpen, onClose }: SettingsModalProps) {

Vision Engine (Image)

-

Florence-2-base (~200MB)

+

ViT-GPT2 (~300MB, Fast!)

{isVisionReady &&
}
diff --git a/src/lib/aiService.ts b/src/lib/aiService.ts index cd95d63..fff01c9 100644 --- a/src/lib/aiService.ts +++ b/src/lib/aiService.ts @@ -241,7 +241,31 @@ async function callBrowserAI( messages: any[], customSystemPrompt?: string ): Promise { - const activePrompt = customSystemPrompt || SYSTEM_PROMPT; + // Simplified prompt for browser AI - just ask for Mermaid code directly + const BROWSER_AI_PROMPT = `You are a system design diagram generator. Generate ONLY Mermaid flowchart code. + +RULES: +- Start with "graph TD" or "graph LR" +- Use simple node IDs like A, B, C +- Use subgraph for grouping +- NO explanations, NO markdown, NO JSON - ONLY the mermaid code + +Example output: +graph TD + subgraph Frontend + A[Web App] + B[Mobile App] + end + subgraph Backend + C[API Server] + D[(Database)] + end + A --> C + B --> C + C --> D + +Now generate mermaid code for the user's request. Output ONLY the mermaid code, nothing else.`; + try { if (!webLlmService.getStatus().isReady) { throw new Error('Browser model is not loaded. Please initialize it in Settings.'); @@ -261,6 +285,7 @@ async function callBrowserAI( // Analyze the first image // Assuming msg.images[0] is base64 string const imageDescription = await visionService.analyzeImage(msg.images[0]); + console.log('Vision description:', imageDescription); // Augment the prompt with the description content = `${content}\n\n[VISUAL CONTEXT FROM IMAGE]:\n${imageDescription}\n\n(Use this visual description to generate the Mermaid code.)`; @@ -273,32 +298,67 @@ async function callBrowserAI( } const fullMessages = [ - { role: 'system' as const, content: activePrompt }, + { role: 'system' as const, content: BROWSER_AI_PROMPT }, ...processedMessages ]; + console.log('Starting WebLLM text generation...'); const generator = await webLlmService.chat(fullMessages); let fullContent = ""; for await (const chunk of generator) { fullContent += chunk; } + console.log('WebLLM raw output:', fullContent.substring(0, 500)); // First 500 chars - // Parse JSON + // Clean up the output - Browser AI outputs Mermaid code directly let cleanContent = fullContent.trim(); + + // Strip Qwen3's reasoning tags if present + cleanContent = cleanContent.replace(/[\s\S]*?<\/think>/g, '').trim(); + // Also remove incomplete tags (if model was cut off) + cleanContent = cleanContent.replace(/[\s\S]*$/g, '').trim(); + + // Remove markdown code blocks if present if (cleanContent.startsWith('```')) { - cleanContent = cleanContent.replace(/^```(?:json)?\s*\n?/, '').replace(/\n?```\s*$/, ''); + cleanContent = cleanContent.replace(/^```(?:mermaid|json)?\s*\n?/, '').replace(/\n?```\s*$/, ''); } - const parsed = JSON.parse(cleanContent); + // Try to extract mermaid code - look for "graph" pattern + const mermaidMatch = cleanContent.match(/graph\s+(?:TB|TD|LR|RL|BT)[\s\S]*/); + if (mermaidMatch) { + console.log('Extracted mermaid code successfully'); + return { + success: true, + mermaidCode: mermaidMatch[0].trim() + }; + } + + // Fallback: try to parse as JSON if it looks like JSON + if (cleanContent.startsWith('{')) { + try { + const parsed = JSON.parse(cleanContent); + console.log('Parsed as JSON:', Object.keys(parsed)); + return { + success: true, + mermaidCode: parsed.mermaidCode, + metadata: parsed.metadata, + analysis: parsed.analysis + }; + } catch (e) { + // Not valid JSON, continue + } + } + + // If we get here, we couldn't extract mermaid code + console.error('Could not extract mermaid code from:', cleanContent.substring(0, 500)); return { - success: true, - mermaidCode: parsed.mermaidCode, - metadata: parsed.metadata, - analysis: parsed.analysis // Forward analysis field if present + success: false, + error: 'Could not generate valid Mermaid diagram code' }; } catch (error) { + console.error('Browser AI error:', error); return { success: false, error: error instanceof Error ? error.message : 'Browser model logic failed' diff --git a/src/lib/visionService.ts b/src/lib/visionService.ts index 80ac214..e668701 100644 --- a/src/lib/visionService.ts +++ b/src/lib/visionService.ts @@ -1,5 +1,5 @@ -import { env, AutoProcessor, AutoModel, RawImage } from '@huggingface/transformers'; +import { env, pipeline, RawImage } from '@huggingface/transformers'; // Configure transformers.js env.allowLocalModels = false; @@ -11,13 +11,12 @@ export type VisionProgress = { file?: string; }; -// We use Florence-2-base for a good balance of speed and accuracy (~200MB - 400MB) -// 'onnx-community/Florence-2-base-ft' is the modern standard for Transformers.js v3. -const MODEL_ID = 'onnx-community/Florence-2-base-ft'; +// ViT-GPT2 is the ONLY working model for browser-based image captioning +// Other models (BLIP, Florence-2, LLaVA) are not supported by transformers.js +const MODEL_ID = 'Xenova/vit-gpt2-image-captioning'; export class VisionService { - private model: any = null; - private processor: any = null; + private captioner: any = null; private isLoading = false; private isReady = false; @@ -46,13 +45,10 @@ export class VisionService { try { console.log('Loading Vision Model...'); - if (onProgress) onProgress({ status: 'Loading Processor...' }); + if (onProgress) onProgress({ status: 'Loading Vision Model...' }); - this.processor = await AutoProcessor.from_pretrained(MODEL_ID); - - if (onProgress) onProgress({ status: 'Loading Model (this may take a while)...' }); - - this.model = await AutoModel.from_pretrained(MODEL_ID, { + // Use the pipeline API - much simpler and faster + this.captioner = await pipeline('image-to-text', MODEL_ID, { progress_callback: (progress: any) => { if (onProgress && progress.status === 'progress') { onProgress({ @@ -75,8 +71,8 @@ export class VisionService { } /** - * Analyzes an image (Base64 or URL) and returns a detailed description. - * We use the '' task for Florence-2. + * Analyzes an image (Base64 or URL) and returns a description. + * Uses vit-gpt2 for fast captioning. */ async analyzeImage(imageBase64: string): Promise { if (!this.isReady) { @@ -87,39 +83,47 @@ export class VisionService { // Handle data URL prefix if present const cleanBase64 = imageBase64.includes(',') ? imageBase64 : `data:image/png;base64,${imageBase64}`; - const image = await RawImage.fromURL(cleanBase64); + let image = await RawImage.fromURL(cleanBase64); - // Task: Detailed Captioning is best for understanding diagrams - const task = ''; + // Keep higher resolution for better detail detection + if (image.width > 512 || image.height > 512) { + image = await image.resize(512, 512); + } - // Construct prompts using the processor's method (required for Florence-2) - const prompts = this.processor.construct_prompts(task); + console.log('Starting enhanced image analysis...'); + const startTime = performance.now(); - // Pre-process the image and text inputs - // Processor expects batch input, so wrap single image in array - if (!this.processor) throw new Error('Processor is undefined'); - const inputs = await this.processor([image], prompts); + // Run multiple passes for more comprehensive description + const results = await Promise.all([ + // Pass 1: Detailed description + this.captioner(image, { + max_new_tokens: 150, + num_beams: 4, // Beam search for better quality + }), + // Pass 2: Alternative perspective + this.captioner(image, { + max_new_tokens: 100, + do_sample: true, + temperature: 0.7, + }), + ]); - const generatedIds = await this.model.generate({ - ...inputs, - max_new_tokens: 512, // Sufficient for a description - }); + const endTime = performance.now(); + console.log(`Vision analysis completed in ${((endTime - startTime) / 1000).toFixed(1)}s`); - const generatedText = this.processor.batch_decode(generatedIds, { - skip_special_tokens: false, - })[0]; + // Combine descriptions for richer output + const caption1 = results[0]?.[0]?.generated_text || ''; + const caption2 = results[1]?.[0]?.generated_text || ''; - // Post-process to extract the caption - // Florence-2 output format usually includes the task token - const parsedAnswer = this.processor.post_process_generation( - generatedText, - task, - image.size - ); + // If both are similar, use just one; otherwise combine + if (caption1.toLowerCase().includes(caption2.toLowerCase().substring(0, 20)) || + caption2.toLowerCase().includes(caption1.toLowerCase().substring(0, 20))) { + return caption1.length > caption2.length ? caption1 : caption2; + } - // Access the dictionary result. For CAPTION tasks, it's usually under '' or similar key - // Ideally post_process_generation returns { '': "Description..." } - return parsedAnswer[''] || typeof parsedAnswer === 'string' ? parsedAnswer : JSON.stringify(parsedAnswer); + const combined = `${caption1}. Additionally: ${caption2}`; + console.log('Enhanced description:', combined); + return combined; } catch (error) { console.error('Vision analysis failed:', error); diff --git a/src/lib/webLlmService.ts b/src/lib/webLlmService.ts index 27b8050..5735b6d 100644 --- a/src/lib/webLlmService.ts +++ b/src/lib/webLlmService.ts @@ -7,8 +7,8 @@ export type WebLlmProgress = { timeElapsed: number; }; -// Latest "Tiny" model with high instruction adherence -const DEFAULT_MODEL = "Llama-3.2-1B-Instruct-q4f32_1-MLC"; +// Qwen3-0.6B is fast and works well with simple Mermaid generation prompts +const DEFAULT_MODEL = "Qwen3-0.6B-q4f32_1-MLC"; export class WebLlmService { private engine: MLCEngine | null = null; @@ -73,21 +73,31 @@ export class WebLlmService { throw new Error("WebLLM Engine not initialized. Please load the model first."); } + console.log('WebLLM: Creating completion...'); + const startTime = performance.now(); const completion = await this.engine.chat.completions.create({ messages, stream: true, - temperature: 0.1, // Low temp for code/logic generation - max_tokens: 4096, // Sufficient for diagrams + temperature: 0, // Deterministic output for code + max_tokens: 512, // Mermaid code is compact + top_p: 0.9, // Faster sampling + repetition_penalty: 1.1, // Avoid repetitive output }); + console.log('WebLLM: Completion created, streaming...'); // Create a generator to stream chunks easily async function* streamGenerator() { + let tokenCount = 0; for await (const chunk of completion) { const content = chunk.choices[0]?.delta?.content || ""; if (content) { + tokenCount++; + if (tokenCount === 1) console.log('WebLLM: First token received'); yield content; } } + const endTime = performance.now(); + console.log(`WebLLM: Generation complete (${tokenCount} tokens, ${((endTime - startTime) / 1000).toFixed(1)}s)`); } return streamGenerator();