diff --git a/README.md b/README.md
index f0b97c4..e0fd1d7 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
-# 🔮 KV-Graph
+# 🔮 SysVis.AI - System Design Visualizer
**AI-Powered Diagram Editor** — Transform ideas into beautiful, interactive flowcharts using natural language, images, or Mermaid code.
-
+[](https://hub.docker.com/r/vndangkhoa/sys-arc-visl)
+[](https://github.com/vndangkhoa/kv-graph)
## ✨ Features
-- **🤖 AI-Powered Generation** — Generates complex diagrams from text prompts using **Llama 3** (local browser) or Cloud AI.
-- **👁️ Vision-to-Diagram** — **Florence-2** powered analysis converts screenshots and sketches into editable layouts entirely in the browser.
+- **🤖 AI-Powered Generation** — Generates complex diagrams from text prompts using **Qwen3-0.6B** (local browser) or Cloud AI.
+- **👁️ Vision-to-Diagram** — **ViT-GPT2** powered analysis converts screenshots and sketches into editable layouts entirely in the browser.
- **🖌️ Unified Toolkit** — A clean, consolidated toolbar for critical actions (Zoom, Layout, Pan/Select) keeps the canvas "void-like".
- **🗺️ MiniMap Overlay** — Navigational aid for large diagrams, unobtrusively positioned in the bottom-right.
- **💡 Smart Guidance** — Context-aware tips and rotation suggestions when looking at empty space.
@@ -18,17 +19,19 @@
## 🚀 Quick Start
-### Prerequisites
+### 🐳 Docker (Recommended)
-- Node.js 18+
-- npm or pnpm
-- WebGPU-compatible browser (Chrome 113+, Edge) for In-Browser AI
+```bash
+docker run -d -p 8338:80 vndangkhoa/sys-arc-visl:latest
+```
-### Installation
+Open [http://localhost:8338](http://localhost:8338) in your browser.
+
+### 💻 Local Development
```bash
# Clone the repository
-git clone https://github.com/your-username/kv-graph.git
+git clone https://github.com/vndangkhoa/kv-graph.git
cd kv-graph
# Install dependencies
@@ -40,6 +43,12 @@ npm run dev
Open [http://localhost:5173](http://localhost:5173) in your browser.
+### Prerequisites
+
+- Node.js 18+
+- npm or pnpm
+- WebGPU-compatible browser (Chrome 113+, Edge) for In-Browser AI
+
## 🧠 AI Configuration
KV-Graph supports a **Local-First** AI architecture, running powerful models directly in your browser via WebGPU.
@@ -47,34 +56,24 @@ KV-Graph supports a **Local-First** AI architecture, running powerful models dir
### 🌐 In-Browser Mode (Privacy First)
Runs entirely on your device. No data leaves your machine.
-| Capability | Model | Technology |
-|------------|-------|------------|
-| **Text Generation** | Llama-3-8B-Instruct | WebLLM (WebGPU) |
-| **Vision Analysis** | Florence-2-base | Transformers.js (ONNX) |
+| Capability | Model | Size | Speed |
+|------------|-------|------|-------|
+| **Text Generation** | Qwen3-0.6B | ~500MB | ~30-60s |
+| **Vision Analysis** | ViT-GPT2 | ~300MB | ~8-10s |
-*Note: First-time load requires downloading model weights (~4GB total).*
+*Note: First-time load requires downloading model weights.*
-### ☁️ Cloud Mode (Optional)
+### ☁️ Cloud Mode (Fast & Powerful)
Connect to external providers for enhanced capabilities.
| Provider | Model | API Key Required |
|----------|-------|------------------|
+| **Google Gemini** | Gemini 2.0 Flash | ✅ (Free tier available) |
| OpenAI | GPT-4 Vision | ✅ |
-| Google Gemini | Gemini Pro Vision | ✅ |
| Ollama | Custom | Local URL |
Configure your AI provider in **Settings** (⚙️ icon).
-## 🐳 Docker Support
-
-Run KV-Graph locally using Docker:
-
-```bash
-docker-compose up -d
-```
-
-Open [http://localhost:8338](http://localhost:8338) in your browser.
-
## 📁 Project Structure
```
@@ -88,17 +87,15 @@ kv-graph/
│ ├── hooks/ # Custom React hooks
│ ├── lib/ # Core Logic
│ │ ├── aiService.ts # AI Orchestrator
-│ │ ├── webLlmService.ts # Local LLM Engine
-│ │ ├── visionService.ts # Local Vision Engine
+│ │ ├── webLlmService.ts # Local LLM Engine (Qwen3)
+│ │ ├── visionService.ts # Local Vision Engine (ViT-GPT2)
│ │ └── layoutEngine.ts # Dagre Auto-Layout
│ ├── pages/ # Route pages
│ ├── store/ # Zustand Global State
-│ │ ├── flowStore.ts # Combined Flow State
-│ │ └── settingsStore.ts # AI & Theme Config
-│ ├── styles/ # Tailwind Global Styles
│ └── types/ # TypeScript interfaces
-├── public/ # Static assets & Models
-└── Configuration files
+├── public/ # Static assets
+├── Dockerfile # Docker build
+└── docker-compose.yml # Docker Compose
```
## 🛠️ Tech Stack
@@ -139,9 +136,11 @@ kv-graph/
## 🗺️ Roadmap
- [x] Undo/Redo history
-- [x] API for programmatic generation
-- [x] Plugin system (Foundation)
-
+- [x] Browser-based AI (WebLLM + Transformers.js)
+- [x] Vision-to-Diagram (ViT-GPT2)
+- [x] Cloud AI integration (Gemini, OpenAI)
+- [ ] Collaborative editing
+- [ ] Plugin system
## 📄 License
@@ -151,7 +150,8 @@ MIT License — see [LICENSE](./LICENSE) for details.
- [React Flow](https://reactflow.dev/) — Powerful diagram library
- [Mermaid.js](https://mermaid.js.org/) — Diagram syntax inspiration
-- [Ollama](https://ollama.ai/) — Local AI inference
+- [WebLLM](https://webllm.mlc.ai/) — Browser-based LLM inference
+- [Transformers.js](https://huggingface.co/docs/transformers.js/) — Browser ML models
- [Tailwind CSS](https://tailwindcss.com/) — Utility-first styling
---
diff --git a/src/components/Settings.tsx b/src/components/Settings.tsx
index 045699b..5cb0cf7 100644
--- a/src/components/Settings.tsx
+++ b/src/components/Settings.tsx
@@ -277,7 +277,7 @@ export function SettingsModal({ isOpen, onClose }: SettingsModalProps) {
Neural Engine (Text)
-
Llama-3.2-1B-Instruct-q4f32_1
+
Qwen3-0.6B (Fast!)
{isBrowserReady && }
@@ -321,7 +321,7 @@ export function SettingsModal({ isOpen, onClose }: SettingsModalProps) {
Vision Engine (Image)
-
Florence-2-base (~200MB)
+
ViT-GPT2 (~300MB, Fast!)
{isVisionReady && }
diff --git a/src/lib/aiService.ts b/src/lib/aiService.ts
index cd95d63..fff01c9 100644
--- a/src/lib/aiService.ts
+++ b/src/lib/aiService.ts
@@ -241,7 +241,31 @@ async function callBrowserAI(
messages: any[],
customSystemPrompt?: string
): Promise {
- const activePrompt = customSystemPrompt || SYSTEM_PROMPT;
+ // Simplified prompt for browser AI - just ask for Mermaid code directly
+ const BROWSER_AI_PROMPT = `You are a system design diagram generator. Generate ONLY Mermaid flowchart code.
+
+RULES:
+- Start with "graph TD" or "graph LR"
+- Use simple node IDs like A, B, C
+- Use subgraph for grouping
+- NO explanations, NO markdown, NO JSON - ONLY the mermaid code
+
+Example output:
+graph TD
+ subgraph Frontend
+ A[Web App]
+ B[Mobile App]
+ end
+ subgraph Backend
+ C[API Server]
+ D[(Database)]
+ end
+ A --> C
+ B --> C
+ C --> D
+
+Now generate mermaid code for the user's request. Output ONLY the mermaid code, nothing else.`;
+
try {
if (!webLlmService.getStatus().isReady) {
throw new Error('Browser model is not loaded. Please initialize it in Settings.');
@@ -261,6 +285,7 @@ async function callBrowserAI(
// Analyze the first image
// Assuming msg.images[0] is base64 string
const imageDescription = await visionService.analyzeImage(msg.images[0]);
+ console.log('Vision description:', imageDescription);
// Augment the prompt with the description
content = `${content}\n\n[VISUAL CONTEXT FROM IMAGE]:\n${imageDescription}\n\n(Use this visual description to generate the Mermaid code.)`;
@@ -273,32 +298,67 @@ async function callBrowserAI(
}
const fullMessages = [
- { role: 'system' as const, content: activePrompt },
+ { role: 'system' as const, content: BROWSER_AI_PROMPT },
...processedMessages
];
+ console.log('Starting WebLLM text generation...');
const generator = await webLlmService.chat(fullMessages);
let fullContent = "";
for await (const chunk of generator) {
fullContent += chunk;
}
+ console.log('WebLLM raw output:', fullContent.substring(0, 500)); // First 500 chars
- // Parse JSON
+ // Clean up the output - Browser AI outputs Mermaid code directly
let cleanContent = fullContent.trim();
+
+ // Strip Qwen3's reasoning tags if present
+ cleanContent = cleanContent.replace(/[\s\S]*?<\/think>/g, '').trim();
+ // Also remove incomplete tags (if model was cut off)
+ cleanContent = cleanContent.replace(/[\s\S]*$/g, '').trim();
+
+ // Remove markdown code blocks if present
if (cleanContent.startsWith('```')) {
- cleanContent = cleanContent.replace(/^```(?:json)?\s*\n?/, '').replace(/\n?```\s*$/, '');
+ cleanContent = cleanContent.replace(/^```(?:mermaid|json)?\s*\n?/, '').replace(/\n?```\s*$/, '');
}
- const parsed = JSON.parse(cleanContent);
+ // Try to extract mermaid code - look for "graph" pattern
+ const mermaidMatch = cleanContent.match(/graph\s+(?:TB|TD|LR|RL|BT)[\s\S]*/);
+ if (mermaidMatch) {
+ console.log('Extracted mermaid code successfully');
+ return {
+ success: true,
+ mermaidCode: mermaidMatch[0].trim()
+ };
+ }
+
+ // Fallback: try to parse as JSON if it looks like JSON
+ if (cleanContent.startsWith('{')) {
+ try {
+ const parsed = JSON.parse(cleanContent);
+ console.log('Parsed as JSON:', Object.keys(parsed));
+ return {
+ success: true,
+ mermaidCode: parsed.mermaidCode,
+ metadata: parsed.metadata,
+ analysis: parsed.analysis
+ };
+ } catch (e) {
+ // Not valid JSON, continue
+ }
+ }
+
+ // If we get here, we couldn't extract mermaid code
+ console.error('Could not extract mermaid code from:', cleanContent.substring(0, 500));
return {
- success: true,
- mermaidCode: parsed.mermaidCode,
- metadata: parsed.metadata,
- analysis: parsed.analysis // Forward analysis field if present
+ success: false,
+ error: 'Could not generate valid Mermaid diagram code'
};
} catch (error) {
+ console.error('Browser AI error:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Browser model logic failed'
diff --git a/src/lib/visionService.ts b/src/lib/visionService.ts
index 80ac214..e668701 100644
--- a/src/lib/visionService.ts
+++ b/src/lib/visionService.ts
@@ -1,5 +1,5 @@
-import { env, AutoProcessor, AutoModel, RawImage } from '@huggingface/transformers';
+import { env, pipeline, RawImage } from '@huggingface/transformers';
// Configure transformers.js
env.allowLocalModels = false;
@@ -11,13 +11,12 @@ export type VisionProgress = {
file?: string;
};
-// We use Florence-2-base for a good balance of speed and accuracy (~200MB - 400MB)
-// 'onnx-community/Florence-2-base-ft' is the modern standard for Transformers.js v3.
-const MODEL_ID = 'onnx-community/Florence-2-base-ft';
+// ViT-GPT2 is the ONLY working model for browser-based image captioning
+// Other models (BLIP, Florence-2, LLaVA) are not supported by transformers.js
+const MODEL_ID = 'Xenova/vit-gpt2-image-captioning';
export class VisionService {
- private model: any = null;
- private processor: any = null;
+ private captioner: any = null;
private isLoading = false;
private isReady = false;
@@ -46,13 +45,10 @@ export class VisionService {
try {
console.log('Loading Vision Model...');
- if (onProgress) onProgress({ status: 'Loading Processor...' });
+ if (onProgress) onProgress({ status: 'Loading Vision Model...' });
- this.processor = await AutoProcessor.from_pretrained(MODEL_ID);
-
- if (onProgress) onProgress({ status: 'Loading Model (this may take a while)...' });
-
- this.model = await AutoModel.from_pretrained(MODEL_ID, {
+ // Use the pipeline API - much simpler and faster
+ this.captioner = await pipeline('image-to-text', MODEL_ID, {
progress_callback: (progress: any) => {
if (onProgress && progress.status === 'progress') {
onProgress({
@@ -75,8 +71,8 @@ export class VisionService {
}
/**
- * Analyzes an image (Base64 or URL) and returns a detailed description.
- * We use the '' task for Florence-2.
+ * Analyzes an image (Base64 or URL) and returns a description.
+ * Uses vit-gpt2 for fast captioning.
*/
async analyzeImage(imageBase64: string): Promise {
if (!this.isReady) {
@@ -87,39 +83,47 @@ export class VisionService {
// Handle data URL prefix if present
const cleanBase64 = imageBase64.includes(',') ? imageBase64 : `data:image/png;base64,${imageBase64}`;
- const image = await RawImage.fromURL(cleanBase64);
+ let image = await RawImage.fromURL(cleanBase64);
- // Task: Detailed Captioning is best for understanding diagrams
- const task = '';
+ // Keep higher resolution for better detail detection
+ if (image.width > 512 || image.height > 512) {
+ image = await image.resize(512, 512);
+ }
- // Construct prompts using the processor's method (required for Florence-2)
- const prompts = this.processor.construct_prompts(task);
+ console.log('Starting enhanced image analysis...');
+ const startTime = performance.now();
- // Pre-process the image and text inputs
- // Processor expects batch input, so wrap single image in array
- if (!this.processor) throw new Error('Processor is undefined');
- const inputs = await this.processor([image], prompts);
+ // Run multiple passes for more comprehensive description
+ const results = await Promise.all([
+ // Pass 1: Detailed description
+ this.captioner(image, {
+ max_new_tokens: 150,
+ num_beams: 4, // Beam search for better quality
+ }),
+ // Pass 2: Alternative perspective
+ this.captioner(image, {
+ max_new_tokens: 100,
+ do_sample: true,
+ temperature: 0.7,
+ }),
+ ]);
- const generatedIds = await this.model.generate({
- ...inputs,
- max_new_tokens: 512, // Sufficient for a description
- });
+ const endTime = performance.now();
+ console.log(`Vision analysis completed in ${((endTime - startTime) / 1000).toFixed(1)}s`);
- const generatedText = this.processor.batch_decode(generatedIds, {
- skip_special_tokens: false,
- })[0];
+ // Combine descriptions for richer output
+ const caption1 = results[0]?.[0]?.generated_text || '';
+ const caption2 = results[1]?.[0]?.generated_text || '';
- // Post-process to extract the caption
- // Florence-2 output format usually includes the task token
- const parsedAnswer = this.processor.post_process_generation(
- generatedText,
- task,
- image.size
- );
+ // If both are similar, use just one; otherwise combine
+ if (caption1.toLowerCase().includes(caption2.toLowerCase().substring(0, 20)) ||
+ caption2.toLowerCase().includes(caption1.toLowerCase().substring(0, 20))) {
+ return caption1.length > caption2.length ? caption1 : caption2;
+ }
- // Access the dictionary result. For CAPTION tasks, it's usually under '' or similar key
- // Ideally post_process_generation returns { '': "Description..." }
- return parsedAnswer[''] || typeof parsedAnswer === 'string' ? parsedAnswer : JSON.stringify(parsedAnswer);
+ const combined = `${caption1}. Additionally: ${caption2}`;
+ console.log('Enhanced description:', combined);
+ return combined;
} catch (error) {
console.error('Vision analysis failed:', error);
diff --git a/src/lib/webLlmService.ts b/src/lib/webLlmService.ts
index 27b8050..5735b6d 100644
--- a/src/lib/webLlmService.ts
+++ b/src/lib/webLlmService.ts
@@ -7,8 +7,8 @@ export type WebLlmProgress = {
timeElapsed: number;
};
-// Latest "Tiny" model with high instruction adherence
-const DEFAULT_MODEL = "Llama-3.2-1B-Instruct-q4f32_1-MLC";
+// Qwen3-0.6B is fast and works well with simple Mermaid generation prompts
+const DEFAULT_MODEL = "Qwen3-0.6B-q4f32_1-MLC";
export class WebLlmService {
private engine: MLCEngine | null = null;
@@ -73,21 +73,31 @@ export class WebLlmService {
throw new Error("WebLLM Engine not initialized. Please load the model first.");
}
+ console.log('WebLLM: Creating completion...');
+ const startTime = performance.now();
const completion = await this.engine.chat.completions.create({
messages,
stream: true,
- temperature: 0.1, // Low temp for code/logic generation
- max_tokens: 4096, // Sufficient for diagrams
+ temperature: 0, // Deterministic output for code
+ max_tokens: 512, // Mermaid code is compact
+ top_p: 0.9, // Faster sampling
+ repetition_penalty: 1.1, // Avoid repetitive output
});
+ console.log('WebLLM: Completion created, streaming...');
// Create a generator to stream chunks easily
async function* streamGenerator() {
+ let tokenCount = 0;
for await (const chunk of completion) {
const content = chunk.choices[0]?.delta?.content || "";
if (content) {
+ tokenCount++;
+ if (tokenCount === 1) console.log('WebLLM: First token received');
yield content;
}
}
+ const endTime = performance.now();
+ console.log(`WebLLM: Generation complete (${tokenCount} tokens, ${((endTime - startTime) / 1000).toFixed(1)}s)`);
}
return streamGenerator();