openpencil/server/api/ai/validate.ts
Kayshen Xu ca1b5370ae
V0.3.0 (#24)
* feat(boolean-operations): implement boolean operations in the editor

- Added a new BooleanToolbar component for union, subtract, and intersect operations.
- Integrated boolean operations into the layer context menu and keyboard shortcuts.
- Enhanced the editor layout to include the boolean toolbar for improved user interaction.
- Updated internationalization support with new translation keys for boolean operations.
- Bumped version to 0.3.0 to reflect the addition of these features.

* refactor(editor): update editor layout and panels for improved functionality

- Replaced the PropertyPanel with a new RightPanel that includes both Property and Code panels.
- Removed the CodePanel from the main editor layout and integrated it into the RightPanel.
- Updated keyboard shortcuts to switch the right panel to the code tab.
- Enhanced the LayerPanel with a resizable width feature for better user experience.
- Added internationalization support for new right panel labels and code panel features.
- Introduced new code generation capabilities for various frameworks in the CodePanel.
- Improved overall layout structure for better responsiveness and usability.

* feat(electron): implement .op file association and enhance file handling

- Added support for .op file association in electron-builder, allowing OpenPencil documents to be opened directly from the file system.
- Implemented IPC handlers for opening and reading .op files, ensuring proper loading of document content.
- Enhanced the main process to handle file opening events on macOS and single-instance locking on Windows/Linux.
- Updated the renderer to listen for file open events and load documents accordingly.
- Improved README to reflect new file association feature.

* fix(canvas): improve layout accuracy for AI-generated designs

- Unify lineHeight default via canonical defaultLineHeight() function
- Unify text measurement by removing duplicate estimators in generation-utils
- Fix optical centering formula to scale proportionally with fontSize
- Round layout positions to whole pixels to prevent sub-pixel artifacts
- Recursively sanitize nested x/y in streaming layout containers
- Fix input trailing icon alignment using fill_container instead of space_between

* feat(canvas): right-align agent badge and add breathing glow border

- Agent badge now right-aligned to frame's right edge instead of after label
- Added breathing glow border around agent-owned frames during generation
- Glow border uses same color and lifecycle as the agent badge
- Removed unused BADGE_GAP constant and useDocumentStore import

* feat(code-panel): enhance tab scrolling functionality and add scrollbar utility

- Introduced left and right scroll buttons for tab navigation in the CodePanel, improving user experience for navigating long tab lists.
- Added a custom utility to hide scrollbars for a cleaner interface.
- Updated styles for better responsiveness and usability in the CodePanel layout.

* fix(docs): update Discord invite links in multiple README files

- Replaced outdated Discord invite links with the new link across all language-specific README files.
- Ensured consistency in the documentation for community engagement.

* feat(code-panel): enhance system prompt for responsive design

- Updated the ENHANCE_SYSTEM_PROMPT to emphasize the importance of responsive design in code rewriting.
- Added detailed guidelines for converting fixed pixel widths to relative units and using responsive Tailwind breakpoints.
- Ensured that the output remains visually faithful on desktop while adapting gracefully across screen sizes.

* feat(docs): add WeChat group information to README.zh.md and include group image

- Introduced a new section in the Chinese README to provide details about the WeChat group for community engagement.
- Added an image representing the WeChat group for better visibility and user interaction.

* feat(electron): enhance theme management and title bar overlay for Windows/Linux

- Updated the `setTheme` method in the Electron API to accept custom colors for the title bar overlay, improving theme synchronization across platforms.
- Adjusted title bar overlay colors for Windows and Linux to ensure proper visibility and aesthetics.
- Enhanced the top bar component to read computed CSS colors and apply them dynamically, ensuring a consistent user interface.
- Improved handling of theme changes in the application to support background and foreground color customization.

* fix(screenshot): update screenshot image for improved clarity and quality

* fix(docs): update WeChat group image path in README.zh.md for consistency

* fix(ai): fix post-generation validation pipeline and text centering

- Fix Agent SDK validation: save temp screenshots inside project dir
  (.openpencil-tmp/) so Claude Code plan mode can read them, instead
  of /tmp/ which is outside the project sandbox
- Enrich validation tree dump with fill colors, stroke, fontSize,
  fontWeight, textAlign, cornerRadius, opacity for comprehensive
  visual analysis
- Add multi-round validation with quality scoring (threshold 8/10),
  500ms stabilization delay between rounds
- Add detailed debug logging to applyValidationFixes showing which
  nodes were found/skipped and property changes
- Fix canvas sync needsTextbox check to also account for textAlign
  (matching isFixedWidthText in factory), preventing IText↔Textbox
  thrashing on every sync tick
- Auto-center text in vertical+center layouts by expanding to full
  container width and injecting textAlign:'center'
- Force Textbox for non-left-aligned text so textAlign is respected
  (IText ignores width and computes its own)

* fix(canvas): use precise text width estimation for fit-content layout

Remove the 14% safety factor from text width estimation when computing
fit-content/natural-width text dimensions. IText auto-computes its own
width and ignores our setting, so the safety margin only inflated the
layout allocation, making text appear left-shifted within its container.

* fix(canvas): center fit-content text in horizontal layouts

For text nodes with fit-content width in horizontal layouts, set
textAlign:'center' to compensate for width estimation inaccuracy.
The estimated box is typically wider than the actual rendered text,
causing left-aligned text to appear visually shifted. Centering
distributes the estimation error evenly on both sides.

* feat(ai): show validation details in checklist panel

- Accumulate validation log (screenshot, analysis, fixes) instead of
  overwriting status messages, so the full process is visible
- Preserve step thinking content in buildFinalStepTags (was discarded)
- Add details field to pipeline items and render in checklist UI
- Each validation step now shows: screenshot captured, issues found,
  quality score, fixes applied

* feat(ai): add visual reference pipeline types and integration hooks

- Add DesignSystem and VisualReference types to ai-types
- Add 'visual-ref' mode to AIDesignRequest and SubTask.htmlReference
- Detect visual-ref candidates in chat handlers (landing pages, websites)
- Wire visual-ref mode in design-generator and orchestrator
- Inject HTML reference snippets into sub-agent prompts

* feat(ai): add modular design principles for sub-agent context

- Add design-principles module with topic files: color, typography,
  spacing, composition, components
- Selectively load relevant principles based on prompt content
- Inject design principles into sub-agent system prompts

* feat(ai): implement visual reference pipeline

- Add design-system-generator: generates color/typography/spacing tokens
- Add design-code-generator: generates HTML/CSS from design system
- Add html-renderer: renders HTML to screenshot via html2canvas
- Add visual-ref-orchestrator: coordinates the full pipeline
  (design system → HTML code → screenshot → enrich subtasks)
- Add html2canvas dependency for client-side HTML rendering

* feat(mcp): default filePath to live canvas and fix cross-platform issues

- Default all MCP tool filePath to live://canvas when omitted, so tools
  operate on the real-time canvas instead of stale files
- Remove filePath from required params in all tool schemas (21 interfaces)
- Fix mcp-server-manager.ts using process.cwd() which fails in Electron
  production on Linux — now checks ELECTRON_RESOURCES_PATH first
- Fix stopMcpHttpServer using SIGTERM on Windows — use taskkill instead
- Force new children reference in applyExternalDocument to ensure canvas
  sync subscriber always detects MCP-pushed document updates

* feat(mcp): enhance design prompt with semantic roles, CJK typography, and layout rules

Add comprehensive design knowledge to MCP design prompt for better
AI-generated designs: design type detection (mobile vs desktop), full
semantic role reference with context-aware defaults, CJK typography
rules, expanded text/layout/form guidelines, and detailed post-processing
documentation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* feat(ai): implement intent classification for chat handlers

- Replace hardcoded keyword matching with a lightweight LLM call to classify user intent in chat messages.
- Introduce a new function `classifyIntent` to determine if the request is for design generation or conversation.
- Update design request handling in `useChatHandlers` to utilize the new classification method.
- Enhance design prompt documentation to reflect changes in design type detection based on intent rather than keywords.

* fix(ai): handle string qualityScore in validation response parsing

The LLM sometimes returns qualityScore as a string (e.g. "8" instead of 8),
causing it to fall through to 0. Also hide misleading "quality: 0/10" display
when the score couldn't be determined, and log raw response for debugging.

* fix(ai): increase validation timeout to 90s and fix quality score parsing

Agent SDK validation requires spawning a process, reading the image, and
analyzing it — 30s was consistently timing out. Also handle string
qualityScore values from LLM responses and hide misleading 0/10 display.

* fix(ai): fix validation timeout and response parsing

- Increase validation timeout from 30s to 180s (Agent SDK needs time
  for subprocess spawn + OAuth auth + multi-turn image reading)
- Strip <tool_use> XML blocks from Agent SDK response before extracting
  JSON — the tool call XML was confusing the regex, causing qualityScore
  to parse as 0 despite valid JSON being present
- Handle string qualityScore values and hide misleading "quality: 0/10"
- Revert unnecessary direct API key approach for validation

* fix(ai): prevent node ID collisions between generations

When generating new content on a canvas with existing nodes, AI-generated
IDs (e.g. brand-spacer) would collide with previous generations. Now
captures pre-existing node IDs at generation start and checks against
them during upsert sanitization. Remapped IDs are tracked in
generationRemappedIds so progressive streaming updates can still find
their nodes.

* fix(ai): require styleGuide in orchestrator plan and fix validation detail icons

- Add fallback default styleGuide when orchestrator LLM omits it
- Strengthen prompt to mark styleGuide as REQUIRED
- Replace emoji icons in validation details with [done]/[pending]/[error]
  markers for consistent styling with the checklist design system

* feat(server): add port file plugin for server instance discovery

- Introduce a new Nitro plugin that writes a port file on server startup to allow the MCP server to discover the running instance, whether it's a development server or Electron.
- Implement error handling in the Electron main process for writing the port file, logging any failures.
- Update Vite configuration to include additional external dependencies in the rollup configuration.

* feat(electron): implement IPC for retrieving pending file paths

- Added a new IPC handler `file:getPending` to retrieve and clear the pending file path when the React app mounts.
- Updated the Electron API to include `getPendingFile` for renderer access.
- Enhanced the `useElectronMenu` hook to load any pending file on application startup.
- Updated UI components to reflect changes in file handling and improved user experience.

* fix(panels): replace emoji icons with styled icons in validation checklist

- Parse [done]/[pending]/[error] prefixes in detail lines and render as
  styled circle icons matching the parent checklist design system
- Replace remaining emoji markers in design-validation.ts with text prefixes
- Fix isApplied detection to recognize new [done] Applied marker

* refactor(electron): update settings path to use platform-standard app data directory

- Changed the settings file path to utilize Electron's user data directory for better cross-platform compatibility.
- Updated the settings writing function to ensure the user data directory is created if it doesn't exist.
- Added comments to clarify the storage location for different operating systems.
- Implemented a fixed partition for localStorage/cookies to maintain data across server port changes.

* feat(ai): enhance validation with pre-checks, structural fixes, and border detection

- Add design-pre-validation.ts: pure code checks before LLM validation
  - Invisible container detection (same fill as parent → auto-add border)
  - Sibling consistency (majority-rule for height/cornerRadius)
- Add structural fixes to validation: addChild/removeNode operations
  - Icon injection via lookupIconByName with server fallback
  - autoFixParentLayout with child count guard to prevent layout breakage
- Add strokeColor/strokeWidth to safe fix properties for border fixes
- Simplify intent classification: all design requests use visual-ref pipeline
- Fix checklist: "Found N issues" now shows [done] instead of [pending]
- Fix qualityScore: only update when > 0 to preserve valid round scores

* fix(ai): cherry-pick safe validation improvements, drop aggressive pre-checks

Keep: stroke tree dump bug fix (object not array), qualityScore=0 false
positive detection, fit_content→fixed safety guard, empty path removal,
type-specific sibling consistency, repeated fix filtering, screenshot
extraction to design-screenshot.ts.

Drop: detectForcedFixedHeight (destroyed input/button heights),
MAX_VALIDATION_ROUNDS 5 (too many rounds), removal of quality threshold
early stop, section regeneration phase.

---------

Co-authored-by: Fini <fini.yang@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 11:55:35 +08:00

240 lines
7.8 KiB
TypeScript

import { defineEventHandler, readBody, setResponseHeaders } from 'h3'
import { resolveClaudeCli } from '../../utils/resolve-claude-cli'
import {
buildClaudeAgentEnv,
getClaudeAgentDebugFilePath,
} from '../../utils/resolve-claude-agent-env'
import { writeFile, mkdtemp, rm } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { runCodexExec } from '../../utils/codex-client'
interface ValidateBody {
system: string
message: string
imageBase64: string
model?: string
provider?: 'anthropic' | 'openai' | 'opencode'
}
/**
* Vision-based validation endpoint.
* Accepts a base64 PNG screenshot and a text prompt, sends multimodal
* content blocks for analysis via Agent SDK.
*
* Saves screenshot to temp file, asks Claude Code to read it via its
* built-in Read tool.
*/
export default defineEventHandler(async (event) => {
const body = await readBody<ValidateBody>(event)
if (!body?.system || !body?.message || !body?.imageBase64) {
setResponseHeaders(event, { 'Content-Type': 'application/json' })
return { error: 'Missing required fields: system, message, imageBase64' }
}
if (!body.model?.trim()) {
setResponseHeaders(event, { 'Content-Type': 'application/json' })
return { error: 'Missing model. Model fallback is disabled.' }
}
try {
if (body.provider === 'anthropic') {
return await validateViaAgentSDK(body, body.model)
}
if (body.provider === 'openai') {
return await validateViaCodex(body, body.model)
}
if (body.provider === 'opencode') {
return await validateViaOpenCode(body, body.model)
}
return { error: 'Missing or unsupported provider. Provider fallback is disabled.' }
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error'
return { error: message }
}
})
function toImageBase64(data: string): string {
const dataUrlPrefix = 'data:image/png;base64,'
return data.startsWith(dataUrlPrefix) ? data.slice(dataUrlPrefix.length) : data
}
async function withTempImageFile<T>(
imageBase64: string,
run: (tempPath: string) => Promise<T>,
insideProject = false,
): Promise<T> {
let tempDir: string
if (insideProject) {
// Save inside the project directory so Claude Code Agent SDK (plan mode)
// can read the file — it restricts reads to the project directory.
const { mkdirSync, chmodSync } = await import('node:fs')
const baseDir = join(process.cwd(), '.openpencil-tmp')
mkdirSync(baseDir, { recursive: true })
chmodSync(baseDir, 0o700)
tempDir = await mkdtemp(join(baseDir, 'validate-'))
} else {
tempDir = await mkdtemp(join(tmpdir(), 'openpencil-validate-'))
}
const tempPath = join(tempDir, 'screenshot.png')
try {
await writeFile(tempPath, Buffer.from(toImageBase64(imageBase64), 'base64'))
return await run(tempPath)
} finally {
await rm(tempDir, { recursive: true, force: true }).catch(() => {})
}
}
/**
* Agent SDK: save screenshot to a temp PNG file inside the project directory,
* then ask Claude Code to read it (Claude Code's Read tool supports images
* natively). Must use insideProject=true because plan mode restricts reads
* to the project directory.
*/
async function validateViaAgentSDK(
body: ValidateBody,
model?: string,
): Promise<{ text: string; skipped?: boolean; error?: string }> {
return await withTempImageFile(body.imageBase64, async (tempPath) => {
const { query } = await import('@anthropic-ai/claude-agent-sdk')
const env = buildClaudeAgentEnv()
const debugFile = getClaudeAgentDebugFilePath()
const claudePath = resolveClaudeCli()
const prompt = `IMPORTANT: First, use the Read tool to read the image file at "${tempPath}". This is a PNG screenshot of a UI design.
After viewing the image, analyze it according to these instructions:
${body.system}
${body.message}
CRITICAL: Your ENTIRE response must be a single JSON object. No markdown, no explanation, no tool calls after reading the image. Just the JSON.`
const q = query({
prompt,
options: {
...(model ? { model } : {}),
maxTurns: 3,
tools: [],
plugins: [],
permissionMode: 'plan',
persistSession: false,
env,
...(debugFile ? { debugFile } : {}),
...(claudePath ? { pathToClaudeCodeExecutable: claudePath } : {}),
},
})
try {
for await (const message of q) {
if (message.type === 'result') {
const isErrorResult = 'is_error' in message && Boolean((message as { is_error?: boolean }).is_error)
if (message.subtype === 'success' && !isErrorResult) {
return { text: message.result }
}
const errors = 'errors' in message ? (message.errors as string[]) : []
const resultText = 'result' in message ? String(message.result ?? '') : ''
return { error: errors.join('; ') || resultText || `Query ended with: ${message.subtype}`, text: '' }
}
}
} finally {
q.close()
}
return { text: '', skipped: true }
}, true)
}
async function validateViaCodex(
body: ValidateBody,
model?: string,
): Promise<{ text: string; skipped?: boolean; error?: string }> {
return await withTempImageFile(body.imageBase64, async (tempPath) => {
const result = await runCodexExec(
`${body.message}\n\nOutput ONLY the JSON object, no markdown fences, no explanation.`,
{
model,
systemPrompt: body.system,
imageFiles: [tempPath],
},
)
if (result.error) {
return { text: '', error: result.error }
}
return { text: result.text ?? '' }
})
}
function parseOpenCodeModel(model?: string): { providerID: string; modelID: string } | undefined {
if (!model || !model.includes('/')) return undefined
const idx = model.indexOf('/')
return { providerID: model.slice(0, idx), modelID: model.slice(idx + 1) }
}
async function validateViaOpenCode(
body: ValidateBody,
model?: string,
): Promise<{ text: string; skipped?: boolean; error?: string }> {
let ocServer: { close(): void } | undefined
try {
const { getOpencodeClient } = await import('../../utils/opencode-client')
const oc = await getOpencodeClient()
const ocClient: any = oc.client
ocServer = oc.server
const { data: session, error: sessionError } = await ocClient.session.create({
title: 'OpenPencil Validate',
})
if (sessionError || !session) {
return { text: '', error: 'Failed to create OpenCode session' }
}
await ocClient.session.prompt({
sessionID: session.id,
noReply: true,
parts: [{ type: 'text', text: body.system }],
})
const parsed = parseOpenCodeModel(model)
if (!parsed) {
return { text: '', error: 'Invalid OpenCode model format. Expected "provider/model".' }
}
const base64 = toImageBase64(body.imageBase64)
const promptPayload = {
sessionID: session.id,
model: parsed,
parts: [
{ type: 'image', url: `data:image/png;base64,${base64}` },
{
type: 'text',
text: `${body.message}\n\nOutput ONLY the JSON object, no markdown fences, no explanation.`,
},
],
}
const { data: result, error: promptError } = await ocClient.session.prompt(promptPayload)
if (promptError) {
return { text: '', error: 'OpenCode validation failed' }
}
const texts: string[] = []
if (result?.parts) {
for (const part of result.parts) {
if (part.type === 'text' && part.text) {
texts.push(part.text)
}
}
}
return { text: texts.join('') }
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error'
return { text: '', error: message }
} finally {
const { releaseOpencodeServer } = await import('../../utils/opencode-client')
releaseOpencodeServer(ocServer)
}
}