apix/lib/whisk-client.ts
Khoa.vo 8741e3b89f
Some checks are pending
CI / build (18.x) (push) Waiting to run
CI / build (20.x) (push) Waiting to run
feat: Initial commit with multi-provider image generation
2026-01-05 13:50:35 +07:00

556 lines
22 KiB
TypeScript

import { v4 as uuidv4 } from 'uuid';
import type {
WhiskAuthResponse,
WhiskGeneratePayload,
WhiskRecipePayload,
MediaInput,
WhiskGenerateResponse,
WhiskVideoResult
} from './types';
/**
* Whisk Client for Next.js
* Ported from whisk_client.py
*/
const ENDPOINTS = {
AUTH: "https://labs.google/fx/api/auth/session",
UPLOAD: "https://labs.google/fx/api/trpc/backbone.uploadImage",
GENERATE: "https://aisandbox-pa.googleapis.com/v1/whisk:generateImage",
RECIPE: "https://aisandbox-pa.googleapis.com/v1/whisk:runImageRecipe",
VIDEO_GENERATE: "https://aisandbox-pa.googleapis.com/v1/whisk:generateVideo",
VIDEO_STATUS: "https://aisandbox-pa.googleapis.com/v1:runVideoFxSingleClipsStatusCheck",
VIDEO_CREDITS: "https://aisandbox-pa.googleapis.com/v1/whisk:getVideoCreditStatus",
};
const DEFAULT_HEADERS = {
"Origin": "https://labs.google",
"Content-Type": "application/json",
"Referer": "https://labs.google/fx/tools/whisk/project",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
};
const ASPECT_RATIOS: Record<string, string> = {
"1:1": "IMAGE_ASPECT_RATIO_SQUARE",
"9:16": "IMAGE_ASPECT_RATIO_PORTRAIT",
"16:9": "IMAGE_ASPECT_RATIO_LANDSCAPE",
"4:3": "IMAGE_ASPECT_RATIO_LANDSCAPE_FOUR_THREE",
"3:4": "IMAGE_ASPECT_RATIO_PORTRAIT",
"Auto": "IMAGE_ASPECT_RATIO_SQUARE"
};
const MEDIA_CATEGORIES: Record<string, string> = {
"subject": "MEDIA_CATEGORY_SUBJECT",
"scene": "MEDIA_CATEGORY_SCENE",
"style": "MEDIA_CATEGORY_STYLE"
};
export interface GeneratedImage {
data: string; // Base64 string
index: number;
prompt: string;
aspectRatio: string;
}
export class WhiskClient {
private cookies: Record<string, string>;
private accessToken: string | null = null;
private tokenExpires: number = 0;
private cookieString: string = '';
constructor(cookieInput: string) {
this.cookies = this.parseCookies(cookieInput);
// Construct cookie string header
this.cookieString = Object.entries(this.cookies)
.map(([k, v]) => `${k}=${v}`)
.join('; ');
if (!this.cookieString) {
throw new Error("No valid cookies provided");
}
}
private parseCookies(input: string): Record<string, string> {
if (!input) return {};
input = input.trim();
const cookies: Record<string, string> = {};
// Try JSON
if (input.startsWith('[') && input.endsWith(']')) {
try {
const list = JSON.parse(input);
for (const c of list) {
if (c.name && c.value) cookies[c.name] = c.value;
}
return cookies;
} catch (e) { /* ignore */ }
}
// Try header string
input.split(';').forEach(part => {
const [name, value] = part.split('=');
if (name && value) cookies[name.trim()] = value.trim();
});
return cookies;
}
private async getAccessToken(): Promise<string> {
if (this.accessToken && Date.now() / 1000 < this.tokenExpires) {
return this.accessToken;
}
console.log("Fetching access token...");
try {
const res = await fetch(ENDPOINTS.AUTH, {
headers: {
...DEFAULT_HEADERS,
"Cookie": this.cookieString
}
});
if (!res.ok) throw new Error(`Auth failed: ${res.status}`);
const data = await res.json();
if (!data.access_token) throw new Error("Missing access_token");
this.accessToken = data.access_token;
this.tokenExpires = (Date.now() / 1000) + 3300; // 55 mins
return this.accessToken!;
} catch (e) {
throw new Error(`Authentication failed: ${e}`);
}
}
async uploadReferenceImage(fileBase64: string, mimeType: string, category: string): Promise<string | null> {
const mediaCategory = MEDIA_CATEGORIES[category.toLowerCase()] || "MEDIA_CATEGORY_SUBJECT";
const dataUri = `data:${mimeType};base64,${fileBase64}`;
const payload = {
json: {
clientContext: {
workflowId: uuidv4(),
sessionId: Date.now().toString()
},
uploadMediaInput: {
mediaCategory,
rawBytes: dataUri,
caption: ""
}
}
};
const res = await fetch(ENDPOINTS.UPLOAD, {
method: "POST",
headers: {
...DEFAULT_HEADERS,
"Cookie": this.cookieString
},
body: JSON.stringify(payload)
});
if (!res.ok) {
const errText = await res.text();
console.error("Upload API Error:", errText);
throw new Error(`Upload status ${res.status}: ${errText.substring(0, 200)}`);
}
const data = await res.json();
const mediaId = data?.result?.data?.json?.result?.uploadMediaGenerationId;
if (mediaId) {
console.log(`Uploaded ${category} image: ${mediaId}`);
return mediaId;
}
console.error("Upload response missing ID:", JSON.stringify(data).substring(0, 200));
throw new Error("Upload successful but returned no ID");
}
async generate(
prompt: string,
aspectRatio: string = "1:1",
refs: { subject?: string | string[]; scene?: string | string[]; style?: string | string[] } = {},
preciseMode: boolean = false
): Promise<GeneratedImage[]> {
const token = await this.getAccessToken();
// Prepare Media Inputs (Assuming refs are Generation IDs already uploaded)
// Now supports multiple IDs per category
const mediaInputs: MediaInput[] = [];
// Helper to add refs (handles both single string and array)
const addRefs = (category: string, ids: string | string[] | undefined) => {
if (!ids) return;
const idArray = Array.isArray(ids) ? ids : [ids];
for (const id of idArray) {
mediaInputs.push({ mediaInput: { mediaCategory: category, mediaGenerationId: id } });
}
};
addRefs("MEDIA_CATEGORY_SUBJECT", refs.subject);
addRefs("MEDIA_CATEGORY_SCENE", refs.scene);
addRefs("MEDIA_CATEGORY_STYLE", refs.style);
const isImageToImage = mediaInputs.length > 0;
const endpoint = isImageToImage ? ENDPOINTS.RECIPE : ENDPOINTS.GENERATE;
const arEnum = ASPECT_RATIOS[aspectRatio] || "IMAGE_ASPECT_RATIO_SQUARE";
let payload: WhiskGeneratePayload | WhiskRecipePayload;
const clientContext = {
workflowId: uuidv4(),
tool: isImageToImage ? "BACKBONE" : "IMAGE_FX",
sessionId: Date.now().toString()
};
if (!isImageToImage) {
const seed = Math.floor(Math.random() * 100000);
payload = {
clientContext,
imageModelSettings: {
imageModel: "IMAGEN_3_5",
aspectRatio: arEnum
},
seed: seed,
prompt: prompt,
mediaCategory: "MEDIA_CATEGORY_BOARD"
} as any;
} else {
// Image-to-Image (Recipe) - uses runImageRecipe endpoint
// Uses recipeMediaInputs array with caption and mediaInput for each ref
const seed = Math.floor(Math.random() * 1000000);
// Build recipeMediaInputs array (handles multiple IDs per category)
const recipeMediaInputs: Array<{ caption: string; mediaInput: { mediaCategory: string; mediaGenerationId: string } }> = [];
// Helper to add recipe inputs (handles both single string and array)
const addRecipeRefs = (category: string, ids: string | string[] | undefined) => {
if (!ids) return;
const idArray = Array.isArray(ids) ? ids : [ids];
for (const id of idArray) {
recipeMediaInputs.push({
caption: "",
mediaInput: {
mediaCategory: category,
mediaGenerationId: id
}
});
}
};
addRecipeRefs("MEDIA_CATEGORY_SUBJECT", refs.subject);
addRecipeRefs("MEDIA_CATEGORY_SCENE", refs.scene);
addRecipeRefs("MEDIA_CATEGORY_STYLE", refs.style);
payload = {
clientContext,
imageModelSettings: {
imageModel: "R2I", // Recipe-to-Image model
aspectRatio: arEnum
},
seed: seed,
userInstruction: prompt, // Note: uses userInstruction instead of prompt
recipeMediaInputs: recipeMediaInputs
// Note: preciseMode field name TBD - needs API discovery
} as any;
}
console.log(`Generating: "${prompt.substring(0, 30)}..." (Refs: ${mediaInputs.length})`);
try {
const res = await fetch(endpoint, {
method: "POST",
headers: {
...DEFAULT_HEADERS,
"Authorization": `Bearer ${token}`
},
body: JSON.stringify(payload)
});
if (!res.ok) {
const errText = await res.text();
console.error("Whisk API Error Body:", errText);
throw new Error(`API Error ${res.status}: ${errText.substring(0, 500)}`);
}
const json = await res.json() as WhiskGenerateResponse;
const images: string[] = [];
if (json.imagePanels) {
for (const panel of json.imagePanels) {
for (const img of (panel.generatedImages || [])) {
if (img.encodedImage) images.push(img.encodedImage);
}
}
}
if (images.length === 0) throw new Error("No images returned");
return images.map((data, i) => ({
data,
index: i,
prompt,
aspectRatio
}));
} catch (e: unknown) {
console.error("Generation failed:", e);
const errMessage = e instanceof Error ? e.message : String(e);
// Check for safety filter
if (errMessage.includes("UNSAFE") || errMessage.includes("SEXUAL")) {
throw new Error("Safety Filter Blocked Request");
}
throw e;
}
}
/**
* Generate a video from an image using Whisk Animate (Veo)
* This is an async operation - submits request and polls for completion
*/
async generateVideo(
imageGenerationId: string,
prompt: string,
imageBase64?: string,
aspectRatio: string = "16:9"
): Promise<WhiskVideoResult> {
const token = await this.getAccessToken();
console.log("generateVideo: Starting video generation...", {
hasImageId: !!imageGenerationId,
hasBase64: !!imageBase64,
promptLength: prompt.length
});
// Generate IDs for client context
const sessionId = `;${Date.now()}`;
const workflowId = crypto.randomUUID();
// Map aspect ratio to video format
const videoAspectRatio = aspectRatio === "9:16"
? "VIDEO_ASPECT_RATIO_PORTRAIT"
: "VIDEO_ASPECT_RATIO_LANDSCAPE";
// Build promptImageInput - the nested object for prompt and image
const promptImageInput: Record<string, string> = {
prompt: prompt
};
// Add image reference
if (imageGenerationId) {
promptImageInput.mediaGenerationId = imageGenerationId;
} else if (imageBase64) {
// Clean and prepare base64 (remove data URI prefix)
const cleanBase64 = imageBase64.replace(/^data:image\/\w+;base64,/, '');
promptImageInput.rawBytes = cleanBase64;
} else {
throw new Error("Either imageGenerationId or imageBase64 is required");
}
// Build payload matching Whisk API structure (with correct field names)
const payload = {
clientContext: {
sessionId: sessionId,
tool: "BACKBONE",
workflowId: workflowId
},
promptImageInput: promptImageInput,
modelNameType: "VEO_3_1_I2V_12STEP",
loopVideo: false,
aspectRatio: videoAspectRatio
};
const endpoint = ENDPOINTS.VIDEO_GENERATE;
console.log("generateVideo: Sending payload to", endpoint, JSON.stringify(payload, null, 2));
try {
const res = await fetch(endpoint, {
method: "POST",
headers: {
...DEFAULT_HEADERS,
"Authorization": `Bearer ${token}`
},
body: JSON.stringify(payload),
});
if (!res.ok) {
const errorText = await res.text();
console.error("generateVideo: API Error", res.status, errorText);
throw new Error(`Whisk API Error: ${res.status} - ${errorText}`);
}
const data = await res.json();
console.log("generateVideo: Response Data", JSON.stringify(data, null, 2));
let resultId = '';
// Check for mediaGenerationId (Whisk video response format)
if (data.mediaGenerationId) {
resultId = data.mediaGenerationId;
}
// Check for operation ID (alternative response)
else if (data.operation?.operation?.name) {
resultId = data.operation.operation.name;
}
// Fallback checks
else {
const generations = data.mediaGenerations || [];
const videoGen = generations.find((g: unknown) => (g as { mediaContentType?: string }).mediaContentType === 'MEDIA_CONTENT_TYPE_VIDEO');
resultId = data.videoGenerationId || (videoGen as { id?: string })?.id || (generations[0] as { id?: string })?.id || data.id;
}
if (!resultId) {
console.error("generateVideo: No ID found in response", data);
throw new Error("Failed to start video generation: No ID returned");
}
console.log("generateVideo: Got ID, starting polling:", resultId);
// Start polling for result
return this.pollVideoStatus(resultId, token);
} catch (error) {
console.error("generateVideo: Failed", error);
throw error;
}
}
/**
* Poll for video generation status until complete or failed
* Uses the runVideoFxSingleClipsStatusCheck endpoint
*/
private async pollVideoStatus(videoGenId: string, token: string): Promise<WhiskVideoResult> {
const maxAttempts = 60; // 5 minutes max (5s intervals)
const pollInterval = 5000; // 5 seconds
console.log(`Starting video status polling for ID: ${videoGenId}`);
for (let attempt = 0; attempt < maxAttempts; attempt++) {
console.log(`Polling video status... attempt ${attempt + 1}/${maxAttempts}`);
try {
// Use POST request with operations array containing operation.name
// (video uses async operation model, not mediaGenerationId)
const statusPayload = {
operations: [
{
operation: {
name: videoGenId
}
}
]
};
const res = await fetch(ENDPOINTS.VIDEO_STATUS, {
method: "POST",
headers: {
...DEFAULT_HEADERS,
"Authorization": `Bearer ${token}`
},
body: JSON.stringify(statusPayload)
});
if (!res.ok) {
const errText = await res.text();
console.error("Video status error:", res.status, errText);
// Continue polling unless it's a fatal error (4xx)
if (res.status >= 400 && res.status < 500) {
throw new Error(`Video status error: ${errText}`);
}
} else {
const json = await res.json();
console.log("Video status response:", JSON.stringify(json, null, 2));
// Response is likely in operations array format
const operation = json.operations?.[0] || json;
const status = operation.status || operation.state || operation.taskStatus || json.status;
// Normalize status - check for completion
const isComplete = status === 'COMPLETED' || status === 'SUCCEEDED' ||
status === 'complete' || status === 'FINISHED' ||
status === 'MEDIA_GENERATION_STATUS_COMPLETE' ||
status === 'MEDIA_GENERATION_STATUS_SUCCEEDED' ||
status === 'MEDIA_GENERATION_STATUS_SUCCESSFUL' ||
status?.includes('SUCCESSFUL') || status?.includes('COMPLETE');
// Normalize status - check for failure
const isFailed = status === 'FAILED' || status === 'ERROR' ||
status === 'failed' || status === 'CANCELLED' ||
status === 'MEDIA_GENERATION_STATUS_FAILED' ||
status?.includes('FAILED') || status?.includes('ERROR');
if (isComplete) {
// Check multiple possible response formats (including nested in operations)
const result = operation.result || operation;
// Check for URL first
const videoUrl = result.videoUrl || result.video?.url || result.mediaUrl ||
result.generatedMedia?.url || result.generatedMedia?.uri ||
result.url || json.videoUrl || operation.generatedMedia?.uri;
// Check for base64 encoded video data - Whisk uses rawBytes field
const encodedVideo = operation.rawBytes || result.rawBytes ||
result.encodedVideo || result.video?.encodedVideo ||
result.generatedMedia?.encodedVideo || json.encodedVideo ||
operation.generatedMedia?.rawBytes;
if (videoUrl) {
console.log("Video generation complete with URL:", videoUrl);
return { id: videoGenId, url: videoUrl, status: 'COMPLETED' };
} else if (encodedVideo) {
console.log("Video generation complete with rawBytes/base64 data");
// Check if it's already a data URI or needs to be converted
const videoDataUri = encodedVideo.startsWith('data:')
? encodedVideo
: `data:video/mp4;base64,${encodedVideo}`;
return {
id: videoGenId,
url: videoDataUri,
status: 'COMPLETED'
};
} else {
console.warn("Video completed but no URL/data found in response:", JSON.stringify(json, null, 2));
// Try to find any media key that can be used
const mediaKey = operation.mediaKey || result.mediaKey;
if (mediaKey) {
console.log("Found mediaKey, but no direct URL:", mediaKey);
}
}
} else if (isFailed) {
// Extract error message from nested structure
const errorMsg = operation.operation?.error?.message ||
operation.error?.message ||
operation.error ||
json.error?.message ||
json.error ||
'Video generation failed';
// Immediately throw - don't continue polling on failure
console.error("Video generation FAILED:", errorMsg);
throw new Error(`Video generation failed: ${errorMsg}`);
}
// IN_PROGRESS, PENDING, PROCESSING, RUNNING - continue polling
console.log(`Video status: ${status} - continuing to poll...`);
}
} catch (e: any) {
// Check if this is a logical failure (should not retry) vs network error (should retry)
if (e.message?.includes('Video generation failed:') ||
e.message?.includes('NCII') ||
e.message?.includes('content policy') ||
e.message?.includes('safety')) {
// Logical failure - throw immediately
throw e;
}
console.error("Poll error (network/transient):", e);
if (attempt === maxAttempts - 1) throw e;
}
await new Promise(resolve => setTimeout(resolve, pollInterval));
}
throw new Error("Video generation timed out after 5 minutes");
}
}