mono/packages/kbot/docs/images-tauri-gem.md
2025-09-23 20:32:47 +02:00

17 KiB

Image Generation Architecture — Multi-Platform Strategy

This document outlines the architectural approach for supporting image generation across CLI (desktop), mobile (Android/iOS), and web platforms while maintaining code reuse and consistent user experience.

Current State Analysis

The existing CLI flow works well for desktop scenarios:

  • src/commands/images.ts orchestrates the process
  • Spawns Tauri desktop binary via spawn()
  • Handles image operations through Google Generative AI
  • Uses filesystem operations via @polymech/fs
  • IPC communication over stdin/stdout with JSON payloads

1. CLI Desktop (Current Flow - Maintained)

Architecture: CLI spawns Tauri GUI, handles all image operations in Node.js

// src/commands/images.ts (existing pattern)
const tauriProcess = spawn(getGuiAppPath(), args, { stdio: ['pipe', 'pipe', 'pipe'] });

// Send config to GUI
tauriProcess.stdin?.write(JSON.stringify({
  cmd: 'forward_config_to_frontend',
  prompt: argv.prompt,
  dst: argv.dst,
  apiKey: apiKey,
  files: absoluteIncludes,
}) + '\n');

// Handle generation requests from GUI
if (message.type === 'generate_request') {
  const imageBuffer = genFiles.length > 0 
    ? await editImage(genPrompt, genFiles, parsedOptions)
    : await createImage(genPrompt, parsedOptions);
  
  write(finalDstPath, imageBuffer);
}

Libraries:

  • Existing stack: @polymech/fs, tslog, Node core modules
  • Google Generative AI integration
  • Tauri for GUI spawning

No changes required - this flow remains optimal for desktop CLI usage.

2. Android/iOS - Standalone Tauri with TypeScript HTTP Client

Architecture: Tauri app runs standalone, TypeScript handles HTTP calls directly

Since mobile platforms cannot spawn processes, the Tauri app becomes the primary application. We leverage Tauri's HTTP plugin to make API calls from the TypeScript frontend.

Configuration Updates

// gui/tauri-app/src-tauri/tauri.conf.json
{
  "plugins": {
    "http": {
      "scope": [
        "https://generativelanguage.googleapis.com/**"
      ]
    }
  },
  "security": {
    "csp": "connect-src 'self' https://generativelanguage.googleapis.com"
  }
}

Mobile Image Client

// gui/tauri-app/src/lib/mobileImageClient.ts
import { tauriApi } from './tauriApi';

const GOOGLE_GENERATIVE_AI_BASE = 'https://generativelanguage.googleapis.com/v1beta';

export interface MobileImageOptions {
  model?: string;
  apiKey: string;
}

export class MobileImageClient {
  constructor(private options: MobileImageOptions) {}

  async createImage(prompt: string): Promise<Buffer> {
    const { model = 'gemini-2.5-flash-image-preview', apiKey } = this.options;
    
    const response = await tauriApi.fetch(`${GOOGLE_GENERATIVE_AI_BASE}/models/${model}:generateContent`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${apiKey}`,
      },
      body: JSON.stringify({
        contents: [{
          parts: [{ text: prompt }]
        }]
      }),
    });

    if (!response.ok) {
      throw new Error(`Google API error: ${response.status} ${response.statusText}`);
    }

    const data = await response.json();
    const inline = data.candidates?.[0]?.content?.parts?.find(
      (part: any) => part.inlineData
    )?.inlineData;

    if (!inline?.data) {
      throw new Error('No image data in Gemini response');
    }

    return Buffer.from(inline.data, 'base64');
  }

  async editImage(prompt: string, imageFiles: string[]): Promise<Buffer> {
    const { model = 'gemini-2.5-flash-image-preview', apiKey } = this.options;
    
    // Read image files using Tauri FS
    const imageParts = await Promise.all(
      imageFiles.map(async (filePath) => {
        const imageData = await tauriApi.fs.readFile(filePath);
        const base64 = btoa(String.fromCharCode(...imageData));
        const mimeType = filePath.toLowerCase().endsWith('.png') ? 'image/png' : 'image/jpeg';
        
        return {
          inlineData: {
            mimeType,
            data: base64
          }
        };
      })
    );

    const response = await tauriApi.fetch(`${GOOGLE_GENERATIVE_AI_BASE}/models/${model}:generateContent`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${apiKey}`,
      },
      body: JSON.stringify({
        contents: [{
          parts: [
            { text: prompt },
            ...imageParts
          ]
        }]
      }),
    });

    if (!response.ok) {
      throw new Error(`Google API error: ${response.status} ${response.statusText}`);
    }

    const data = await response.json();
    const inline = data.candidates?.[0]?.content?.parts?.find(
      (part: any) => part.inlineData
    )?.inlineData;

    if (!inline?.data) {
      throw new Error('No image data in Gemini response');
    }

    return Buffer.from(inline.data, 'base64');
  }
}

Mobile Integration

// gui/tauri-app/src/components/MobileImageWizard.tsx
import { MobileImageClient } from '../lib/mobileImageClient';

export function MobileImageWizard() {
  const [apiKey, setApiKey] = useState('');
  const [prompt, setPrompt] = useState('');
  
  const handleGenerate = async () => {
    const client = new MobileImageClient({ apiKey });
    
    try {
      const imageBuffer = await client.createImage(prompt);
      
      // Save to mobile app data directory
      const appDataDir = await tauriApi.path.appDataDir();
      const imagePath = await tauriApi.path.join(appDataDir, `generated_${Date.now()}.png`);
      
      await tauriApi.fs.writeFile(imagePath, imageBuffer);
      
      // Update UI with generated image
      setGeneratedImage(imagePath);
    } catch (error) {
      console.error('Generation failed:', error);
    }
  };
  
  return (
    <div className="mobile-image-wizard">
      {/* Mobile-optimized UI */}
    </div>
  );
}

Libraries:

  • @tauri-apps/plugin-http - HTTP requests
  • @tauri-apps/plugin-fs - File system operations
  • @tauri-apps/plugin-os - Platform detection
  • Existing React/TypeScript stack

3. Web App - Browser with Backend API

Architecture: Browser frontend + backend API server, configurable endpoints

Web browsers have CORS restrictions and cannot store API keys securely. We need a backend service to handle API calls and a configurable frontend.

Backend API Server (Hono)

// web/api/imageServer.ts
import { Hono } from 'hono';
import { cors } from 'hono/cors';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { z } from 'zod';

const app = new Hono();

// CORS configuration
app.use('/*', cors({
  origin: [
    'http://localhost:3000',
    'http://localhost:5173', // Vite dev
    process.env.FRONTEND_URL || 'https://your-app.example.com'
  ],
  allowHeaders: ['Content-Type', 'Authorization', 'X-API-Key'],
  allowMethods: ['POST', 'GET', 'OPTIONS'],
}));

// Request schemas
const CreateImageSchema = z.object({
  prompt: z.string().min(1),
  model: z.string().default('gemini-2.5-flash-image-preview'),
  userApiKey: z.string().optional(), // User-provided API key
});

const EditImageSchema = z.object({
  prompt: z.string().min(1),
  images: z.array(z.object({
    data: z.string(), // base64
    mimeType: z.string(),
  })),
  model: z.string().default('gemini-2.5-flash-image-preview'),
  userApiKey: z.string().optional(),
});

// Middleware for API key resolution
const resolveApiKey = async (c: any, userApiKey?: string) => {
  // Priority: user-provided > environment > tenant-specific
  return userApiKey || 
         process.env.GOOGLE_GENERATIVE_AI_KEY || 
         await getTenantApiKey(c.req.header('X-Tenant-ID'));
};

app.post('/api/images/create', async (c) => {
  try {
    const body = await c.req.json();
    const { prompt, model, userApiKey } = CreateImageSchema.parse(body);
    
    const apiKey = await resolveApiKey(c, userApiKey);
    if (!apiKey) {
      return c.json({ success: false, error: 'No API key available' }, 401);
    }

    const genAI = new GoogleGenerativeAI(apiKey);
    const modelClient = genAI.getGenerativeModel({ model });
    
    const result = await modelClient.generateContent(prompt);
    const response = await result.response;
    
    const inline = response.candidates?.[0]?.content?.parts?.find(
      (part) => 'inlineData' in part
    )?.inlineData;

    if (!inline?.data) {
      return c.json({ success: false, error: 'No image data in response' }, 500);
    }

    return c.json({
      success: true,
      image: {
        data: inline.data,
        mimeType: inline.mimeType || 'image/png'
      }
    });
  } catch (error) {
    console.error('Create image error:', error);
    return c.json({ 
      success: false, 
      error: error instanceof Error ? error.message : 'Unknown error' 
    }, 500);
  }
});

app.post('/api/images/edit', async (c) => {
  try {
    const body = await c.req.json();
    const { prompt, images, model, userApiKey } = EditImageSchema.parse(body);
    
    const apiKey = await resolveApiKey(c, userApiKey);
    if (!apiKey) {
      return c.json({ success: false, error: 'No API key available' }, 401);
    }

    const genAI = new GoogleGenerativeAI(apiKey);
    const modelClient = genAI.getGenerativeModel({ model });
    
    const parts = [
      { text: prompt },
      ...images.map(img => ({
        inlineData: {
          mimeType: img.mimeType,
          data: img.data
        }
      }))
    ];
    
    const result = await modelClient.generateContent({ contents: [{ parts }] });
    const response = await result.response;
    
    const inline = response.candidates?.[0]?.content?.parts?.find(
      (part) => 'inlineData' in part
    )?.inlineData;

    if (!inline?.data) {
      return c.json({ success: false, error: 'No image data in response' }, 500);
    }

    return c.json({
      success: true,
      image: {
        data: inline.data,
        mimeType: inline.mimeType || 'image/png'
      }
    });
  } catch (error) {
    console.error('Edit image error:', error);
    return c.json({ 
      success: false, 
      error: error instanceof Error ? error.message : 'Unknown error' 
    }, 500);
  }
});

// Health check
app.get('/api/health', (c) => {
  return c.json({ status: 'ok', timestamp: new Date().toISOString() });
});

export default app;

Web Client

// web/client/webImageClient.ts
export interface WebImageClientConfig {
  endpoint: string;
  apiKey?: string; // Optional user API key
  tenantId?: string;
}

export interface ImageResult {
  data: string; // base64
  mimeType: string;
}

export class WebImageClient {
  constructor(private config: WebImageClientConfig) {}

  async createImage(prompt: string, model?: string): Promise<ImageResult> {
    const response = await fetch(`${this.config.endpoint}/api/images/create`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        ...(this.config.tenantId && { 'X-Tenant-ID': this.config.tenantId }),
      },
      body: JSON.stringify({
        prompt,
        model,
        userApiKey: this.config.apiKey,
      }),
    });

    if (!response.ok) {
      const error = await response.json().catch(() => ({ error: 'Network error' }));
      throw new Error(error.error || `HTTP ${response.status}`);
    }

    const data = await response.json();
    if (!data.success) {
      throw new Error(data.error || 'Unknown server error');
    }

    return data.image;
  }

  async editImage(prompt: string, imageFiles: File[], model?: string): Promise<ImageResult> {
    // Convert files to base64
    const images = await Promise.all(
      imageFiles.map(async (file) => ({
        data: await fileToBase64(file),
        mimeType: file.type,
      }))
    );

    const response = await fetch(`${this.config.endpoint}/api/images/edit`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        ...(this.config.tenantId && { 'X-Tenant-ID': this.config.tenantId }),
      },
      body: JSON.stringify({
        prompt,
        images,
        model,
        userApiKey: this.config.apiKey,
      }),
    });

    if (!response.ok) {
      const error = await response.json().catch(() => ({ error: 'Network error' }));
      throw new Error(error.error || `HTTP ${response.status}`);
    }

    const data = await response.json();
    if (!data.success) {
      throw new Error(data.error || 'Unknown server error');
    }

    return data.image;
  }
}

// Utility function
async function fileToBase64(file: File): Promise<string> {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
    reader.onload = () => {
      const result = reader.result as string;
      resolve(result.split(',')[1]); // Remove data:image/...;base64, prefix
    };
    reader.onerror = reject;
    reader.readAsDataURL(file);
  });
}

Web Frontend Integration

// web/components/WebImageWizard.tsx
import { WebImageClient } from '../client/webImageClient';

export function WebImageWizard() {
  const [client, setClient] = useState<WebImageClient | null>(null);
  const [endpoint, setEndpoint] = useState(process.env.REACT_APP_API_ENDPOINT || '');
  const [apiKey, setApiKey] = useState('');
  
  useEffect(() => {
    if (endpoint) {
      setClient(new WebImageClient({ endpoint, apiKey }));
    }
  }, [endpoint, apiKey]);

  const handleGenerate = async (prompt: string) => {
    if (!client) return;
    
    try {
      const result = await client.createImage(prompt);
      
      // Create blob URL for display
      const blob = new Blob([
        Uint8Array.from(atob(result.data), c => c.charCodeAt(0))
      ], { type: result.mimeType });
      
      const imageUrl = URL.createObjectURL(blob);
      setGeneratedImage(imageUrl);
      
      // Optionally trigger download
      const link = document.createElement('a');
      link.href = imageUrl;
      link.download = `generated_${Date.now()}.png`;
      link.click();
    } catch (error) {
      console.error('Generation failed:', error);
    }
  };

  return (
    <div className="web-image-wizard">
      <div className="config-section">
        <input
          type="url"
          placeholder="API Endpoint"
          value={endpoint}
          onChange={(e) => setEndpoint(e.target.value)}
        />
        <input
          type="password"
          placeholder="API Key (optional)"
          value={apiKey}
          onChange={(e) => setApiKey(e.target.value)}
        />
      </div>
      {/* Rest of UI */}
    </div>
  );
}

Libraries:

  • Backend: hono, hono/cors, @google/generative-ai, zod
  • Frontend: React/Vue/Svelte, standard web APIs
  • Deployment: Bun, Node.js, or serverless (Vercel, Netlify Functions)

Configuration Schema Extension

// shared/config/imageConfig.ts
export interface ImageConfig {
  // Existing CLI config
  cli?: {
    model?: string;
    logLevel?: number;
  };
  
  // Mobile-specific config
  mobile?: {
    model?: string;
    cacheDir?: string;
    maxImageSize?: number;
  };
  
  // Web-specific config
  web?: {
    apiEndpoint: string;
    tenantId?: string;
    allowUserApiKeys?: boolean;
    maxFileSize?: number;
  };
  
  // Shared Google AI config
  google?: {
    key?: string; // For CLI and mobile
    defaultModel?: string;
  };
}

Platform Detection and Unified Interface

// shared/lib/unifiedImageClient.ts
export interface UnifiedImageGenerator {
  createImage(prompt: string, options?: any): Promise<Buffer | ImageResult>;
  editImage(prompt: string, images: string[] | File[], options?: any): Promise<Buffer | ImageResult>;
}

export async function createImageClient(config: ImageConfig): Promise<UnifiedImageGenerator> {
  // Detect platform
  if (typeof window === 'undefined') {
    // Node.js CLI environment
    const { CLIImageClient } = await import('./cliImageClient');
    return new CLIImageClient(config.cli, config.google);
  } else if ((window as any).__TAURI__) {
    // Tauri mobile/desktop environment
    const { MobileImageClient } = await import('./mobileImageClient');
    return new MobileImageClient({ apiKey: config.google?.key || '' });
  } else {
    // Web browser environment
    const { WebImageClient } = await import('./webImageClient');
    return new WebImageClient({
      endpoint: config.web?.apiEndpoint || '',
      tenantId: config.web?.tenantId,
    });
  }
}

Summary

This architecture provides:

  1. CLI Desktop: Maintains current efficient Node.js-based approach
  2. Mobile: Leverages Tauri HTTP plugin for direct API calls from TypeScript
  3. Web: Secure backend API with configurable endpoints and tenant support

Each platform optimizes for its constraints while sharing common TypeScript interfaces and configuration schemas. The next step is to break this down into actionable implementation tasks.