/** * Vision-Proxy Extension — v6 (Robuste Bildpfad-Erkennung) * * Änderungen v6 → v5: * - Erkennt Bildpfade ÜBERALL im Prompt (nicht nur /tmp/pi-clipboard-*) * - Unterstützt: absolute Pfade, relative Pfade, file:// URLs * - Erkennung per Dateiendung (.png, .jpg, .jpeg, .gif, .webp, .bmp) * - Prüft Datei-Existenz vor Verarbeitung * - Maximal 5 Bilder pro Nachricht (Schutz vor Overload) * - Bessere Fehlerbehandlung und Logging * * v5 bleibt als vision-proxy-v5-backup.ts erhalten */ import { complete, type Message } from "@earendil-works/pi-ai"; import * as fs from "node:fs"; import * as path from "node:path"; import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; const LOG_FILE = "/tmp/vision-proxy.log"; const VISION_PROVIDER = "openrouter"; const VISION_MODEL_ID = "qwen/qwen3-vl-32b-instruct"; const MAX_IMAGES = 5; function log(msg: string) { const ts = new Date().toISOString().substring(11, 23); const line = `[${ts}] ${msg}\n`; try { fs.appendFileSync(LOG_FILE, line); } catch {} } const VISION_SYSTEM_PROMPT = `Du bist ein Bildanalyse-Assistent. Beschreibe das angehängte Bild detailliert und präzise. Gib ALLES lesbar wieder — jeden Text, jeden Code, jede Zahl exakt. Bei Fehlermeldungen: Zeichen-exakte Wiedergabe. Bei Tabellen, Diagrammen oder Strukturen: Gib den Inhalt strukturiert wieder. Keine Einleitung — direkt den Inhalt beschreiben.`; const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"]); function isImagePath(filePath: string): boolean { const ext = path.extname(filePath).toLowerCase(); return IMAGE_EXTENSIONS.has(ext); } function detectMimeType(filePath: string): string { const ext = path.extname(filePath).toLowerCase(); const map: Record = { ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp", ".tiff": "image/tiff", ".tif": "image/tiff", }; return map[ext] || "image/png"; } /** * Extrahiert Bildpfade aus dem Prompt-Text. * Erkennt: * - Absolute Pfade: /home/user/bild.png * - pi-clipboard Pfade: /tmp/pi-clipboard-xxx.png * - file:// URLs * - Pfade in Anführungszeichen (mit Leerzeichen) * - Relative Pfade die auf dem CWD existieren */ function extractImagePaths(prompt: string, cwd: string): string[] { const found: string[] = []; const seen = new Set(); // Pattern 1: file:// URLs const fileUrlPattern = /file:\/\/([^\s"'<>]+)/g; let match; while ((match = fileUrlPattern.exec(prompt)) !== null) { const p = decodeURIComponent(match[1]); if (!seen.has(p) && isImagePath(p)) { seen.add(p); found.push(p); } } // Pattern 2: Absolute Pfade mit Bild-Endung (Linux/Mac) const absPathPattern = /(\/[^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi; while ((match = absPathPattern.exec(prompt)) !== null) { const p = match[1]; if (!seen.has(p)) { seen.add(p); found.push(p); } } // Pattern 3: Pfade in Anführungszeichen (Leerzeichen) const quotedPathPattern = /["']([^"']+\.(?:png|jpe?g|gif|webp|bmp|tiff?))["']/gi; while ((match = quotedPathPattern.exec(prompt)) !== null) { const p = match[1]; if (!seen.has(p) && p.startsWith("/")) { seen.add(p); found.push(p); } } // Pattern 4: Prompt selbst ist ein Bildpfad (paste-only) const trimmed = prompt.trim(); if (isImagePath(trimmed) && !seen.has(trimmed)) { seen.add(trimmed); found.push(trimmed); } // Pattern 5: Relative Pfade die auf CWD existieren const relPathPattern = /([^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi; while ((match = relPathPattern.exec(prompt)) !== null) { const p = match[1]; if (!seen.has(p) && !p.startsWith("/") && !p.startsWith("http")) { const resolved = path.resolve(cwd, p); if (!seen.has(resolved)) { seen.add(resolved); found.push(resolved); } } } return found; } export default function (pi: ExtensionAPI) { log("=== Vision-Proxy v6 factory started ==="); pi.on("session_start", async (_event, ctx) => { log("v6: session_start"); try { ctx.ui.notify("🔧 Vision-Proxy v6 (robuste Bildpfad-Erkennung) geladen", "info"); } catch (e: any) { log(`v6: session_start notify ERROR: ${e.message}`); } }); pi.on("before_agent_start", async (event, ctx) => { log("--- v6: before_agent_start ---"); log(`prompt length: ${event.prompt?.length ?? "undefined"}`); log(`prompt first 200: ${(event.prompt || "").substring(0, 200)}`); try { // Step 1: event.images prüfen (falls Pi diese korrekt übergibt) const hasEventImages = event.images && event.images.length > 0; log(`event.images: ${hasEventImages ? `array[${event.images!.length}]` : "none"}`); // Step 2: Prompt nach Bilddateien scannen const prompt = event.prompt || ""; const cwd = ctx.cwd || process.cwd(); const rawPaths = extractImagePaths(prompt, cwd); const imagePaths = rawPaths.filter(p => { try { return fs.existsSync(p) && fs.statSync(p).isFile(); } catch { return false; } }).slice(0, MAX_IMAGES); log(`Found ${rawPaths.length} path candidates, ${imagePaths.length} valid: ${JSON.stringify(imagePaths)}`); // Step 3: Kurzschluss — keine Bilder if (imagePaths.length === 0 && !hasEventImages) { log("No images found — exiting"); return; } // Step 4: Vision-Modell suchen let visionModel = ctx.modelRegistry.find(VISION_PROVIDER, VISION_MODEL_ID); if (!visionModel) { const alternatives = ["qwen3-vl:235b-instruct", "gemini-3-flash-preview", "kimi-k2.6"]; for (const alt of alternatives) { visionModel = ctx.modelRegistry.find(VISION_PROVIDER, alt); if (visionModel) { log(`Fallback vision model: ${visionModel.id}`); break; } } } if (!visionModel) { log("ERROR: No vision model found — exiting"); return; } log(`Using vision model: ${visionModel.id}`); // Step 5: API-Key const auth = await ctx.modelRegistry.getApiKeyAndHeaders(visionModel); if (!auth.ok || !auth.apiKey) { log(`ERROR: No API key (error: ${auth.error || "none"}) — exiting`); return; } log("Auth OK"); // Step 6: UI-Feedback const theme = ctx.ui.theme; const totalImages = imagePaths.length + (hasEventImages ? event.images!.length : 0); const statusId = "vision-proxy"; ctx.ui.setStatus(statusId, theme.fg("warning", "🔍 Analysiere Bild...")); ctx.ui.notify(theme.fg("warning", `🔍 ${totalImages} Bild(er) werden analysiert...`), "info"); // Step 7: Bilder analysieren const descriptions: string[] = []; // 7a: Datei-basierte Bilder for (let i = 0; i < imagePaths.length; i++) { const imgPath = imagePaths[i]; const label = totalImages > 1 ? `Bild ${i + 1} (${path.basename(imgPath)})` : "Bild"; log(`Processing file: ${imgPath}`); try { const imageBuffer = fs.readFileSync(imgPath); const base64Data = imageBuffer.toString("base64"); const mimeType = detectMimeType(imgPath); log(`Read ${imageBuffer.length} bytes, mime=${mimeType}`); const userMessage: Message = { role: "user", content: [ { type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` }, { type: "image", data: base64Data, mimeType }, ], timestamp: Date.now(), }; const response = await complete( visionModel, { systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] }, { apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal }, ); const descriptionText = response.content .filter((c): c is { type: "text"; text: string } => c.type === "text") .map((c) => c.text) .join("\n"); log(`Description received (${descriptionText.length} chars)`); descriptions.push(`### ${label}\n${descriptionText.trim()}`); } catch (imgErr: any) { log(`ERROR processing ${imgPath}: ${imgErr.message}`); descriptions.push(`### ${label}\n⚠️ Fehler: ${imgErr.message}`); } } // 7b: Event-basierte Bilder (falls Pi diese eines Tages übergibt) if (hasEventImages) { for (let i = 0; i < event.images!.length; i++) { const img = event.images![i]; const label = `Event-Bild ${descriptions.length + 1}`; log(`Processing event image ${i + 1}`); try { const userMessage: Message = { role: "user", content: [ { type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` }, img, ], timestamp: Date.now(), }; const response = await complete( visionModel, { systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] }, { apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal }, ); const descriptionText = response.content .filter((c): c is { type: "text"; text: string } => c.type === "text") .map((c) => c.text) .join("\n"); descriptions.push(`### ${label}\n${descriptionText.trim()}`); } catch (imgErr: any) { log(`ERROR processing event image: ${imgErr.message}`); descriptions.push(`### ${label}\n⚠️ Fehler: ${imgErr.message}`); } } } // Step 8: Ergebnis zurückgeben if (descriptions.length === 0) { ctx.ui.setStatus(statusId, undefined); log("No descriptions generated — exiting"); return; } ctx.ui.setStatus(statusId, undefined); ctx.ui.notify(theme.fg("success", `✅ Bildanalyse abgeschlossen (${descriptions.length} Bild(er))`), "info"); const visionContext = [ `== VISION-PROXY v6: Bildanalyse ==`, `${descriptions.length} Bild${descriptions.length > 1 ? "er wurden" : " wurde"} durch ${visionModel.id} analysiert.`, ``, ...descriptions, `== ENDE VISION-PROXY ==`, ].join("\n"); log(`Returning vision context (${visionContext.length} chars)`); return { message: { customType: "vision-proxy", content: visionContext, display: true, details: { imageCount: descriptions.length }, }, }; } catch (err: any) { try { ctx.ui.setStatus("vision-proxy", ctx.ui.theme.fg("error", "❌ Bildanalyse fehlgeschlagen")); } catch {} try { ctx.ui.notify(ctx.ui.theme.fg("error", `❌ Bildanalyse fehlgeschlagen: ${err.message}`), "error"); } catch {} log(`FATAL ERROR: ${err.message}\n${err.stack}`); } }); log("=== Vision-Proxy v6 factory complete ==="); }