pi-system/extensions/vision-proxy.ts

/**
 * Vision-Proxy Extension — v6 (Robuste Bildpfad-Erkennung)
 *
 * Änderungen v6 → v5:
 * - Erkennt Bildpfade ÜBERALL im Prompt (nicht nur /tmp/pi-clipboard-*)
 * - Unterstützt: absolute Pfade, relative Pfade, file:// URLs
 * - Erkennung per Dateiendung (.png, .jpg, .jpeg, .gif, .webp, .bmp)
 * - Prüft Datei-Existenz vor Verarbeitung
 * - Maximal 5 Bilder pro Nachricht (Schutz vor Overload)
 * - Bessere Fehlerbehandlung und Logging
 *
 * v5 bleibt als vision-proxy-v5-backup.ts erhalten
 */

import { complete, type Message } from "@earendil-works/pi-ai";
import * as fs from "node:fs";
import * as path from "node:path";
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";

const LOG_FILE = "/tmp/vision-proxy.log";
const VISION_PROVIDER = "openrouter";
const VISION_MODEL_ID = "qwen/qwen3-vl-32b-instruct";
const MAX_IMAGES = 5;

function log(msg: string) {
  const ts = new Date().toISOString().substring(11, 23);
  const line = `[${ts}] ${msg}\n`;
  try { fs.appendFileSync(LOG_FILE, line); } catch {}
}

const VISION_SYSTEM_PROMPT = `Du bist ein Bildanalyse-Assistent. Beschreibe das angehängte Bild detailliert und präzise.
Gib ALLES lesbar wieder — jeden Text, jeden Code, jede Zahl exakt. Bei Fehlermeldungen: Zeichen-exakte Wiedergabe.
Bei Tabellen, Diagrammen oder Strukturen: Gib den Inhalt strukturiert wieder.
Keine Einleitung — direkt den Inhalt beschreiben.`;

const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"]);

function isImagePath(filePath: string): boolean {
  const ext = path.extname(filePath).toLowerCase();
  return IMAGE_EXTENSIONS.has(ext);
}

function detectMimeType(filePath: string): string {
  const ext = path.extname(filePath).toLowerCase();
  const map: Record<string, string> = {
    ".png": "image/png",
    ".jpg": "image/jpeg",
    ".jpeg": "image/jpeg",
    ".gif": "image/gif",
    ".webp": "image/webp",
    ".bmp": "image/bmp",
    ".tiff": "image/tiff",
    ".tif": "image/tiff",
  };
  return map[ext] || "image/png";
}

/**
 * Extrahiert Bildpfade aus dem Prompt-Text.
 * Erkennt:
 * - Absolute Pfade: /home/user/bild.png
 * - pi-clipboard Pfade: /tmp/pi-clipboard-xxx.png
 * - file:// URLs
 * - Pfade in Anführungszeichen (mit Leerzeichen)
 * - Relative Pfade die auf dem CWD existieren
 */
function extractImagePaths(prompt: string, cwd: string): string[] {
  const found: string[] = [];
  const seen = new Set<string>();

  // Pattern 1: file:// URLs
  const fileUrlPattern = /file:\/\/([^\s"'<>]+)/g;
  let match;
  while ((match = fileUrlPattern.exec(prompt)) !== null) {
    const p = decodeURIComponent(match[1]);
    if (!seen.has(p) && isImagePath(p)) {
      seen.add(p);
      found.push(p);
    }
  }

  // Pattern 2: Absolute Pfade mit Bild-Endung (Linux/Mac)
  const absPathPattern = /(\/[^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
  while ((match = absPathPattern.exec(prompt)) !== null) {
    const p = match[1];
    if (!seen.has(p)) {
      seen.add(p);
      found.push(p);
    }
  }

  // Pattern 3: Pfade in Anführungszeichen (Leerzeichen)
  const quotedPathPattern = /["']([^"']+\.(?:png|jpe?g|gif|webp|bmp|tiff?))["']/gi;
  while ((match = quotedPathPattern.exec(prompt)) !== null) {
    const p = match[1];
    if (!seen.has(p) && p.startsWith("/")) {
      seen.add(p);
      found.push(p);
    }
  }

  // Pattern 4: Prompt selbst ist ein Bildpfad (paste-only)
  const trimmed = prompt.trim();
  if (isImagePath(trimmed) && !seen.has(trimmed)) {
    seen.add(trimmed);
    found.push(trimmed);
  }

  // Pattern 5: Relative Pfade die auf CWD existieren
  const relPathPattern = /([^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
  while ((match = relPathPattern.exec(prompt)) !== null) {
    const p = match[1];
    if (!seen.has(p) && !p.startsWith("/") && !p.startsWith("http")) {
      const resolved = path.resolve(cwd, p);
      if (!seen.has(resolved)) {
        seen.add(resolved);
        found.push(resolved);
      }
    }
  }

  return found;
}

export default function (pi: ExtensionAPI) {
  log("=== Vision-Proxy v6 factory started ===");

  pi.on("session_start", async (_event, ctx) => {
    log("v6: session_start");
    try {
      ctx.ui.notify("🔧 Vision-Proxy v6 (robuste Bildpfad-Erkennung) geladen", "info");
    } catch (e: any) {
      log(`v6: session_start notify ERROR: ${e.message}`);
    }
  });

  pi.on("before_agent_start", async (event, ctx) => {
    log("--- v6: before_agent_start ---");
    log(`prompt length: ${event.prompt?.length ?? "undefined"}`);
    log(`prompt first 200: ${(event.prompt || "").substring(0, 200)}`);

    try {
      // Step 1: event.images prüfen (falls Pi diese korrekt übergibt)
      const hasEventImages = event.images && event.images.length > 0;
      log(`event.images: ${hasEventImages ? `array[${event.images!.length}]` : "none"}`);

      // Step 2: Prompt nach Bilddateien scannen
      const prompt = event.prompt || "";
      const cwd = ctx.cwd || process.cwd();
      const rawPaths = extractImagePaths(prompt, cwd);
      const imagePaths = rawPaths.filter(p => {
        try { return fs.existsSync(p) && fs.statSync(p).isFile(); } catch { return false; }
      }).slice(0, MAX_IMAGES);
      log(`Found ${rawPaths.length} path candidates, ${imagePaths.length} valid: ${JSON.stringify(imagePaths)}`);

      // Step 3: Kurzschluss — keine Bilder
      if (imagePaths.length === 0 && !hasEventImages) {
        log("No images found — exiting");
        return;
      }

      // Step 4: Vision-Modell suchen
      let visionModel = ctx.modelRegistry.find(VISION_PROVIDER, VISION_MODEL_ID);
      if (!visionModel) {
        const alternatives = ["qwen3-vl:235b-instruct", "gemini-3-flash-preview", "kimi-k2.6"];
        for (const alt of alternatives) {
          visionModel = ctx.modelRegistry.find(VISION_PROVIDER, alt);
          if (visionModel) { log(`Fallback vision model: ${visionModel.id}`); break; }
        }
      }

      if (!visionModel) {
        log("ERROR: No vision model found — exiting");
        return;
      }
      log(`Using vision model: ${visionModel.id}`);

      // Step 5: API-Key
      const auth = await ctx.modelRegistry.getApiKeyAndHeaders(visionModel);
      if (!auth.ok || !auth.apiKey) {
        log(`ERROR: No API key (error: ${auth.error || "none"}) — exiting`);
        return;
      }
      log("Auth OK");

      // Step 6: UI-Feedback
      const theme = ctx.ui.theme;
      const totalImages = imagePaths.length + (hasEventImages ? event.images!.length : 0);
      const statusId = "vision-proxy";
      ctx.ui.setStatus(statusId, theme.fg("warning", "🔍 Analysiere Bild..."));
      ctx.ui.notify(theme.fg("warning", `🔍 ${totalImages} Bild(er) werden analysiert...`), "info");

      // Step 7: Bilder analysieren
      const descriptions: string[] = [];

      // 7a: Datei-basierte Bilder
      for (let i = 0; i < imagePaths.length; i++) {
        const imgPath = imagePaths[i];
        const label = totalImages > 1 ? `Bild ${i + 1} (${path.basename(imgPath)})` : "Bild";
        log(`Processing file: ${imgPath}`);

        try {
          const imageBuffer = fs.readFileSync(imgPath);
          const base64Data = imageBuffer.toString("base64");
          const mimeType = detectMimeType(imgPath);
          log(`Read ${imageBuffer.length} bytes, mime=${mimeType}`);

          const userMessage: Message = {
            role: "user",
            content: [
              { type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
              { type: "image", data: base64Data, mimeType },
            ],
            timestamp: Date.now(),
          };

          const response = await complete(
            visionModel,
            { systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
            { apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
          );

          const descriptionText = response.content
            .filter((c): c is { type: "text"; text: string } => c.type === "text")
            .map((c) => c.text)
            .join("\n");
          log(`Description received (${descriptionText.length} chars)`);

          descriptions.push(`### ${label}\n${descriptionText.trim()}`);
        } catch (imgErr: any) {
          log(`ERROR processing ${imgPath}: ${imgErr.message}`);
          descriptions.push(`### ${label}\n⚠️ Fehler: ${imgErr.message}`);
        }
      }

      // 7b: Event-basierte Bilder (falls Pi diese eines Tages übergibt)
      if (hasEventImages) {
        for (let i = 0; i < event.images!.length; i++) {
          const img = event.images![i];
          const label = `Event-Bild ${descriptions.length + 1}`;
          log(`Processing event image ${i + 1}`);

          try {
            const userMessage: Message = {
              role: "user",
              content: [
                { type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
                img,
              ],
              timestamp: Date.now(),
            };

            const response = await complete(
              visionModel,
              { systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
              { apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
            );

            const descriptionText = response.content
              .filter((c): c is { type: "text"; text: string } => c.type === "text")
              .map((c) => c.text)
              .join("\n");

            descriptions.push(`### ${label}\n${descriptionText.trim()}`);
          } catch (imgErr: any) {
            log(`ERROR processing event image: ${imgErr.message}`);
            descriptions.push(`### ${label}\n⚠️ Fehler: ${imgErr.message}`);
          }
        }
      }

      // Step 8: Ergebnis zurückgeben
      if (descriptions.length === 0) {
        ctx.ui.setStatus(statusId, undefined);
        log("No descriptions generated — exiting");
        return;
      }

      ctx.ui.setStatus(statusId, undefined);
      ctx.ui.notify(theme.fg("success", `✅ Bildanalyse abgeschlossen (${descriptions.length} Bild(er))`), "info");

      const visionContext = [
        `== VISION-PROXY v6: Bildanalyse ==`,
        `${descriptions.length} Bild${descriptions.length > 1 ? "er wurden" : " wurde"} durch ${visionModel.id} analysiert.`,
        ``,
        ...descriptions,
        `== ENDE VISION-PROXY ==`,
      ].join("\n");

      log(`Returning vision context (${visionContext.length} chars)`);
      return {
        message: {
          customType: "vision-proxy",
          content: visionContext,
          display: true,
          details: { imageCount: descriptions.length },
        },
      };

    } catch (err: any) {
      try { ctx.ui.setStatus("vision-proxy", ctx.ui.theme.fg("error", "❌ Bildanalyse fehlgeschlagen")); } catch {}
      try { ctx.ui.notify(ctx.ui.theme.fg("error", `❌ Bildanalyse fehlgeschlagen: ${err.message}`), "error"); } catch {}
      log(`FATAL ERROR: ${err.message}\n${err.stack}`);
    }
  });

  log("=== Vision-Proxy v6 factory complete ===");
}