pi-system/extensions/vision-proxy.ts

309 lines
11 KiB
TypeScript
Raw Permalink Normal View History

/**
* Vision-Proxy Extension v6 (Robuste Bildpfad-Erkennung)
*
* Änderungen v6 v5:
* - Erkennt Bildpfade ÜBERALL im Prompt (nicht nur /tmp/pi-clipboard-*)
* - Unterstützt: absolute Pfade, relative Pfade, file:// URLs
* - Erkennung per Dateiendung (.png, .jpg, .jpeg, .gif, .webp, .bmp)
* - Prüft Datei-Existenz vor Verarbeitung
* - Maximal 5 Bilder pro Nachricht (Schutz vor Overload)
* - Bessere Fehlerbehandlung und Logging
*
* v5 bleibt als vision-proxy-v5-backup.ts erhalten
*/
import { complete, type Message } from "@earendil-works/pi-ai";
import * as fs from "node:fs";
import * as path from "node:path";
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
const LOG_FILE = "/tmp/vision-proxy.log";
const VISION_PROVIDER = "openrouter";
const VISION_MODEL_ID = "qwen/qwen3-vl-32b-instruct";
const MAX_IMAGES = 5;
function log(msg: string) {
const ts = new Date().toISOString().substring(11, 23);
const line = `[${ts}] ${msg}\n`;
try { fs.appendFileSync(LOG_FILE, line); } catch {}
}
const VISION_SYSTEM_PROMPT = `Du bist ein Bildanalyse-Assistent. Beschreibe das angehängte Bild detailliert und präzise.
Gib ALLES lesbar wieder jeden Text, jeden Code, jede Zahl exakt. Bei Fehlermeldungen: Zeichen-exakte Wiedergabe.
Bei Tabellen, Diagrammen oder Strukturen: Gib den Inhalt strukturiert wieder.
Keine Einleitung direkt den Inhalt beschreiben.`;
const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"]);
function isImagePath(filePath: string): boolean {
const ext = path.extname(filePath).toLowerCase();
return IMAGE_EXTENSIONS.has(ext);
}
function detectMimeType(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
const map: Record<string, string> = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/bmp",
".tiff": "image/tiff",
".tif": "image/tiff",
};
return map[ext] || "image/png";
}
/**
* Extrahiert Bildpfade aus dem Prompt-Text.
* Erkennt:
* - Absolute Pfade: /home/user/bild.png
* - pi-clipboard Pfade: /tmp/pi-clipboard-xxx.png
* - file:// URLs
* - Pfade in Anführungszeichen (mit Leerzeichen)
* - Relative Pfade die auf dem CWD existieren
*/
function extractImagePaths(prompt: string, cwd: string): string[] {
const found: string[] = [];
const seen = new Set<string>();
// Pattern 1: file:// URLs
const fileUrlPattern = /file:\/\/([^\s"'<>]+)/g;
let match;
while ((match = fileUrlPattern.exec(prompt)) !== null) {
const p = decodeURIComponent(match[1]);
if (!seen.has(p) && isImagePath(p)) {
seen.add(p);
found.push(p);
}
}
// Pattern 2: Absolute Pfade mit Bild-Endung (Linux/Mac)
const absPathPattern = /(\/[^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
while ((match = absPathPattern.exec(prompt)) !== null) {
const p = match[1];
if (!seen.has(p)) {
seen.add(p);
found.push(p);
}
}
// Pattern 3: Pfade in Anführungszeichen (Leerzeichen)
const quotedPathPattern = /["']([^"']+\.(?:png|jpe?g|gif|webp|bmp|tiff?))["']/gi;
while ((match = quotedPathPattern.exec(prompt)) !== null) {
const p = match[1];
if (!seen.has(p) && p.startsWith("/")) {
seen.add(p);
found.push(p);
}
}
// Pattern 4: Prompt selbst ist ein Bildpfad (paste-only)
const trimmed = prompt.trim();
if (isImagePath(trimmed) && !seen.has(trimmed)) {
seen.add(trimmed);
found.push(trimmed);
}
// Pattern 5: Relative Pfade die auf CWD existieren
const relPathPattern = /([^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
while ((match = relPathPattern.exec(prompt)) !== null) {
const p = match[1];
if (!seen.has(p) && !p.startsWith("/") && !p.startsWith("http")) {
const resolved = path.resolve(cwd, p);
if (!seen.has(resolved)) {
seen.add(resolved);
found.push(resolved);
}
}
}
return found;
}
export default function (pi: ExtensionAPI) {
log("=== Vision-Proxy v6 factory started ===");
pi.on("session_start", async (_event, ctx) => {
log("v6: session_start");
try {
ctx.ui.notify("🔧 Vision-Proxy v6 (robuste Bildpfad-Erkennung) geladen", "info");
} catch (e: any) {
log(`v6: session_start notify ERROR: ${e.message}`);
}
});
pi.on("before_agent_start", async (event, ctx) => {
log("--- v6: before_agent_start ---");
log(`prompt length: ${event.prompt?.length ?? "undefined"}`);
log(`prompt first 200: ${(event.prompt || "").substring(0, 200)}`);
try {
// Step 1: event.images prüfen (falls Pi diese korrekt übergibt)
const hasEventImages = event.images && event.images.length > 0;
log(`event.images: ${hasEventImages ? `array[${event.images!.length}]` : "none"}`);
// Step 2: Prompt nach Bilddateien scannen
const prompt = event.prompt || "";
const cwd = ctx.cwd || process.cwd();
const rawPaths = extractImagePaths(prompt, cwd);
const imagePaths = rawPaths.filter(p => {
try { return fs.existsSync(p) && fs.statSync(p).isFile(); } catch { return false; }
}).slice(0, MAX_IMAGES);
log(`Found ${rawPaths.length} path candidates, ${imagePaths.length} valid: ${JSON.stringify(imagePaths)}`);
// Step 3: Kurzschluss — keine Bilder
if (imagePaths.length === 0 && !hasEventImages) {
log("No images found — exiting");
return;
}
// Step 4: Vision-Modell suchen
let visionModel = ctx.modelRegistry.find(VISION_PROVIDER, VISION_MODEL_ID);
if (!visionModel) {
const alternatives = ["qwen3-vl:235b-instruct", "gemini-3-flash-preview", "kimi-k2.6"];
for (const alt of alternatives) {
visionModel = ctx.modelRegistry.find(VISION_PROVIDER, alt);
if (visionModel) { log(`Fallback vision model: ${visionModel.id}`); break; }
}
}
if (!visionModel) {
log("ERROR: No vision model found — exiting");
return;
}
log(`Using vision model: ${visionModel.id}`);
// Step 5: API-Key
const auth = await ctx.modelRegistry.getApiKeyAndHeaders(visionModel);
if (!auth.ok || !auth.apiKey) {
log(`ERROR: No API key (error: ${auth.error || "none"}) — exiting`);
return;
}
log("Auth OK");
// Step 6: UI-Feedback
const theme = ctx.ui.theme;
const totalImages = imagePaths.length + (hasEventImages ? event.images!.length : 0);
const statusId = "vision-proxy";
ctx.ui.setStatus(statusId, theme.fg("warning", "🔍 Analysiere Bild..."));
ctx.ui.notify(theme.fg("warning", `🔍 ${totalImages} Bild(er) werden analysiert...`), "info");
// Step 7: Bilder analysieren
const descriptions: string[] = [];
// 7a: Datei-basierte Bilder
for (let i = 0; i < imagePaths.length; i++) {
const imgPath = imagePaths[i];
const label = totalImages > 1 ? `Bild ${i + 1} (${path.basename(imgPath)})` : "Bild";
log(`Processing file: ${imgPath}`);
try {
const imageBuffer = fs.readFileSync(imgPath);
const base64Data = imageBuffer.toString("base64");
const mimeType = detectMimeType(imgPath);
log(`Read ${imageBuffer.length} bytes, mime=${mimeType}`);
const userMessage: Message = {
role: "user",
content: [
{ type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
{ type: "image", data: base64Data, mimeType },
],
timestamp: Date.now(),
};
const response = await complete(
visionModel,
{ systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
{ apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
);
const descriptionText = response.content
.filter((c): c is { type: "text"; text: string } => c.type === "text")
.map((c) => c.text)
.join("\n");
log(`Description received (${descriptionText.length} chars)`);
descriptions.push(`### ${label}\n${descriptionText.trim()}`);
} catch (imgErr: any) {
log(`ERROR processing ${imgPath}: ${imgErr.message}`);
descriptions.push(`### ${label}\n⚠ Fehler: ${imgErr.message}`);
}
}
// 7b: Event-basierte Bilder (falls Pi diese eines Tages übergibt)
if (hasEventImages) {
for (let i = 0; i < event.images!.length; i++) {
const img = event.images![i];
const label = `Event-Bild ${descriptions.length + 1}`;
log(`Processing event image ${i + 1}`);
try {
const userMessage: Message = {
role: "user",
content: [
{ type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
img,
],
timestamp: Date.now(),
};
const response = await complete(
visionModel,
{ systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
{ apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
);
const descriptionText = response.content
.filter((c): c is { type: "text"; text: string } => c.type === "text")
.map((c) => c.text)
.join("\n");
descriptions.push(`### ${label}\n${descriptionText.trim()}`);
} catch (imgErr: any) {
log(`ERROR processing event image: ${imgErr.message}`);
descriptions.push(`### ${label}\n⚠ Fehler: ${imgErr.message}`);
}
}
}
// Step 8: Ergebnis zurückgeben
if (descriptions.length === 0) {
ctx.ui.setStatus(statusId, undefined);
log("No descriptions generated — exiting");
return;
}
ctx.ui.setStatus(statusId, undefined);
ctx.ui.notify(theme.fg("success", `✅ Bildanalyse abgeschlossen (${descriptions.length} Bild(er))`), "info");
const visionContext = [
`== VISION-PROXY v6: Bildanalyse ==`,
`${descriptions.length} Bild${descriptions.length > 1 ? "er wurden" : " wurde"} durch ${visionModel.id} analysiert.`,
``,
...descriptions,
`== ENDE VISION-PROXY ==`,
].join("\n");
log(`Returning vision context (${visionContext.length} chars)`);
return {
message: {
customType: "vision-proxy",
content: visionContext,
display: true,
details: { imageCount: descriptions.length },
},
};
} catch (err: any) {
try { ctx.ui.setStatus("vision-proxy", ctx.ui.theme.fg("error", "❌ Bildanalyse fehlgeschlagen")); } catch {}
try { ctx.ui.notify(ctx.ui.theme.fg("error", `❌ Bildanalyse fehlgeschlagen: ${err.message}`), "error"); } catch {}
log(`FATAL ERROR: ${err.message}\n${err.stack}`);
}
});
log("=== Vision-Proxy v6 factory complete ===");
}