309 lines
11 KiB
TypeScript
309 lines
11 KiB
TypeScript
|
|
/**
|
|||
|
|
* Vision-Proxy Extension — v6 (Robuste Bildpfad-Erkennung)
|
|||
|
|
*
|
|||
|
|
* Änderungen v6 → v5:
|
|||
|
|
* - Erkennt Bildpfade ÜBERALL im Prompt (nicht nur /tmp/pi-clipboard-*)
|
|||
|
|
* - Unterstützt: absolute Pfade, relative Pfade, file:// URLs
|
|||
|
|
* - Erkennung per Dateiendung (.png, .jpg, .jpeg, .gif, .webp, .bmp)
|
|||
|
|
* - Prüft Datei-Existenz vor Verarbeitung
|
|||
|
|
* - Maximal 5 Bilder pro Nachricht (Schutz vor Overload)
|
|||
|
|
* - Bessere Fehlerbehandlung und Logging
|
|||
|
|
*
|
|||
|
|
* v5 bleibt als vision-proxy-v5-backup.ts erhalten
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
import { complete, type Message } from "@earendil-works/pi-ai";
|
|||
|
|
import * as fs from "node:fs";
|
|||
|
|
import * as path from "node:path";
|
|||
|
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|||
|
|
|
|||
|
|
const LOG_FILE = "/tmp/vision-proxy.log";
|
|||
|
|
const VISION_PROVIDER = "openrouter";
|
|||
|
|
const VISION_MODEL_ID = "qwen/qwen3-vl-32b-instruct";
|
|||
|
|
const MAX_IMAGES = 5;
|
|||
|
|
|
|||
|
|
function log(msg: string) {
|
|||
|
|
const ts = new Date().toISOString().substring(11, 23);
|
|||
|
|
const line = `[${ts}] ${msg}\n`;
|
|||
|
|
try { fs.appendFileSync(LOG_FILE, line); } catch {}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const VISION_SYSTEM_PROMPT = `Du bist ein Bildanalyse-Assistent. Beschreibe das angehängte Bild detailliert und präzise.
|
|||
|
|
Gib ALLES lesbar wieder — jeden Text, jeden Code, jede Zahl exakt. Bei Fehlermeldungen: Zeichen-exakte Wiedergabe.
|
|||
|
|
Bei Tabellen, Diagrammen oder Strukturen: Gib den Inhalt strukturiert wieder.
|
|||
|
|
Keine Einleitung — direkt den Inhalt beschreiben.`;
|
|||
|
|
|
|||
|
|
const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"]);
|
|||
|
|
|
|||
|
|
function isImagePath(filePath: string): boolean {
|
|||
|
|
const ext = path.extname(filePath).toLowerCase();
|
|||
|
|
return IMAGE_EXTENSIONS.has(ext);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function detectMimeType(filePath: string): string {
|
|||
|
|
const ext = path.extname(filePath).toLowerCase();
|
|||
|
|
const map: Record<string, string> = {
|
|||
|
|
".png": "image/png",
|
|||
|
|
".jpg": "image/jpeg",
|
|||
|
|
".jpeg": "image/jpeg",
|
|||
|
|
".gif": "image/gif",
|
|||
|
|
".webp": "image/webp",
|
|||
|
|
".bmp": "image/bmp",
|
|||
|
|
".tiff": "image/tiff",
|
|||
|
|
".tif": "image/tiff",
|
|||
|
|
};
|
|||
|
|
return map[ext] || "image/png";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Extrahiert Bildpfade aus dem Prompt-Text.
|
|||
|
|
* Erkennt:
|
|||
|
|
* - Absolute Pfade: /home/user/bild.png
|
|||
|
|
* - pi-clipboard Pfade: /tmp/pi-clipboard-xxx.png
|
|||
|
|
* - file:// URLs
|
|||
|
|
* - Pfade in Anführungszeichen (mit Leerzeichen)
|
|||
|
|
* - Relative Pfade die auf dem CWD existieren
|
|||
|
|
*/
|
|||
|
|
function extractImagePaths(prompt: string, cwd: string): string[] {
|
|||
|
|
const found: string[] = [];
|
|||
|
|
const seen = new Set<string>();
|
|||
|
|
|
|||
|
|
// Pattern 1: file:// URLs
|
|||
|
|
const fileUrlPattern = /file:\/\/([^\s"'<>]+)/g;
|
|||
|
|
let match;
|
|||
|
|
while ((match = fileUrlPattern.exec(prompt)) !== null) {
|
|||
|
|
const p = decodeURIComponent(match[1]);
|
|||
|
|
if (!seen.has(p) && isImagePath(p)) {
|
|||
|
|
seen.add(p);
|
|||
|
|
found.push(p);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Pattern 2: Absolute Pfade mit Bild-Endung (Linux/Mac)
|
|||
|
|
const absPathPattern = /(\/[^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
|
|||
|
|
while ((match = absPathPattern.exec(prompt)) !== null) {
|
|||
|
|
const p = match[1];
|
|||
|
|
if (!seen.has(p)) {
|
|||
|
|
seen.add(p);
|
|||
|
|
found.push(p);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Pattern 3: Pfade in Anführungszeichen (Leerzeichen)
|
|||
|
|
const quotedPathPattern = /["']([^"']+\.(?:png|jpe?g|gif|webp|bmp|tiff?))["']/gi;
|
|||
|
|
while ((match = quotedPathPattern.exec(prompt)) !== null) {
|
|||
|
|
const p = match[1];
|
|||
|
|
if (!seen.has(p) && p.startsWith("/")) {
|
|||
|
|
seen.add(p);
|
|||
|
|
found.push(p);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Pattern 4: Prompt selbst ist ein Bildpfad (paste-only)
|
|||
|
|
const trimmed = prompt.trim();
|
|||
|
|
if (isImagePath(trimmed) && !seen.has(trimmed)) {
|
|||
|
|
seen.add(trimmed);
|
|||
|
|
found.push(trimmed);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Pattern 5: Relative Pfade die auf CWD existieren
|
|||
|
|
const relPathPattern = /([^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
|
|||
|
|
while ((match = relPathPattern.exec(prompt)) !== null) {
|
|||
|
|
const p = match[1];
|
|||
|
|
if (!seen.has(p) && !p.startsWith("/") && !p.startsWith("http")) {
|
|||
|
|
const resolved = path.resolve(cwd, p);
|
|||
|
|
if (!seen.has(resolved)) {
|
|||
|
|
seen.add(resolved);
|
|||
|
|
found.push(resolved);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return found;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export default function (pi: ExtensionAPI) {
|
|||
|
|
log("=== Vision-Proxy v6 factory started ===");
|
|||
|
|
|
|||
|
|
pi.on("session_start", async (_event, ctx) => {
|
|||
|
|
log("v6: session_start");
|
|||
|
|
try {
|
|||
|
|
ctx.ui.notify("🔧 Vision-Proxy v6 (robuste Bildpfad-Erkennung) geladen", "info");
|
|||
|
|
} catch (e: any) {
|
|||
|
|
log(`v6: session_start notify ERROR: ${e.message}`);
|
|||
|
|
}
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
pi.on("before_agent_start", async (event, ctx) => {
|
|||
|
|
log("--- v6: before_agent_start ---");
|
|||
|
|
log(`prompt length: ${event.prompt?.length ?? "undefined"}`);
|
|||
|
|
log(`prompt first 200: ${(event.prompt || "").substring(0, 200)}`);
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
// Step 1: event.images prüfen (falls Pi diese korrekt übergibt)
|
|||
|
|
const hasEventImages = event.images && event.images.length > 0;
|
|||
|
|
log(`event.images: ${hasEventImages ? `array[${event.images!.length}]` : "none"}`);
|
|||
|
|
|
|||
|
|
// Step 2: Prompt nach Bilddateien scannen
|
|||
|
|
const prompt = event.prompt || "";
|
|||
|
|
const cwd = ctx.cwd || process.cwd();
|
|||
|
|
const rawPaths = extractImagePaths(prompt, cwd);
|
|||
|
|
const imagePaths = rawPaths.filter(p => {
|
|||
|
|
try { return fs.existsSync(p) && fs.statSync(p).isFile(); } catch { return false; }
|
|||
|
|
}).slice(0, MAX_IMAGES);
|
|||
|
|
log(`Found ${rawPaths.length} path candidates, ${imagePaths.length} valid: ${JSON.stringify(imagePaths)}`);
|
|||
|
|
|
|||
|
|
// Step 3: Kurzschluss — keine Bilder
|
|||
|
|
if (imagePaths.length === 0 && !hasEventImages) {
|
|||
|
|
log("No images found — exiting");
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Step 4: Vision-Modell suchen
|
|||
|
|
let visionModel = ctx.modelRegistry.find(VISION_PROVIDER, VISION_MODEL_ID);
|
|||
|
|
if (!visionModel) {
|
|||
|
|
const alternatives = ["qwen3-vl:235b-instruct", "gemini-3-flash-preview", "kimi-k2.6"];
|
|||
|
|
for (const alt of alternatives) {
|
|||
|
|
visionModel = ctx.modelRegistry.find(VISION_PROVIDER, alt);
|
|||
|
|
if (visionModel) { log(`Fallback vision model: ${visionModel.id}`); break; }
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!visionModel) {
|
|||
|
|
log("ERROR: No vision model found — exiting");
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
log(`Using vision model: ${visionModel.id}`);
|
|||
|
|
|
|||
|
|
// Step 5: API-Key
|
|||
|
|
const auth = await ctx.modelRegistry.getApiKeyAndHeaders(visionModel);
|
|||
|
|
if (!auth.ok || !auth.apiKey) {
|
|||
|
|
log(`ERROR: No API key (error: ${auth.error || "none"}) — exiting`);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
log("Auth OK");
|
|||
|
|
|
|||
|
|
// Step 6: UI-Feedback
|
|||
|
|
const theme = ctx.ui.theme;
|
|||
|
|
const totalImages = imagePaths.length + (hasEventImages ? event.images!.length : 0);
|
|||
|
|
const statusId = "vision-proxy";
|
|||
|
|
ctx.ui.setStatus(statusId, theme.fg("warning", "🔍 Analysiere Bild..."));
|
|||
|
|
ctx.ui.notify(theme.fg("warning", `🔍 ${totalImages} Bild(er) werden analysiert...`), "info");
|
|||
|
|
|
|||
|
|
// Step 7: Bilder analysieren
|
|||
|
|
const descriptions: string[] = [];
|
|||
|
|
|
|||
|
|
// 7a: Datei-basierte Bilder
|
|||
|
|
for (let i = 0; i < imagePaths.length; i++) {
|
|||
|
|
const imgPath = imagePaths[i];
|
|||
|
|
const label = totalImages > 1 ? `Bild ${i + 1} (${path.basename(imgPath)})` : "Bild";
|
|||
|
|
log(`Processing file: ${imgPath}`);
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
const imageBuffer = fs.readFileSync(imgPath);
|
|||
|
|
const base64Data = imageBuffer.toString("base64");
|
|||
|
|
const mimeType = detectMimeType(imgPath);
|
|||
|
|
log(`Read ${imageBuffer.length} bytes, mime=${mimeType}`);
|
|||
|
|
|
|||
|
|
const userMessage: Message = {
|
|||
|
|
role: "user",
|
|||
|
|
content: [
|
|||
|
|
{ type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
|
|||
|
|
{ type: "image", data: base64Data, mimeType },
|
|||
|
|
],
|
|||
|
|
timestamp: Date.now(),
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const response = await complete(
|
|||
|
|
visionModel,
|
|||
|
|
{ systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
|
|||
|
|
{ apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
const descriptionText = response.content
|
|||
|
|
.filter((c): c is { type: "text"; text: string } => c.type === "text")
|
|||
|
|
.map((c) => c.text)
|
|||
|
|
.join("\n");
|
|||
|
|
log(`Description received (${descriptionText.length} chars)`);
|
|||
|
|
|
|||
|
|
descriptions.push(`### ${label}\n${descriptionText.trim()}`);
|
|||
|
|
} catch (imgErr: any) {
|
|||
|
|
log(`ERROR processing ${imgPath}: ${imgErr.message}`);
|
|||
|
|
descriptions.push(`### ${label}\n⚠️ Fehler: ${imgErr.message}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 7b: Event-basierte Bilder (falls Pi diese eines Tages übergibt)
|
|||
|
|
if (hasEventImages) {
|
|||
|
|
for (let i = 0; i < event.images!.length; i++) {
|
|||
|
|
const img = event.images![i];
|
|||
|
|
const label = `Event-Bild ${descriptions.length + 1}`;
|
|||
|
|
log(`Processing event image ${i + 1}`);
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
const userMessage: Message = {
|
|||
|
|
role: "user",
|
|||
|
|
content: [
|
|||
|
|
{ type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
|
|||
|
|
img,
|
|||
|
|
],
|
|||
|
|
timestamp: Date.now(),
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const response = await complete(
|
|||
|
|
visionModel,
|
|||
|
|
{ systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
|
|||
|
|
{ apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
const descriptionText = response.content
|
|||
|
|
.filter((c): c is { type: "text"; text: string } => c.type === "text")
|
|||
|
|
.map((c) => c.text)
|
|||
|
|
.join("\n");
|
|||
|
|
|
|||
|
|
descriptions.push(`### ${label}\n${descriptionText.trim()}`);
|
|||
|
|
} catch (imgErr: any) {
|
|||
|
|
log(`ERROR processing event image: ${imgErr.message}`);
|
|||
|
|
descriptions.push(`### ${label}\n⚠️ Fehler: ${imgErr.message}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Step 8: Ergebnis zurückgeben
|
|||
|
|
if (descriptions.length === 0) {
|
|||
|
|
ctx.ui.setStatus(statusId, undefined);
|
|||
|
|
log("No descriptions generated — exiting");
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
ctx.ui.setStatus(statusId, undefined);
|
|||
|
|
ctx.ui.notify(theme.fg("success", `✅ Bildanalyse abgeschlossen (${descriptions.length} Bild(er))`), "info");
|
|||
|
|
|
|||
|
|
const visionContext = [
|
|||
|
|
`== VISION-PROXY v6: Bildanalyse ==`,
|
|||
|
|
`${descriptions.length} Bild${descriptions.length > 1 ? "er wurden" : " wurde"} durch ${visionModel.id} analysiert.`,
|
|||
|
|
``,
|
|||
|
|
...descriptions,
|
|||
|
|
`== ENDE VISION-PROXY ==`,
|
|||
|
|
].join("\n");
|
|||
|
|
|
|||
|
|
log(`Returning vision context (${visionContext.length} chars)`);
|
|||
|
|
return {
|
|||
|
|
message: {
|
|||
|
|
customType: "vision-proxy",
|
|||
|
|
content: visionContext,
|
|||
|
|
display: true,
|
|||
|
|
details: { imageCount: descriptions.length },
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
} catch (err: any) {
|
|||
|
|
try { ctx.ui.setStatus("vision-proxy", ctx.ui.theme.fg("error", "❌ Bildanalyse fehlgeschlagen")); } catch {}
|
|||
|
|
try { ctx.ui.notify(ctx.ui.theme.fg("error", `❌ Bildanalyse fehlgeschlagen: ${err.message}`), "error"); } catch {}
|
|||
|
|
log(`FATAL ERROR: ${err.message}\n${err.stack}`);
|
|||
|
|
}
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
log("=== Vision-Proxy v6 factory complete ===");
|
|||
|
|
}
|