pi-system/extensions/vision-proxy.ts
Raimund Bauer fb3daab33f feat/init: PiSystem Infrastruktur-Repo mit SubConfirm
Enthält alle Pi-Orchestrator-Infrastrukturkomponenten:
- bin/Sub* Skripte (SubAgenten, SubStatus, SubWatcher, SubConfirm)
- extensions/ (arbeitsweise-guard, confirm-deletion, etc.)
- memory/ (arbeitsweise, subagent-autocheck)
- agent/AGENTS.md mit SubConfirm-Reaktionslogik
- install.sh: deterministisches, idempotentes Setup für neue Maschinen

SubConfirm (neu): Stasis-Detektor der alle 30s tmux-Sessions prüft.
Bei unverändertem Output sendet er den vollständigen Pane-Inhalt
an die Alert-Datei — der Orchestrator beurteilt selbst ob Handlung nötig.
Kein Keyword-Matching.
2026-06-02 11:53:37 +02:00

308 lines
11 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Vision-Proxy Extension — v6 (Robuste Bildpfad-Erkennung)
*
* Änderungen v6 → v5:
* - Erkennt Bildpfade ÜBERALL im Prompt (nicht nur /tmp/pi-clipboard-*)
* - Unterstützt: absolute Pfade, relative Pfade, file:// URLs
* - Erkennung per Dateiendung (.png, .jpg, .jpeg, .gif, .webp, .bmp)
* - Prüft Datei-Existenz vor Verarbeitung
* - Maximal 5 Bilder pro Nachricht (Schutz vor Overload)
* - Bessere Fehlerbehandlung und Logging
*
* v5 bleibt als vision-proxy-v5-backup.ts erhalten
*/
import { complete, type Message } from "@earendil-works/pi-ai";
import * as fs from "node:fs";
import * as path from "node:path";
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
const LOG_FILE = "/tmp/vision-proxy.log";
const VISION_PROVIDER = "openrouter";
const VISION_MODEL_ID = "qwen/qwen3-vl-32b-instruct";
const MAX_IMAGES = 5;
function log(msg: string) {
const ts = new Date().toISOString().substring(11, 23);
const line = `[${ts}] ${msg}\n`;
try { fs.appendFileSync(LOG_FILE, line); } catch {}
}
const VISION_SYSTEM_PROMPT = `Du bist ein Bildanalyse-Assistent. Beschreibe das angehängte Bild detailliert und präzise.
Gib ALLES lesbar wieder — jeden Text, jeden Code, jede Zahl exakt. Bei Fehlermeldungen: Zeichen-exakte Wiedergabe.
Bei Tabellen, Diagrammen oder Strukturen: Gib den Inhalt strukturiert wieder.
Keine Einleitung — direkt den Inhalt beschreiben.`;
const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"]);
function isImagePath(filePath: string): boolean {
const ext = path.extname(filePath).toLowerCase();
return IMAGE_EXTENSIONS.has(ext);
}
function detectMimeType(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
const map: Record<string, string> = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/bmp",
".tiff": "image/tiff",
".tif": "image/tiff",
};
return map[ext] || "image/png";
}
/**
* Extrahiert Bildpfade aus dem Prompt-Text.
* Erkennt:
* - Absolute Pfade: /home/user/bild.png
* - pi-clipboard Pfade: /tmp/pi-clipboard-xxx.png
* - file:// URLs
* - Pfade in Anführungszeichen (mit Leerzeichen)
* - Relative Pfade die auf dem CWD existieren
*/
function extractImagePaths(prompt: string, cwd: string): string[] {
const found: string[] = [];
const seen = new Set<string>();
// Pattern 1: file:// URLs
const fileUrlPattern = /file:\/\/([^\s"'<>]+)/g;
let match;
while ((match = fileUrlPattern.exec(prompt)) !== null) {
const p = decodeURIComponent(match[1]);
if (!seen.has(p) && isImagePath(p)) {
seen.add(p);
found.push(p);
}
}
// Pattern 2: Absolute Pfade mit Bild-Endung (Linux/Mac)
const absPathPattern = /(\/[^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
while ((match = absPathPattern.exec(prompt)) !== null) {
const p = match[1];
if (!seen.has(p)) {
seen.add(p);
found.push(p);
}
}
// Pattern 3: Pfade in Anführungszeichen (Leerzeichen)
const quotedPathPattern = /["']([^"']+\.(?:png|jpe?g|gif|webp|bmp|tiff?))["']/gi;
while ((match = quotedPathPattern.exec(prompt)) !== null) {
const p = match[1];
if (!seen.has(p) && p.startsWith("/")) {
seen.add(p);
found.push(p);
}
}
// Pattern 4: Prompt selbst ist ein Bildpfad (paste-only)
const trimmed = prompt.trim();
if (isImagePath(trimmed) && !seen.has(trimmed)) {
seen.add(trimmed);
found.push(trimmed);
}
// Pattern 5: Relative Pfade die auf CWD existieren
const relPathPattern = /([^\s"'<>]+\.(?:png|jpe?g|gif|webp|bmp|tiff?))/gi;
while ((match = relPathPattern.exec(prompt)) !== null) {
const p = match[1];
if (!seen.has(p) && !p.startsWith("/") && !p.startsWith("http")) {
const resolved = path.resolve(cwd, p);
if (!seen.has(resolved)) {
seen.add(resolved);
found.push(resolved);
}
}
}
return found;
}
export default function (pi: ExtensionAPI) {
log("=== Vision-Proxy v6 factory started ===");
pi.on("session_start", async (_event, ctx) => {
log("v6: session_start");
try {
ctx.ui.notify("🔧 Vision-Proxy v6 (robuste Bildpfad-Erkennung) geladen", "info");
} catch (e: any) {
log(`v6: session_start notify ERROR: ${e.message}`);
}
});
pi.on("before_agent_start", async (event, ctx) => {
log("--- v6: before_agent_start ---");
log(`prompt length: ${event.prompt?.length ?? "undefined"}`);
log(`prompt first 200: ${(event.prompt || "").substring(0, 200)}`);
try {
// Step 1: event.images prüfen (falls Pi diese korrekt übergibt)
const hasEventImages = event.images && event.images.length > 0;
log(`event.images: ${hasEventImages ? `array[${event.images!.length}]` : "none"}`);
// Step 2: Prompt nach Bilddateien scannen
const prompt = event.prompt || "";
const cwd = ctx.cwd || process.cwd();
const rawPaths = extractImagePaths(prompt, cwd);
const imagePaths = rawPaths.filter(p => {
try { return fs.existsSync(p) && fs.statSync(p).isFile(); } catch { return false; }
}).slice(0, MAX_IMAGES);
log(`Found ${rawPaths.length} path candidates, ${imagePaths.length} valid: ${JSON.stringify(imagePaths)}`);
// Step 3: Kurzschluss — keine Bilder
if (imagePaths.length === 0 && !hasEventImages) {
log("No images found — exiting");
return;
}
// Step 4: Vision-Modell suchen
let visionModel = ctx.modelRegistry.find(VISION_PROVIDER, VISION_MODEL_ID);
if (!visionModel) {
const alternatives = ["qwen3-vl:235b-instruct", "gemini-3-flash-preview", "kimi-k2.6"];
for (const alt of alternatives) {
visionModel = ctx.modelRegistry.find(VISION_PROVIDER, alt);
if (visionModel) { log(`Fallback vision model: ${visionModel.id}`); break; }
}
}
if (!visionModel) {
log("ERROR: No vision model found — exiting");
return;
}
log(`Using vision model: ${visionModel.id}`);
// Step 5: API-Key
const auth = await ctx.modelRegistry.getApiKeyAndHeaders(visionModel);
if (!auth.ok || !auth.apiKey) {
log(`ERROR: No API key (error: ${auth.error || "none"}) — exiting`);
return;
}
log("Auth OK");
// Step 6: UI-Feedback
const theme = ctx.ui.theme;
const totalImages = imagePaths.length + (hasEventImages ? event.images!.length : 0);
const statusId = "vision-proxy";
ctx.ui.setStatus(statusId, theme.fg("warning", "🔍 Analysiere Bild..."));
ctx.ui.notify(theme.fg("warning", `🔍 ${totalImages} Bild(er) werden analysiert...`), "info");
// Step 7: Bilder analysieren
const descriptions: string[] = [];
// 7a: Datei-basierte Bilder
for (let i = 0; i < imagePaths.length; i++) {
const imgPath = imagePaths[i];
const label = totalImages > 1 ? `Bild ${i + 1} (${path.basename(imgPath)})` : "Bild";
log(`Processing file: ${imgPath}`);
try {
const imageBuffer = fs.readFileSync(imgPath);
const base64Data = imageBuffer.toString("base64");
const mimeType = detectMimeType(imgPath);
log(`Read ${imageBuffer.length} bytes, mime=${mimeType}`);
const userMessage: Message = {
role: "user",
content: [
{ type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
{ type: "image", data: base64Data, mimeType },
],
timestamp: Date.now(),
};
const response = await complete(
visionModel,
{ systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
{ apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
);
const descriptionText = response.content
.filter((c): c is { type: "text"; text: string } => c.type === "text")
.map((c) => c.text)
.join("\n");
log(`Description received (${descriptionText.length} chars)`);
descriptions.push(`### ${label}\n${descriptionText.trim()}`);
} catch (imgErr: any) {
log(`ERROR processing ${imgPath}: ${imgErr.message}`);
descriptions.push(`### ${label}\n⚠ Fehler: ${imgErr.message}`);
}
}
// 7b: Event-basierte Bilder (falls Pi diese eines Tages übergibt)
if (hasEventImages) {
for (let i = 0; i < event.images!.length; i++) {
const img = event.images![i];
const label = `Event-Bild ${descriptions.length + 1}`;
log(`Processing event image ${i + 1}`);
try {
const userMessage: Message = {
role: "user",
content: [
{ type: "text", text: `Analysiere dieses Bild. Gib alle sichtbaren Texte, Code, Zahlen und Strukturen exakt wieder.` },
img,
],
timestamp: Date.now(),
};
const response = await complete(
visionModel,
{ systemPrompt: VISION_SYSTEM_PROMPT, messages: [userMessage] },
{ apiKey: auth.apiKey, headers: auth.headers, signal: ctx.signal },
);
const descriptionText = response.content
.filter((c): c is { type: "text"; text: string } => c.type === "text")
.map((c) => c.text)
.join("\n");
descriptions.push(`### ${label}\n${descriptionText.trim()}`);
} catch (imgErr: any) {
log(`ERROR processing event image: ${imgErr.message}`);
descriptions.push(`### ${label}\n⚠ Fehler: ${imgErr.message}`);
}
}
}
// Step 8: Ergebnis zurückgeben
if (descriptions.length === 0) {
ctx.ui.setStatus(statusId, undefined);
log("No descriptions generated — exiting");
return;
}
ctx.ui.setStatus(statusId, undefined);
ctx.ui.notify(theme.fg("success", `✅ Bildanalyse abgeschlossen (${descriptions.length} Bild(er))`), "info");
const visionContext = [
`== VISION-PROXY v6: Bildanalyse ==`,
`${descriptions.length} Bild${descriptions.length > 1 ? "er wurden" : " wurde"} durch ${visionModel.id} analysiert.`,
``,
...descriptions,
`== ENDE VISION-PROXY ==`,
].join("\n");
log(`Returning vision context (${visionContext.length} chars)`);
return {
message: {
customType: "vision-proxy",
content: visionContext,
display: true,
details: { imageCount: descriptions.length },
},
};
} catch (err: any) {
try { ctx.ui.setStatus("vision-proxy", ctx.ui.theme.fg("error", "❌ Bildanalyse fehlgeschlagen")); } catch {}
try { ctx.ui.notify(ctx.ui.theme.fg("error", `❌ Bildanalyse fehlgeschlagen: ${err.message}`), "error"); } catch {}
log(`FATAL ERROR: ${err.message}\n${err.stack}`);
}
});
log("=== Vision-Proxy v6 factory complete ===");
}