feat(PROJ-44): API-Types + Sanitize-Helper fuer OCR-GUI

- MailDetail um ocr_status/ocr_chars erweitert - SearchHit um snippet + match_field erweitert - Neue API-Funktionen getOCRTextDownloadURL und downloadMailOCRText inkl. 202/404-Handling fuer pending/not-available - src/lib/sanitize.ts: sanitizeSnippet escaped HTML und laesst nur <b>-Tags fuer Manticore-Highlights durch - Re-exports in src/lib/api/index.ts ergaenzt Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 22:18:38 +02:00
parent 88e9d0c08c
commit 23a9a7ea37
3 changed files with 77 additions and 0 deletions
@@ -82,6 +82,9 @@ export type {
  ThreadResponse,
  MailAttachment,
  MailDetail,
+  OCRStatus,
+  SearchMatchField,
+  OCRDownloadResult,
  ImapFolder,
  ImapAccount,
  ImapTestResult,
@@ -95,6 +98,8 @@ export {
  getThread,
  downloadMailAttachment,
  downloadMailRaw,
+  getOCRTextDownloadURL,
+  downloadMailOCRText,
  getImapAccounts,
  createImapAccount,
  deleteImapAccount,
@@ -2,6 +2,16 @@ import { API_BASE, request } from "./core";

 // ── Types ────────────────────────────────────────────────────────────────────

+export type OCRStatus = "pending" | "done" | "failed" | "skipped" | "disabled";
+
+export type SearchMatchField =
+  | "subject"
+  | "body"
+  | "attachment_text"
+  | "attachment_names"
+  | "from_addr"
+  | "to_addr";
+
 export interface SearchHit {
  id: string;
  score: number;
@@ -13,6 +23,9 @@ export interface SearchHit {
  has_attachments?: boolean;
  thread_id?: string;
  thread_size?: number;
+  // PROJ-44: Manticore snippet (with <b>...</b> highlights) and source field
+  snippet?: string;
+  match_field?: SearchMatchField;
 }

 export interface ThreadMail {
@@ -57,6 +70,9 @@ export interface MailDetail {
  verify_ok: boolean | null;
  verified_at: string | null;
  thread_id?: string;
+  // PROJ-44: OCR status and extracted-text length
+  ocr_status?: OCRStatus;
+  ocr_chars?: number;
 }

 export interface ImapFolder {
@@ -187,6 +203,37 @@ export async function downloadMailRaw(
  return { blob: await res.blob(), filename: `${id}.eml` };
 }

+// ── OCR text ──────────────────────────────────────────────────────────────────
+
+/** Direct URL of the OCR-text endpoint. Browser uses cookie auth automatically. */
+export function getOCRTextDownloadURL(mailId: string): string {
+  return `${API_BASE}/api/mails/${mailId}/ocr-text`;
+}
+
+export type OCRDownloadResult =
+  | { kind: "ok"; blob: Blob; filename: string }
+  | { kind: "pending" }
+  | { kind: "not_available" };
+
+/**
+ * Fetch the OCR text. Handles the 202 (pending) and 404 (not available)
+ * cases gracefully so the UI can show different feedback.
+ */
+export async function downloadMailOCRText(
+  id: string
+): Promise<OCRDownloadResult> {
+  const res = await fetch(`${API_BASE}/api/mails/${id}/ocr-text`, {
+    credentials: "include",
+  });
+  if (res.status === 202) return { kind: "pending" };
+  if (res.status === 404) return { kind: "not_available" };
+  if (!res.ok) throw new Error(`Download fehlgeschlagen: ${res.status}`);
+  const cd = res.headers.get("Content-Disposition") || "";
+  const match = cd.match(/filename="([^"]+)"/);
+  const filename = match ? match[1] : `${id}.ocr.txt`;
+  return { kind: "ok", blob: await res.blob(), filename };
+}
+
 // ── IMAP ──────────────────────────────────────────────────────────────────────

 export async function getImapAccounts(): Promise<ImapAccount[]> {
@@ -0,0 +1,25 @@
+/**
+ * sanitizeSnippet escapes all HTML in a Manticore snippet, then re-enables
+ * the only markup we trust: <b>…</b> wrappers around match words.
+ *
+ * Manticore SNIPPET()/HIGHLIGHT() output is otherwise plain text plus
+ * configurable `<b>` markers, so this is sufficient for safe rendering
+ * via dangerouslySetInnerHTML.
+ */
+export function sanitizeSnippet(input: string): string {
+  if (!input) return "";
+
+  // 1. Escape every HTML-special character.
+  const escaped = input
+    .replace(/&/g, "&amp;")
+    .replace(/</g, "&lt;")
+    .replace(/>/g, "&gt;")
+    .replace(/"/g, "&quot;")
+    .replace(/'/g, "&#39;");
+
+  // 2. Re-allow our two whitelisted tags. After step 1 they appear as
+  //    "&lt;b&gt;" / "&lt;/b&gt;" — restore them verbatim.
+  return escaped
+    .replace(/&lt;b&gt;/g, "<b>")
+    .replace(/&lt;\/b&gt;/g, "</b>");
+}