From 23a9a7ea37a2be33b2c367c9ba8052ce6ccd5a80 Mon Sep 17 00:00:00 2001 From: sysops Date: Sun, 10 May 2026 22:18:38 +0200 Subject: [PATCH] feat(PROJ-44): API-Types + Sanitize-Helper fuer OCR-GUI - MailDetail um ocr_status/ocr_chars erweitert - SearchHit um snippet + match_field erweitert - Neue API-Funktionen getOCRTextDownloadURL und downloadMailOCRText inkl. 202/404-Handling fuer pending/not-available - src/lib/sanitize.ts: sanitizeSnippet escaped HTML und laesst nur -Tags fuer Manticore-Highlights durch - Re-exports in src/lib/api/index.ts ergaenzt Co-Authored-By: Claude Opus 4.7 --- src/lib/api/index.ts | 5 +++++ src/lib/api/mail.ts | 47 ++++++++++++++++++++++++++++++++++++++++++++ src/lib/sanitize.ts | 25 +++++++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 src/lib/sanitize.ts diff --git a/src/lib/api/index.ts b/src/lib/api/index.ts index eafcdba..fefdb8f 100644 --- a/src/lib/api/index.ts +++ b/src/lib/api/index.ts @@ -82,6 +82,9 @@ export type { ThreadResponse, MailAttachment, MailDetail, + OCRStatus, + SearchMatchField, + OCRDownloadResult, ImapFolder, ImapAccount, ImapTestResult, @@ -95,6 +98,8 @@ export { getThread, downloadMailAttachment, downloadMailRaw, + getOCRTextDownloadURL, + downloadMailOCRText, getImapAccounts, createImapAccount, deleteImapAccount, diff --git a/src/lib/api/mail.ts b/src/lib/api/mail.ts index 0c1045c..523a50d 100644 --- a/src/lib/api/mail.ts +++ b/src/lib/api/mail.ts @@ -2,6 +2,16 @@ import { API_BASE, request } from "./core"; // ── Types ──────────────────────────────────────────────────────────────────── +export type OCRStatus = "pending" | "done" | "failed" | "skipped" | "disabled"; + +export type SearchMatchField = + | "subject" + | "body" + | "attachment_text" + | "attachment_names" + | "from_addr" + | "to_addr"; + export interface SearchHit { id: string; score: number; @@ -13,6 +23,9 @@ export interface SearchHit { has_attachments?: boolean; thread_id?: string; thread_size?: number; + // PROJ-44: Manticore snippet (with ... highlights) and source field + snippet?: string; + match_field?: SearchMatchField; } export interface ThreadMail { @@ -57,6 +70,9 @@ export interface MailDetail { verify_ok: boolean | null; verified_at: string | null; thread_id?: string; + // PROJ-44: OCR status and extracted-text length + ocr_status?: OCRStatus; + ocr_chars?: number; } export interface ImapFolder { @@ -187,6 +203,37 @@ export async function downloadMailRaw( return { blob: await res.blob(), filename: `${id}.eml` }; } +// ── OCR text ────────────────────────────────────────────────────────────────── + +/** Direct URL of the OCR-text endpoint. Browser uses cookie auth automatically. */ +export function getOCRTextDownloadURL(mailId: string): string { + return `${API_BASE}/api/mails/${mailId}/ocr-text`; +} + +export type OCRDownloadResult = + | { kind: "ok"; blob: Blob; filename: string } + | { kind: "pending" } + | { kind: "not_available" }; + +/** + * Fetch the OCR text. Handles the 202 (pending) and 404 (not available) + * cases gracefully so the UI can show different feedback. + */ +export async function downloadMailOCRText( + id: string +): Promise { + const res = await fetch(`${API_BASE}/api/mails/${id}/ocr-text`, { + credentials: "include", + }); + if (res.status === 202) return { kind: "pending" }; + if (res.status === 404) return { kind: "not_available" }; + if (!res.ok) throw new Error(`Download fehlgeschlagen: ${res.status}`); + const cd = res.headers.get("Content-Disposition") || ""; + const match = cd.match(/filename="([^"]+)"/); + const filename = match ? match[1] : `${id}.ocr.txt`; + return { kind: "ok", blob: await res.blob(), filename }; +} + // ── IMAP ────────────────────────────────────────────────────────────────────── export async function getImapAccounts(): Promise { diff --git a/src/lib/sanitize.ts b/src/lib/sanitize.ts new file mode 100644 index 0000000..64f48ef --- /dev/null +++ b/src/lib/sanitize.ts @@ -0,0 +1,25 @@ +/** + * sanitizeSnippet escapes all HTML in a Manticore snippet, then re-enables + * the only markup we trust: wrappers around match words. + * + * Manticore SNIPPET()/HIGHLIGHT() output is otherwise plain text plus + * configurable `` markers, so this is sufficient for safe rendering + * via dangerouslySetInnerHTML. + */ +export function sanitizeSnippet(input: string): string { + if (!input) return ""; + + // 1. Escape every HTML-special character. + const escaped = input + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); + + // 2. Re-allow our two whitelisted tags. After step 1 they appear as + // "<b>" / "</b>" — restore them verbatim. + return escaped + .replace(/<b>/g, "") + .replace(/<\/b>/g, ""); +}