diff --git a/internal/api/ocr_handlers.go b/internal/api/ocr_handlers.go new file mode 100644 index 0000000..43e5b43 --- /dev/null +++ b/internal/api/ocr_handlers.go @@ -0,0 +1,169 @@ +package api + +import ( + "fmt" + "net/http" + "strconv" + + "archivmail/internal/audit" + "archivmail/internal/index" + "archivmail/internal/userstore" + "archivmail/pkg/mailparser" +) + +// handleGetOCRText serves the OCR-extracted text of a mail as text/plain. +// +// PROJ-44 contract: +// - ocr_status='done' + non-empty text → 200 OK, attachment download +// - ocr_status='pending' → 202 Accepted with hint JSON +// - ocr_status in {skipped,failed,disabled} or empty text → 404 JSON +// - ACL is enforced exactly like /raw via requireMailAccess + the +// tenant/role checks below. +func (s *Server) handleGetOCRText(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + // SEC-22: Validate mail ID format to prevent path traversal. + if !isValidMailID(id) { + writeError(w, http.StatusBadRequest, "invalid mail id") + return + } + + sess := sessionFromCtx(r.Context()) + + // Tenant isolation (mirrors handleGetRaw). + if sess.TenantID != nil { + mailTenant, _ := s.store.GetTenantForMail(r.Context(), id) + if mailTenant == nil || *mailTenant != *sess.TenantID { + writeError(w, http.StatusForbidden, "access denied") + return + } + } + + // Auditor: only mails with no tenant assignment. + if sess.Role == userstore.RoleAuditor { + ok, err := s.store.IsWithoutTenant(r.Context(), id) + if err != nil || !ok { + writeError(w, http.StatusForbidden, "access denied") + return + } + } + + // SEC-28/29: User only: own mails. Parse failure must NOT grant access. + // We load the raw mail solely to enforce the From/To/CC ownership check. + if sess.Role == userstore.RoleUser { + raw, loadErr := s.store.Load(id) + if loadErr != nil { + writeError(w, http.StatusNotFound, "mail not found") + return + } + pm, parseErr := mailparser.Parse(raw) + if parseErr != nil { + writeError(w, http.StatusInternalServerError, "failed to parse mail") + return + } + u, err := s.users.GetByUsername(sess.Username) + if err != nil || !mailBelongsToUser(pm, u.Email) { + writeError(w, http.StatusForbidden, "access denied") + return + } + } + + // Load OCR meta from PostgreSQL. + status, chars, metaErr := s.store.GetOCRMeta(r.Context(), id) + if metaErr != nil { + s.logger.Warn("ocr-text: meta lookup failed", "mail_id", id, "err", metaErr) + writeError(w, http.StatusInternalServerError, "ocr meta lookup failed") + return + } + if status == "" { + // No row at all → treat as not-found just like /raw would. + writeError(w, http.StatusNotFound, "mail not found") + return + } + + switch status { + case "pending": + writeJSON(w, http.StatusAccepted, map[string]string{ + "error": "ocr_pending", + "message": "OCR läuft noch, bitte später erneut", + }) + return + case "skipped", "failed", "disabled": + writeJSON(w, http.StatusNotFound, map[string]string{ + "error": "ocr_not_available", + }) + return + case "done": + // fall through + default: + // Unknown status — be conservative and return 404. + writeJSON(w, http.StatusNotFound, map[string]string{ + "error": "ocr_not_available", + }) + return + } + + if chars <= 0 { + writeJSON(w, http.StatusNotFound, map[string]string{ + "error": "ocr_not_available", + }) + return + } + + // Resolve the correct per-tenant Manticore index for this mail. + reader := s.ocrTextReader(sess.TenantID) + if reader == nil { + s.logger.Warn("ocr-text: indexer does not support AttachmentTextReader", + "mail_id", id) + writeError(w, http.StatusInternalServerError, "ocr text unavailable") + return + } + + text, err := reader.GetAttachmentText(id) + if err != nil { + s.logger.Warn("ocr-text: reader failed", "mail_id", id, "err", err) + writeError(w, http.StatusInternalServerError, "ocr text fetch failed") + return + } + if text == "" { + // DB said chars > 0 but Manticore returned empty — race / out of sync. + writeJSON(w, http.StatusNotFound, map[string]string{ + "error": "ocr_not_available", + }) + return + } + + // Audit log — every download is recorded (PROJ-44 AC). + s.audlog.Log(audit.Entry{ + EventType: audit.EventOCRDownload, + Username: sess.Username, + IPAddress: s.remoteIP(r), + MailID: id, + Success: true, + }) + + // Filename: use the first 16 hex chars of the SHA-256 mail ID. Mirrors + // /raw behaviour; safe by construction (no quotes, no newlines). + filename := fmt.Sprintf("%s.ocr.txt", id[:16]) + body := []byte(text) + + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, filename)) + w.Header().Set("Content-Length", strconv.Itoa(len(body))) + w.WriteHeader(http.StatusOK) + _, _ = w.Write(body) +} + +// ocrTextReader returns the AttachmentTextReader for the given tenant or +// nil when no reader-capable index is wired. Resolution rules match the +// search path (see handleSearch): per-tenant index when available, else the +// global indexer. +func (s *Server) ocrTextReader(tenantID *int64) index.AttachmentTextReader { + var idx index.Indexer = s.idx + if s.idxMgr != nil && tenantID != nil { + idx = s.idxMgr.ForTenant(tenantID) + } + if r, ok := idx.(index.AttachmentTextReader); ok { + return r + } + return nil +} diff --git a/internal/api/server.go b/internal/api/server.go index f8f3248..8eae884 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -215,6 +215,8 @@ func (s *Server) routes() { s.mux.HandleFunc("GET /api/mails/{id}/attachments/{index}", s.auth(s.requireMailAccess(s.handleGetAttachment))) s.mux.HandleFunc("GET /api/threads/{threadID}", s.auth(s.handleGetThread)) s.mux.HandleFunc("GET /api/mails/{id}/raw", s.auth(s.requireMailAccess(s.handleGetRaw))) + // PROJ-44: OCR-Text-Download — gleicher ACL-Pfad wie /raw. + s.mux.HandleFunc("GET /api/mails/{id}/ocr-text", s.auth(s.requireMailAccess(s.handleGetOCRText))) s.mux.HandleFunc("GET /api/admin/services", s.authAdmin(s.handleListServices)) s.mux.HandleFunc("POST /api/admin/services/{name}/action", s.authAdmin(s.handleServiceAction)) diff --git a/internal/audit/audit.go b/internal/audit/audit.go index 0522c12..bf1c9b4 100644 --- a/internal/audit/audit.go +++ b/internal/audit/audit.go @@ -11,13 +11,14 @@ import ( ) const ( - EventLogin = "login" - EventLogout = "logout" - EventSearch = "search" - EventMailView = "mail_view" - EventImport = "import" - EventExport = "export" - EventUserMgmt = "user_mgmt" + EventLogin = "login" + EventLogout = "logout" + EventSearch = "search" + EventMailView = "mail_view" + EventImport = "import" + EventExport = "export" + EventUserMgmt = "user_mgmt" + EventOCRDownload = "mail:ocr_download" // PROJ-44: extracted OCR text downloaded ) // Entry is a single audit log record.