package api import ( "fmt" "net/http" "strconv" "archivmail/internal/audit" "archivmail/internal/index" "archivmail/internal/userstore" "archivmail/pkg/mailparser" ) // handleGetOCRText serves the OCR-extracted text of a mail as text/plain. // // PROJ-44 contract: // - ocr_status='done' + non-empty text → 200 OK, attachment download // - ocr_status='pending' → 202 Accepted with hint JSON // - ocr_status in {skipped,failed,disabled} or empty text → 404 JSON // - ACL is enforced exactly like /raw via requireMailAccess + the // tenant/role checks below. func (s *Server) handleGetOCRText(w http.ResponseWriter, r *http.Request) { id := r.PathValue("id") // SEC-22: Validate mail ID format to prevent path traversal. if !isValidMailID(id) { writeError(w, http.StatusBadRequest, "invalid mail id") return } sess := sessionFromCtx(r.Context()) // Tenant isolation (mirrors handleGetRaw). if sess.TenantID != nil { mailTenant, _ := s.store.GetTenantForMail(r.Context(), id) if mailTenant == nil || *mailTenant != *sess.TenantID { writeError(w, http.StatusForbidden, "access denied") return } } // Auditor: only mails with no tenant assignment. if sess.Role == userstore.RoleAuditor { ok, err := s.store.IsWithoutTenant(r.Context(), id) if err != nil || !ok { writeError(w, http.StatusForbidden, "access denied") return } } // SEC-28/29: User only: own mails. Parse failure must NOT grant access. // We load the raw mail solely to enforce the From/To/CC ownership check. if sess.Role == userstore.RoleUser { raw, loadErr := s.store.Load(id) if loadErr != nil { writeError(w, http.StatusNotFound, "mail not found") return } pm, parseErr := mailparser.Parse(raw) if parseErr != nil { writeError(w, http.StatusInternalServerError, "failed to parse mail") return } u, err := s.users.GetByUsername(sess.Username) if err != nil || !mailBelongsToUser(pm, u.Email) { writeError(w, http.StatusForbidden, "access denied") return } } // Load OCR meta from PostgreSQL. status, chars, metaErr := s.store.GetOCRMeta(r.Context(), id) if metaErr != nil { s.logger.Warn("ocr-text: meta lookup failed", "mail_id", id, "err", metaErr) writeError(w, http.StatusInternalServerError, "ocr meta lookup failed") return } if status == "" { // No row at all → treat as not-found just like /raw would. writeError(w, http.StatusNotFound, "mail not found") return } switch status { case "pending": writeJSON(w, http.StatusAccepted, map[string]string{ "error": "ocr_pending", "message": "OCR läuft noch, bitte später erneut", }) return case "skipped", "failed", "disabled": writeJSON(w, http.StatusNotFound, map[string]string{ "error": "ocr_not_available", }) return case "done": // fall through default: // Unknown status — be conservative and return 404. writeJSON(w, http.StatusNotFound, map[string]string{ "error": "ocr_not_available", }) return } if chars <= 0 { writeJSON(w, http.StatusNotFound, map[string]string{ "error": "ocr_not_available", }) return } // PROJ-44: Resolve the correct per-tenant Manticore index based on the // MAIL's tenant assignment, not the session's. The session tenant is // already used for ACL enforcement above; using it for index selection // breaks admin/auditor access (who have nil session tenant) and would // also misread when the two ever disagree. The OCR worker writes into // the index derived from emails.tenant_id, so the reader must follow // the same rule. mailTenant, _ := s.store.GetTenantForMail(r.Context(), id) reader := s.ocrTextReader(mailTenant) if reader == nil { s.logger.Warn("ocr-text: indexer does not support AttachmentTextReader", "mail_id", id) writeError(w, http.StatusInternalServerError, "ocr text unavailable") return } text, err := reader.GetAttachmentText(id) if err != nil { s.logger.Warn("ocr-text: reader failed", "mail_id", id, "err", err) writeError(w, http.StatusInternalServerError, "ocr text fetch failed") return } if text == "" { // DB said chars > 0 but Manticore returned empty — race / out of sync. writeJSON(w, http.StatusNotFound, map[string]string{ "error": "ocr_not_available", }) return } // Audit log — every download is recorded (PROJ-44 AC). s.audlog.Log(audit.Entry{ EventType: audit.EventOCRDownload, Username: sess.Username, IPAddress: s.remoteIP(r), MailID: id, Success: true, }) // Filename: use the first 16 hex chars of the SHA-256 mail ID. Mirrors // /raw behaviour; safe by construction (no quotes, no newlines). filename := fmt.Sprintf("%s.ocr.txt", id[:16]) body := []byte(text) w.Header().Set("Content-Type", "text/plain; charset=utf-8") w.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, filename)) w.Header().Set("Content-Length", strconv.Itoa(len(body))) w.WriteHeader(http.StatusOK) _, _ = w.Write(body) } // ocrTextReader returns the AttachmentTextReader for the given tenant or // nil when no reader-capable index is wired. Callers must pass the MAIL's // tenant id (emails.tenant_id), not the session tenant — see PROJ-44 fix. // Resolution rules: per-tenant index when available, else the global indexer. func (s *Server) ocrTextReader(tenantID *int64) index.AttachmentTextReader { var idx index.Indexer = s.idx if s.idxMgr != nil && tenantID != nil { idx = s.idxMgr.ForTenant(tenantID) } if r, ok := idx.(index.AttachmentTextReader); ok { return r } return nil }