feat(PROJ-44): GET /api/mails/{id}/ocr-text + Audit-Event

Neuer Endpoint liefert den OCR-extrahierten Reintext als text/plain
mit Content-Disposition. ACL identisch zu /raw (Tenant-Isolation,
Auditor-Regeln, User-Ownership-Check). Status-Mapping:
  done + chars>0 -> 200, attachment "<id16>.ocr.txt"
  pending        -> 202 JSON {"error":"ocr_pending"}
  skipped/failed/disabled/empty -> 404 JSON {"error":"ocr_not_available"}
Jeder erfolgreiche Download landet im Audit-Log als mail:ocr_download.
This commit is contained in:
sysops
2026-05-10 22:20:59 +02:00
parent 7b75433999
commit 62a130d208
3 changed files with 179 additions and 7 deletions
+169
View File
@@ -0,0 +1,169 @@
package api
import (
"fmt"
"net/http"
"strconv"
"archivmail/internal/audit"
"archivmail/internal/index"
"archivmail/internal/userstore"
"archivmail/pkg/mailparser"
)
// handleGetOCRText serves the OCR-extracted text of a mail as text/plain.
//
// PROJ-44 contract:
// - ocr_status='done' + non-empty text → 200 OK, attachment download
// - ocr_status='pending' → 202 Accepted with hint JSON
// - ocr_status in {skipped,failed,disabled} or empty text → 404 JSON
// - ACL is enforced exactly like /raw via requireMailAccess + the
// tenant/role checks below.
func (s *Server) handleGetOCRText(w http.ResponseWriter, r *http.Request) {
id := r.PathValue("id")
// SEC-22: Validate mail ID format to prevent path traversal.
if !isValidMailID(id) {
writeError(w, http.StatusBadRequest, "invalid mail id")
return
}
sess := sessionFromCtx(r.Context())
// Tenant isolation (mirrors handleGetRaw).
if sess.TenantID != nil {
mailTenant, _ := s.store.GetTenantForMail(r.Context(), id)
if mailTenant == nil || *mailTenant != *sess.TenantID {
writeError(w, http.StatusForbidden, "access denied")
return
}
}
// Auditor: only mails with no tenant assignment.
if sess.Role == userstore.RoleAuditor {
ok, err := s.store.IsWithoutTenant(r.Context(), id)
if err != nil || !ok {
writeError(w, http.StatusForbidden, "access denied")
return
}
}
// SEC-28/29: User only: own mails. Parse failure must NOT grant access.
// We load the raw mail solely to enforce the From/To/CC ownership check.
if sess.Role == userstore.RoleUser {
raw, loadErr := s.store.Load(id)
if loadErr != nil {
writeError(w, http.StatusNotFound, "mail not found")
return
}
pm, parseErr := mailparser.Parse(raw)
if parseErr != nil {
writeError(w, http.StatusInternalServerError, "failed to parse mail")
return
}
u, err := s.users.GetByUsername(sess.Username)
if err != nil || !mailBelongsToUser(pm, u.Email) {
writeError(w, http.StatusForbidden, "access denied")
return
}
}
// Load OCR meta from PostgreSQL.
status, chars, metaErr := s.store.GetOCRMeta(r.Context(), id)
if metaErr != nil {
s.logger.Warn("ocr-text: meta lookup failed", "mail_id", id, "err", metaErr)
writeError(w, http.StatusInternalServerError, "ocr meta lookup failed")
return
}
if status == "" {
// No row at all → treat as not-found just like /raw would.
writeError(w, http.StatusNotFound, "mail not found")
return
}
switch status {
case "pending":
writeJSON(w, http.StatusAccepted, map[string]string{
"error": "ocr_pending",
"message": "OCR läuft noch, bitte später erneut",
})
return
case "skipped", "failed", "disabled":
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
case "done":
// fall through
default:
// Unknown status — be conservative and return 404.
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
}
if chars <= 0 {
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
}
// Resolve the correct per-tenant Manticore index for this mail.
reader := s.ocrTextReader(sess.TenantID)
if reader == nil {
s.logger.Warn("ocr-text: indexer does not support AttachmentTextReader",
"mail_id", id)
writeError(w, http.StatusInternalServerError, "ocr text unavailable")
return
}
text, err := reader.GetAttachmentText(id)
if err != nil {
s.logger.Warn("ocr-text: reader failed", "mail_id", id, "err", err)
writeError(w, http.StatusInternalServerError, "ocr text fetch failed")
return
}
if text == "" {
// DB said chars > 0 but Manticore returned empty — race / out of sync.
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
}
// Audit log — every download is recorded (PROJ-44 AC).
s.audlog.Log(audit.Entry{
EventType: audit.EventOCRDownload,
Username: sess.Username,
IPAddress: s.remoteIP(r),
MailID: id,
Success: true,
})
// Filename: use the first 16 hex chars of the SHA-256 mail ID. Mirrors
// /raw behaviour; safe by construction (no quotes, no newlines).
filename := fmt.Sprintf("%s.ocr.txt", id[:16])
body := []byte(text)
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, filename))
w.Header().Set("Content-Length", strconv.Itoa(len(body)))
w.WriteHeader(http.StatusOK)
_, _ = w.Write(body)
}
// ocrTextReader returns the AttachmentTextReader for the given tenant or
// nil when no reader-capable index is wired. Resolution rules match the
// search path (see handleSearch): per-tenant index when available, else the
// global indexer.
func (s *Server) ocrTextReader(tenantID *int64) index.AttachmentTextReader {
var idx index.Indexer = s.idx
if s.idxMgr != nil && tenantID != nil {
idx = s.idxMgr.ForTenant(tenantID)
}
if r, ok := idx.(index.AttachmentTextReader); ok {
return r
}
return nil
}
+2
View File
@@ -215,6 +215,8 @@ func (s *Server) routes() {
s.mux.HandleFunc("GET /api/mails/{id}/attachments/{index}", s.auth(s.requireMailAccess(s.handleGetAttachment)))
s.mux.HandleFunc("GET /api/threads/{threadID}", s.auth(s.handleGetThread))
s.mux.HandleFunc("GET /api/mails/{id}/raw", s.auth(s.requireMailAccess(s.handleGetRaw)))
// PROJ-44: OCR-Text-Download — gleicher ACL-Pfad wie /raw.
s.mux.HandleFunc("GET /api/mails/{id}/ocr-text", s.auth(s.requireMailAccess(s.handleGetOCRText)))
s.mux.HandleFunc("GET /api/admin/services", s.authAdmin(s.handleListServices))
s.mux.HandleFunc("POST /api/admin/services/{name}/action", s.authAdmin(s.handleServiceAction))
+8 -7
View File
@@ -11,13 +11,14 @@ import (
)
const (
EventLogin = "login"
EventLogout = "logout"
EventSearch = "search"
EventMailView = "mail_view"
EventImport = "import"
EventExport = "export"
EventUserMgmt = "user_mgmt"
EventLogin = "login"
EventLogout = "logout"
EventSearch = "search"
EventMailView = "mail_view"
EventImport = "import"
EventExport = "export"
EventUserMgmt = "user_mgmt"
EventOCRDownload = "mail:ocr_download" // PROJ-44: extracted OCR text downloaded
)
// Entry is a single audit log record.