Files
archivmail/internal/api/ocr_handlers.go
T
sysops 16013e8b66 fix(PROJ-44): OCR-Tenant-Routing nutzt kanonische DB-tenant_id
Strukturbug auf 132 gefunden: Tenant-User (Rolle user) sahen ihren
OCR-Text nicht, obwohl ocr_chars>0 in PostgreSQL stand. Ursache:

- OCR-Worker hat in den per Job.TenantID gewaehlten Index geschrieben.
  Beim Reprocess via CLI kam TenantID aus dem Submitter-Kontext und
  konnte vom in emails.tenant_id gespeicherten Wert abweichen.
- /ocr-text-Endpoint hat fuer die Index-Auswahl session.TenantID
  benutzt. Bei Admin/Auditor (nil Session-Tenant) wurde immer global
  gelesen, auch wenn die Mail einem Tenant gehoert.

Fix: Beide Stellen lesen jetzt die TenantID **immer** aus
storage.GetTenantForMail(emails.tenant_id) und routen den
Manticore-Index entsprechend. ACL-Check im Endpoint bleibt
unveraendert auf session.TenantID == mail.tenant_id — die
Tenant-Isolation wird nicht aufgeweicht.

Edge cases:
- Mail mit tenant_id NULL: GetTenantForMail liefert nil -> globaler
  Index (vorher und nachher gleich).
- DB-Fehler beim Lookup: faellt auf nil zurueck -> globaler Index,
  liefert leeren Text fuer Tenant-Mails -> 404. Safe (keine
  Querleckage zwischen Tenants).
2026-05-10 23:13:57 +02:00

177 lines
5.4 KiB
Go

package api
import (
"fmt"
"net/http"
"strconv"
"archivmail/internal/audit"
"archivmail/internal/index"
"archivmail/internal/userstore"
"archivmail/pkg/mailparser"
)
// handleGetOCRText serves the OCR-extracted text of a mail as text/plain.
//
// PROJ-44 contract:
// - ocr_status='done' + non-empty text → 200 OK, attachment download
// - ocr_status='pending' → 202 Accepted with hint JSON
// - ocr_status in {skipped,failed,disabled} or empty text → 404 JSON
// - ACL is enforced exactly like /raw via requireMailAccess + the
// tenant/role checks below.
func (s *Server) handleGetOCRText(w http.ResponseWriter, r *http.Request) {
id := r.PathValue("id")
// SEC-22: Validate mail ID format to prevent path traversal.
if !isValidMailID(id) {
writeError(w, http.StatusBadRequest, "invalid mail id")
return
}
sess := sessionFromCtx(r.Context())
// Tenant isolation (mirrors handleGetRaw).
if sess.TenantID != nil {
mailTenant, _ := s.store.GetTenantForMail(r.Context(), id)
if mailTenant == nil || *mailTenant != *sess.TenantID {
writeError(w, http.StatusForbidden, "access denied")
return
}
}
// Auditor: only mails with no tenant assignment.
if sess.Role == userstore.RoleAuditor {
ok, err := s.store.IsWithoutTenant(r.Context(), id)
if err != nil || !ok {
writeError(w, http.StatusForbidden, "access denied")
return
}
}
// SEC-28/29: User only: own mails. Parse failure must NOT grant access.
// We load the raw mail solely to enforce the From/To/CC ownership check.
if sess.Role == userstore.RoleUser {
raw, loadErr := s.store.Load(id)
if loadErr != nil {
writeError(w, http.StatusNotFound, "mail not found")
return
}
pm, parseErr := mailparser.Parse(raw)
if parseErr != nil {
writeError(w, http.StatusInternalServerError, "failed to parse mail")
return
}
u, err := s.users.GetByUsername(sess.Username)
if err != nil || !mailBelongsToUser(pm, u.Email) {
writeError(w, http.StatusForbidden, "access denied")
return
}
}
// Load OCR meta from PostgreSQL.
status, chars, metaErr := s.store.GetOCRMeta(r.Context(), id)
if metaErr != nil {
s.logger.Warn("ocr-text: meta lookup failed", "mail_id", id, "err", metaErr)
writeError(w, http.StatusInternalServerError, "ocr meta lookup failed")
return
}
if status == "" {
// No row at all → treat as not-found just like /raw would.
writeError(w, http.StatusNotFound, "mail not found")
return
}
switch status {
case "pending":
writeJSON(w, http.StatusAccepted, map[string]string{
"error": "ocr_pending",
"message": "OCR läuft noch, bitte später erneut",
})
return
case "skipped", "failed", "disabled":
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
case "done":
// fall through
default:
// Unknown status — be conservative and return 404.
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
}
if chars <= 0 {
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
}
// PROJ-44: Resolve the correct per-tenant Manticore index based on the
// MAIL's tenant assignment, not the session's. The session tenant is
// already used for ACL enforcement above; using it for index selection
// breaks admin/auditor access (who have nil session tenant) and would
// also misread when the two ever disagree. The OCR worker writes into
// the index derived from emails.tenant_id, so the reader must follow
// the same rule.
mailTenant, _ := s.store.GetTenantForMail(r.Context(), id)
reader := s.ocrTextReader(mailTenant)
if reader == nil {
s.logger.Warn("ocr-text: indexer does not support AttachmentTextReader",
"mail_id", id)
writeError(w, http.StatusInternalServerError, "ocr text unavailable")
return
}
text, err := reader.GetAttachmentText(id)
if err != nil {
s.logger.Warn("ocr-text: reader failed", "mail_id", id, "err", err)
writeError(w, http.StatusInternalServerError, "ocr text fetch failed")
return
}
if text == "" {
// DB said chars > 0 but Manticore returned empty — race / out of sync.
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "ocr_not_available",
})
return
}
// Audit log — every download is recorded (PROJ-44 AC).
s.audlog.Log(audit.Entry{
EventType: audit.EventOCRDownload,
Username: sess.Username,
IPAddress: s.remoteIP(r),
MailID: id,
Success: true,
})
// Filename: use the first 16 hex chars of the SHA-256 mail ID. Mirrors
// /raw behaviour; safe by construction (no quotes, no newlines).
filename := fmt.Sprintf("%s.ocr.txt", id[:16])
body := []byte(text)
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, filename))
w.Header().Set("Content-Length", strconv.Itoa(len(body)))
w.WriteHeader(http.StatusOK)
_, _ = w.Write(body)
}
// ocrTextReader returns the AttachmentTextReader for the given tenant or
// nil when no reader-capable index is wired. Callers must pass the MAIL's
// tenant id (emails.tenant_id), not the session tenant — see PROJ-44 fix.
// Resolution rules: per-tenant index when available, else the global indexer.
func (s *Server) ocrTextReader(tenantID *int64) index.AttachmentTextReader {
var idx index.Indexer = s.idx
if s.idxMgr != nil && tenantID != nil {
idx = s.idxMgr.ForTenant(tenantID)
}
if r, ok := idx.(index.AttachmentTextReader); ok {
return r
}
return nil
}