62a130d208
Neuer Endpoint liefert den OCR-extrahierten Reintext als text/plain
mit Content-Disposition. ACL identisch zu /raw (Tenant-Isolation,
Auditor-Regeln, User-Ownership-Check). Status-Mapping:
done + chars>0 -> 200, attachment "<id16>.ocr.txt"
pending -> 202 JSON {"error":"ocr_pending"}
skipped/failed/disabled/empty -> 404 JSON {"error":"ocr_not_available"}
Jeder erfolgreiche Download landet im Audit-Log als mail:ocr_download.
170 lines
4.9 KiB
Go
170 lines
4.9 KiB
Go
package api
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"strconv"
|
|
|
|
"archivmail/internal/audit"
|
|
"archivmail/internal/index"
|
|
"archivmail/internal/userstore"
|
|
"archivmail/pkg/mailparser"
|
|
)
|
|
|
|
// handleGetOCRText serves the OCR-extracted text of a mail as text/plain.
|
|
//
|
|
// PROJ-44 contract:
|
|
// - ocr_status='done' + non-empty text → 200 OK, attachment download
|
|
// - ocr_status='pending' → 202 Accepted with hint JSON
|
|
// - ocr_status in {skipped,failed,disabled} or empty text → 404 JSON
|
|
// - ACL is enforced exactly like /raw via requireMailAccess + the
|
|
// tenant/role checks below.
|
|
func (s *Server) handleGetOCRText(w http.ResponseWriter, r *http.Request) {
|
|
id := r.PathValue("id")
|
|
// SEC-22: Validate mail ID format to prevent path traversal.
|
|
if !isValidMailID(id) {
|
|
writeError(w, http.StatusBadRequest, "invalid mail id")
|
|
return
|
|
}
|
|
|
|
sess := sessionFromCtx(r.Context())
|
|
|
|
// Tenant isolation (mirrors handleGetRaw).
|
|
if sess.TenantID != nil {
|
|
mailTenant, _ := s.store.GetTenantForMail(r.Context(), id)
|
|
if mailTenant == nil || *mailTenant != *sess.TenantID {
|
|
writeError(w, http.StatusForbidden, "access denied")
|
|
return
|
|
}
|
|
}
|
|
|
|
// Auditor: only mails with no tenant assignment.
|
|
if sess.Role == userstore.RoleAuditor {
|
|
ok, err := s.store.IsWithoutTenant(r.Context(), id)
|
|
if err != nil || !ok {
|
|
writeError(w, http.StatusForbidden, "access denied")
|
|
return
|
|
}
|
|
}
|
|
|
|
// SEC-28/29: User only: own mails. Parse failure must NOT grant access.
|
|
// We load the raw mail solely to enforce the From/To/CC ownership check.
|
|
if sess.Role == userstore.RoleUser {
|
|
raw, loadErr := s.store.Load(id)
|
|
if loadErr != nil {
|
|
writeError(w, http.StatusNotFound, "mail not found")
|
|
return
|
|
}
|
|
pm, parseErr := mailparser.Parse(raw)
|
|
if parseErr != nil {
|
|
writeError(w, http.StatusInternalServerError, "failed to parse mail")
|
|
return
|
|
}
|
|
u, err := s.users.GetByUsername(sess.Username)
|
|
if err != nil || !mailBelongsToUser(pm, u.Email) {
|
|
writeError(w, http.StatusForbidden, "access denied")
|
|
return
|
|
}
|
|
}
|
|
|
|
// Load OCR meta from PostgreSQL.
|
|
status, chars, metaErr := s.store.GetOCRMeta(r.Context(), id)
|
|
if metaErr != nil {
|
|
s.logger.Warn("ocr-text: meta lookup failed", "mail_id", id, "err", metaErr)
|
|
writeError(w, http.StatusInternalServerError, "ocr meta lookup failed")
|
|
return
|
|
}
|
|
if status == "" {
|
|
// No row at all → treat as not-found just like /raw would.
|
|
writeError(w, http.StatusNotFound, "mail not found")
|
|
return
|
|
}
|
|
|
|
switch status {
|
|
case "pending":
|
|
writeJSON(w, http.StatusAccepted, map[string]string{
|
|
"error": "ocr_pending",
|
|
"message": "OCR läuft noch, bitte später erneut",
|
|
})
|
|
return
|
|
case "skipped", "failed", "disabled":
|
|
writeJSON(w, http.StatusNotFound, map[string]string{
|
|
"error": "ocr_not_available",
|
|
})
|
|
return
|
|
case "done":
|
|
// fall through
|
|
default:
|
|
// Unknown status — be conservative and return 404.
|
|
writeJSON(w, http.StatusNotFound, map[string]string{
|
|
"error": "ocr_not_available",
|
|
})
|
|
return
|
|
}
|
|
|
|
if chars <= 0 {
|
|
writeJSON(w, http.StatusNotFound, map[string]string{
|
|
"error": "ocr_not_available",
|
|
})
|
|
return
|
|
}
|
|
|
|
// Resolve the correct per-tenant Manticore index for this mail.
|
|
reader := s.ocrTextReader(sess.TenantID)
|
|
if reader == nil {
|
|
s.logger.Warn("ocr-text: indexer does not support AttachmentTextReader",
|
|
"mail_id", id)
|
|
writeError(w, http.StatusInternalServerError, "ocr text unavailable")
|
|
return
|
|
}
|
|
|
|
text, err := reader.GetAttachmentText(id)
|
|
if err != nil {
|
|
s.logger.Warn("ocr-text: reader failed", "mail_id", id, "err", err)
|
|
writeError(w, http.StatusInternalServerError, "ocr text fetch failed")
|
|
return
|
|
}
|
|
if text == "" {
|
|
// DB said chars > 0 but Manticore returned empty — race / out of sync.
|
|
writeJSON(w, http.StatusNotFound, map[string]string{
|
|
"error": "ocr_not_available",
|
|
})
|
|
return
|
|
}
|
|
|
|
// Audit log — every download is recorded (PROJ-44 AC).
|
|
s.audlog.Log(audit.Entry{
|
|
EventType: audit.EventOCRDownload,
|
|
Username: sess.Username,
|
|
IPAddress: s.remoteIP(r),
|
|
MailID: id,
|
|
Success: true,
|
|
})
|
|
|
|
// Filename: use the first 16 hex chars of the SHA-256 mail ID. Mirrors
|
|
// /raw behaviour; safe by construction (no quotes, no newlines).
|
|
filename := fmt.Sprintf("%s.ocr.txt", id[:16])
|
|
body := []byte(text)
|
|
|
|
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
|
w.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, filename))
|
|
w.Header().Set("Content-Length", strconv.Itoa(len(body)))
|
|
w.WriteHeader(http.StatusOK)
|
|
_, _ = w.Write(body)
|
|
}
|
|
|
|
// ocrTextReader returns the AttachmentTextReader for the given tenant or
|
|
// nil when no reader-capable index is wired. Resolution rules match the
|
|
// search path (see handleSearch): per-tenant index when available, else the
|
|
// global indexer.
|
|
func (s *Server) ocrTextReader(tenantID *int64) index.AttachmentTextReader {
|
|
var idx index.Indexer = s.idx
|
|
if s.idxMgr != nil && tenantID != nil {
|
|
idx = s.idxMgr.ForTenant(tenantID)
|
|
}
|
|
if r, ok := idx.(index.AttachmentTextReader); ok {
|
|
return r
|
|
}
|
|
return nil
|
|
}
|