feat(PROJ-44): GET /api/mails/{id}/ocr-text + Audit-Event
Neuer Endpoint liefert den OCR-extrahierten Reintext als text/plain
mit Content-Disposition. ACL identisch zu /raw (Tenant-Isolation,
Auditor-Regeln, User-Ownership-Check). Status-Mapping:
done + chars>0 -> 200, attachment "<id16>.ocr.txt"
pending -> 202 JSON {"error":"ocr_pending"}
skipped/failed/disabled/empty -> 404 JSON {"error":"ocr_not_available"}
Jeder erfolgreiche Download landet im Audit-Log als mail:ocr_download.
This commit is contained in:
@@ -0,0 +1,169 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"archivmail/internal/audit"
|
||||
"archivmail/internal/index"
|
||||
"archivmail/internal/userstore"
|
||||
"archivmail/pkg/mailparser"
|
||||
)
|
||||
|
||||
// handleGetOCRText serves the OCR-extracted text of a mail as text/plain.
|
||||
//
|
||||
// PROJ-44 contract:
|
||||
// - ocr_status='done' + non-empty text → 200 OK, attachment download
|
||||
// - ocr_status='pending' → 202 Accepted with hint JSON
|
||||
// - ocr_status in {skipped,failed,disabled} or empty text → 404 JSON
|
||||
// - ACL is enforced exactly like /raw via requireMailAccess + the
|
||||
// tenant/role checks below.
|
||||
func (s *Server) handleGetOCRText(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// SEC-22: Validate mail ID format to prevent path traversal.
|
||||
if !isValidMailID(id) {
|
||||
writeError(w, http.StatusBadRequest, "invalid mail id")
|
||||
return
|
||||
}
|
||||
|
||||
sess := sessionFromCtx(r.Context())
|
||||
|
||||
// Tenant isolation (mirrors handleGetRaw).
|
||||
if sess.TenantID != nil {
|
||||
mailTenant, _ := s.store.GetTenantForMail(r.Context(), id)
|
||||
if mailTenant == nil || *mailTenant != *sess.TenantID {
|
||||
writeError(w, http.StatusForbidden, "access denied")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Auditor: only mails with no tenant assignment.
|
||||
if sess.Role == userstore.RoleAuditor {
|
||||
ok, err := s.store.IsWithoutTenant(r.Context(), id)
|
||||
if err != nil || !ok {
|
||||
writeError(w, http.StatusForbidden, "access denied")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// SEC-28/29: User only: own mails. Parse failure must NOT grant access.
|
||||
// We load the raw mail solely to enforce the From/To/CC ownership check.
|
||||
if sess.Role == userstore.RoleUser {
|
||||
raw, loadErr := s.store.Load(id)
|
||||
if loadErr != nil {
|
||||
writeError(w, http.StatusNotFound, "mail not found")
|
||||
return
|
||||
}
|
||||
pm, parseErr := mailparser.Parse(raw)
|
||||
if parseErr != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to parse mail")
|
||||
return
|
||||
}
|
||||
u, err := s.users.GetByUsername(sess.Username)
|
||||
if err != nil || !mailBelongsToUser(pm, u.Email) {
|
||||
writeError(w, http.StatusForbidden, "access denied")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Load OCR meta from PostgreSQL.
|
||||
status, chars, metaErr := s.store.GetOCRMeta(r.Context(), id)
|
||||
if metaErr != nil {
|
||||
s.logger.Warn("ocr-text: meta lookup failed", "mail_id", id, "err", metaErr)
|
||||
writeError(w, http.StatusInternalServerError, "ocr meta lookup failed")
|
||||
return
|
||||
}
|
||||
if status == "" {
|
||||
// No row at all → treat as not-found just like /raw would.
|
||||
writeError(w, http.StatusNotFound, "mail not found")
|
||||
return
|
||||
}
|
||||
|
||||
switch status {
|
||||
case "pending":
|
||||
writeJSON(w, http.StatusAccepted, map[string]string{
|
||||
"error": "ocr_pending",
|
||||
"message": "OCR läuft noch, bitte später erneut",
|
||||
})
|
||||
return
|
||||
case "skipped", "failed", "disabled":
|
||||
writeJSON(w, http.StatusNotFound, map[string]string{
|
||||
"error": "ocr_not_available",
|
||||
})
|
||||
return
|
||||
case "done":
|
||||
// fall through
|
||||
default:
|
||||
// Unknown status — be conservative and return 404.
|
||||
writeJSON(w, http.StatusNotFound, map[string]string{
|
||||
"error": "ocr_not_available",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if chars <= 0 {
|
||||
writeJSON(w, http.StatusNotFound, map[string]string{
|
||||
"error": "ocr_not_available",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Resolve the correct per-tenant Manticore index for this mail.
|
||||
reader := s.ocrTextReader(sess.TenantID)
|
||||
if reader == nil {
|
||||
s.logger.Warn("ocr-text: indexer does not support AttachmentTextReader",
|
||||
"mail_id", id)
|
||||
writeError(w, http.StatusInternalServerError, "ocr text unavailable")
|
||||
return
|
||||
}
|
||||
|
||||
text, err := reader.GetAttachmentText(id)
|
||||
if err != nil {
|
||||
s.logger.Warn("ocr-text: reader failed", "mail_id", id, "err", err)
|
||||
writeError(w, http.StatusInternalServerError, "ocr text fetch failed")
|
||||
return
|
||||
}
|
||||
if text == "" {
|
||||
// DB said chars > 0 but Manticore returned empty — race / out of sync.
|
||||
writeJSON(w, http.StatusNotFound, map[string]string{
|
||||
"error": "ocr_not_available",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Audit log — every download is recorded (PROJ-44 AC).
|
||||
s.audlog.Log(audit.Entry{
|
||||
EventType: audit.EventOCRDownload,
|
||||
Username: sess.Username,
|
||||
IPAddress: s.remoteIP(r),
|
||||
MailID: id,
|
||||
Success: true,
|
||||
})
|
||||
|
||||
// Filename: use the first 16 hex chars of the SHA-256 mail ID. Mirrors
|
||||
// /raw behaviour; safe by construction (no quotes, no newlines).
|
||||
filename := fmt.Sprintf("%s.ocr.txt", id[:16])
|
||||
body := []byte(text)
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, filename))
|
||||
w.Header().Set("Content-Length", strconv.Itoa(len(body)))
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write(body)
|
||||
}
|
||||
|
||||
// ocrTextReader returns the AttachmentTextReader for the given tenant or
|
||||
// nil when no reader-capable index is wired. Resolution rules match the
|
||||
// search path (see handleSearch): per-tenant index when available, else the
|
||||
// global indexer.
|
||||
func (s *Server) ocrTextReader(tenantID *int64) index.AttachmentTextReader {
|
||||
var idx index.Indexer = s.idx
|
||||
if s.idxMgr != nil && tenantID != nil {
|
||||
idx = s.idxMgr.ForTenant(tenantID)
|
||||
}
|
||||
if r, ok := idx.(index.AttachmentTextReader); ok {
|
||||
return r
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -215,6 +215,8 @@ func (s *Server) routes() {
|
||||
s.mux.HandleFunc("GET /api/mails/{id}/attachments/{index}", s.auth(s.requireMailAccess(s.handleGetAttachment)))
|
||||
s.mux.HandleFunc("GET /api/threads/{threadID}", s.auth(s.handleGetThread))
|
||||
s.mux.HandleFunc("GET /api/mails/{id}/raw", s.auth(s.requireMailAccess(s.handleGetRaw)))
|
||||
// PROJ-44: OCR-Text-Download — gleicher ACL-Pfad wie /raw.
|
||||
s.mux.HandleFunc("GET /api/mails/{id}/ocr-text", s.auth(s.requireMailAccess(s.handleGetOCRText)))
|
||||
s.mux.HandleFunc("GET /api/admin/services", s.authAdmin(s.handleListServices))
|
||||
s.mux.HandleFunc("POST /api/admin/services/{name}/action", s.authAdmin(s.handleServiceAction))
|
||||
|
||||
|
||||
@@ -11,13 +11,14 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
EventLogin = "login"
|
||||
EventLogout = "logout"
|
||||
EventSearch = "search"
|
||||
EventMailView = "mail_view"
|
||||
EventImport = "import"
|
||||
EventExport = "export"
|
||||
EventUserMgmt = "user_mgmt"
|
||||
EventLogin = "login"
|
||||
EventLogout = "logout"
|
||||
EventSearch = "search"
|
||||
EventMailView = "mail_view"
|
||||
EventImport = "import"
|
||||
EventExport = "export"
|
||||
EventUserMgmt = "user_mgmt"
|
||||
EventOCRDownload = "mail:ocr_download" // PROJ-44: extracted OCR text downloaded
|
||||
)
|
||||
|
||||
// Entry is a single audit log record.
|
||||
|
||||
Reference in New Issue
Block a user