Files
archivmail/internal/storage/ocr.go
T
sysops 0bda21033e feat(PROJ-35): OCR & Anhang-Volltext-Indexierung
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils.
Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist
ueber die normale Volltextsuche auffindbar.

- internal/ocr: ExtractText + Worker (queue + drain)
- internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus
- emails.ocr_status (pending|done|failed|skipped|disabled)
- tenants.ocr_enabled (Default TRUE, opt-out)
- Manticore: attachment_text-Feld + UpdateAttachmentText
- Boot-resume: pending Jobs nach Restart automatisch in die Queue
- CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all
- update.sh: tesseract-ocr + poppler-utils optional installieren
2026-05-08 22:11:17 +02:00

155 lines
4.1 KiB
Go

package storage
import (
"context"
"errors"
"fmt"
"github.com/jackc/pgx/v5"
)
// PROJ-35: OCR-Status-Verwaltung pro Mail.
// Status-Werte: pending | done | failed | skipped | disabled
// PendingOCRMail describes one entry in the OCR backlog. TenantID is nil
// when the mail has no tenant assignment (system-level / global).
type PendingOCRMail struct {
ID string
TenantID *int64
}
// SetOCRStatus updates emails.ocr_status for one mail.
// Valid values: pending | done | failed | skipped | disabled.
// Silently no-ops when no DB is configured.
func (s *Store) SetOCRStatus(ctx context.Context, id, status string) error {
if s.db == nil {
return nil
}
if id == "" {
return errors.New("storage: SetOCRStatus: empty id")
}
switch status {
case "pending", "done", "failed", "skipped", "disabled":
default:
return fmt.Errorf("storage: SetOCRStatus: invalid status %q", status)
}
_, err := s.db.Exec(ctx, `UPDATE emails SET ocr_status = $1 WHERE id = $2`, status, id)
if err != nil {
return fmt.Errorf("storage: set ocr status: %w", err)
}
return nil
}
// OCREnabled reports whether OCR processing should run for the given tenant.
// Defaults to true when:
// - no DB is configured (DB-less mode)
// - tenantID is nil (global / no tenant)
// - the tenants row cannot be read (degrade gracefully)
//
// OCR is opt-out, not opt-in.
func (s *Store) OCREnabled(ctx context.Context, tenantID *int64) bool {
if s.db == nil || tenantID == nil {
return true
}
var enabled bool
err := s.db.QueryRow(ctx,
`SELECT COALESCE(ocr_enabled, TRUE) FROM tenants WHERE id = $1`,
*tenantID,
).Scan(&enabled)
if errors.Is(err, pgx.ErrNoRows) {
return true
}
if err != nil {
// On unexpected errors (e.g. column missing on a freshly-initialised DB),
// default to enabled so OCR remains opt-out.
return true
}
return enabled
}
// GetPendingOCRMails returns up to limit mails with ocr_status='pending'.
// If tenantID != nil, results are restricted to mails referenced by that
// tenant (via email_refs) or whose emails.tenant_id matches.
// limit <= 0 means no limit.
func (s *Store) GetPendingOCRMails(ctx context.Context, tenantID *int64, limit int) ([]PendingOCRMail, error) {
return s.GetMailsByOCRStatus(ctx, "pending", tenantID, limit)
}
// GetMailsByOCRStatus returns mails matching a specific ocr_status filter.
// status="all" returns every mail regardless of status.
// If tenantID != nil, results are restricted to that tenant.
// limit <= 0 means no limit.
func (s *Store) GetMailsByOCRStatus(ctx context.Context, status string, tenantID *int64, limit int) ([]PendingOCRMail, error) {
if s.db == nil {
return nil, nil
}
args := []interface{}{}
whereParts := []string{}
if status != "" && status != "all" {
args = append(args, status)
whereParts = append(whereParts, fmt.Sprintf("COALESCE(e.ocr_status, 'pending') = $%d", len(args)))
}
joinClause := ""
if tenantID != nil {
args = append(args, *tenantID)
joinClause = fmt.Sprintf(
"LEFT JOIN email_refs r ON r.email_id = e.id AND r.tenant_id = $%d", len(args),
)
whereParts = append(whereParts,
fmt.Sprintf("(r.tenant_id = $%d OR e.tenant_id = $%d)", len(args), len(args)),
)
}
whereClause := ""
if len(whereParts) > 0 {
whereClause = "WHERE " + joinAnd(whereParts)
}
limitClause := ""
if limit > 0 {
args = append(args, limit)
limitClause = fmt.Sprintf("LIMIT $%d", len(args))
}
q := fmt.Sprintf(`
SELECT e.id, e.tenant_id
FROM emails e
%s
%s
ORDER BY e.received_at DESC
%s
`, joinClause, whereClause, limitClause)
rows, err := s.db.Query(ctx, q, args...)
if err != nil {
return nil, fmt.Errorf("storage: get mails by ocr status: %w", err)
}
defer rows.Close()
var out []PendingOCRMail
for rows.Next() {
var m PendingOCRMail
if err := rows.Scan(&m.ID, &m.TenantID); err != nil {
continue
}
out = append(out, m)
}
return out, rows.Err()
}
// joinAnd joins parts with " AND " — small helper to avoid pulling in strings
// for a single use.
func joinAnd(parts []string) string {
out := ""
for i, p := range parts {
if i > 0 {
out += " AND "
}
out += p
}
return out
}