0bda21033e
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
155 lines
4.1 KiB
Go
155 lines
4.1 KiB
Go
package storage
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
)
|
|
|
|
// PROJ-35: OCR-Status-Verwaltung pro Mail.
|
|
// Status-Werte: pending | done | failed | skipped | disabled
|
|
|
|
// PendingOCRMail describes one entry in the OCR backlog. TenantID is nil
|
|
// when the mail has no tenant assignment (system-level / global).
|
|
type PendingOCRMail struct {
|
|
ID string
|
|
TenantID *int64
|
|
}
|
|
|
|
// SetOCRStatus updates emails.ocr_status for one mail.
|
|
// Valid values: pending | done | failed | skipped | disabled.
|
|
// Silently no-ops when no DB is configured.
|
|
func (s *Store) SetOCRStatus(ctx context.Context, id, status string) error {
|
|
if s.db == nil {
|
|
return nil
|
|
}
|
|
if id == "" {
|
|
return errors.New("storage: SetOCRStatus: empty id")
|
|
}
|
|
switch status {
|
|
case "pending", "done", "failed", "skipped", "disabled":
|
|
default:
|
|
return fmt.Errorf("storage: SetOCRStatus: invalid status %q", status)
|
|
}
|
|
_, err := s.db.Exec(ctx, `UPDATE emails SET ocr_status = $1 WHERE id = $2`, status, id)
|
|
if err != nil {
|
|
return fmt.Errorf("storage: set ocr status: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// OCREnabled reports whether OCR processing should run for the given tenant.
|
|
// Defaults to true when:
|
|
// - no DB is configured (DB-less mode)
|
|
// - tenantID is nil (global / no tenant)
|
|
// - the tenants row cannot be read (degrade gracefully)
|
|
//
|
|
// OCR is opt-out, not opt-in.
|
|
func (s *Store) OCREnabled(ctx context.Context, tenantID *int64) bool {
|
|
if s.db == nil || tenantID == nil {
|
|
return true
|
|
}
|
|
var enabled bool
|
|
err := s.db.QueryRow(ctx,
|
|
`SELECT COALESCE(ocr_enabled, TRUE) FROM tenants WHERE id = $1`,
|
|
*tenantID,
|
|
).Scan(&enabled)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return true
|
|
}
|
|
if err != nil {
|
|
// On unexpected errors (e.g. column missing on a freshly-initialised DB),
|
|
// default to enabled so OCR remains opt-out.
|
|
return true
|
|
}
|
|
return enabled
|
|
}
|
|
|
|
// GetPendingOCRMails returns up to limit mails with ocr_status='pending'.
|
|
// If tenantID != nil, results are restricted to mails referenced by that
|
|
// tenant (via email_refs) or whose emails.tenant_id matches.
|
|
// limit <= 0 means no limit.
|
|
func (s *Store) GetPendingOCRMails(ctx context.Context, tenantID *int64, limit int) ([]PendingOCRMail, error) {
|
|
return s.GetMailsByOCRStatus(ctx, "pending", tenantID, limit)
|
|
}
|
|
|
|
// GetMailsByOCRStatus returns mails matching a specific ocr_status filter.
|
|
// status="all" returns every mail regardless of status.
|
|
// If tenantID != nil, results are restricted to that tenant.
|
|
// limit <= 0 means no limit.
|
|
func (s *Store) GetMailsByOCRStatus(ctx context.Context, status string, tenantID *int64, limit int) ([]PendingOCRMail, error) {
|
|
if s.db == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
args := []interface{}{}
|
|
whereParts := []string{}
|
|
|
|
if status != "" && status != "all" {
|
|
args = append(args, status)
|
|
whereParts = append(whereParts, fmt.Sprintf("COALESCE(e.ocr_status, 'pending') = $%d", len(args)))
|
|
}
|
|
|
|
joinClause := ""
|
|
if tenantID != nil {
|
|
args = append(args, *tenantID)
|
|
joinClause = fmt.Sprintf(
|
|
"LEFT JOIN email_refs r ON r.email_id = e.id AND r.tenant_id = $%d", len(args),
|
|
)
|
|
whereParts = append(whereParts,
|
|
fmt.Sprintf("(r.tenant_id = $%d OR e.tenant_id = $%d)", len(args), len(args)),
|
|
)
|
|
}
|
|
|
|
whereClause := ""
|
|
if len(whereParts) > 0 {
|
|
whereClause = "WHERE " + joinAnd(whereParts)
|
|
}
|
|
|
|
limitClause := ""
|
|
if limit > 0 {
|
|
args = append(args, limit)
|
|
limitClause = fmt.Sprintf("LIMIT $%d", len(args))
|
|
}
|
|
|
|
q := fmt.Sprintf(`
|
|
SELECT e.id, e.tenant_id
|
|
FROM emails e
|
|
%s
|
|
%s
|
|
ORDER BY e.received_at DESC
|
|
%s
|
|
`, joinClause, whereClause, limitClause)
|
|
|
|
rows, err := s.db.Query(ctx, q, args...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("storage: get mails by ocr status: %w", err)
|
|
}
|
|
defer rows.Close()
|
|
|
|
var out []PendingOCRMail
|
|
for rows.Next() {
|
|
var m PendingOCRMail
|
|
if err := rows.Scan(&m.ID, &m.TenantID); err != nil {
|
|
continue
|
|
}
|
|
out = append(out, m)
|
|
}
|
|
return out, rows.Err()
|
|
}
|
|
|
|
// joinAnd joins parts with " AND " — small helper to avoid pulling in strings
|
|
// for a single use.
|
|
func joinAnd(parts []string) string {
|
|
out := ""
|
|
for i, p := range parts {
|
|
if i > 0 {
|
|
out += " AND "
|
|
}
|
|
out += p
|
|
}
|
|
return out
|
|
}
|