feat(PROJ-35): OCR & Anhang-Volltext-Indexierung
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
@@ -0,0 +1,154 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
// PROJ-35: OCR-Status-Verwaltung pro Mail.
|
||||
// Status-Werte: pending | done | failed | skipped | disabled
|
||||
|
||||
// PendingOCRMail describes one entry in the OCR backlog. TenantID is nil
|
||||
// when the mail has no tenant assignment (system-level / global).
|
||||
type PendingOCRMail struct {
|
||||
ID string
|
||||
TenantID *int64
|
||||
}
|
||||
|
||||
// SetOCRStatus updates emails.ocr_status for one mail.
|
||||
// Valid values: pending | done | failed | skipped | disabled.
|
||||
// Silently no-ops when no DB is configured.
|
||||
func (s *Store) SetOCRStatus(ctx context.Context, id, status string) error {
|
||||
if s.db == nil {
|
||||
return nil
|
||||
}
|
||||
if id == "" {
|
||||
return errors.New("storage: SetOCRStatus: empty id")
|
||||
}
|
||||
switch status {
|
||||
case "pending", "done", "failed", "skipped", "disabled":
|
||||
default:
|
||||
return fmt.Errorf("storage: SetOCRStatus: invalid status %q", status)
|
||||
}
|
||||
_, err := s.db.Exec(ctx, `UPDATE emails SET ocr_status = $1 WHERE id = $2`, status, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("storage: set ocr status: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// OCREnabled reports whether OCR processing should run for the given tenant.
|
||||
// Defaults to true when:
|
||||
// - no DB is configured (DB-less mode)
|
||||
// - tenantID is nil (global / no tenant)
|
||||
// - the tenants row cannot be read (degrade gracefully)
|
||||
//
|
||||
// OCR is opt-out, not opt-in.
|
||||
func (s *Store) OCREnabled(ctx context.Context, tenantID *int64) bool {
|
||||
if s.db == nil || tenantID == nil {
|
||||
return true
|
||||
}
|
||||
var enabled bool
|
||||
err := s.db.QueryRow(ctx,
|
||||
`SELECT COALESCE(ocr_enabled, TRUE) FROM tenants WHERE id = $1`,
|
||||
*tenantID,
|
||||
).Scan(&enabled)
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return true
|
||||
}
|
||||
if err != nil {
|
||||
// On unexpected errors (e.g. column missing on a freshly-initialised DB),
|
||||
// default to enabled so OCR remains opt-out.
|
||||
return true
|
||||
}
|
||||
return enabled
|
||||
}
|
||||
|
||||
// GetPendingOCRMails returns up to limit mails with ocr_status='pending'.
|
||||
// If tenantID != nil, results are restricted to mails referenced by that
|
||||
// tenant (via email_refs) or whose emails.tenant_id matches.
|
||||
// limit <= 0 means no limit.
|
||||
func (s *Store) GetPendingOCRMails(ctx context.Context, tenantID *int64, limit int) ([]PendingOCRMail, error) {
|
||||
return s.GetMailsByOCRStatus(ctx, "pending", tenantID, limit)
|
||||
}
|
||||
|
||||
// GetMailsByOCRStatus returns mails matching a specific ocr_status filter.
|
||||
// status="all" returns every mail regardless of status.
|
||||
// If tenantID != nil, results are restricted to that tenant.
|
||||
// limit <= 0 means no limit.
|
||||
func (s *Store) GetMailsByOCRStatus(ctx context.Context, status string, tenantID *int64, limit int) ([]PendingOCRMail, error) {
|
||||
if s.db == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
args := []interface{}{}
|
||||
whereParts := []string{}
|
||||
|
||||
if status != "" && status != "all" {
|
||||
args = append(args, status)
|
||||
whereParts = append(whereParts, fmt.Sprintf("COALESCE(e.ocr_status, 'pending') = $%d", len(args)))
|
||||
}
|
||||
|
||||
joinClause := ""
|
||||
if tenantID != nil {
|
||||
args = append(args, *tenantID)
|
||||
joinClause = fmt.Sprintf(
|
||||
"LEFT JOIN email_refs r ON r.email_id = e.id AND r.tenant_id = $%d", len(args),
|
||||
)
|
||||
whereParts = append(whereParts,
|
||||
fmt.Sprintf("(r.tenant_id = $%d OR e.tenant_id = $%d)", len(args), len(args)),
|
||||
)
|
||||
}
|
||||
|
||||
whereClause := ""
|
||||
if len(whereParts) > 0 {
|
||||
whereClause = "WHERE " + joinAnd(whereParts)
|
||||
}
|
||||
|
||||
limitClause := ""
|
||||
if limit > 0 {
|
||||
args = append(args, limit)
|
||||
limitClause = fmt.Sprintf("LIMIT $%d", len(args))
|
||||
}
|
||||
|
||||
q := fmt.Sprintf(`
|
||||
SELECT e.id, e.tenant_id
|
||||
FROM emails e
|
||||
%s
|
||||
%s
|
||||
ORDER BY e.received_at DESC
|
||||
%s
|
||||
`, joinClause, whereClause, limitClause)
|
||||
|
||||
rows, err := s.db.Query(ctx, q, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("storage: get mails by ocr status: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var out []PendingOCRMail
|
||||
for rows.Next() {
|
||||
var m PendingOCRMail
|
||||
if err := rows.Scan(&m.ID, &m.TenantID); err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, m)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// joinAnd joins parts with " AND " — small helper to avoid pulling in strings
|
||||
// for a single use.
|
||||
func joinAnd(parts []string) string {
|
||||
out := ""
|
||||
for i, p := range parts {
|
||||
if i > 0 {
|
||||
out += " AND "
|
||||
}
|
||||
out += p
|
||||
}
|
||||
return out
|
||||
}
|
||||
Reference in New Issue
Block a user