feat(PROJ-35): OCR & Anhang-Volltext-Indexierung

Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
2026-05-08 22:11:17 +02:00
parent 2a91f6e249
commit 0bda21033e
11 changed files with 926 additions and 25 deletions
@@ -0,0 +1,265 @@
+// Package ocr extracts plain text from email attachments (PDFs and images)
+// using locally installed tools — pdftotext (poppler-utils) and tesseract.
+//
+// PROJ-35: All extraction happens on the host. No external services.
+package ocr
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+const (
+	// MaxAttachmentSize skips files larger than this (50 MiB per spec).
+	MaxAttachmentSize = 50 * 1024 * 1024
+	// DefaultTimeout caps a single OCR run.
+	DefaultTimeout = 60 * time.Second
+	// pdfMinTextLen below this we treat the PDF as a scan and fall back to OCR.
+	pdfMinTextLen = 32
+)
+
+// Errors returned by Extract.
+var (
+	ErrUnsupported = errors.New("ocr: unsupported attachment type")
+	ErrTooLarge    = errors.New("ocr: attachment exceeds size limit")
+	ErrEncrypted   = errors.New("ocr: attachment is password protected")
+	ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
+)
+
+// IsAvailable reports whether at least one OCR tool is on PATH.
+// pdftotext alone enables text extraction from native PDFs;
+// tesseract enables OCR for images and scanned PDFs.
+func IsAvailable() bool {
+	_, errPDF := exec.LookPath("pdftotext")
+	_, errTess := exec.LookPath("tesseract")
+	return errPDF == nil || errTess == nil
+}
+
+// ToolStatus reports which tools are present on the host.
+type ToolStatus struct {
+	HasPdftotext bool
+	HasTesseract bool
+	HasPdftoppm  bool
+}
+
+// CheckTools probes the system for the supporting binaries.
+func CheckTools() ToolStatus {
+	_, errPDF := exec.LookPath("pdftotext")
+	_, errTess := exec.LookPath("tesseract")
+	_, errPpm := exec.LookPath("pdftoppm")
+	return ToolStatus{
+		HasPdftotext: errPDF == nil,
+		HasTesseract: errTess == nil,
+		HasPdftoppm:  errPpm == nil,
+	}
+}
+
+// ExtractText extracts plain text from a single attachment.
+// contentType is the MIME type (lowercased), filename is used to derive an
+// extension when contentType is missing or generic. langs is the Tesseract
+// language list (e.g. ["deu","eng"]).
+//
+// Returns:
+//   - extracted text (may be empty when nothing was found)
+//   - ErrUnsupported when the format cannot be processed
+//   - ErrTooLarge when data exceeds MaxAttachmentSize
+//   - ErrEncrypted when the PDF is password-protected
+//   - other errors from the underlying tool
+func ExtractText(ctx context.Context, data []byte, contentType, filename string, langs []string) (string, error) {
+	if len(data) == 0 {
+		return "", nil
+	}
+	if len(data) > MaxAttachmentSize {
+		return "", ErrTooLarge
+	}
+
+	kind := classify(contentType, filename)
+	switch kind {
+	case kindPDF:
+		return extractPDF(ctx, data, langs)
+	case kindImage:
+		return extractImage(ctx, data, filename, langs)
+	default:
+		return "", ErrUnsupported
+	}
+}
+
+// ── classification ────────────────────────────────────────────────────────
+
+type fileKind int
+
+const (
+	kindUnknown fileKind = iota
+	kindPDF
+	kindImage
+)
+
+func classify(contentType, filename string) fileKind {
+	ct := strings.ToLower(strings.TrimSpace(contentType))
+	if i := strings.Index(ct, ";"); i >= 0 {
+		ct = strings.TrimSpace(ct[:i])
+	}
+
+	switch ct {
+	case "application/pdf", "application/x-pdf":
+		return kindPDF
+	case "image/jpeg", "image/jpg", "image/png", "image/tiff",
+		"image/x-tiff", "image/bmp", "image/x-bmp", "image/webp":
+		return kindImage
+	}
+
+	ext := strings.ToLower(filepath.Ext(filename))
+	switch ext {
+	case ".pdf":
+		return kindPDF
+	case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp":
+		return kindImage
+	}
+	return kindUnknown
+}
+
+// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
+
+func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
+	tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
+	if err != nil {
+		return "", fmt.Errorf("ocr: tempdir: %w", err)
+	}
+	defer os.RemoveAll(tmp)
+
+	pdfPath := filepath.Join(tmp, "in.pdf")
+	if err := os.WriteFile(pdfPath, data, 0o600); err != nil {
+		return "", fmt.Errorf("ocr: write pdf: %w", err)
+	}
+
+	if _, err := exec.LookPath("pdftotext"); err == nil {
+		text, ptErr := runPdftotext(ctx, pdfPath)
+		if ptErr != nil {
+			if errors.Is(ptErr, ErrEncrypted) {
+				return "", ErrEncrypted
+			}
+			// Fall through to OCR fallback on other errors.
+		}
+		if len(strings.TrimSpace(text)) >= pdfMinTextLen {
+			return text, nil
+		}
+	}
+
+	if _, err := exec.LookPath("tesseract"); err != nil {
+		return "", ErrUnavailable
+	}
+	if _, err := exec.LookPath("pdftoppm"); err != nil {
+		return "", ErrUnavailable
+	}
+
+	return ocrPDFViaImages(ctx, tmp, pdfPath, langs)
+}
+
+func runPdftotext(ctx context.Context, pdfPath string) (string, error) {
+	cctx, cancel := withDefaultTimeout(ctx)
+	defer cancel()
+
+	var out, errBuf bytes.Buffer
+	cmd := exec.CommandContext(cctx, "pdftotext", "-layout", "-q", pdfPath, "-")
+	cmd.Stdout = &out
+	cmd.Stderr = &errBuf
+	if err := cmd.Run(); err != nil {
+		stderr := errBuf.String()
+		if strings.Contains(strings.ToLower(stderr), "incorrect password") ||
+			strings.Contains(strings.ToLower(stderr), "encrypted") {
+			return "", ErrEncrypted
+		}
+		return "", fmt.Errorf("ocr: pdftotext: %w (%s)", err, strings.TrimSpace(stderr))
+	}
+	return out.String(), nil
+}
+
+func ocrPDFViaImages(ctx context.Context, dir, pdfPath string, langs []string) (string, error) {
+	cctx, cancel := withDefaultTimeout(ctx)
+	defer cancel()
+
+	prefix := filepath.Join(dir, "page")
+	cmd := exec.CommandContext(cctx, "pdftoppm", "-r", "200", "-png", pdfPath, prefix)
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("ocr: pdftoppm: %w", err)
+	}
+
+	pages, err := filepath.Glob(prefix + "-*.png")
+	if err != nil || len(pages) == 0 {
+		return "", fmt.Errorf("ocr: pdftoppm produced no pages")
+	}
+
+	var combined strings.Builder
+	for _, p := range pages {
+		text, err := tesseractFile(cctx, p, langs)
+		if err != nil {
+			continue
+		}
+		combined.WriteString(text)
+		combined.WriteString("\n")
+	}
+	return combined.String(), nil
+}
+
+// ── image extraction ──────────────────────────────────────────────────────
+
+func extractImage(ctx context.Context, data []byte, filename string, langs []string) (string, error) {
+	if _, err := exec.LookPath("tesseract"); err != nil {
+		return "", ErrUnavailable
+	}
+	tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
+	if err != nil {
+		return "", fmt.Errorf("ocr: tempdir: %w", err)
+	}
+	defer os.RemoveAll(tmp)
+
+	ext := filepath.Ext(filename)
+	if ext == "" {
+		ext = ".bin"
+	}
+	in := filepath.Join(tmp, "img"+ext)
+	if err := os.WriteFile(in, data, 0o600); err != nil {
+		return "", fmt.Errorf("ocr: write image: %w", err)
+	}
+	cctx, cancel := withDefaultTimeout(ctx)
+	defer cancel()
+	return tesseractFile(cctx, in, langs)
+}
+
+func tesseractFile(ctx context.Context, path string, langs []string) (string, error) {
+	args := []string{path, "stdout"}
+	if l := joinLangs(langs); l != "" {
+		args = append(args, "-l", l)
+	}
+	args = append(args, "--psm", "3", "-c", "preserve_interword_spaces=1")
+
+	var out, errBuf bytes.Buffer
+	cmd := exec.CommandContext(ctx, "tesseract", args...)
+	cmd.Stdout = &out
+	cmd.Stderr = &errBuf
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("ocr: tesseract: %w (%s)", err, strings.TrimSpace(errBuf.String()))
+	}
+	return out.String(), nil
+}
+
+func joinLangs(langs []string) string {
+	if len(langs) == 0 {
+		return "deu+eng"
+	}
+	return strings.Join(langs, "+")
+}
+
+func withDefaultTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
+	if _, ok := ctx.Deadline(); ok {
+		return ctx, func() {}
+	}
+	return context.WithTimeout(ctx, DefaultTimeout)
+}
@@ -0,0 +1,205 @@
+package ocr
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"strings"
+	"sync"
+
+	"archivmail/internal/index"
+	"archivmail/internal/storage"
+	"archivmail/pkg/mailparser"
+)
+
+// Job describes one OCR work unit — extract text from all attachments of a
+// stored mail and feed the result back into the per-tenant index.
+type Job struct {
+	MailID   string
+	TenantID *int64
+}
+
+// Worker runs OCR jobs on a buffered channel using N background goroutines.
+//
+// Lifecycle: NewWorker → Start(ctx) → Submit(...) (n times) → Stop().
+// Submit is non-blocking; jobs are dropped when the queue is full.
+type Worker struct {
+	store    *storage.Store
+	idxMgr   index.TenantIndexer
+	logger   *slog.Logger
+	queue    chan Job
+	done     chan struct{}
+	wg       sync.WaitGroup
+	workers  int
+	langs    []string
+}
+
+// Options configures a Worker. Zero values are replaced with sensible defaults.
+type Options struct {
+	QueueSize int      // default 1000
+	Workers   int      // default 2
+	Langs     []string // default ["deu", "eng"]
+	Logger    *slog.Logger
+}
+
+// NewWorker constructs a worker that reads mails from store, runs OCR on
+// supported attachments, and pushes the combined text into the per-tenant
+// Manticore index via idxMgr. The store and idxMgr must be non-nil.
+func NewWorker(store *storage.Store, idxMgr index.TenantIndexer, opts Options) *Worker {
+	if opts.QueueSize <= 0 {
+		opts.QueueSize = 1000
+	}
+	if opts.Workers <= 0 {
+		opts.Workers = 2
+	}
+	if len(opts.Langs) == 0 {
+		opts.Langs = []string{"deu", "eng"}
+	}
+	if opts.Logger == nil {
+		opts.Logger = slog.Default()
+	}
+	return &Worker{
+		store:   store,
+		idxMgr:  idxMgr,
+		logger:  opts.Logger,
+		queue:   make(chan Job, opts.QueueSize),
+		done:    make(chan struct{}),
+		workers: opts.Workers,
+		langs:   opts.Langs,
+	}
+}
+
+// Submit enqueues a job. Drops with a warning if the queue is full so the
+// caller (mail intake) is never blocked.
+func (w *Worker) Submit(mailID string, tenantID *int64) {
+	if mailID == "" {
+		return
+	}
+	select {
+	case w.queue <- Job{MailID: mailID, TenantID: tenantID}:
+	default:
+		w.logger.Warn("ocr worker: queue full, dropping job", "mail_id", mailID)
+	}
+}
+
+// QueueLen returns the current number of pending jobs.
+func (w *Worker) QueueLen() int { return len(w.queue) }
+
+// Start launches w.workers goroutines that consume the queue until Stop is
+// called or ctx is cancelled.
+func (w *Worker) Start(ctx context.Context) {
+	if !IsAvailable() {
+		w.logger.Warn("ocr worker: tesseract/pdftotext not on PATH — OCR disabled at runtime")
+	}
+	for i := 0; i < w.workers; i++ {
+		w.wg.Add(1)
+		go w.run(ctx, i)
+	}
+	w.logger.Info("ocr worker: started", "workers", w.workers, "queue", cap(w.queue))
+}
+
+// Stop drains the remaining queue and waits for all goroutines to exit.
+func (w *Worker) Stop() {
+	close(w.done)
+	w.wg.Wait()
+	w.logger.Info("ocr worker: stopped")
+}
+
+func (w *Worker) run(ctx context.Context, id int) {
+	defer w.wg.Done()
+	for {
+		select {
+		case job, ok := <-w.queue:
+			if !ok {
+				return
+			}
+			w.process(ctx, job)
+		case <-w.done:
+			// Drain remaining jobs so nothing in-flight is lost.
+			for {
+				select {
+				case job, ok := <-w.queue:
+					if !ok {
+						return
+					}
+					w.process(ctx, job)
+				default:
+					return
+				}
+			}
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (w *Worker) process(ctx context.Context, job Job) {
+	logger := w.logger.With("mail_id", job.MailID, "tenant_id", job.TenantID)
+
+	if w.store.OCREnabled(ctx, job.TenantID) == false {
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "disabled")
+		return
+	}
+
+	raw, err := w.store.Load(job.MailID)
+	if err != nil {
+		logger.Warn("ocr worker: load failed", "err", err)
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
+		return
+	}
+
+	pm, err := mailparser.Parse(raw)
+	if err != nil {
+		logger.Warn("ocr worker: parse failed", "err", err)
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
+		return
+	}
+
+	if len(pm.Attachments) == 0 {
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "skipped")
+		return
+	}
+
+	var combined strings.Builder
+	processed := 0
+	for _, a := range pm.Attachments {
+		text, err := ExtractText(ctx, a.Data, a.ContentType, a.Filename, w.langs)
+		if err != nil {
+			if errors.Is(err, ErrUnsupported) || errors.Is(err, ErrEncrypted) ||
+				errors.Is(err, ErrTooLarge) || errors.Is(err, ErrUnavailable) {
+				logger.Debug("ocr worker: attachment skipped",
+					"filename", a.Filename, "reason", err)
+				continue
+			}
+			logger.Warn("ocr worker: extract failed",
+				"filename", a.Filename, "err", err)
+			continue
+		}
+		if t := strings.TrimSpace(text); t != "" {
+			combined.WriteString(t)
+			combined.WriteString("\n\n")
+			processed++
+		}
+	}
+
+	if processed == 0 {
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "skipped")
+		return
+	}
+
+	idx := w.idxMgr.ForTenant(job.TenantID)
+	updater, ok := idx.(index.AttachmentTextUpdater)
+	if !ok {
+		logger.Warn("ocr worker: indexer does not support AttachmentTextUpdater — text dropped")
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
+		return
+	}
+	if err := updater.UpdateAttachmentText(job.MailID, combined.String()); err != nil {
+		logger.Warn("ocr worker: index update failed", "err", err)
+		_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
+		return
+	}
+
+	_ = w.store.SetOCRStatus(ctx, job.MailID, "done")
+	logger.Info("ocr worker: indexed", "attachments", processed, "chars", combined.Len())
+}