feat(PROJ-35): OCR & Anhang-Volltext-Indexierung
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
@@ -0,0 +1,265 @@
|
||||
// Package ocr extracts plain text from email attachments (PDFs and images)
|
||||
// using locally installed tools — pdftotext (poppler-utils) and tesseract.
|
||||
//
|
||||
// PROJ-35: All extraction happens on the host. No external services.
|
||||
package ocr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
// MaxAttachmentSize skips files larger than this (50 MiB per spec).
|
||||
MaxAttachmentSize = 50 * 1024 * 1024
|
||||
// DefaultTimeout caps a single OCR run.
|
||||
DefaultTimeout = 60 * time.Second
|
||||
// pdfMinTextLen below this we treat the PDF as a scan and fall back to OCR.
|
||||
pdfMinTextLen = 32
|
||||
)
|
||||
|
||||
// Errors returned by Extract.
|
||||
var (
|
||||
ErrUnsupported = errors.New("ocr: unsupported attachment type")
|
||||
ErrTooLarge = errors.New("ocr: attachment exceeds size limit")
|
||||
ErrEncrypted = errors.New("ocr: attachment is password protected")
|
||||
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
|
||||
)
|
||||
|
||||
// IsAvailable reports whether at least one OCR tool is on PATH.
|
||||
// pdftotext alone enables text extraction from native PDFs;
|
||||
// tesseract enables OCR for images and scanned PDFs.
|
||||
func IsAvailable() bool {
|
||||
_, errPDF := exec.LookPath("pdftotext")
|
||||
_, errTess := exec.LookPath("tesseract")
|
||||
return errPDF == nil || errTess == nil
|
||||
}
|
||||
|
||||
// ToolStatus reports which tools are present on the host.
|
||||
type ToolStatus struct {
|
||||
HasPdftotext bool
|
||||
HasTesseract bool
|
||||
HasPdftoppm bool
|
||||
}
|
||||
|
||||
// CheckTools probes the system for the supporting binaries.
|
||||
func CheckTools() ToolStatus {
|
||||
_, errPDF := exec.LookPath("pdftotext")
|
||||
_, errTess := exec.LookPath("tesseract")
|
||||
_, errPpm := exec.LookPath("pdftoppm")
|
||||
return ToolStatus{
|
||||
HasPdftotext: errPDF == nil,
|
||||
HasTesseract: errTess == nil,
|
||||
HasPdftoppm: errPpm == nil,
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractText extracts plain text from a single attachment.
|
||||
// contentType is the MIME type (lowercased), filename is used to derive an
|
||||
// extension when contentType is missing or generic. langs is the Tesseract
|
||||
// language list (e.g. ["deu","eng"]).
|
||||
//
|
||||
// Returns:
|
||||
// - extracted text (may be empty when nothing was found)
|
||||
// - ErrUnsupported when the format cannot be processed
|
||||
// - ErrTooLarge when data exceeds MaxAttachmentSize
|
||||
// - ErrEncrypted when the PDF is password-protected
|
||||
// - other errors from the underlying tool
|
||||
func ExtractText(ctx context.Context, data []byte, contentType, filename string, langs []string) (string, error) {
|
||||
if len(data) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
if len(data) > MaxAttachmentSize {
|
||||
return "", ErrTooLarge
|
||||
}
|
||||
|
||||
kind := classify(contentType, filename)
|
||||
switch kind {
|
||||
case kindPDF:
|
||||
return extractPDF(ctx, data, langs)
|
||||
case kindImage:
|
||||
return extractImage(ctx, data, filename, langs)
|
||||
default:
|
||||
return "", ErrUnsupported
|
||||
}
|
||||
}
|
||||
|
||||
// ── classification ────────────────────────────────────────────────────────
|
||||
|
||||
type fileKind int
|
||||
|
||||
const (
|
||||
kindUnknown fileKind = iota
|
||||
kindPDF
|
||||
kindImage
|
||||
)
|
||||
|
||||
func classify(contentType, filename string) fileKind {
|
||||
ct := strings.ToLower(strings.TrimSpace(contentType))
|
||||
if i := strings.Index(ct, ";"); i >= 0 {
|
||||
ct = strings.TrimSpace(ct[:i])
|
||||
}
|
||||
|
||||
switch ct {
|
||||
case "application/pdf", "application/x-pdf":
|
||||
return kindPDF
|
||||
case "image/jpeg", "image/jpg", "image/png", "image/tiff",
|
||||
"image/x-tiff", "image/bmp", "image/x-bmp", "image/webp":
|
||||
return kindImage
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
switch ext {
|
||||
case ".pdf":
|
||||
return kindPDF
|
||||
case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp":
|
||||
return kindImage
|
||||
}
|
||||
return kindUnknown
|
||||
}
|
||||
|
||||
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
|
||||
|
||||
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
|
||||
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmp)
|
||||
|
||||
pdfPath := filepath.Join(tmp, "in.pdf")
|
||||
if err := os.WriteFile(pdfPath, data, 0o600); err != nil {
|
||||
return "", fmt.Errorf("ocr: write pdf: %w", err)
|
||||
}
|
||||
|
||||
if _, err := exec.LookPath("pdftotext"); err == nil {
|
||||
text, ptErr := runPdftotext(ctx, pdfPath)
|
||||
if ptErr != nil {
|
||||
if errors.Is(ptErr, ErrEncrypted) {
|
||||
return "", ErrEncrypted
|
||||
}
|
||||
// Fall through to OCR fallback on other errors.
|
||||
}
|
||||
if len(strings.TrimSpace(text)) >= pdfMinTextLen {
|
||||
return text, nil
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := exec.LookPath("tesseract"); err != nil {
|
||||
return "", ErrUnavailable
|
||||
}
|
||||
if _, err := exec.LookPath("pdftoppm"); err != nil {
|
||||
return "", ErrUnavailable
|
||||
}
|
||||
|
||||
return ocrPDFViaImages(ctx, tmp, pdfPath, langs)
|
||||
}
|
||||
|
||||
func runPdftotext(ctx context.Context, pdfPath string) (string, error) {
|
||||
cctx, cancel := withDefaultTimeout(ctx)
|
||||
defer cancel()
|
||||
|
||||
var out, errBuf bytes.Buffer
|
||||
cmd := exec.CommandContext(cctx, "pdftotext", "-layout", "-q", pdfPath, "-")
|
||||
cmd.Stdout = &out
|
||||
cmd.Stderr = &errBuf
|
||||
if err := cmd.Run(); err != nil {
|
||||
stderr := errBuf.String()
|
||||
if strings.Contains(strings.ToLower(stderr), "incorrect password") ||
|
||||
strings.Contains(strings.ToLower(stderr), "encrypted") {
|
||||
return "", ErrEncrypted
|
||||
}
|
||||
return "", fmt.Errorf("ocr: pdftotext: %w (%s)", err, strings.TrimSpace(stderr))
|
||||
}
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
func ocrPDFViaImages(ctx context.Context, dir, pdfPath string, langs []string) (string, error) {
|
||||
cctx, cancel := withDefaultTimeout(ctx)
|
||||
defer cancel()
|
||||
|
||||
prefix := filepath.Join(dir, "page")
|
||||
cmd := exec.CommandContext(cctx, "pdftoppm", "-r", "200", "-png", pdfPath, prefix)
|
||||
if err := cmd.Run(); err != nil {
|
||||
return "", fmt.Errorf("ocr: pdftoppm: %w", err)
|
||||
}
|
||||
|
||||
pages, err := filepath.Glob(prefix + "-*.png")
|
||||
if err != nil || len(pages) == 0 {
|
||||
return "", fmt.Errorf("ocr: pdftoppm produced no pages")
|
||||
}
|
||||
|
||||
var combined strings.Builder
|
||||
for _, p := range pages {
|
||||
text, err := tesseractFile(cctx, p, langs)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
combined.WriteString(text)
|
||||
combined.WriteString("\n")
|
||||
}
|
||||
return combined.String(), nil
|
||||
}
|
||||
|
||||
// ── image extraction ──────────────────────────────────────────────────────
|
||||
|
||||
func extractImage(ctx context.Context, data []byte, filename string, langs []string) (string, error) {
|
||||
if _, err := exec.LookPath("tesseract"); err != nil {
|
||||
return "", ErrUnavailable
|
||||
}
|
||||
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmp)
|
||||
|
||||
ext := filepath.Ext(filename)
|
||||
if ext == "" {
|
||||
ext = ".bin"
|
||||
}
|
||||
in := filepath.Join(tmp, "img"+ext)
|
||||
if err := os.WriteFile(in, data, 0o600); err != nil {
|
||||
return "", fmt.Errorf("ocr: write image: %w", err)
|
||||
}
|
||||
cctx, cancel := withDefaultTimeout(ctx)
|
||||
defer cancel()
|
||||
return tesseractFile(cctx, in, langs)
|
||||
}
|
||||
|
||||
func tesseractFile(ctx context.Context, path string, langs []string) (string, error) {
|
||||
args := []string{path, "stdout"}
|
||||
if l := joinLangs(langs); l != "" {
|
||||
args = append(args, "-l", l)
|
||||
}
|
||||
args = append(args, "--psm", "3", "-c", "preserve_interword_spaces=1")
|
||||
|
||||
var out, errBuf bytes.Buffer
|
||||
cmd := exec.CommandContext(ctx, "tesseract", args...)
|
||||
cmd.Stdout = &out
|
||||
cmd.Stderr = &errBuf
|
||||
if err := cmd.Run(); err != nil {
|
||||
return "", fmt.Errorf("ocr: tesseract: %w (%s)", err, strings.TrimSpace(errBuf.String()))
|
||||
}
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
func joinLangs(langs []string) string {
|
||||
if len(langs) == 0 {
|
||||
return "deu+eng"
|
||||
}
|
||||
return strings.Join(langs, "+")
|
||||
}
|
||||
|
||||
func withDefaultTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
|
||||
if _, ok := ctx.Deadline(); ok {
|
||||
return ctx, func() {}
|
||||
}
|
||||
return context.WithTimeout(ctx, DefaultTimeout)
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
package ocr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"archivmail/internal/index"
|
||||
"archivmail/internal/storage"
|
||||
"archivmail/pkg/mailparser"
|
||||
)
|
||||
|
||||
// Job describes one OCR work unit — extract text from all attachments of a
|
||||
// stored mail and feed the result back into the per-tenant index.
|
||||
type Job struct {
|
||||
MailID string
|
||||
TenantID *int64
|
||||
}
|
||||
|
||||
// Worker runs OCR jobs on a buffered channel using N background goroutines.
|
||||
//
|
||||
// Lifecycle: NewWorker → Start(ctx) → Submit(...) (n times) → Stop().
|
||||
// Submit is non-blocking; jobs are dropped when the queue is full.
|
||||
type Worker struct {
|
||||
store *storage.Store
|
||||
idxMgr index.TenantIndexer
|
||||
logger *slog.Logger
|
||||
queue chan Job
|
||||
done chan struct{}
|
||||
wg sync.WaitGroup
|
||||
workers int
|
||||
langs []string
|
||||
}
|
||||
|
||||
// Options configures a Worker. Zero values are replaced with sensible defaults.
|
||||
type Options struct {
|
||||
QueueSize int // default 1000
|
||||
Workers int // default 2
|
||||
Langs []string // default ["deu", "eng"]
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewWorker constructs a worker that reads mails from store, runs OCR on
|
||||
// supported attachments, and pushes the combined text into the per-tenant
|
||||
// Manticore index via idxMgr. The store and idxMgr must be non-nil.
|
||||
func NewWorker(store *storage.Store, idxMgr index.TenantIndexer, opts Options) *Worker {
|
||||
if opts.QueueSize <= 0 {
|
||||
opts.QueueSize = 1000
|
||||
}
|
||||
if opts.Workers <= 0 {
|
||||
opts.Workers = 2
|
||||
}
|
||||
if len(opts.Langs) == 0 {
|
||||
opts.Langs = []string{"deu", "eng"}
|
||||
}
|
||||
if opts.Logger == nil {
|
||||
opts.Logger = slog.Default()
|
||||
}
|
||||
return &Worker{
|
||||
store: store,
|
||||
idxMgr: idxMgr,
|
||||
logger: opts.Logger,
|
||||
queue: make(chan Job, opts.QueueSize),
|
||||
done: make(chan struct{}),
|
||||
workers: opts.Workers,
|
||||
langs: opts.Langs,
|
||||
}
|
||||
}
|
||||
|
||||
// Submit enqueues a job. Drops with a warning if the queue is full so the
|
||||
// caller (mail intake) is never blocked.
|
||||
func (w *Worker) Submit(mailID string, tenantID *int64) {
|
||||
if mailID == "" {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case w.queue <- Job{MailID: mailID, TenantID: tenantID}:
|
||||
default:
|
||||
w.logger.Warn("ocr worker: queue full, dropping job", "mail_id", mailID)
|
||||
}
|
||||
}
|
||||
|
||||
// QueueLen returns the current number of pending jobs.
|
||||
func (w *Worker) QueueLen() int { return len(w.queue) }
|
||||
|
||||
// Start launches w.workers goroutines that consume the queue until Stop is
|
||||
// called or ctx is cancelled.
|
||||
func (w *Worker) Start(ctx context.Context) {
|
||||
if !IsAvailable() {
|
||||
w.logger.Warn("ocr worker: tesseract/pdftotext not on PATH — OCR disabled at runtime")
|
||||
}
|
||||
for i := 0; i < w.workers; i++ {
|
||||
w.wg.Add(1)
|
||||
go w.run(ctx, i)
|
||||
}
|
||||
w.logger.Info("ocr worker: started", "workers", w.workers, "queue", cap(w.queue))
|
||||
}
|
||||
|
||||
// Stop drains the remaining queue and waits for all goroutines to exit.
|
||||
func (w *Worker) Stop() {
|
||||
close(w.done)
|
||||
w.wg.Wait()
|
||||
w.logger.Info("ocr worker: stopped")
|
||||
}
|
||||
|
||||
func (w *Worker) run(ctx context.Context, id int) {
|
||||
defer w.wg.Done()
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-w.queue:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
w.process(ctx, job)
|
||||
case <-w.done:
|
||||
// Drain remaining jobs so nothing in-flight is lost.
|
||||
for {
|
||||
select {
|
||||
case job, ok := <-w.queue:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
w.process(ctx, job)
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *Worker) process(ctx context.Context, job Job) {
|
||||
logger := w.logger.With("mail_id", job.MailID, "tenant_id", job.TenantID)
|
||||
|
||||
if w.store.OCREnabled(ctx, job.TenantID) == false {
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "disabled")
|
||||
return
|
||||
}
|
||||
|
||||
raw, err := w.store.Load(job.MailID)
|
||||
if err != nil {
|
||||
logger.Warn("ocr worker: load failed", "err", err)
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
|
||||
return
|
||||
}
|
||||
|
||||
pm, err := mailparser.Parse(raw)
|
||||
if err != nil {
|
||||
logger.Warn("ocr worker: parse failed", "err", err)
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
|
||||
return
|
||||
}
|
||||
|
||||
if len(pm.Attachments) == 0 {
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "skipped")
|
||||
return
|
||||
}
|
||||
|
||||
var combined strings.Builder
|
||||
processed := 0
|
||||
for _, a := range pm.Attachments {
|
||||
text, err := ExtractText(ctx, a.Data, a.ContentType, a.Filename, w.langs)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrUnsupported) || errors.Is(err, ErrEncrypted) ||
|
||||
errors.Is(err, ErrTooLarge) || errors.Is(err, ErrUnavailable) {
|
||||
logger.Debug("ocr worker: attachment skipped",
|
||||
"filename", a.Filename, "reason", err)
|
||||
continue
|
||||
}
|
||||
logger.Warn("ocr worker: extract failed",
|
||||
"filename", a.Filename, "err", err)
|
||||
continue
|
||||
}
|
||||
if t := strings.TrimSpace(text); t != "" {
|
||||
combined.WriteString(t)
|
||||
combined.WriteString("\n\n")
|
||||
processed++
|
||||
}
|
||||
}
|
||||
|
||||
if processed == 0 {
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "skipped")
|
||||
return
|
||||
}
|
||||
|
||||
idx := w.idxMgr.ForTenant(job.TenantID)
|
||||
updater, ok := idx.(index.AttachmentTextUpdater)
|
||||
if !ok {
|
||||
logger.Warn("ocr worker: indexer does not support AttachmentTextUpdater — text dropped")
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
|
||||
return
|
||||
}
|
||||
if err := updater.UpdateAttachmentText(job.MailID, combined.String()); err != nil {
|
||||
logger.Warn("ocr worker: index update failed", "err", err)
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "failed")
|
||||
return
|
||||
}
|
||||
|
||||
_ = w.store.SetOCRStatus(ctx, job.MailID, "done")
|
||||
logger.Info("ocr worker: indexed", "attachments", processed, "chars", combined.Len())
|
||||
}
|
||||
Reference in New Issue
Block a user