feat(PROJ-35): OCR & Anhang-Volltext-Indexierung

Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils.
Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist
ueber die normale Volltextsuche auffindbar.

- internal/ocr: ExtractText + Worker (queue + drain)
- internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus
- emails.ocr_status (pending|done|failed|skipped|disabled)
- tenants.ocr_enabled (Default TRUE, opt-out)
- Manticore: attachment_text-Feld + UpdateAttachmentText
- Boot-resume: pending Jobs nach Restart automatisch in die Queue
- CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all
- update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
sysops
2026-05-08 22:11:17 +02:00
parent 2a91f6e249
commit 0bda21033e
11 changed files with 926 additions and 25 deletions
+265
View File
@@ -0,0 +1,265 @@
// Package ocr extracts plain text from email attachments (PDFs and images)
// using locally installed tools — pdftotext (poppler-utils) and tesseract.
//
// PROJ-35: All extraction happens on the host. No external services.
package ocr
import (
"bytes"
"context"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
const (
// MaxAttachmentSize skips files larger than this (50 MiB per spec).
MaxAttachmentSize = 50 * 1024 * 1024
// DefaultTimeout caps a single OCR run.
DefaultTimeout = 60 * time.Second
// pdfMinTextLen below this we treat the PDF as a scan and fall back to OCR.
pdfMinTextLen = 32
)
// Errors returned by Extract.
var (
ErrUnsupported = errors.New("ocr: unsupported attachment type")
ErrTooLarge = errors.New("ocr: attachment exceeds size limit")
ErrEncrypted = errors.New("ocr: attachment is password protected")
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
)
// IsAvailable reports whether at least one OCR tool is on PATH.
// pdftotext alone enables text extraction from native PDFs;
// tesseract enables OCR for images and scanned PDFs.
func IsAvailable() bool {
_, errPDF := exec.LookPath("pdftotext")
_, errTess := exec.LookPath("tesseract")
return errPDF == nil || errTess == nil
}
// ToolStatus reports which tools are present on the host.
type ToolStatus struct {
HasPdftotext bool
HasTesseract bool
HasPdftoppm bool
}
// CheckTools probes the system for the supporting binaries.
func CheckTools() ToolStatus {
_, errPDF := exec.LookPath("pdftotext")
_, errTess := exec.LookPath("tesseract")
_, errPpm := exec.LookPath("pdftoppm")
return ToolStatus{
HasPdftotext: errPDF == nil,
HasTesseract: errTess == nil,
HasPdftoppm: errPpm == nil,
}
}
// ExtractText extracts plain text from a single attachment.
// contentType is the MIME type (lowercased), filename is used to derive an
// extension when contentType is missing or generic. langs is the Tesseract
// language list (e.g. ["deu","eng"]).
//
// Returns:
// - extracted text (may be empty when nothing was found)
// - ErrUnsupported when the format cannot be processed
// - ErrTooLarge when data exceeds MaxAttachmentSize
// - ErrEncrypted when the PDF is password-protected
// - other errors from the underlying tool
func ExtractText(ctx context.Context, data []byte, contentType, filename string, langs []string) (string, error) {
if len(data) == 0 {
return "", nil
}
if len(data) > MaxAttachmentSize {
return "", ErrTooLarge
}
kind := classify(contentType, filename)
switch kind {
case kindPDF:
return extractPDF(ctx, data, langs)
case kindImage:
return extractImage(ctx, data, filename, langs)
default:
return "", ErrUnsupported
}
}
// ── classification ────────────────────────────────────────────────────────
type fileKind int
const (
kindUnknown fileKind = iota
kindPDF
kindImage
)
func classify(contentType, filename string) fileKind {
ct := strings.ToLower(strings.TrimSpace(contentType))
if i := strings.Index(ct, ";"); i >= 0 {
ct = strings.TrimSpace(ct[:i])
}
switch ct {
case "application/pdf", "application/x-pdf":
return kindPDF
case "image/jpeg", "image/jpg", "image/png", "image/tiff",
"image/x-tiff", "image/bmp", "image/x-bmp", "image/webp":
return kindImage
}
ext := strings.ToLower(filepath.Ext(filename))
switch ext {
case ".pdf":
return kindPDF
case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp":
return kindImage
}
return kindUnknown
}
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
if err != nil {
return "", fmt.Errorf("ocr: tempdir: %w", err)
}
defer os.RemoveAll(tmp)
pdfPath := filepath.Join(tmp, "in.pdf")
if err := os.WriteFile(pdfPath, data, 0o600); err != nil {
return "", fmt.Errorf("ocr: write pdf: %w", err)
}
if _, err := exec.LookPath("pdftotext"); err == nil {
text, ptErr := runPdftotext(ctx, pdfPath)
if ptErr != nil {
if errors.Is(ptErr, ErrEncrypted) {
return "", ErrEncrypted
}
// Fall through to OCR fallback on other errors.
}
if len(strings.TrimSpace(text)) >= pdfMinTextLen {
return text, nil
}
}
if _, err := exec.LookPath("tesseract"); err != nil {
return "", ErrUnavailable
}
if _, err := exec.LookPath("pdftoppm"); err != nil {
return "", ErrUnavailable
}
return ocrPDFViaImages(ctx, tmp, pdfPath, langs)
}
func runPdftotext(ctx context.Context, pdfPath string) (string, error) {
cctx, cancel := withDefaultTimeout(ctx)
defer cancel()
var out, errBuf bytes.Buffer
cmd := exec.CommandContext(cctx, "pdftotext", "-layout", "-q", pdfPath, "-")
cmd.Stdout = &out
cmd.Stderr = &errBuf
if err := cmd.Run(); err != nil {
stderr := errBuf.String()
if strings.Contains(strings.ToLower(stderr), "incorrect password") ||
strings.Contains(strings.ToLower(stderr), "encrypted") {
return "", ErrEncrypted
}
return "", fmt.Errorf("ocr: pdftotext: %w (%s)", err, strings.TrimSpace(stderr))
}
return out.String(), nil
}
func ocrPDFViaImages(ctx context.Context, dir, pdfPath string, langs []string) (string, error) {
cctx, cancel := withDefaultTimeout(ctx)
defer cancel()
prefix := filepath.Join(dir, "page")
cmd := exec.CommandContext(cctx, "pdftoppm", "-r", "200", "-png", pdfPath, prefix)
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("ocr: pdftoppm: %w", err)
}
pages, err := filepath.Glob(prefix + "-*.png")
if err != nil || len(pages) == 0 {
return "", fmt.Errorf("ocr: pdftoppm produced no pages")
}
var combined strings.Builder
for _, p := range pages {
text, err := tesseractFile(cctx, p, langs)
if err != nil {
continue
}
combined.WriteString(text)
combined.WriteString("\n")
}
return combined.String(), nil
}
// ── image extraction ──────────────────────────────────────────────────────
func extractImage(ctx context.Context, data []byte, filename string, langs []string) (string, error) {
if _, err := exec.LookPath("tesseract"); err != nil {
return "", ErrUnavailable
}
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
if err != nil {
return "", fmt.Errorf("ocr: tempdir: %w", err)
}
defer os.RemoveAll(tmp)
ext := filepath.Ext(filename)
if ext == "" {
ext = ".bin"
}
in := filepath.Join(tmp, "img"+ext)
if err := os.WriteFile(in, data, 0o600); err != nil {
return "", fmt.Errorf("ocr: write image: %w", err)
}
cctx, cancel := withDefaultTimeout(ctx)
defer cancel()
return tesseractFile(cctx, in, langs)
}
func tesseractFile(ctx context.Context, path string, langs []string) (string, error) {
args := []string{path, "stdout"}
if l := joinLangs(langs); l != "" {
args = append(args, "-l", l)
}
args = append(args, "--psm", "3", "-c", "preserve_interword_spaces=1")
var out, errBuf bytes.Buffer
cmd := exec.CommandContext(ctx, "tesseract", args...)
cmd.Stdout = &out
cmd.Stderr = &errBuf
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("ocr: tesseract: %w (%s)", err, strings.TrimSpace(errBuf.String()))
}
return out.String(), nil
}
func joinLangs(langs []string) string {
if len(langs) == 0 {
return "deu+eng"
}
return strings.Join(langs, "+")
}
func withDefaultTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
if _, ok := ctx.Deadline(); ok {
return ctx, func() {}
}
return context.WithTimeout(ctx, DefaultTimeout)
}