6d835aefac
Mit systemd ProtectSystem=strict ist /tmp fuer den Service read-only. ocr.SetTempDir(storage_path/ocr-tmp) nutzt einen RW-Pfad innerhalb der ohnehin freigegebenen ReadWritePaths.
285 lines
8.2 KiB
Go
285 lines
8.2 KiB
Go
// Package ocr extracts plain text from email attachments (PDFs and images)
|
|
// using locally installed tools — pdftotext (poppler-utils) and tesseract.
|
|
//
|
|
// PROJ-35: All extraction happens on the host. No external services.
|
|
package ocr
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
// MaxAttachmentSize skips files larger than this (50 MiB per spec).
|
|
MaxAttachmentSize = 50 * 1024 * 1024
|
|
// DefaultTimeout caps a single OCR run.
|
|
DefaultTimeout = 60 * time.Second
|
|
// pdfMinTextLen below this we treat the PDF as a scan and fall back to OCR.
|
|
pdfMinTextLen = 32
|
|
)
|
|
|
|
// Errors returned by Extract.
|
|
var (
|
|
ErrUnsupported = errors.New("ocr: unsupported attachment type")
|
|
ErrTooLarge = errors.New("ocr: attachment exceeds size limit")
|
|
ErrEncrypted = errors.New("ocr: attachment is password protected")
|
|
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
|
|
)
|
|
|
|
// tempDirRoot may be overridden by callers (e.g. when the systemd unit
|
|
// restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp.
|
|
var tempDirRoot = ""
|
|
|
|
// SetTempDir overrides the parent directory used for temporary OCR work.
|
|
// When dir is empty, the OS default ($TMPDIR or /tmp) is used.
|
|
// Safe to call once at startup; not safe for concurrent reconfiguration.
|
|
func SetTempDir(dir string) {
|
|
tempDirRoot = dir
|
|
}
|
|
|
|
func mkTempDir() (string, error) {
|
|
if tempDirRoot != "" {
|
|
_ = os.MkdirAll(tempDirRoot, 0o755)
|
|
return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*")
|
|
}
|
|
return os.MkdirTemp("", "archivmail-ocr-*")
|
|
}
|
|
|
|
// IsAvailable reports whether at least one OCR tool is on PATH.
|
|
// pdftotext alone enables text extraction from native PDFs;
|
|
// tesseract enables OCR for images and scanned PDFs.
|
|
func IsAvailable() bool {
|
|
_, errPDF := exec.LookPath("pdftotext")
|
|
_, errTess := exec.LookPath("tesseract")
|
|
return errPDF == nil || errTess == nil
|
|
}
|
|
|
|
// ToolStatus reports which tools are present on the host.
|
|
type ToolStatus struct {
|
|
HasPdftotext bool
|
|
HasTesseract bool
|
|
HasPdftoppm bool
|
|
}
|
|
|
|
// CheckTools probes the system for the supporting binaries.
|
|
func CheckTools() ToolStatus {
|
|
_, errPDF := exec.LookPath("pdftotext")
|
|
_, errTess := exec.LookPath("tesseract")
|
|
_, errPpm := exec.LookPath("pdftoppm")
|
|
return ToolStatus{
|
|
HasPdftotext: errPDF == nil,
|
|
HasTesseract: errTess == nil,
|
|
HasPdftoppm: errPpm == nil,
|
|
}
|
|
}
|
|
|
|
// ExtractText extracts plain text from a single attachment.
|
|
// contentType is the MIME type (lowercased), filename is used to derive an
|
|
// extension when contentType is missing or generic. langs is the Tesseract
|
|
// language list (e.g. ["deu","eng"]).
|
|
//
|
|
// Returns:
|
|
// - extracted text (may be empty when nothing was found)
|
|
// - ErrUnsupported when the format cannot be processed
|
|
// - ErrTooLarge when data exceeds MaxAttachmentSize
|
|
// - ErrEncrypted when the PDF is password-protected
|
|
// - other errors from the underlying tool
|
|
func ExtractText(ctx context.Context, data []byte, contentType, filename string, langs []string) (string, error) {
|
|
if len(data) == 0 {
|
|
return "", nil
|
|
}
|
|
if len(data) > MaxAttachmentSize {
|
|
return "", ErrTooLarge
|
|
}
|
|
|
|
kind := classify(contentType, filename)
|
|
switch kind {
|
|
case kindPDF:
|
|
return extractPDF(ctx, data, langs)
|
|
case kindImage:
|
|
return extractImage(ctx, data, filename, langs)
|
|
default:
|
|
return "", ErrUnsupported
|
|
}
|
|
}
|
|
|
|
// ── classification ────────────────────────────────────────────────────────
|
|
|
|
type fileKind int
|
|
|
|
const (
|
|
kindUnknown fileKind = iota
|
|
kindPDF
|
|
kindImage
|
|
)
|
|
|
|
func classify(contentType, filename string) fileKind {
|
|
ct := strings.ToLower(strings.TrimSpace(contentType))
|
|
if i := strings.Index(ct, ";"); i >= 0 {
|
|
ct = strings.TrimSpace(ct[:i])
|
|
}
|
|
|
|
switch ct {
|
|
case "application/pdf", "application/x-pdf":
|
|
return kindPDF
|
|
case "image/jpeg", "image/jpg", "image/png", "image/tiff",
|
|
"image/x-tiff", "image/bmp", "image/x-bmp", "image/webp":
|
|
return kindImage
|
|
}
|
|
|
|
ext := strings.ToLower(filepath.Ext(filename))
|
|
switch ext {
|
|
case ".pdf":
|
|
return kindPDF
|
|
case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp":
|
|
return kindImage
|
|
}
|
|
return kindUnknown
|
|
}
|
|
|
|
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
|
|
|
|
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
|
|
tmp, err := mkTempDir()
|
|
if err != nil {
|
|
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
|
}
|
|
defer os.RemoveAll(tmp)
|
|
|
|
pdfPath := filepath.Join(tmp, "in.pdf")
|
|
if err := os.WriteFile(pdfPath, data, 0o600); err != nil {
|
|
return "", fmt.Errorf("ocr: write pdf: %w", err)
|
|
}
|
|
|
|
if _, err := exec.LookPath("pdftotext"); err == nil {
|
|
text, ptErr := runPdftotext(ctx, pdfPath)
|
|
if ptErr != nil {
|
|
if errors.Is(ptErr, ErrEncrypted) {
|
|
return "", ErrEncrypted
|
|
}
|
|
// Fall through to OCR fallback on other errors.
|
|
}
|
|
if len(strings.TrimSpace(text)) >= pdfMinTextLen {
|
|
return text, nil
|
|
}
|
|
}
|
|
|
|
if _, err := exec.LookPath("tesseract"); err != nil {
|
|
return "", ErrUnavailable
|
|
}
|
|
if _, err := exec.LookPath("pdftoppm"); err != nil {
|
|
return "", ErrUnavailable
|
|
}
|
|
|
|
return ocrPDFViaImages(ctx, tmp, pdfPath, langs)
|
|
}
|
|
|
|
func runPdftotext(ctx context.Context, pdfPath string) (string, error) {
|
|
cctx, cancel := withDefaultTimeout(ctx)
|
|
defer cancel()
|
|
|
|
var out, errBuf bytes.Buffer
|
|
cmd := exec.CommandContext(cctx, "pdftotext", "-layout", "-q", pdfPath, "-")
|
|
cmd.Stdout = &out
|
|
cmd.Stderr = &errBuf
|
|
if err := cmd.Run(); err != nil {
|
|
stderr := errBuf.String()
|
|
if strings.Contains(strings.ToLower(stderr), "incorrect password") ||
|
|
strings.Contains(strings.ToLower(stderr), "encrypted") {
|
|
return "", ErrEncrypted
|
|
}
|
|
return "", fmt.Errorf("ocr: pdftotext: %w (%s)", err, strings.TrimSpace(stderr))
|
|
}
|
|
return out.String(), nil
|
|
}
|
|
|
|
func ocrPDFViaImages(ctx context.Context, dir, pdfPath string, langs []string) (string, error) {
|
|
cctx, cancel := withDefaultTimeout(ctx)
|
|
defer cancel()
|
|
|
|
prefix := filepath.Join(dir, "page")
|
|
cmd := exec.CommandContext(cctx, "pdftoppm", "-r", "200", "-png", pdfPath, prefix)
|
|
if err := cmd.Run(); err != nil {
|
|
return "", fmt.Errorf("ocr: pdftoppm: %w", err)
|
|
}
|
|
|
|
pages, err := filepath.Glob(prefix + "-*.png")
|
|
if err != nil || len(pages) == 0 {
|
|
return "", fmt.Errorf("ocr: pdftoppm produced no pages")
|
|
}
|
|
|
|
var combined strings.Builder
|
|
for _, p := range pages {
|
|
text, err := tesseractFile(cctx, p, langs)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
combined.WriteString(text)
|
|
combined.WriteString("\n")
|
|
}
|
|
return combined.String(), nil
|
|
}
|
|
|
|
// ── image extraction ──────────────────────────────────────────────────────
|
|
|
|
func extractImage(ctx context.Context, data []byte, filename string, langs []string) (string, error) {
|
|
if _, err := exec.LookPath("tesseract"); err != nil {
|
|
return "", ErrUnavailable
|
|
}
|
|
tmp, err := mkTempDir()
|
|
if err != nil {
|
|
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
|
}
|
|
defer os.RemoveAll(tmp)
|
|
|
|
ext := filepath.Ext(filename)
|
|
if ext == "" {
|
|
ext = ".bin"
|
|
}
|
|
in := filepath.Join(tmp, "img"+ext)
|
|
if err := os.WriteFile(in, data, 0o600); err != nil {
|
|
return "", fmt.Errorf("ocr: write image: %w", err)
|
|
}
|
|
cctx, cancel := withDefaultTimeout(ctx)
|
|
defer cancel()
|
|
return tesseractFile(cctx, in, langs)
|
|
}
|
|
|
|
func tesseractFile(ctx context.Context, path string, langs []string) (string, error) {
|
|
args := []string{path, "stdout"}
|
|
if l := joinLangs(langs); l != "" {
|
|
args = append(args, "-l", l)
|
|
}
|
|
args = append(args, "--psm", "3", "-c", "preserve_interword_spaces=1")
|
|
|
|
var out, errBuf bytes.Buffer
|
|
cmd := exec.CommandContext(ctx, "tesseract", args...)
|
|
cmd.Stdout = &out
|
|
cmd.Stderr = &errBuf
|
|
if err := cmd.Run(); err != nil {
|
|
return "", fmt.Errorf("ocr: tesseract: %w (%s)", err, strings.TrimSpace(errBuf.String()))
|
|
}
|
|
return out.String(), nil
|
|
}
|
|
|
|
func joinLangs(langs []string) string {
|
|
if len(langs) == 0 {
|
|
return "deu+eng"
|
|
}
|
|
return strings.Join(langs, "+")
|
|
}
|
|
|
|
func withDefaultTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
|
|
if _, ok := ctx.Deadline(); ok {
|
|
return ctx, func() {}
|
|
}
|
|
return context.WithTimeout(ctx, DefaultTimeout)
|
|
}
|