archivmail/internal/ocr/ocr.go

// Package ocr extracts plain text from email attachments (PDFs and images)
// using locally installed tools — pdftotext (poppler-utils) and tesseract.
//
// PROJ-35: All extraction happens on the host. No external services.
package ocr

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"strings"
	"time"
)

const (
	// MaxAttachmentSize skips files larger than this (50 MiB per spec).
	MaxAttachmentSize = 50 * 1024 * 1024
	// DefaultTimeout caps a single OCR run.
	DefaultTimeout = 60 * time.Second
	// pdfMinTextLen below this we treat the PDF as a scan and fall back to OCR.
	pdfMinTextLen = 32
)

// Errors returned by Extract.
var (
	ErrUnsupported = errors.New("ocr: unsupported attachment type")
	ErrTooLarge    = errors.New("ocr: attachment exceeds size limit")
	ErrEncrypted   = errors.New("ocr: attachment is password protected")
	ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
)

// tempDirRoot may be overridden by callers (e.g. when the systemd unit
// restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp.
var tempDirRoot = ""

// SetTempDir overrides the parent directory used for temporary OCR work.
// When dir is empty, the OS default ($TMPDIR or /tmp) is used.
// Safe to call once at startup; not safe for concurrent reconfiguration.
func SetTempDir(dir string) {
	tempDirRoot = dir
}

func mkTempDir() (string, error) {
	if tempDirRoot != "" {
		_ = os.MkdirAll(tempDirRoot, 0o755)
		return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*")
	}
	return os.MkdirTemp("", "archivmail-ocr-*")
}

// IsAvailable reports whether at least one OCR tool is on PATH.
// pdftotext alone enables text extraction from native PDFs;
// tesseract enables OCR for images and scanned PDFs.
func IsAvailable() bool {
	_, errPDF := exec.LookPath("pdftotext")
	_, errTess := exec.LookPath("tesseract")
	return errPDF == nil || errTess == nil
}

// ToolStatus reports which tools are present on the host.
type ToolStatus struct {
	HasPdftotext bool
	HasTesseract bool
	HasPdftoppm  bool
}

// CheckTools probes the system for the supporting binaries.
func CheckTools() ToolStatus {
	_, errPDF := exec.LookPath("pdftotext")
	_, errTess := exec.LookPath("tesseract")
	_, errPpm := exec.LookPath("pdftoppm")
	return ToolStatus{
		HasPdftotext: errPDF == nil,
		HasTesseract: errTess == nil,
		HasPdftoppm:  errPpm == nil,
	}
}

// ExtractText extracts plain text from a single attachment.
// contentType is the MIME type (lowercased), filename is used to derive an
// extension when contentType is missing or generic. langs is the Tesseract
// language list (e.g. ["deu","eng"]).
//
// Returns:
//   - extracted text (may be empty when nothing was found)
//   - ErrUnsupported when the format cannot be processed
//   - ErrTooLarge when data exceeds MaxAttachmentSize
//   - ErrEncrypted when the PDF is password-protected
//   - other errors from the underlying tool
func ExtractText(ctx context.Context, data []byte, contentType, filename string, langs []string) (string, error) {
	if len(data) == 0 {
		return "", nil
	}
	if len(data) > MaxAttachmentSize {
		return "", ErrTooLarge
	}

	kind := classify(contentType, filename)
	switch kind {
	case kindPDF:
		return extractPDF(ctx, data, langs)
	case kindImage:
		return extractImage(ctx, data, filename, langs)
	default:
		return "", ErrUnsupported
	}
}

// ── classification ────────────────────────────────────────────────────────

type fileKind int

const (
	kindUnknown fileKind = iota
	kindPDF
	kindImage
)

func classify(contentType, filename string) fileKind {
	ct := strings.ToLower(strings.TrimSpace(contentType))
	if i := strings.Index(ct, ";"); i >= 0 {
		ct = strings.TrimSpace(ct[:i])
	}

	switch ct {
	case "application/pdf", "application/x-pdf":
		return kindPDF
	case "image/jpeg", "image/jpg", "image/png", "image/tiff",
		"image/x-tiff", "image/bmp", "image/x-bmp", "image/webp":
		return kindImage
	}

	ext := strings.ToLower(filepath.Ext(filename))
	switch ext {
	case ".pdf":
		return kindPDF
	case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp":
		return kindImage
	}
	return kindUnknown
}

// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────

func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
	tmp, err := mkTempDir()
	if err != nil {
		return "", fmt.Errorf("ocr: tempdir: %w", err)
	}
	defer os.RemoveAll(tmp)

	pdfPath := filepath.Join(tmp, "in.pdf")
	if err := os.WriteFile(pdfPath, data, 0o600); err != nil {
		return "", fmt.Errorf("ocr: write pdf: %w", err)
	}

	if _, err := exec.LookPath("pdftotext"); err == nil {
		text, ptErr := runPdftotext(ctx, pdfPath)
		if ptErr != nil {
			if errors.Is(ptErr, ErrEncrypted) {
				return "", ErrEncrypted
			}
			// Fall through to OCR fallback on other errors.
		}
		if len(strings.TrimSpace(text)) >= pdfMinTextLen {
			return text, nil
		}
	}

	if _, err := exec.LookPath("tesseract"); err != nil {
		return "", ErrUnavailable
	}
	if _, err := exec.LookPath("pdftoppm"); err != nil {
		return "", ErrUnavailable
	}

	return ocrPDFViaImages(ctx, tmp, pdfPath, langs)
}

func runPdftotext(ctx context.Context, pdfPath string) (string, error) {
	cctx, cancel := withDefaultTimeout(ctx)
	defer cancel()

	var out, errBuf bytes.Buffer
	cmd := exec.CommandContext(cctx, "pdftotext", "-layout", "-q", pdfPath, "-")
	cmd.Stdout = &out
	cmd.Stderr = &errBuf
	if err := cmd.Run(); err != nil {
		stderr := errBuf.String()
		if strings.Contains(strings.ToLower(stderr), "incorrect password") ||
			strings.Contains(strings.ToLower(stderr), "encrypted") {
			return "", ErrEncrypted
		}
		return "", fmt.Errorf("ocr: pdftotext: %w (%s)", err, strings.TrimSpace(stderr))
	}
	return out.String(), nil
}

func ocrPDFViaImages(ctx context.Context, dir, pdfPath string, langs []string) (string, error) {
	cctx, cancel := withDefaultTimeout(ctx)
	defer cancel()

	prefix := filepath.Join(dir, "page")
	cmd := exec.CommandContext(cctx, "pdftoppm", "-r", "200", "-png", pdfPath, prefix)
	if err := cmd.Run(); err != nil {
		return "", fmt.Errorf("ocr: pdftoppm: %w", err)
	}

	pages, err := filepath.Glob(prefix + "-*.png")
	if err != nil || len(pages) == 0 {
		return "", fmt.Errorf("ocr: pdftoppm produced no pages")
	}

	var combined strings.Builder
	for _, p := range pages {
		text, err := tesseractFile(cctx, p, langs)
		if err != nil {
			continue
		}
		combined.WriteString(text)
		combined.WriteString("\n")
	}
	return combined.String(), nil
}

// ── image extraction ──────────────────────────────────────────────────────

func extractImage(ctx context.Context, data []byte, filename string, langs []string) (string, error) {
	if _, err := exec.LookPath("tesseract"); err != nil {
		return "", ErrUnavailable
	}
	tmp, err := mkTempDir()
	if err != nil {
		return "", fmt.Errorf("ocr: tempdir: %w", err)
	}
	defer os.RemoveAll(tmp)

	ext := filepath.Ext(filename)
	if ext == "" {
		ext = ".bin"
	}
	in := filepath.Join(tmp, "img"+ext)
	if err := os.WriteFile(in, data, 0o600); err != nil {
		return "", fmt.Errorf("ocr: write image: %w", err)
	}
	cctx, cancel := withDefaultTimeout(ctx)
	defer cancel()
	return tesseractFile(cctx, in, langs)
}

func tesseractFile(ctx context.Context, path string, langs []string) (string, error) {
	args := []string{path, "stdout"}
	if l := joinLangs(langs); l != "" {
		args = append(args, "-l", l)
	}
	args = append(args, "--psm", "3", "-c", "preserve_interword_spaces=1")

	var out, errBuf bytes.Buffer
	cmd := exec.CommandContext(ctx, "tesseract", args...)
	cmd.Stdout = &out
	cmd.Stderr = &errBuf
	if err := cmd.Run(); err != nil {
		return "", fmt.Errorf("ocr: tesseract: %w (%s)", err, strings.TrimSpace(errBuf.String()))
	}
	return out.String(), nil
}

func joinLangs(langs []string) string {
	if len(langs) == 0 {
		return "deu+eng"
	}
	return strings.Join(langs, "+")
}

func withDefaultTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
	if _, ok := ctx.Deadline(); ok {
		return ctx, func() {}
	}
	return context.WithTimeout(ctx, DefaultTimeout)
}