// Package ocr extracts plain text from email attachments (PDFs and images) // using locally installed tools — pdftotext (poppler-utils) and tesseract. // // PROJ-35: All extraction happens on the host. No external services. package ocr import ( "bytes" "context" "errors" "fmt" "os" "os/exec" "path/filepath" "strings" "time" ) const ( // MaxAttachmentSize skips files larger than this (50 MiB per spec). MaxAttachmentSize = 50 * 1024 * 1024 // DefaultTimeout caps a single OCR run. DefaultTimeout = 60 * time.Second // pdfMinTextLen below this we treat the PDF as a scan and fall back to OCR. pdfMinTextLen = 32 ) // Errors returned by Extract. var ( ErrUnsupported = errors.New("ocr: unsupported attachment type") ErrTooLarge = errors.New("ocr: attachment exceeds size limit") ErrEncrypted = errors.New("ocr: attachment is password protected") ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available") ) // tempDirRoot may be overridden by callers (e.g. when the systemd unit // restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp. var tempDirRoot = "" // SetTempDir overrides the parent directory used for temporary OCR work. // When dir is empty, the OS default ($TMPDIR or /tmp) is used. // Safe to call once at startup; not safe for concurrent reconfiguration. func SetTempDir(dir string) { tempDirRoot = dir } func mkTempDir() (string, error) { if tempDirRoot != "" { _ = os.MkdirAll(tempDirRoot, 0o755) return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*") } return os.MkdirTemp("", "archivmail-ocr-*") } // IsAvailable reports whether at least one OCR tool is on PATH. // pdftotext alone enables text extraction from native PDFs; // tesseract enables OCR for images and scanned PDFs. func IsAvailable() bool { _, errPDF := exec.LookPath("pdftotext") _, errTess := exec.LookPath("tesseract") return errPDF == nil || errTess == nil } // ToolStatus reports which tools are present on the host. type ToolStatus struct { HasPdftotext bool HasTesseract bool HasPdftoppm bool } // CheckTools probes the system for the supporting binaries. func CheckTools() ToolStatus { _, errPDF := exec.LookPath("pdftotext") _, errTess := exec.LookPath("tesseract") _, errPpm := exec.LookPath("pdftoppm") return ToolStatus{ HasPdftotext: errPDF == nil, HasTesseract: errTess == nil, HasPdftoppm: errPpm == nil, } } // ExtractText extracts plain text from a single attachment. // contentType is the MIME type (lowercased), filename is used to derive an // extension when contentType is missing or generic. langs is the Tesseract // language list (e.g. ["deu","eng"]). // // Returns: // - extracted text (may be empty when nothing was found) // - ErrUnsupported when the format cannot be processed // - ErrTooLarge when data exceeds MaxAttachmentSize // - ErrEncrypted when the PDF is password-protected // - other errors from the underlying tool func ExtractText(ctx context.Context, data []byte, contentType, filename string, langs []string) (string, error) { if len(data) == 0 { return "", nil } if len(data) > MaxAttachmentSize { return "", ErrTooLarge } kind := classify(contentType, filename) switch kind { case kindPDF: return extractPDF(ctx, data, langs) case kindImage: return extractImage(ctx, data, filename, langs) default: return "", ErrUnsupported } } // ── classification ──────────────────────────────────────────────────────── type fileKind int const ( kindUnknown fileKind = iota kindPDF kindImage ) func classify(contentType, filename string) fileKind { ct := strings.ToLower(strings.TrimSpace(contentType)) if i := strings.Index(ct, ";"); i >= 0 { ct = strings.TrimSpace(ct[:i]) } switch ct { case "application/pdf", "application/x-pdf": return kindPDF case "image/jpeg", "image/jpg", "image/png", "image/tiff", "image/x-tiff", "image/bmp", "image/x-bmp", "image/webp": return kindImage } ext := strings.ToLower(filepath.Ext(filename)) switch ext { case ".pdf": return kindPDF case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp": return kindImage } return kindUnknown } // ── PDF extraction (pdftotext → tesseract fallback) ─────────────────────── func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) { tmp, err := mkTempDir() if err != nil { return "", fmt.Errorf("ocr: tempdir: %w", err) } defer os.RemoveAll(tmp) pdfPath := filepath.Join(tmp, "in.pdf") if err := os.WriteFile(pdfPath, data, 0o600); err != nil { return "", fmt.Errorf("ocr: write pdf: %w", err) } if _, err := exec.LookPath("pdftotext"); err == nil { text, ptErr := runPdftotext(ctx, pdfPath) if ptErr != nil { if errors.Is(ptErr, ErrEncrypted) { return "", ErrEncrypted } // Fall through to OCR fallback on other errors. } if len(strings.TrimSpace(text)) >= pdfMinTextLen { return text, nil } } if _, err := exec.LookPath("tesseract"); err != nil { return "", ErrUnavailable } if _, err := exec.LookPath("pdftoppm"); err != nil { return "", ErrUnavailable } return ocrPDFViaImages(ctx, tmp, pdfPath, langs) } func runPdftotext(ctx context.Context, pdfPath string) (string, error) { cctx, cancel := withDefaultTimeout(ctx) defer cancel() var out, errBuf bytes.Buffer cmd := exec.CommandContext(cctx, "pdftotext", "-layout", "-q", pdfPath, "-") cmd.Stdout = &out cmd.Stderr = &errBuf if err := cmd.Run(); err != nil { stderr := errBuf.String() if strings.Contains(strings.ToLower(stderr), "incorrect password") || strings.Contains(strings.ToLower(stderr), "encrypted") { return "", ErrEncrypted } return "", fmt.Errorf("ocr: pdftotext: %w (%s)", err, strings.TrimSpace(stderr)) } return out.String(), nil } func ocrPDFViaImages(ctx context.Context, dir, pdfPath string, langs []string) (string, error) { cctx, cancel := withDefaultTimeout(ctx) defer cancel() prefix := filepath.Join(dir, "page") cmd := exec.CommandContext(cctx, "pdftoppm", "-r", "200", "-png", pdfPath, prefix) if err := cmd.Run(); err != nil { return "", fmt.Errorf("ocr: pdftoppm: %w", err) } pages, err := filepath.Glob(prefix + "-*.png") if err != nil || len(pages) == 0 { return "", fmt.Errorf("ocr: pdftoppm produced no pages") } var combined strings.Builder for _, p := range pages { text, err := tesseractFile(cctx, p, langs) if err != nil { continue } combined.WriteString(text) combined.WriteString("\n") } return combined.String(), nil } // ── image extraction ────────────────────────────────────────────────────── func extractImage(ctx context.Context, data []byte, filename string, langs []string) (string, error) { if _, err := exec.LookPath("tesseract"); err != nil { return "", ErrUnavailable } tmp, err := mkTempDir() if err != nil { return "", fmt.Errorf("ocr: tempdir: %w", err) } defer os.RemoveAll(tmp) ext := filepath.Ext(filename) if ext == "" { ext = ".bin" } in := filepath.Join(tmp, "img"+ext) if err := os.WriteFile(in, data, 0o600); err != nil { return "", fmt.Errorf("ocr: write image: %w", err) } cctx, cancel := withDefaultTimeout(ctx) defer cancel() return tesseractFile(cctx, in, langs) } func tesseractFile(ctx context.Context, path string, langs []string) (string, error) { args := []string{path, "stdout"} if l := joinLangs(langs); l != "" { args = append(args, "-l", l) } args = append(args, "--psm", "3", "-c", "preserve_interword_spaces=1") var out, errBuf bytes.Buffer cmd := exec.CommandContext(ctx, "tesseract", args...) cmd.Stdout = &out cmd.Stderr = &errBuf if err := cmd.Run(); err != nil { return "", fmt.Errorf("ocr: tesseract: %w (%s)", err, strings.TrimSpace(errBuf.String())) } return out.String(), nil } func joinLangs(langs []string) string { if len(langs) == 0 { return "deu+eng" } return strings.Join(langs, "+") } func withDefaultTimeout(ctx context.Context) (context.Context, context.CancelFunc) { if _, ok := ctx.Deadline(); ok { return ctx, func() {} } return context.WithTimeout(ctx, DefaultTimeout) }