From 6d835aefac4c954e77ea137809ada213562eac11 Mon Sep 17 00:00:00 2001 From: sysops Date: Fri, 8 May 2026 22:19:20 +0200 Subject: [PATCH] fix(PROJ-35): OCR Tempdir auf storage_dir umleiten Mit systemd ProtectSystem=strict ist /tmp fuer den Service read-only. ocr.SetTempDir(storage_path/ocr-tmp) nutzt einen RW-Pfad innerhalb der ohnehin freigegebenen ReadWritePaths. --- cmd/archivmail/cmd_ocr_reprocess.go | 2 ++ cmd/archivmail/main.go | 3 +++ internal/ocr/ocr.go | 23 +++++++++++++++++++++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/cmd/archivmail/cmd_ocr_reprocess.go b/cmd/archivmail/cmd_ocr_reprocess.go index 4e0d72f..f0c102b 100644 --- a/cmd/archivmail/cmd_ocr_reprocess.go +++ b/cmd/archivmail/cmd_ocr_reprocess.go @@ -75,6 +75,8 @@ func runOCRReprocess(args []string) { logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils", "pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm) } + // Keep scratch space inside the storage dir (matches the daemon's setup). + ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp") ctx := context.Background() diff --git a/cmd/archivmail/main.go b/cmd/archivmail/main.go index fd7f9d9..203e260 100644 --- a/cmd/archivmail/main.go +++ b/cmd/archivmail/main.go @@ -174,6 +174,9 @@ func main() { // PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds // it back into the per-tenant Manticore index. Non-blocking submit so the // mail intake pipeline is never delayed. + // systemd's ProtectSystem=strict restricts /tmp; route OCR scratch space + // into the storage-dir which is guaranteed-RW. + ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp") ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{ Workers: 2, QueueSize: 1000, diff --git a/internal/ocr/ocr.go b/internal/ocr/ocr.go index 047ef53..81a085a 100644 --- a/internal/ocr/ocr.go +++ b/internal/ocr/ocr.go @@ -33,6 +33,25 @@ var ( ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available") ) +// tempDirRoot may be overridden by callers (e.g. when the systemd unit +// restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp. +var tempDirRoot = "" + +// SetTempDir overrides the parent directory used for temporary OCR work. +// When dir is empty, the OS default ($TMPDIR or /tmp) is used. +// Safe to call once at startup; not safe for concurrent reconfiguration. +func SetTempDir(dir string) { + tempDirRoot = dir +} + +func mkTempDir() (string, error) { + if tempDirRoot != "" { + _ = os.MkdirAll(tempDirRoot, 0o755) + return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*") + } + return os.MkdirTemp("", "archivmail-ocr-*") +} + // IsAvailable reports whether at least one OCR tool is on PATH. // pdftotext alone enables text extraction from native PDFs; // tesseract enables OCR for images and scanned PDFs. @@ -128,7 +147,7 @@ func classify(contentType, filename string) fileKind { // ── PDF extraction (pdftotext → tesseract fallback) ─────────────────────── func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) { - tmp, err := os.MkdirTemp("", "archivmail-ocr-*") + tmp, err := mkTempDir() if err != nil { return "", fmt.Errorf("ocr: tempdir: %w", err) } @@ -214,7 +233,7 @@ func extractImage(ctx context.Context, data []byte, filename string, langs []str if _, err := exec.LookPath("tesseract"); err != nil { return "", ErrUnavailable } - tmp, err := os.MkdirTemp("", "archivmail-ocr-*") + tmp, err := mkTempDir() if err != nil { return "", fmt.Errorf("ocr: tempdir: %w", err) }