fix(PROJ-35): OCR Tempdir auf storage_dir umleiten

Mit systemd ProtectSystem=strict ist /tmp fuer den Service read-only.
ocr.SetTempDir(storage_path/ocr-tmp) nutzt einen RW-Pfad innerhalb der
ohnehin freigegebenen ReadWritePaths.
This commit is contained in:
sysops
2026-05-08 22:19:20 +02:00
parent a252ad6f0e
commit 6d835aefac
3 changed files with 26 additions and 2 deletions
+2
View File
@@ -75,6 +75,8 @@ func runOCRReprocess(args []string) {
logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils",
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
}
// Keep scratch space inside the storage dir (matches the daemon's setup).
ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp")
ctx := context.Background()
+3
View File
@@ -174,6 +174,9 @@ func main() {
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
// it back into the per-tenant Manticore index. Non-blocking submit so the
// mail intake pipeline is never delayed.
// systemd's ProtectSystem=strict restricts /tmp; route OCR scratch space
// into the storage-dir which is guaranteed-RW.
ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp")
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
Workers: 2,
QueueSize: 1000,
+21 -2
View File
@@ -33,6 +33,25 @@ var (
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
)
// tempDirRoot may be overridden by callers (e.g. when the systemd unit
// restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp.
var tempDirRoot = ""
// SetTempDir overrides the parent directory used for temporary OCR work.
// When dir is empty, the OS default ($TMPDIR or /tmp) is used.
// Safe to call once at startup; not safe for concurrent reconfiguration.
func SetTempDir(dir string) {
tempDirRoot = dir
}
func mkTempDir() (string, error) {
if tempDirRoot != "" {
_ = os.MkdirAll(tempDirRoot, 0o755)
return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*")
}
return os.MkdirTemp("", "archivmail-ocr-*")
}
// IsAvailable reports whether at least one OCR tool is on PATH.
// pdftotext alone enables text extraction from native PDFs;
// tesseract enables OCR for images and scanned PDFs.
@@ -128,7 +147,7 @@ func classify(contentType, filename string) fileKind {
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
tmp, err := mkTempDir()
if err != nil {
return "", fmt.Errorf("ocr: tempdir: %w", err)
}
@@ -214,7 +233,7 @@ func extractImage(ctx context.Context, data []byte, filename string, langs []str
if _, err := exec.LookPath("tesseract"); err != nil {
return "", ErrUnavailable
}
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
tmp, err := mkTempDir()
if err != nil {
return "", fmt.Errorf("ocr: tempdir: %w", err)
}