fix(PROJ-35): OCR Tempdir auf storage_dir umleiten
Mit systemd ProtectSystem=strict ist /tmp fuer den Service read-only. ocr.SetTempDir(storage_path/ocr-tmp) nutzt einen RW-Pfad innerhalb der ohnehin freigegebenen ReadWritePaths.
This commit is contained in:
@@ -75,6 +75,8 @@ func runOCRReprocess(args []string) {
|
||||
logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils",
|
||||
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
|
||||
}
|
||||
// Keep scratch space inside the storage dir (matches the daemon's setup).
|
||||
ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp")
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
|
||||
@@ -174,6 +174,9 @@ func main() {
|
||||
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
|
||||
// it back into the per-tenant Manticore index. Non-blocking submit so the
|
||||
// mail intake pipeline is never delayed.
|
||||
// systemd's ProtectSystem=strict restricts /tmp; route OCR scratch space
|
||||
// into the storage-dir which is guaranteed-RW.
|
||||
ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp")
|
||||
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
|
||||
Workers: 2,
|
||||
QueueSize: 1000,
|
||||
|
||||
+21
-2
@@ -33,6 +33,25 @@ var (
|
||||
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
|
||||
)
|
||||
|
||||
// tempDirRoot may be overridden by callers (e.g. when the systemd unit
|
||||
// restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp.
|
||||
var tempDirRoot = ""
|
||||
|
||||
// SetTempDir overrides the parent directory used for temporary OCR work.
|
||||
// When dir is empty, the OS default ($TMPDIR or /tmp) is used.
|
||||
// Safe to call once at startup; not safe for concurrent reconfiguration.
|
||||
func SetTempDir(dir string) {
|
||||
tempDirRoot = dir
|
||||
}
|
||||
|
||||
func mkTempDir() (string, error) {
|
||||
if tempDirRoot != "" {
|
||||
_ = os.MkdirAll(tempDirRoot, 0o755)
|
||||
return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*")
|
||||
}
|
||||
return os.MkdirTemp("", "archivmail-ocr-*")
|
||||
}
|
||||
|
||||
// IsAvailable reports whether at least one OCR tool is on PATH.
|
||||
// pdftotext alone enables text extraction from native PDFs;
|
||||
// tesseract enables OCR for images and scanned PDFs.
|
||||
@@ -128,7 +147,7 @@ func classify(contentType, filename string) fileKind {
|
||||
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
|
||||
|
||||
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
|
||||
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
|
||||
tmp, err := mkTempDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
||||
}
|
||||
@@ -214,7 +233,7 @@ func extractImage(ctx context.Context, data []byte, filename string, langs []str
|
||||
if _, err := exec.LookPath("tesseract"); err != nil {
|
||||
return "", ErrUnavailable
|
||||
}
|
||||
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
|
||||
tmp, err := mkTempDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user