fix(PROJ-35): OCR Tempdir auf storage_dir umleiten
Mit systemd ProtectSystem=strict ist /tmp fuer den Service read-only. ocr.SetTempDir(storage_path/ocr-tmp) nutzt einen RW-Pfad innerhalb der ohnehin freigegebenen ReadWritePaths.
This commit is contained in:
@@ -75,6 +75,8 @@ func runOCRReprocess(args []string) {
|
|||||||
logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils",
|
logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils",
|
||||||
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
|
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
|
||||||
}
|
}
|
||||||
|
// Keep scratch space inside the storage dir (matches the daemon's setup).
|
||||||
|
ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp")
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
|
|
||||||
|
|||||||
@@ -174,6 +174,9 @@ func main() {
|
|||||||
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
|
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
|
||||||
// it back into the per-tenant Manticore index. Non-blocking submit so the
|
// it back into the per-tenant Manticore index. Non-blocking submit so the
|
||||||
// mail intake pipeline is never delayed.
|
// mail intake pipeline is never delayed.
|
||||||
|
// systemd's ProtectSystem=strict restricts /tmp; route OCR scratch space
|
||||||
|
// into the storage-dir which is guaranteed-RW.
|
||||||
|
ocr.SetTempDir(cfg.Storage.StorePath + "/ocr-tmp")
|
||||||
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
|
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
|
||||||
Workers: 2,
|
Workers: 2,
|
||||||
QueueSize: 1000,
|
QueueSize: 1000,
|
||||||
|
|||||||
+21
-2
@@ -33,6 +33,25 @@ var (
|
|||||||
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
|
ErrUnavailable = errors.New("ocr: tesseract or pdftotext not available")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// tempDirRoot may be overridden by callers (e.g. when the systemd unit
|
||||||
|
// restricts /tmp). Empty string means "use the OS default" via os.MkdirTemp.
|
||||||
|
var tempDirRoot = ""
|
||||||
|
|
||||||
|
// SetTempDir overrides the parent directory used for temporary OCR work.
|
||||||
|
// When dir is empty, the OS default ($TMPDIR or /tmp) is used.
|
||||||
|
// Safe to call once at startup; not safe for concurrent reconfiguration.
|
||||||
|
func SetTempDir(dir string) {
|
||||||
|
tempDirRoot = dir
|
||||||
|
}
|
||||||
|
|
||||||
|
func mkTempDir() (string, error) {
|
||||||
|
if tempDirRoot != "" {
|
||||||
|
_ = os.MkdirAll(tempDirRoot, 0o755)
|
||||||
|
return os.MkdirTemp(tempDirRoot, "archivmail-ocr-*")
|
||||||
|
}
|
||||||
|
return os.MkdirTemp("", "archivmail-ocr-*")
|
||||||
|
}
|
||||||
|
|
||||||
// IsAvailable reports whether at least one OCR tool is on PATH.
|
// IsAvailable reports whether at least one OCR tool is on PATH.
|
||||||
// pdftotext alone enables text extraction from native PDFs;
|
// pdftotext alone enables text extraction from native PDFs;
|
||||||
// tesseract enables OCR for images and scanned PDFs.
|
// tesseract enables OCR for images and scanned PDFs.
|
||||||
@@ -128,7 +147,7 @@ func classify(contentType, filename string) fileKind {
|
|||||||
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
|
// ── PDF extraction (pdftotext → tesseract fallback) ───────────────────────
|
||||||
|
|
||||||
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
|
func extractPDF(ctx context.Context, data []byte, langs []string) (string, error) {
|
||||||
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
|
tmp, err := mkTempDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
||||||
}
|
}
|
||||||
@@ -214,7 +233,7 @@ func extractImage(ctx context.Context, data []byte, filename string, langs []str
|
|||||||
if _, err := exec.LookPath("tesseract"); err != nil {
|
if _, err := exec.LookPath("tesseract"); err != nil {
|
||||||
return "", ErrUnavailable
|
return "", ErrUnavailable
|
||||||
}
|
}
|
||||||
tmp, err := os.MkdirTemp("", "archivmail-ocr-*")
|
tmp, err := mkTempDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
return "", fmt.Errorf("ocr: tempdir: %w", err)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user