feat(PROJ-35): OCR & Anhang-Volltext-Indexierung

Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils.
Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist
ueber die normale Volltextsuche auffindbar.

- internal/ocr: ExtractText + Worker (queue + drain)
- internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus
- emails.ocr_status (pending|done|failed|skipped|disabled)
- tenants.ocr_enabled (Default TRUE, opt-out)
- Manticore: attachment_text-Feld + UpdateAttachmentText
- Boot-resume: pending Jobs nach Restart automatisch in die Queue
- CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all
- update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
sysops
2026-05-08 22:11:17 +02:00
parent 2a91f6e249
commit 0bda21033e
11 changed files with 926 additions and 25 deletions
+1
View File
@@ -286,6 +286,7 @@ Commands:
reindex Index neu aufbauen (alle oder pro Mandant)
recompress Bestehende Mails nachträglich gzip-komprimieren
rethread Thread-IDs rückwirkend aus In-Reply-To/References befüllen
ocr-reprocess OCR für Anhänge nachholen (alle oder pro Mandant/Status)
version Version anzeigen
help Diese Hilfe anzeigen
+131
View File
@@ -0,0 +1,131 @@
package main
import (
"context"
"flag"
"log/slog"
"os"
"time"
"archivmail/config"
"archivmail/internal/index"
"archivmail/internal/ocr"
"archivmail/internal/storage"
)
// runOCRReprocess re-runs OCR for selected mails. It loads matching IDs from
// the DB by ocr_status, queues them on the OCR worker, then waits for the
// worker to drain.
//
// Usage:
//
// archivmail ocr-reprocess --config /etc/archivmail/config.yml
// archivmail ocr-reprocess --tenant 1 --status failed
// archivmail ocr-reprocess --status all --limit 1000
func runOCRReprocess(args []string) {
fs := flag.NewFlagSet("ocr-reprocess", flag.ExitOnError)
configPath := fs.String("config", "/etc/archivmail/config.yml", "path to config file")
tenantIDFlag := fs.Int64("tenant", 0, "tenant ID (0 = all tenants)")
statusFlag := fs.String("status", "pending", "ocr_status filter: pending|done|failed|skipped|disabled|all")
limitFlag := fs.Int("limit", 0, "max number of mails to process (0 = no limit)")
fs.Parse(args)
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
cfg, err := config.Load(*configPath)
if err != nil {
logger.Error("failed to load config", "err", err)
os.Exit(1)
}
storeCfg := storage.Config{
Dir: cfg.Storage.StorePath,
Keyfile: cfg.Storage.Keyfile,
DSN: cfg.Database.DSN(),
CompressEnabled: cfg.Storage.Compress,
}
mailStore, err := storage.New(storeCfg)
if err != nil {
logger.Error("storage init failed", "err", err)
os.Exit(1)
}
defer mailStore.Close()
indexBackend := cfg.Index.Backend
if indexBackend == "" {
indexBackend = "manticore"
}
if indexBackend != "manticore" {
logger.Error("ocr-reprocess requires the manticore backend", "configured", indexBackend)
os.Exit(1)
}
dsn := cfg.Index.ManticoreDSN
if dsn == "" {
dsn = "manticore@tcp(127.0.0.1:9306)/"
}
idxMgr, err := index.NewManticoreTenantManager(dsn)
if err != nil {
logger.Error("manticore init failed", "err", err)
os.Exit(1)
}
defer idxMgr.Close()
if !ocr.IsAvailable() {
ts := ocr.CheckTools()
logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils",
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
}
ctx := context.Background()
var tenantPtr *int64
if *tenantIDFlag > 0 {
t := *tenantIDFlag
tenantPtr = &t
}
mails, err := mailStore.GetMailsByOCRStatus(ctx, *statusFlag, tenantPtr, *limitFlag)
if err != nil {
logger.Error("failed to list mails", "err", err)
os.Exit(1)
}
logger.Info("ocr-reprocess: starting",
"status", *statusFlag, "tenant", *tenantIDFlag, "count", len(mails))
if len(mails) == 0 {
return
}
// Queue size needs to fit the entire batch so Submit never drops.
qSize := len(mails) + 16
worker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
Workers: 2,
QueueSize: qSize,
Logger: logger,
})
worker.Start(ctx)
// Reset status to 'pending' before submission so the worker actually picks
// them up — for callers that want to reprocess 'failed'/'skipped' mails.
if *statusFlag != "pending" && *statusFlag != "all" {
for _, m := range mails {
_ = mailStore.SetOCRStatus(ctx, m.ID, "pending")
}
}
for _, m := range mails {
worker.Submit(m.ID, m.TenantID)
}
// Periodic progress while waiting for the queue to drain.
tick := time.NewTicker(5 * time.Second)
defer tick.Stop()
go func() {
for range tick.C {
logger.Info("ocr-reprocess: progress", "queue_remaining", worker.QueueLen())
}
}()
worker.Stop() // waits for all in-flight jobs
logger.Info("ocr-reprocess: complete", "submitted", len(mails))
}
+51 -5
View File
@@ -29,6 +29,7 @@ import (
"archivmail/internal/index"
ldapcfg "archivmail/internal/ldapconfig"
"archivmail/internal/mailer"
"archivmail/internal/ocr"
pop3store "archivmail/internal/pop3"
"archivmail/internal/smtpoutconfig"
"archivmail/internal/smtpd"
@@ -63,6 +64,9 @@ func main() {
case "rethread":
runRethread(os.Args[2:])
return
case "ocr-reprocess":
runOCRReprocess(os.Args[2:])
return
case "version":
fmt.Printf("archivmail %s\n", AppVersion)
for mod, ver := range Modules {
@@ -167,6 +171,39 @@ func main() {
tenantWorker.Start()
defer tenantWorker.Stop()
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
// it back into the per-tenant Manticore index. Non-blocking submit so the
// mail intake pipeline is never delayed.
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
Workers: 2,
QueueSize: 1000,
Logger: logger,
})
ocrWorker.Start(context.Background())
defer ocrWorker.Stop()
if !ocr.IsAvailable() {
ts := ocr.CheckTools()
logger.Warn("ocr tools not fully available — install tesseract-ocr + poppler-utils for full OCR support",
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
}
// Boot-resume: re-enqueue all mails still marked ocr_status='pending'.
go func() {
ctx := context.Background()
pending, err := mailStore.GetPendingOCRMails(ctx, nil, 5000)
if err != nil {
logger.Warn("ocr boot-resume: query failed", "err", err)
return
}
if len(pending) == 0 {
return
}
logger.Info("ocr boot-resume: re-enqueueing pending jobs", "count", len(pending))
for _, m := range pending {
ocrWorker.Submit(m.ID, m.TenantID)
}
}()
// User store
users, err := userstore.New(cfg.Database.DSN())
if err != nil {
@@ -299,7 +336,7 @@ func main() {
smtpDaemon.SetIndexCallback(func(raw []byte, id string) {
// Look up the tenant_id for this email from DB metadata.
tenantID, _ := mailStore.GetTenantForMail(context.Background(), id)
submitToWorker(tenantWorker, mailStore, raw, id, tenantID, logger)
submitToWorker(tenantWorker, mailStore, raw, id, tenantID, logger, ocrWorker)
})
// Wire tenant routing into SMTP daemon
if cfg.SMTP.TenantRouting == "domain" {
@@ -363,7 +400,7 @@ func main() {
srv.SetPop3(pop3St, pop3Imp)
// Backfill in background: migrate existing files into DB metadata + re-index
go runBackfill(context.Background(), mailStore, idx, tenantWorker, logger)
go runBackfill(context.Background(), mailStore, idx, tenantWorker, logger, ocrWorker)
// Background integrity verification — runs every 5 minutes
go runIntegrityCheck(context.Background(), mailStore, logger)
@@ -389,7 +426,9 @@ func main() {
// submitToWorker parses a raw email and submits it to the async index worker.
// tenantID may be nil for global context.
func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw []byte, id string, tenantID *int64, logger *slog.Logger) {
// If ocrWorker is non-nil and the mail has attachments, an OCR job is also
// queued (non-blocking).
func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw []byte, id string, tenantID *int64, logger *slog.Logger, ocrWorker *ocr.Worker) {
pm, err := mailparser.Parse(raw)
if err != nil {
logger.Warn("index: parse failed, skipping indexing", "id", id, "err", err)
@@ -422,12 +461,19 @@ func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw [
if err := store.SetIndexedAt(context.Background(), id); err != nil {
logger.Warn("index: set indexed_at failed", "id", id, "err", err)
}
// PROJ-35: hand off to OCR worker for asynchronous attachment processing.
if ocrWorker != nil && len(pm.Attachments) > 0 {
ocrWorker.Submit(id, tenantID)
}
}
// runBackfill walks the store, inserts missing DB metadata, and indexes
// emails that have not yet been indexed. Per-tenant indexing is handled by
// looking up each email's tenant_id from the DB.
func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, worker *index.TenantIndexWorker, logger *slog.Logger) {
// ocrWorker is optional; when non-nil, mails with attachments are also
// queued for OCR processing.
func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, worker *index.TenantIndexWorker, logger *slog.Logger, ocrWorker *ocr.Worker) {
logger.Info("backfill: starting")
count := 0
@@ -465,7 +511,7 @@ func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, w
if !alreadyIndexed {
needIndex++
tenantID, _ := store.GetTenantForMail(ctx, id)
submitToWorker(worker, store, raw, id, tenantID, logger)
submitToWorker(worker, store, raw, id, tenantID, logger, ocrWorker)
}
if count%100 == 0 {