feat(PROJ-35): OCR & Anhang-Volltext-Indexierung
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
@@ -286,6 +286,7 @@ Commands:
|
||||
reindex Index neu aufbauen (alle oder pro Mandant)
|
||||
recompress Bestehende Mails nachträglich gzip-komprimieren
|
||||
rethread Thread-IDs rückwirkend aus In-Reply-To/References befüllen
|
||||
ocr-reprocess OCR für Anhänge nachholen (alle oder pro Mandant/Status)
|
||||
version Version anzeigen
|
||||
help Diese Hilfe anzeigen
|
||||
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"log/slog"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"archivmail/config"
|
||||
"archivmail/internal/index"
|
||||
"archivmail/internal/ocr"
|
||||
"archivmail/internal/storage"
|
||||
)
|
||||
|
||||
// runOCRReprocess re-runs OCR for selected mails. It loads matching IDs from
|
||||
// the DB by ocr_status, queues them on the OCR worker, then waits for the
|
||||
// worker to drain.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// archivmail ocr-reprocess --config /etc/archivmail/config.yml
|
||||
// archivmail ocr-reprocess --tenant 1 --status failed
|
||||
// archivmail ocr-reprocess --status all --limit 1000
|
||||
func runOCRReprocess(args []string) {
|
||||
fs := flag.NewFlagSet("ocr-reprocess", flag.ExitOnError)
|
||||
configPath := fs.String("config", "/etc/archivmail/config.yml", "path to config file")
|
||||
tenantIDFlag := fs.Int64("tenant", 0, "tenant ID (0 = all tenants)")
|
||||
statusFlag := fs.String("status", "pending", "ocr_status filter: pending|done|failed|skipped|disabled|all")
|
||||
limitFlag := fs.Int("limit", 0, "max number of mails to process (0 = no limit)")
|
||||
fs.Parse(args)
|
||||
|
||||
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
|
||||
|
||||
cfg, err := config.Load(*configPath)
|
||||
if err != nil {
|
||||
logger.Error("failed to load config", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
storeCfg := storage.Config{
|
||||
Dir: cfg.Storage.StorePath,
|
||||
Keyfile: cfg.Storage.Keyfile,
|
||||
DSN: cfg.Database.DSN(),
|
||||
CompressEnabled: cfg.Storage.Compress,
|
||||
}
|
||||
mailStore, err := storage.New(storeCfg)
|
||||
if err != nil {
|
||||
logger.Error("storage init failed", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer mailStore.Close()
|
||||
|
||||
indexBackend := cfg.Index.Backend
|
||||
if indexBackend == "" {
|
||||
indexBackend = "manticore"
|
||||
}
|
||||
if indexBackend != "manticore" {
|
||||
logger.Error("ocr-reprocess requires the manticore backend", "configured", indexBackend)
|
||||
os.Exit(1)
|
||||
}
|
||||
dsn := cfg.Index.ManticoreDSN
|
||||
if dsn == "" {
|
||||
dsn = "manticore@tcp(127.0.0.1:9306)/"
|
||||
}
|
||||
idxMgr, err := index.NewManticoreTenantManager(dsn)
|
||||
if err != nil {
|
||||
logger.Error("manticore init failed", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer idxMgr.Close()
|
||||
|
||||
if !ocr.IsAvailable() {
|
||||
ts := ocr.CheckTools()
|
||||
logger.Warn("ocr tools not on PATH — install tesseract-ocr + poppler-utils",
|
||||
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
var tenantPtr *int64
|
||||
if *tenantIDFlag > 0 {
|
||||
t := *tenantIDFlag
|
||||
tenantPtr = &t
|
||||
}
|
||||
|
||||
mails, err := mailStore.GetMailsByOCRStatus(ctx, *statusFlag, tenantPtr, *limitFlag)
|
||||
if err != nil {
|
||||
logger.Error("failed to list mails", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
logger.Info("ocr-reprocess: starting",
|
||||
"status", *statusFlag, "tenant", *tenantIDFlag, "count", len(mails))
|
||||
if len(mails) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Queue size needs to fit the entire batch so Submit never drops.
|
||||
qSize := len(mails) + 16
|
||||
worker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
|
||||
Workers: 2,
|
||||
QueueSize: qSize,
|
||||
Logger: logger,
|
||||
})
|
||||
worker.Start(ctx)
|
||||
|
||||
// Reset status to 'pending' before submission so the worker actually picks
|
||||
// them up — for callers that want to reprocess 'failed'/'skipped' mails.
|
||||
if *statusFlag != "pending" && *statusFlag != "all" {
|
||||
for _, m := range mails {
|
||||
_ = mailStore.SetOCRStatus(ctx, m.ID, "pending")
|
||||
}
|
||||
}
|
||||
|
||||
for _, m := range mails {
|
||||
worker.Submit(m.ID, m.TenantID)
|
||||
}
|
||||
|
||||
// Periodic progress while waiting for the queue to drain.
|
||||
tick := time.NewTicker(5 * time.Second)
|
||||
defer tick.Stop()
|
||||
go func() {
|
||||
for range tick.C {
|
||||
logger.Info("ocr-reprocess: progress", "queue_remaining", worker.QueueLen())
|
||||
}
|
||||
}()
|
||||
|
||||
worker.Stop() // waits for all in-flight jobs
|
||||
logger.Info("ocr-reprocess: complete", "submitted", len(mails))
|
||||
}
|
||||
+51
-5
@@ -29,6 +29,7 @@ import (
|
||||
"archivmail/internal/index"
|
||||
ldapcfg "archivmail/internal/ldapconfig"
|
||||
"archivmail/internal/mailer"
|
||||
"archivmail/internal/ocr"
|
||||
pop3store "archivmail/internal/pop3"
|
||||
"archivmail/internal/smtpoutconfig"
|
||||
"archivmail/internal/smtpd"
|
||||
@@ -63,6 +64,9 @@ func main() {
|
||||
case "rethread":
|
||||
runRethread(os.Args[2:])
|
||||
return
|
||||
case "ocr-reprocess":
|
||||
runOCRReprocess(os.Args[2:])
|
||||
return
|
||||
case "version":
|
||||
fmt.Printf("archivmail %s\n", AppVersion)
|
||||
for mod, ver := range Modules {
|
||||
@@ -167,6 +171,39 @@ func main() {
|
||||
tenantWorker.Start()
|
||||
defer tenantWorker.Stop()
|
||||
|
||||
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
|
||||
// it back into the per-tenant Manticore index. Non-blocking submit so the
|
||||
// mail intake pipeline is never delayed.
|
||||
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
|
||||
Workers: 2,
|
||||
QueueSize: 1000,
|
||||
Logger: logger,
|
||||
})
|
||||
ocrWorker.Start(context.Background())
|
||||
defer ocrWorker.Stop()
|
||||
if !ocr.IsAvailable() {
|
||||
ts := ocr.CheckTools()
|
||||
logger.Warn("ocr tools not fully available — install tesseract-ocr + poppler-utils for full OCR support",
|
||||
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
|
||||
}
|
||||
|
||||
// Boot-resume: re-enqueue all mails still marked ocr_status='pending'.
|
||||
go func() {
|
||||
ctx := context.Background()
|
||||
pending, err := mailStore.GetPendingOCRMails(ctx, nil, 5000)
|
||||
if err != nil {
|
||||
logger.Warn("ocr boot-resume: query failed", "err", err)
|
||||
return
|
||||
}
|
||||
if len(pending) == 0 {
|
||||
return
|
||||
}
|
||||
logger.Info("ocr boot-resume: re-enqueueing pending jobs", "count", len(pending))
|
||||
for _, m := range pending {
|
||||
ocrWorker.Submit(m.ID, m.TenantID)
|
||||
}
|
||||
}()
|
||||
|
||||
// User store
|
||||
users, err := userstore.New(cfg.Database.DSN())
|
||||
if err != nil {
|
||||
@@ -299,7 +336,7 @@ func main() {
|
||||
smtpDaemon.SetIndexCallback(func(raw []byte, id string) {
|
||||
// Look up the tenant_id for this email from DB metadata.
|
||||
tenantID, _ := mailStore.GetTenantForMail(context.Background(), id)
|
||||
submitToWorker(tenantWorker, mailStore, raw, id, tenantID, logger)
|
||||
submitToWorker(tenantWorker, mailStore, raw, id, tenantID, logger, ocrWorker)
|
||||
})
|
||||
// Wire tenant routing into SMTP daemon
|
||||
if cfg.SMTP.TenantRouting == "domain" {
|
||||
@@ -363,7 +400,7 @@ func main() {
|
||||
srv.SetPop3(pop3St, pop3Imp)
|
||||
|
||||
// Backfill in background: migrate existing files into DB metadata + re-index
|
||||
go runBackfill(context.Background(), mailStore, idx, tenantWorker, logger)
|
||||
go runBackfill(context.Background(), mailStore, idx, tenantWorker, logger, ocrWorker)
|
||||
|
||||
// Background integrity verification — runs every 5 minutes
|
||||
go runIntegrityCheck(context.Background(), mailStore, logger)
|
||||
@@ -389,7 +426,9 @@ func main() {
|
||||
|
||||
// submitToWorker parses a raw email and submits it to the async index worker.
|
||||
// tenantID may be nil for global context.
|
||||
func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw []byte, id string, tenantID *int64, logger *slog.Logger) {
|
||||
// If ocrWorker is non-nil and the mail has attachments, an OCR job is also
|
||||
// queued (non-blocking).
|
||||
func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw []byte, id string, tenantID *int64, logger *slog.Logger, ocrWorker *ocr.Worker) {
|
||||
pm, err := mailparser.Parse(raw)
|
||||
if err != nil {
|
||||
logger.Warn("index: parse failed, skipping indexing", "id", id, "err", err)
|
||||
@@ -422,12 +461,19 @@ func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw [
|
||||
if err := store.SetIndexedAt(context.Background(), id); err != nil {
|
||||
logger.Warn("index: set indexed_at failed", "id", id, "err", err)
|
||||
}
|
||||
|
||||
// PROJ-35: hand off to OCR worker for asynchronous attachment processing.
|
||||
if ocrWorker != nil && len(pm.Attachments) > 0 {
|
||||
ocrWorker.Submit(id, tenantID)
|
||||
}
|
||||
}
|
||||
|
||||
// runBackfill walks the store, inserts missing DB metadata, and indexes
|
||||
// emails that have not yet been indexed. Per-tenant indexing is handled by
|
||||
// looking up each email's tenant_id from the DB.
|
||||
func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, worker *index.TenantIndexWorker, logger *slog.Logger) {
|
||||
// ocrWorker is optional; when non-nil, mails with attachments are also
|
||||
// queued for OCR processing.
|
||||
func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, worker *index.TenantIndexWorker, logger *slog.Logger, ocrWorker *ocr.Worker) {
|
||||
logger.Info("backfill: starting")
|
||||
|
||||
count := 0
|
||||
@@ -465,7 +511,7 @@ func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, w
|
||||
if !alreadyIndexed {
|
||||
needIndex++
|
||||
tenantID, _ := store.GetTenantForMail(ctx, id)
|
||||
submitToWorker(worker, store, raw, id, tenantID, logger)
|
||||
submitToWorker(worker, store, raw, id, tenantID, logger, ocrWorker)
|
||||
}
|
||||
|
||||
if count%100 == 0 {
|
||||
|
||||
Reference in New Issue
Block a user