Files
archivmail/cmd/archivmail/main.go
T
sysops 0bda21033e feat(PROJ-35): OCR & Anhang-Volltext-Indexierung
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils.
Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist
ueber die normale Volltextsuche auffindbar.

- internal/ocr: ExtractText + Worker (queue + drain)
- internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus
- emails.ocr_status (pending|done|failed|skipped|disabled)
- tenants.ocr_enabled (Default TRUE, opt-out)
- Manticore: attachment_text-Feld + UpdateAttachmentText
- Boot-resume: pending Jobs nach Restart automatisch in die Queue
- CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all
- update.sh: tesseract-ocr + poppler-utils optional installieren
2026-05-08 22:11:17 +02:00

695 lines
20 KiB
Go

package main
import (
"context"
"crypto/rand"
"crypto/sha256"
"encoding/hex"
"flag"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"os/signal"
"strings"
"syscall"
"time"
"golang.org/x/crypto/hkdf"
"github.com/jackc/pgx/v5/pgxpool"
"archivmail/config"
"archivmail/internal/api"
"archivmail/internal/audit"
"archivmail/internal/auth"
imapstore "archivmail/internal/imap"
"archivmail/internal/imapserver"
"archivmail/internal/index"
ldapcfg "archivmail/internal/ldapconfig"
"archivmail/internal/mailer"
"archivmail/internal/ocr"
pop3store "archivmail/internal/pop3"
"archivmail/internal/smtpoutconfig"
"archivmail/internal/smtpd"
"archivmail/internal/storage"
tenantstore "archivmail/internal/tenantstore"
"archivmail/internal/tokenstore"
"archivmail/internal/userstore"
"archivmail/pkg/mailparser"
)
func main() {
if len(os.Args) > 1 {
switch os.Args[1] {
case "import":
runImport(os.Args[2:])
return
case "import-piler":
runImportPiler(os.Args[2:])
return
case "export":
runExport(os.Args[2:])
return
case "migrate-tenants":
runMigrateTenants(os.Args[2:])
return
case "reindex":
runReindex(os.Args[2:])
return
case "recompress":
runRecompress(os.Args[2:])
return
case "rethread":
runRethread(os.Args[2:])
return
case "ocr-reprocess":
runOCRReprocess(os.Args[2:])
return
case "version":
fmt.Printf("archivmail %s\n", AppVersion)
for mod, ver := range Modules {
fmt.Printf(" %-14s %s\n", mod, ver)
}
return
case "help", "--help", "-h":
printHelp()
return
case "serve":
// strip "serve" from args so flag.Parse works normally below
os.Args = append(os.Args[:1], os.Args[2:]...)
}
}
configPath := flag.String("config", "/etc/archivmail/config.yml", "path to config file")
flag.Parse()
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
cfg, err := config.Load(*configPath)
if err != nil {
logger.Error("failed to load config", "path", *configPath, "err", err)
os.Exit(1)
}
// SEC-08: Derive separate keys from the master secret to prevent key reuse.
// jwtSecret is used for JWT token signing only.
// aesKey is used for AES-256-GCM encryption of stored passwords (IMAP, POP3, LDAP).
// HKDF is deterministic: same cfg.API.Secret always produces the same derived keys.
// NOTE: After this change, existing stored IMAP/POP3/LDAP passwords must be
// re-entered once, as they were encrypted with the old undivided key.
masterKey := []byte(cfg.API.Secret)
jwtKeyRaw := make([]byte, 32)
if _, err := io.ReadFull(hkdf.New(sha256.New, masterKey, []byte("archivmail-jwt-v1"), nil), jwtKeyRaw); err != nil {
logger.Error("key derivation failed", "err", err)
os.Exit(1)
}
aesKeyRaw := make([]byte, 32)
if _, err := io.ReadFull(hkdf.New(sha256.New, masterKey, []byte("archivmail-aes-v1"), nil), aesKeyRaw); err != nil {
logger.Error("key derivation failed", "err", err)
os.Exit(1)
}
jwtSecret := hex.EncodeToString(jwtKeyRaw)
aesKey := hex.EncodeToString(aesKeyRaw)
// Storage with encryption + DB metadata
storeCfg := storage.Config{
Dir: cfg.Storage.StorePath,
Keyfile: cfg.Storage.Keyfile,
DSN: cfg.Database.DSN(),
RetentionDays: cfg.Storage.RetentionDays,
CompressEnabled: cfg.Storage.Compress,
}
mailStore, err := storage.New(storeCfg)
if err != nil {
logger.Error("storage init failed", "err", err)
os.Exit(1)
}
defer mailStore.Close()
// Index — per-tenant index manager (PROJ-21 Phase 4)
indexBackend := cfg.Index.Backend
if indexBackend == "" {
indexBackend = "manticore"
}
batchSize := cfg.Index.BatchSize
if batchSize <= 0 {
batchSize = 100
}
var idxMgr index.TenantIndexer
if indexBackend == "manticore" {
dsn := cfg.Index.ManticoreDSN
if dsn == "" {
dsn = "manticore@tcp(127.0.0.1:9306)/"
}
m, err := index.NewManticoreTenantManager(dsn)
if err != nil {
logger.Error("manticore index manager init failed", "err", err)
os.Exit(1)
}
idxMgr = m
} else {
m, err := index.NewTenantIndexManager(cfg.Index.Path, batchSize, indexBackend)
if err != nil {
logger.Error("index manager init failed", "err", err)
os.Exit(1)
}
idxMgr = m
}
defer func() { idxMgr.Close() }()
// Global index reference for backward compatibility (IMAP importer, etc.)
idx := idxMgr.Global()
// Async index worker — tenant-aware (routes docs to correct per-tenant index)
asyncQueueSize := cfg.Index.AsyncQueueSize
if asyncQueueSize <= 0 {
asyncQueueSize = 1000
}
tenantWorker := index.NewTenantWorker(idxMgr, asyncQueueSize, logger)
tenantWorker.Start()
defer tenantWorker.Stop()
// PROJ-35: OCR-Worker — extracts text from PDF/image attachments and feeds
// it back into the per-tenant Manticore index. Non-blocking submit so the
// mail intake pipeline is never delayed.
ocrWorker := ocr.NewWorker(mailStore, idxMgr, ocr.Options{
Workers: 2,
QueueSize: 1000,
Logger: logger,
})
ocrWorker.Start(context.Background())
defer ocrWorker.Stop()
if !ocr.IsAvailable() {
ts := ocr.CheckTools()
logger.Warn("ocr tools not fully available — install tesseract-ocr + poppler-utils for full OCR support",
"pdftotext", ts.HasPdftotext, "tesseract", ts.HasTesseract, "pdftoppm", ts.HasPdftoppm)
}
// Boot-resume: re-enqueue all mails still marked ocr_status='pending'.
go func() {
ctx := context.Background()
pending, err := mailStore.GetPendingOCRMails(ctx, nil, 5000)
if err != nil {
logger.Warn("ocr boot-resume: query failed", "err", err)
return
}
if len(pending) == 0 {
return
}
logger.Info("ocr boot-resume: re-enqueueing pending jobs", "count", len(pending))
for _, m := range pending {
ocrWorker.Submit(m.ID, m.TenantID)
}
}()
// User store
users, err := userstore.New(cfg.Database.DSN())
if err != nil {
logger.Error("userstore init failed", "err", err)
os.Exit(1)
}
defer users.Close()
// Audit log
audlog, err := audit.New(cfg.Database.DSN(), cfg.Audit.LogPath, logger)
if err != nil {
logger.Error("audit init failed", "err", err)
os.Exit(1)
}
defer audlog.Close()
// Seed default users on first run
if err := seedDefaultUsers(users, logger); err != nil {
logger.Error("seed users failed", "err", err)
}
// LDAP config store
ldapSt, err := ldapcfg.New(cfg.Database.DSN(), aesKey)
if err != nil {
logger.Error("ldap config store init failed", "err", err)
os.Exit(1)
}
defer ldapSt.Close()
// Auth manager (with LDAP fallback + TOTP AES key)
authMgr := auth.New(users, ldapSt, jwtSecret, aesKey)
// API server
apiCfg := config.APIConfig{
Bind: cfg.API.Bind,
Secret: jwtSecret,
}
srv := api.New(apiCfg, mailStore, idx, authMgr, users, audlog, logger)
srv.SetVersion(AppVersion, Modules)
srv.SetGlobalRetentionDays(cfg.Storage.RetentionDays)
srv.SetMetrics(cfg.Metrics)
// PROJ-28: Self-Service Onboarding — mailer + token store + FQDN
mlr := mailer.New(cfg.SMTPOut)
// SMTP-Out config store — load from DB, overrides config.yml if present
smtpOutSt, err := smtpoutconfig.New(cfg.Database.DSN(), cfg.API.Secret)
if err != nil {
logger.Error("smtp-out config store init failed", "err", err)
os.Exit(1)
}
defer smtpOutSt.Close()
srv.SetSMTPOutStore(smtpOutSt)
// Override config.yml settings with DB config if available
if dbCfg, err := smtpOutSt.GetWithPassword(context.Background()); err == nil && dbCfg != nil && dbCfg.Enabled {
mlr.Reload(config.SMTPOutConfig{
Host: dbCfg.Host,
Port: dbCfg.Port,
User: dbCfg.User,
Password: dbCfg.Password,
TLS: dbCfg.TLS,
From: dbCfg.From,
})
logger.Info("smtp_out: loaded from database")
}
srv.SetMailer(mlr)
srv.SetFQDN(cfg.Server.FQDN)
if cfg.Server.FQDN == "" {
logger.Warn("server.fqdn not set — signup/reset links will not work (PROJ-28)")
}
tokenPool, err := pgxpool.New(context.Background(), cfg.Database.DSN())
if err != nil {
logger.Error("token store pool failed", "err", err)
os.Exit(1)
}
defer tokenPool.Close()
tokenSt, err := tokenstore.New(tokenPool)
if err != nil {
logger.Error("token store init failed", "err", err)
os.Exit(1)
}
srv.SetTokenStore(tokenSt)
bind := cfg.API.Bind
if bind == "" {
bind = fmt.Sprintf(":%d", cfg.Server.APIPort)
}
httpServer := &http.Server{
Addr: bind,
Handler: srv,
}
// Tenant store (Multi-Tenancy Phase 1+2) — must be initialised before SMTP daemon
tenantSt, err := tenantstore.New(cfg.Database.DSN())
if err != nil {
logger.Error("tenant store init failed", "err", err)
os.Exit(1)
}
defer tenantSt.Close()
srv.SetTenants(tenantSt)
srv.SetIndexManager(idxMgr)
// PROJ-26: IMAP Archive Server (read-only access for IMAP clients)
if cfg.IMAPServer.Enabled {
cfg.IMAPServer.FQDN = cfg.Server.FQDN
imapSrv := imapserver.New(cfg.IMAPServer, mailStore, users, audlog, authMgr, logger, tenantSt)
if err := imapSrv.Start(); err != nil {
logger.Error("IMAP server failed to start", "err", err)
os.Exit(1)
}
defer imapSrv.Stop()
imapBind := cfg.IMAPServer.Bind
if imapBind == "" {
imapBind = "127.0.0.1:1143"
}
logger.Info("IMAP archive server started", "addr", imapBind)
}
// Start SMTP daemon with index worker integration
if cfg.SMTP.Bind == "" {
cfg.SMTP.Bind = fmt.Sprintf(":%d", cfg.Server.SMTPPort)
}
// PROJ-28: FQDN fallback for SMTP EHLO banner
if cfg.SMTP.Domain == "" && cfg.Server.FQDN != "" {
cfg.SMTP.Domain = cfg.Server.FQDN
}
smtpDaemon := smtpd.New(cfg.SMTP, mailStore, logger)
smtpDaemon.SetIndexCallback(func(raw []byte, id string) {
// Look up the tenant_id for this email from DB metadata.
tenantID, _ := mailStore.GetTenantForMail(context.Background(), id)
submitToWorker(tenantWorker, mailStore, raw, id, tenantID, logger, ocrWorker)
})
// Wire tenant routing into SMTP daemon
if cfg.SMTP.TenantRouting == "domain" {
var defaultTenantID *int64
if cfg.SMTP.DefaultTenantID != 0 {
id := cfg.SMTP.DefaultTenantID
defaultTenantID = &id
}
smtpDaemon.SetDomainToTenant(func(ctx context.Context, domain string) (*int64, error) {
t, err := tenantSt.GetByDomain(ctx, domain)
if err != nil || t == nil {
return nil, err
}
id := t.ID
return &id, nil
}, defaultTenantID)
}
if err := smtpDaemon.Start(); err != nil {
logger.Error("SMTP daemon failed to start", "err", err)
os.Exit(1)
}
defer smtpDaemon.Stop()
// Wire LDAP config store into API server
srv.SetLDAP(ldapSt)
// PROJ-23: Per-tenant LDAP config store
tenantLdapSt, err := ldapcfg.NewTenantStore(cfg.Database.DSN(), aesKey)
if err != nil {
logger.Error("tenant ldap store init failed", "err", err)
os.Exit(1)
}
defer tenantLdapSt.Close()
srv.SetTenantLDAP(tenantLdapSt)
authMgr.SetTenantLDAP(tenantLdapSt, tenantSt)
// Wire SMTP daemon into API server for status endpoint
srv.SetSMTPDaemon(smtpDaemon)
// IMAP store + importer + scheduler (wired to use async worker)
imapSt, err := imapstore.New(cfg.Database.DSN(), aesKey)
if err != nil {
logger.Error("imap store init failed", "err", err)
os.Exit(1)
}
defer imapSt.Close()
imapImp := imapstore.NewImporter(imapSt, mailStore, idx, logger)
imapSched := imapstore.NewScheduler(imapSt, imapImp, logger)
imapSched.Start()
defer imapSched.Stop()
srv.SetImap(imapSt, imapImp, imapSched)
// POP3 store + importer
pop3St, err := pop3store.New(cfg.Database.DSN(), aesKey)
if err != nil {
logger.Error("pop3 store init failed", "err", err)
os.Exit(1)
}
defer pop3St.Close()
pop3Imp := pop3store.NewImporter(pop3St, mailStore, idx, logger)
srv.SetPop3(pop3St, pop3Imp)
// Backfill in background: migrate existing files into DB metadata + re-index
go runBackfill(context.Background(), mailStore, idx, tenantWorker, logger, ocrWorker)
// Background integrity verification — runs every 5 minutes
go runIntegrityCheck(context.Background(), mailStore, logger)
// Start HTTP API
go func() {
logger.Info("starting API server", "addr", bind)
if err := httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
logger.Error("API server error", "err", err)
}
}()
// Graceful shutdown
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
logger.Info("shutting down...")
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
httpServer.Shutdown(ctx)
}
// submitToWorker parses a raw email and submits it to the async index worker.
// tenantID may be nil for global context.
// If ocrWorker is non-nil and the mail has attachments, an OCR job is also
// queued (non-blocking).
func submitToWorker(worker *index.TenantIndexWorker, store *storage.Store, raw []byte, id string, tenantID *int64, logger *slog.Logger, ocrWorker *ocr.Worker) {
pm, err := mailparser.Parse(raw)
if err != nil {
logger.Warn("index: parse failed, skipping indexing", "id", id, "err", err)
return
}
var attachNames []string
for _, a := range pm.Attachments {
if a.Filename != "" {
attachNames = append(attachNames, a.Filename)
}
}
doc := index.MailDocument{
ID: id,
From: pm.From,
To: strings.Join(pm.To, ", "),
Subject: pm.Subject,
Body: pm.TextBody,
AttachNames: strings.Join(attachNames, " "),
HasAttachment: len(pm.Attachments) > 0,
Date: pm.Date,
Size: int64(len(raw)),
TenantID: tenantID,
}
worker.Submit(doc)
// Mark as indexed in DB
if err := store.SetIndexedAt(context.Background(), id); err != nil {
logger.Warn("index: set indexed_at failed", "id", id, "err", err)
}
// PROJ-35: hand off to OCR worker for asynchronous attachment processing.
if ocrWorker != nil && len(pm.Attachments) > 0 {
ocrWorker.Submit(id, tenantID)
}
}
// runBackfill walks the store, inserts missing DB metadata, and indexes
// emails that have not yet been indexed. Per-tenant indexing is handled by
// looking up each email's tenant_id from the DB.
// ocrWorker is optional; when non-nil, mails with attachments are also
// queued for OCR processing.
func runBackfill(ctx context.Context, store *storage.Store, idx index.Indexer, worker *index.TenantIndexWorker, logger *slog.Logger, ocrWorker *ocr.Worker) {
logger.Info("backfill: starting")
count := 0
needIndex := 0
errCount := 0
err := store.WalkStore(ctx, func(id string) error {
count++
raw, err := store.Load(id)
if err != nil {
logger.Warn("backfill: load failed", "id", id, "err", err)
errCount++
return nil
}
pm, err := mailparser.Parse(raw)
if err != nil {
logger.Warn("backfill: parse failed", "id", id, "err", err)
errCount++
return nil
}
// Upsert metadata into DB
if err := store.SaveMeta(ctx, id, pm, len(raw)); err != nil {
logger.Warn("backfill: save meta failed", "id", id, "err", err)
}
// Check if already indexed
alreadyIndexed, err := store.IsIndexed(ctx, id)
if err != nil {
logger.Warn("backfill: check indexed failed", "id", id, "err", err)
}
if !alreadyIndexed {
needIndex++
tenantID, _ := store.GetTenantForMail(ctx, id)
submitToWorker(worker, store, raw, id, tenantID, logger, ocrWorker)
}
if count%100 == 0 {
logger.Info("backfill: progress", "processed", count, "need_index", needIndex, "errors", errCount)
}
return nil
})
if err != nil {
logger.Error("backfill failed", "err", err)
return
}
logger.Info("backfill: complete", "total", count, "submitted_for_index", needIndex, "errors", errCount)
}
// reindexTenant re-indexes all emails belonging to a specific tenant.
// Used during migration when switching from global index to per-tenant indexes.
func reindexTenant(ctx context.Context, store *storage.Store, mgr index.TenantIndexer, tenantID int64, logger *slog.Logger) error {
tid := tenantID
ids, err := store.GetAllIDsByTenant(ctx, &tid)
if err != nil {
return fmt.Errorf("reindex tenant %d: get IDs: %w", tenantID, err)
}
logger.Info("reindex tenant: starting", "tenant_id", tenantID, "count", len(ids))
idx := mgr.ForTenant(&tid)
indexed := 0
errCount := 0
for _, id := range ids {
raw, err := store.Load(id)
if err != nil {
logger.Warn("reindex tenant: load failed", "tenant_id", tenantID, "id", id, "err", err)
errCount++
continue
}
pm, parseErr := mailparser.Parse(raw)
if parseErr != nil {
logger.Warn("reindex tenant: parse failed", "tenant_id", tenantID, "id", id, "err", parseErr)
errCount++
continue
}
var attachNames []string
for _, a := range pm.Attachments {
if a.Filename != "" {
attachNames = append(attachNames, a.Filename)
}
}
doc := index.MailDocument{
ID: id,
From: pm.From,
To: strings.Join(pm.To, ", "),
Subject: pm.Subject,
Body: pm.TextBody,
AttachNames: strings.Join(attachNames, " "),
HasAttachment: len(pm.Attachments) > 0,
Date: pm.Date,
Size: int64(len(raw)),
TenantID: &tid,
}
if err := idx.IndexSync(doc); err != nil {
logger.Warn("reindex tenant: index failed", "tenant_id", tenantID, "id", id, "err", err)
errCount++
continue
}
indexed++
}
logger.Info("reindex tenant: complete", "tenant_id", tenantID, "indexed", indexed, "errors", errCount)
return nil
}
// runIntegrityCheck verifies all stored emails every 5 minutes by re-computing
// their SHA-256 and comparing it to the stored file ID.
func runIntegrityCheck(ctx context.Context, store *storage.Store, logger *slog.Logger) {
// run once at startup, then every 5 minutes
doVerify := func() {
ids, err := store.GetAllIDs(ctx)
if err != nil {
logger.Error("integrity check: get IDs failed", "err", err)
return
}
ok := 0
fail := 0
for _, id := range ids {
verified, err := store.VerifyIntegrity(ctx, id)
if err != nil {
fail++
continue
}
if verified {
ok++
} else {
fail++
logger.Warn("integrity check: FAILED", "id", id)
}
}
logger.Info("integrity check: complete", "ok", ok, "failed", fail, "total", len(ids))
}
doVerify()
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
doVerify()
case <-ctx.Done():
return
}
}
}
// seedDefaultUsers creates default admin and auditor accounts if no users exist yet.
// Passwords are randomly generated and printed once to stdout — there is no way to
// recover them afterwards; they must be changed immediately after the first login.
func seedDefaultUsers(users *userstore.Store, logger *slog.Logger) error {
all, err := users.List("")
if err != nil {
return fmt.Errorf("list users: %w", err)
}
if len(all) > 0 {
return nil // already seeded
}
superadminPw, err := randomPassword()
if err != nil {
return fmt.Errorf("generate superadmin password: %w", err)
}
adminPw, err := randomPassword()
if err != nil {
return fmt.Errorf("generate admin password: %w", err)
}
auditorPw, err := randomPassword()
if err != nil {
return fmt.Errorf("generate auditor password: %w", err)
}
defaults := []userstore.CreateUserRequest{
{Username: "superadmin", Email: "superadmin@archivmail.local", Password: superadminPw, Role: userstore.RoleSuperAdmin},
{Username: "admin", Email: "admin@archivmail.local", Password: adminPw, Role: userstore.RoleAdmin},
{Username: "auditor", Email: "auditor@archivmail.local", Password: auditorPw, Role: userstore.RoleAuditor},
}
for _, req := range defaults {
if _, err := users.Create(req); err != nil {
return fmt.Errorf("create default user %s: %w", req.Username, err)
}
}
// Print credentials prominently — this is the only time they are visible.
fmt.Println()
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
fmt.Println("║ ARCHIVMAIL — ERSTMALIGE EINRICHTUNG ║")
fmt.Println("║ Initiale Zugangsdaten (NUR EINMAL ANGEZEIGT): ║")
fmt.Printf( "║ superadmin : %-47s ║\n", superadminPw)
fmt.Printf( "║ admin : %-47s ║\n", adminPw)
fmt.Printf( "║ auditor : %-47s ║\n", auditorPw)
fmt.Println("║ Passwörter sofort nach dem ersten Login ändern! ║")
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
fmt.Println()
logger.Warn("default users created — change passwords immediately!")
return nil
}
// randomPassword generates a cryptographically random 16-byte hex password.
func randomPassword() (string, error) {
b := make([]byte, 16)
if _, err := rand.Read(b); err != nil {
return "", err
}
return hex.EncodeToString(b), nil
}