feat(PROJ-35): OCR & Anhang-Volltext-Indexierung

Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils.
Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist
ueber die normale Volltextsuche auffindbar.

- internal/ocr: ExtractText + Worker (queue + drain)
- internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus
- emails.ocr_status (pending|done|failed|skipped|disabled)
- tenants.ocr_enabled (Default TRUE, opt-out)
- Manticore: attachment_text-Feld + UpdateAttachmentText
- Boot-resume: pending Jobs nach Restart automatisch in die Queue
- CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all
- update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
sysops
2026-05-08 22:11:17 +02:00
parent 2a91f6e249
commit 0bda21033e
11 changed files with 926 additions and 25 deletions
+20 -10
View File
@@ -7,16 +7,17 @@ import (
// MailDocument is the indexed representation of a stored email.
type MailDocument struct {
ID string
From string
To string
Subject string
Body string
AttachNames string
HasAttachment bool
Date time.Time
Size int64
TenantID *int64 // nil = global / superadmin context
ID string
From string
To string
Subject string
Body string
AttachNames string
AttachmentText string // PROJ-35: OCR-extracted text from PDF/image attachments
HasAttachment bool
Date time.Time
Size int64
TenantID *int64 // nil = global / superadmin context
}
// SearchRequest specifies search parameters.
@@ -53,6 +54,15 @@ type Indexer interface {
Close() error
}
// AttachmentTextUpdater is implemented by indexers that support partial
// updates of the OCR-extracted attachment text. Optional add-on to Indexer:
// callers should type-assert and degrade gracefully if not supported.
//
// PROJ-35: Manticore implements this; legacy Xapian does not.
type AttachmentTextUpdater interface {
UpdateAttachmentText(mailID, text string) error
}
// TenantIndexer manages per-tenant Indexer instances.
// Implemented by ManticoreTenantManager (primary) and TenantIndexManager (legacy Xapian).
type TenantIndexer interface {
+73 -5
View File
@@ -105,7 +105,8 @@ func (m *ManticoreTenantManager) Close() error {
// ── manticoreIndex methods ────────────────────────────────────────────────
// ensureTable creates the RT index if it does not yet exist.
// ensureTable creates the RT index if it does not yet exist and applies
// idempotent column additions for schema migrations.
func (idx *manticoreIndex) ensureTable() error {
stmt := fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s (
mail_id string,
@@ -114,14 +115,46 @@ func (idx *manticoreIndex) ensureTable() error {
to_addr text,
body text,
attachment_names text,
attachment_text text,
has_attachment uint,
date_ts bigint,
size_bytes bigint
) type='rt' morphology='stem_en,lemmatize_de_all'`, idx.table)
_, err := idx.db.Exec(stmt)
if err != nil {
if _, err := idx.db.Exec(stmt); err != nil {
return fmt.Errorf("ensureTable %s: %w", idx.table, err)
}
// PROJ-35: ALTER existing tables to add attachment_text. Manticore lacks
// ALTER IF NOT EXISTS, so we DESC first and only add when missing.
if err := idx.ensureColumn("attachment_text", "text"); err != nil {
return err
}
return nil
}
// ensureColumn checks DESC <table> for the named column and adds it via
// ALTER TABLE when missing. Safe to call repeatedly.
func (idx *manticoreIndex) ensureColumn(name, typ string) error {
rows, err := idx.db.Query(fmt.Sprintf("DESC %s", idx.table))
if err != nil {
return fmt.Errorf("desc %s: %w", idx.table, err)
}
defer rows.Close()
for rows.Next() {
var field, fieldType string
var props sql.NullString
if err := rows.Scan(&field, &fieldType, &props); err != nil {
return fmt.Errorf("desc scan %s: %w", idx.table, err)
}
if field == name {
return nil
}
}
if err := rows.Err(); err != nil {
return fmt.Errorf("desc rows %s: %w", idx.table, err)
}
if _, err := idx.db.Exec(fmt.Sprintf("ALTER TABLE %s ADD COLUMN %s %s", idx.table, name, typ)); err != nil {
return fmt.Errorf("alter %s add %s: %w", idx.table, name, err)
}
return nil
}
@@ -139,8 +172,8 @@ func (idx *manticoreIndex) IndexSync(doc MailDocument) error {
_, err := idx.db.Exec(
fmt.Sprintf(`REPLACE INTO %s
(id, mail_id, subject, from_addr, to_addr, body, attachment_names, has_attachment, date_ts, size_bytes)
VALUES (?,?,?,?,?,?,?,?,?,?)`, idx.table),
(id, mail_id, subject, from_addr, to_addr, body, attachment_names, attachment_text, has_attachment, date_ts, size_bytes)
VALUES (?,?,?,?,?,?,?,?,?,?,?)`, idx.table),
rowID,
doc.ID,
doc.Subject,
@@ -148,6 +181,7 @@ func (idx *manticoreIndex) IndexSync(doc MailDocument) error {
doc.To,
doc.Body,
doc.AttachNames,
doc.AttachmentText,
hasAttach,
dateTS,
doc.Size,
@@ -158,6 +192,40 @@ func (idx *manticoreIndex) IndexSync(doc MailDocument) error {
return nil
}
// UpdateAttachmentText partially updates only the attachment_text field of an
// already-indexed document. Implements index.AttachmentTextUpdater.
//
// Manticore RT indexes do not support UPDATE on text columns, so this
// re-fetches the full row and issues a REPLACE INTO with all fields preserved
// and attachment_text overwritten. Returns sql.ErrNoRows-style nil result if
// the document is not yet indexed (mail must be ingested first).
func (idx *manticoreIndex) UpdateAttachmentText(mailID, text string) error {
rowID := hashMailID(mailID)
row := idx.db.QueryRow(fmt.Sprintf(
`SELECT mail_id, subject, from_addr, to_addr, body, attachment_names,
has_attachment, date_ts, size_bytes
FROM %s WHERE id = ? LIMIT 1`, idx.table),
rowID,
)
var (
mid, subj, from, to, body, attachNames string
hasAttach uint64
dateTS, sizeBytes int64
)
if err := row.Scan(&mid, &subj, &from, &to, &body, &attachNames, &hasAttach, &dateTS, &sizeBytes); err != nil {
return fmt.Errorf("manticore UpdateAttachmentText %s: load row: %w", idx.table, err)
}
if _, err := idx.db.Exec(
fmt.Sprintf(`REPLACE INTO %s
(id, mail_id, subject, from_addr, to_addr, body, attachment_names, attachment_text, has_attachment, date_ts, size_bytes)
VALUES (?,?,?,?,?,?,?,?,?,?,?)`, idx.table),
rowID, mid, subj, from, to, body, attachNames, text, hasAttach, dateTS, sizeBytes,
); err != nil {
return fmt.Errorf("manticore UpdateAttachmentText %s: replace: %w", idx.table, err)
}
return nil
}
// Delete removes a document by mail ID hash.
func (idx *manticoreIndex) Delete(id string) error {
rowID := hashMailID(id)