feat(PROJ-35): OCR & Anhang-Volltext-Indexierung
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
This commit is contained in:
@@ -105,7 +105,8 @@ func (m *ManticoreTenantManager) Close() error {
|
||||
|
||||
// ── manticoreIndex methods ────────────────────────────────────────────────
|
||||
|
||||
// ensureTable creates the RT index if it does not yet exist.
|
||||
// ensureTable creates the RT index if it does not yet exist and applies
|
||||
// idempotent column additions for schema migrations.
|
||||
func (idx *manticoreIndex) ensureTable() error {
|
||||
stmt := fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s (
|
||||
mail_id string,
|
||||
@@ -114,14 +115,46 @@ func (idx *manticoreIndex) ensureTable() error {
|
||||
to_addr text,
|
||||
body text,
|
||||
attachment_names text,
|
||||
attachment_text text,
|
||||
has_attachment uint,
|
||||
date_ts bigint,
|
||||
size_bytes bigint
|
||||
) type='rt' morphology='stem_en,lemmatize_de_all'`, idx.table)
|
||||
_, err := idx.db.Exec(stmt)
|
||||
if err != nil {
|
||||
if _, err := idx.db.Exec(stmt); err != nil {
|
||||
return fmt.Errorf("ensureTable %s: %w", idx.table, err)
|
||||
}
|
||||
// PROJ-35: ALTER existing tables to add attachment_text. Manticore lacks
|
||||
// ALTER IF NOT EXISTS, so we DESC first and only add when missing.
|
||||
if err := idx.ensureColumn("attachment_text", "text"); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensureColumn checks DESC <table> for the named column and adds it via
|
||||
// ALTER TABLE when missing. Safe to call repeatedly.
|
||||
func (idx *manticoreIndex) ensureColumn(name, typ string) error {
|
||||
rows, err := idx.db.Query(fmt.Sprintf("DESC %s", idx.table))
|
||||
if err != nil {
|
||||
return fmt.Errorf("desc %s: %w", idx.table, err)
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var field, fieldType string
|
||||
var props sql.NullString
|
||||
if err := rows.Scan(&field, &fieldType, &props); err != nil {
|
||||
return fmt.Errorf("desc scan %s: %w", idx.table, err)
|
||||
}
|
||||
if field == name {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return fmt.Errorf("desc rows %s: %w", idx.table, err)
|
||||
}
|
||||
if _, err := idx.db.Exec(fmt.Sprintf("ALTER TABLE %s ADD COLUMN %s %s", idx.table, name, typ)); err != nil {
|
||||
return fmt.Errorf("alter %s add %s: %w", idx.table, name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -139,8 +172,8 @@ func (idx *manticoreIndex) IndexSync(doc MailDocument) error {
|
||||
|
||||
_, err := idx.db.Exec(
|
||||
fmt.Sprintf(`REPLACE INTO %s
|
||||
(id, mail_id, subject, from_addr, to_addr, body, attachment_names, has_attachment, date_ts, size_bytes)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?)`, idx.table),
|
||||
(id, mail_id, subject, from_addr, to_addr, body, attachment_names, attachment_text, has_attachment, date_ts, size_bytes)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?)`, idx.table),
|
||||
rowID,
|
||||
doc.ID,
|
||||
doc.Subject,
|
||||
@@ -148,6 +181,7 @@ func (idx *manticoreIndex) IndexSync(doc MailDocument) error {
|
||||
doc.To,
|
||||
doc.Body,
|
||||
doc.AttachNames,
|
||||
doc.AttachmentText,
|
||||
hasAttach,
|
||||
dateTS,
|
||||
doc.Size,
|
||||
@@ -158,6 +192,40 @@ func (idx *manticoreIndex) IndexSync(doc MailDocument) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateAttachmentText partially updates only the attachment_text field of an
|
||||
// already-indexed document. Implements index.AttachmentTextUpdater.
|
||||
//
|
||||
// Manticore RT indexes do not support UPDATE on text columns, so this
|
||||
// re-fetches the full row and issues a REPLACE INTO with all fields preserved
|
||||
// and attachment_text overwritten. Returns sql.ErrNoRows-style nil result if
|
||||
// the document is not yet indexed (mail must be ingested first).
|
||||
func (idx *manticoreIndex) UpdateAttachmentText(mailID, text string) error {
|
||||
rowID := hashMailID(mailID)
|
||||
row := idx.db.QueryRow(fmt.Sprintf(
|
||||
`SELECT mail_id, subject, from_addr, to_addr, body, attachment_names,
|
||||
has_attachment, date_ts, size_bytes
|
||||
FROM %s WHERE id = ? LIMIT 1`, idx.table),
|
||||
rowID,
|
||||
)
|
||||
var (
|
||||
mid, subj, from, to, body, attachNames string
|
||||
hasAttach uint64
|
||||
dateTS, sizeBytes int64
|
||||
)
|
||||
if err := row.Scan(&mid, &subj, &from, &to, &body, &attachNames, &hasAttach, &dateTS, &sizeBytes); err != nil {
|
||||
return fmt.Errorf("manticore UpdateAttachmentText %s: load row: %w", idx.table, err)
|
||||
}
|
||||
if _, err := idx.db.Exec(
|
||||
fmt.Sprintf(`REPLACE INTO %s
|
||||
(id, mail_id, subject, from_addr, to_addr, body, attachment_names, attachment_text, has_attachment, date_ts, size_bytes)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?)`, idx.table),
|
||||
rowID, mid, subj, from, to, body, attachNames, text, hasAttach, dateTS, sizeBytes,
|
||||
); err != nil {
|
||||
return fmt.Errorf("manticore UpdateAttachmentText %s: replace: %w", idx.table, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete removes a document by mail ID hash.
|
||||
func (idx *manticoreIndex) Delete(id string) error {
|
||||
rowID := hashMailID(id)
|
||||
|
||||
Reference in New Issue
Block a user