0bda21033e
Asynchrone OCR fuer PDF- und Bild-Anhaenge via tesseract + poppler-utils. Extrahierter Text wird in Manticore (attachment_text) gespeichert und ist ueber die normale Volltextsuche auffindbar. - internal/ocr: ExtractText + Worker (queue + drain) - internal/storage/ocr.go: SetOCRStatus, OCREnabled, GetMailsByOCRStatus - emails.ocr_status (pending|done|failed|skipped|disabled) - tenants.ocr_enabled (Default TRUE, opt-out) - Manticore: attachment_text-Feld + UpdateAttachmentText - Boot-resume: pending Jobs nach Restart automatisch in die Queue - CLI: archivmail ocr-reprocess --tenant N --status pending|failed|all - update.sh: tesseract-ocr + poppler-utils optional installieren
84 lines
2.3 KiB
Go
84 lines
2.3 KiB
Go
package index
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
)
|
|
|
|
// MailDocument is the indexed representation of a stored email.
|
|
type MailDocument struct {
|
|
ID string
|
|
From string
|
|
To string
|
|
Subject string
|
|
Body string
|
|
AttachNames string
|
|
AttachmentText string // PROJ-35: OCR-extracted text from PDF/image attachments
|
|
HasAttachment bool
|
|
Date time.Time
|
|
Size int64
|
|
TenantID *int64 // nil = global / superadmin context
|
|
}
|
|
|
|
// SearchRequest specifies search parameters.
|
|
type SearchRequest struct {
|
|
Query string
|
|
From string
|
|
To string
|
|
OwnEmail string
|
|
DateFrom *time.Time
|
|
DateTo *time.Time
|
|
HasAttachment *bool // nil=no filter, true=only with, false=only without
|
|
Sort string // "relevance", "date_asc", "date_desc" (default: date_desc)
|
|
PageSize int
|
|
Page int
|
|
}
|
|
|
|
// Hit is a single search result.
|
|
type Hit struct {
|
|
ID string `json:"id"`
|
|
Score float64 `json:"score"`
|
|
}
|
|
|
|
// SearchResult holds paginated search results.
|
|
type SearchResult struct {
|
|
Total int
|
|
Hits []Hit
|
|
}
|
|
|
|
// Indexer is the interface for full-text email indexing.
|
|
type Indexer interface {
|
|
IndexSync(doc MailDocument) error
|
|
Search(req SearchRequest) (*SearchResult, error)
|
|
Delete(id string) error
|
|
Close() error
|
|
}
|
|
|
|
// AttachmentTextUpdater is implemented by indexers that support partial
|
|
// updates of the OCR-extracted attachment text. Optional add-on to Indexer:
|
|
// callers should type-assert and degrade gracefully if not supported.
|
|
//
|
|
// PROJ-35: Manticore implements this; legacy Xapian does not.
|
|
type AttachmentTextUpdater interface {
|
|
UpdateAttachmentText(mailID, text string) error
|
|
}
|
|
|
|
// TenantIndexer manages per-tenant Indexer instances.
|
|
// Implemented by ManticoreTenantManager (primary) and TenantIndexManager (legacy Xapian).
|
|
type TenantIndexer interface {
|
|
ForTenant(tenantID *int64) Indexer
|
|
Global() Indexer
|
|
Close() error
|
|
}
|
|
|
|
// New creates an Indexer for the specified backend.
|
|
// Deprecated: use NewManticoreTenantManager instead.
|
|
func New(dir string, batchSize int, backend string) (Indexer, error) {
|
|
switch backend {
|
|
case "xapian":
|
|
return newXapian(dir)
|
|
default:
|
|
return nil, fmt.Errorf("unknown index backend: %q (use manticore via NewManticoreTenantManager)", backend)
|
|
}
|
|
}
|