diff --git a/internal/index/index.go b/internal/index/index.go index faadef8..1173465 100644 --- a/internal/index/index.go +++ b/internal/index/index.go @@ -35,9 +35,16 @@ type SearchRequest struct { } // Hit is a single search result. +// +// PROJ-44: Snippet and MatchField are populated by the Manticore Search path +// when a full-text query was provided. They remain empty for filter-only +// searches (e.g. date range without query) and when the per-hit highlight +// pass fails — the hit is still returned in that case (no hard error). type Hit struct { - ID string `json:"id"` - Score float64 `json:"score"` + ID string `json:"id"` + Score float64 `json:"score"` + Snippet string `json:"snippet,omitempty"` // HTML-marked excerpt with match tags + MatchField string `json:"match_field,omitempty"` // subject|body|attachment_text|attachment_names|from_addr|to_addr } // SearchResult holds paginated search results. @@ -63,6 +70,14 @@ type AttachmentTextUpdater interface { UpdateAttachmentText(mailID, text string) error } +// AttachmentTextReader is implemented by indexers that can return the stored +// OCR-extracted attachment text for a mail. Optional add-on to Indexer. +// +// PROJ-44: Manticore implements this for the /api/mails/{id}/ocr-text endpoint. +type AttachmentTextReader interface { + GetAttachmentText(mailID string) (string, error) +} + // TenantIndexer manages per-tenant Indexer instances. // Implemented by ManticoreTenantManager (primary) and TenantIndexManager (legacy Xapian). type TenantIndexer interface { diff --git a/internal/index/manticore.go b/internal/index/manticore.go index 7842be7..5a31af8 100644 --- a/internal/index/manticore.go +++ b/internal/index/manticore.go @@ -226,6 +226,28 @@ func (idx *manticoreIndex) UpdateAttachmentText(mailID, text string) error { return nil } +// GetAttachmentText returns the stored OCR-extracted text for a mail or +// "" if the document is not (yet) indexed or has no attachment text. +// Implements index.AttachmentTextReader (PROJ-44). +// +// Errors are returned only for unexpected SQL failures — a missing row maps +// to ("", nil) so callers can treat it the same as "no OCR text available". +func (idx *manticoreIndex) GetAttachmentText(mailID string) (string, error) { + rowID := hashMailID(mailID) + var text string + err := idx.db.QueryRow( + fmt.Sprintf(`SELECT attachment_text FROM %s WHERE id = ? LIMIT 1`, idx.table), + rowID, + ).Scan(&text) + if err != nil { + if err == sql.ErrNoRows { + return "", nil + } + return "", fmt.Errorf("manticore GetAttachmentText %s: %w", idx.table, err) + } + return text, nil +} + // Delete removes a document by mail ID hash. func (idx *manticoreIndex) Delete(id string) error { rowID := hashMailID(id) @@ -357,6 +379,14 @@ func (idx *manticoreIndex) Search(req SearchRequest) (*SearchResult, error) { return nil, fmt.Errorf("manticore Search rows: %w", err) } + // PROJ-44: per-hit snippet + match_field heuristic. Only meaningful when + // the caller supplied a full-text query. Failures are logged via the + // returned error message wrapping but do not drop the hit — see + // enrichHitsWithSnippets. + if req.Query != "" && len(hits) > 0 { + idx.enrichHitsWithSnippets(hits, req.Query) + } + return &SearchResult{Total: total, Hits: hits}, nil } diff --git a/internal/index/manticore_snippet.go b/internal/index/manticore_snippet.go new file mode 100644 index 0000000..f0e89b4 --- /dev/null +++ b/internal/index/manticore_snippet.go @@ -0,0 +1,127 @@ +package index + +import ( + "fmt" + "log" + "strings" +) + +// matchFieldOrder defines the priority in which fields are probed when +// determining which one of them caused a hit. Order is taken from PROJ-44: +// subject > body > attachment_text > attachment_names > from_addr > to_addr. +// +// The list is intentionally small — the per-hit cost is one extra Manticore +// SELECT per probed field until a match is found. With typical page sizes +// (<=50) and at most 6 probes per hit this stays well below 300 cheap queries +// per request. +var matchFieldOrder = []string{ + "subject", + "body", + "attachment_text", + "attachment_names", + "from_addr", + "to_addr", +} + +// enrichHitsWithSnippets fills Hit.Snippet and Hit.MatchField for each hit in +// place. Errors are logged but never propagated — a hit without snippet is +// still a valid hit (PROJ-44 edge case: "Snippet-Generierung schlägt fehl … +// kein Hard-Error"). +func (idx *manticoreIndex) enrichHitsWithSnippets(hits []Hit, query string) { + for i := range hits { + field := idx.detectMatchField(hits[i].ID, query) + if field != "" { + hits[i].MatchField = field + } + + snip, err := idx.buildSnippet(hits[i].ID, query, field) + if err != nil { + log.Printf("manticore snippet: mail=%s err=%v", hits[i].ID, err) + continue + } + hits[i].Snippet = snip + } +} + +// detectMatchField runs a small MATCH() probe per field in matchFieldOrder +// until one returns a row. Returns "" when no field matches — that can happen +// for filter-only matches (e.g. date range only) or when the query terms +// require multiple fields combined. +func (idx *manticoreIndex) detectMatchField(mailID, query string) string { + rowID := hashMailID(mailID) + escaped := escapeManticoreMatch(query) + + for _, field := range matchFieldOrder { + // SAFETY: field is from a static allow-list above, never user input. + matchExpr := fmt.Sprintf("@%s %s", field, escaped) + q := fmt.Sprintf( + `SELECT id FROM %s WHERE id = ? AND MATCH(?) LIMIT 1`, + idx.table, + ) + var got int64 + err := idx.db.QueryRow(q, rowID, matchExpr).Scan(&got) + if err == nil { + return field + } + } + return "" +} + +// buildSnippet returns an excerpt of the matched field with ... +// markers around match words via Manticore's CALL SNIPPETS(...) function. +// When matchField is empty, "body" is used as a sensible default. +// +// Manticore's SNIPPETS expects the source text as the first argument, the +// table name as the second, and the query as the third. We fetch the source +// column for the hit first (small SELECT) and then call SNIPPETS in a second +// query. Two roundtrips per hit is acceptable for typical page sizes. +func (idx *manticoreIndex) buildSnippet(mailID, query, matchField string) (string, error) { + field := matchField + if field == "" { + field = "body" + } + // Whitelist guard — never interpolate user-provided field names. + allowed := false + for _, f := range matchFieldOrder { + if f == field { + allowed = true + break + } + } + if !allowed { + return "", fmt.Errorf("manticore snippet: invalid field %q", field) + } + + rowID := hashMailID(mailID) + source, err := idx.fetchFieldText(field, rowID) + if err != nil { + return "", err + } + source = strings.TrimSpace(source) + if source == "" { + return "", nil + } + + // CALL SNIPPETS(text, table, query, ...options). + // Manticore returns a single-column, single-row result. + row := idx.db.QueryRow( + `CALL SNIPPETS(?, ?, ?, 'before_match=', 'after_match=', 'limit=240', 'around=12')`, + source, idx.table, query, + ) + var snippet string + if err := row.Scan(&snippet); err != nil { + return "", fmt.Errorf("call snippets %s: %w", idx.table, err) + } + return snippet, nil +} + +// fetchFieldText loads a single text column for one row. field must already +// be validated against matchFieldOrder by the caller. +func (idx *manticoreIndex) fetchFieldText(field string, rowID int64) (string, error) { + q := fmt.Sprintf(`SELECT %s FROM %s WHERE id = ? LIMIT 1`, field, idx.table) + var text string + if err := idx.db.QueryRow(q, rowID).Scan(&text); err != nil { + return "", fmt.Errorf("fetch %s: %w", field, err) + } + return text, nil +}