feat(PROJ-44): Snippet + match_field fuer Suche, GetAttachmentText

Hit-Struct um Snippet + MatchField erweitert. enrichHitsWithSnippets
fuellt diese pro Treffer: detectMatchField probt subject>body>
attachment_text>attachment_names>from_addr>to_addr; buildSnippet ruft
CALL SNIPPETS mit <b>-Markern. Snippet-Fehler droppen den Treffer nicht.

AttachmentTextReader-Interface + Manticore-Implementation
GetAttachmentText liefert den indexierten OCR-Text fuer den neuen
/ocr-text-Endpoint.
This commit is contained in:
sysops
2026-05-10 22:20:52 +02:00
parent 5078830469
commit 7b75433999
3 changed files with 174 additions and 2 deletions
+15
View File
@@ -35,9 +35,16 @@ type SearchRequest struct {
}
// Hit is a single search result.
//
// PROJ-44: Snippet and MatchField are populated by the Manticore Search path
// when a full-text query was provided. They remain empty for filter-only
// searches (e.g. date range without query) and when the per-hit highlight
// pass fails — the hit is still returned in that case (no hard error).
type Hit struct {
ID string `json:"id"`
Score float64 `json:"score"`
Snippet string `json:"snippet,omitempty"` // HTML-marked excerpt with <b>match</b> tags
MatchField string `json:"match_field,omitempty"` // subject|body|attachment_text|attachment_names|from_addr|to_addr
}
// SearchResult holds paginated search results.
@@ -63,6 +70,14 @@ type AttachmentTextUpdater interface {
UpdateAttachmentText(mailID, text string) error
}
// AttachmentTextReader is implemented by indexers that can return the stored
// OCR-extracted attachment text for a mail. Optional add-on to Indexer.
//
// PROJ-44: Manticore implements this for the /api/mails/{id}/ocr-text endpoint.
type AttachmentTextReader interface {
GetAttachmentText(mailID string) (string, error)
}
// TenantIndexer manages per-tenant Indexer instances.
// Implemented by ManticoreTenantManager (primary) and TenantIndexManager (legacy Xapian).
type TenantIndexer interface {
+30
View File
@@ -226,6 +226,28 @@ func (idx *manticoreIndex) UpdateAttachmentText(mailID, text string) error {
return nil
}
// GetAttachmentText returns the stored OCR-extracted text for a mail or
// "" if the document is not (yet) indexed or has no attachment text.
// Implements index.AttachmentTextReader (PROJ-44).
//
// Errors are returned only for unexpected SQL failures — a missing row maps
// to ("", nil) so callers can treat it the same as "no OCR text available".
func (idx *manticoreIndex) GetAttachmentText(mailID string) (string, error) {
rowID := hashMailID(mailID)
var text string
err := idx.db.QueryRow(
fmt.Sprintf(`SELECT attachment_text FROM %s WHERE id = ? LIMIT 1`, idx.table),
rowID,
).Scan(&text)
if err != nil {
if err == sql.ErrNoRows {
return "", nil
}
return "", fmt.Errorf("manticore GetAttachmentText %s: %w", idx.table, err)
}
return text, nil
}
// Delete removes a document by mail ID hash.
func (idx *manticoreIndex) Delete(id string) error {
rowID := hashMailID(id)
@@ -357,6 +379,14 @@ func (idx *manticoreIndex) Search(req SearchRequest) (*SearchResult, error) {
return nil, fmt.Errorf("manticore Search rows: %w", err)
}
// PROJ-44: per-hit snippet + match_field heuristic. Only meaningful when
// the caller supplied a full-text query. Failures are logged via the
// returned error message wrapping but do not drop the hit — see
// enrichHitsWithSnippets.
if req.Query != "" && len(hits) > 0 {
idx.enrichHitsWithSnippets(hits, req.Query)
}
return &SearchResult{Total: total, Hits: hits}, nil
}
+127
View File
@@ -0,0 +1,127 @@
package index
import (
"fmt"
"log"
"strings"
)
// matchFieldOrder defines the priority in which fields are probed when
// determining which one of them caused a hit. Order is taken from PROJ-44:
// subject > body > attachment_text > attachment_names > from_addr > to_addr.
//
// The list is intentionally small — the per-hit cost is one extra Manticore
// SELECT per probed field until a match is found. With typical page sizes
// (<=50) and at most 6 probes per hit this stays well below 300 cheap queries
// per request.
var matchFieldOrder = []string{
"subject",
"body",
"attachment_text",
"attachment_names",
"from_addr",
"to_addr",
}
// enrichHitsWithSnippets fills Hit.Snippet and Hit.MatchField for each hit in
// place. Errors are logged but never propagated — a hit without snippet is
// still a valid hit (PROJ-44 edge case: "Snippet-Generierung schlägt fehl …
// kein Hard-Error").
func (idx *manticoreIndex) enrichHitsWithSnippets(hits []Hit, query string) {
for i := range hits {
field := idx.detectMatchField(hits[i].ID, query)
if field != "" {
hits[i].MatchField = field
}
snip, err := idx.buildSnippet(hits[i].ID, query, field)
if err != nil {
log.Printf("manticore snippet: mail=%s err=%v", hits[i].ID, err)
continue
}
hits[i].Snippet = snip
}
}
// detectMatchField runs a small MATCH() probe per field in matchFieldOrder
// until one returns a row. Returns "" when no field matches — that can happen
// for filter-only matches (e.g. date range only) or when the query terms
// require multiple fields combined.
func (idx *manticoreIndex) detectMatchField(mailID, query string) string {
rowID := hashMailID(mailID)
escaped := escapeManticoreMatch(query)
for _, field := range matchFieldOrder {
// SAFETY: field is from a static allow-list above, never user input.
matchExpr := fmt.Sprintf("@%s %s", field, escaped)
q := fmt.Sprintf(
`SELECT id FROM %s WHERE id = ? AND MATCH(?) LIMIT 1`,
idx.table,
)
var got int64
err := idx.db.QueryRow(q, rowID, matchExpr).Scan(&got)
if err == nil {
return field
}
}
return ""
}
// buildSnippet returns an excerpt of the matched field with <b>...</b>
// markers around match words via Manticore's CALL SNIPPETS(...) function.
// When matchField is empty, "body" is used as a sensible default.
//
// Manticore's SNIPPETS expects the source text as the first argument, the
// table name as the second, and the query as the third. We fetch the source
// column for the hit first (small SELECT) and then call SNIPPETS in a second
// query. Two roundtrips per hit is acceptable for typical page sizes.
func (idx *manticoreIndex) buildSnippet(mailID, query, matchField string) (string, error) {
field := matchField
if field == "" {
field = "body"
}
// Whitelist guard — never interpolate user-provided field names.
allowed := false
for _, f := range matchFieldOrder {
if f == field {
allowed = true
break
}
}
if !allowed {
return "", fmt.Errorf("manticore snippet: invalid field %q", field)
}
rowID := hashMailID(mailID)
source, err := idx.fetchFieldText(field, rowID)
if err != nil {
return "", err
}
source = strings.TrimSpace(source)
if source == "" {
return "", nil
}
// CALL SNIPPETS(text, table, query, ...options).
// Manticore returns a single-column, single-row result.
row := idx.db.QueryRow(
`CALL SNIPPETS(?, ?, ?, 'before_match=<b>', 'after_match=</b>', 'limit=240', 'around=12')`,
source, idx.table, query,
)
var snippet string
if err := row.Scan(&snippet); err != nil {
return "", fmt.Errorf("call snippets %s: %w", idx.table, err)
}
return snippet, nil
}
// fetchFieldText loads a single text column for one row. field must already
// be validated against matchFieldOrder by the caller.
func (idx *manticoreIndex) fetchFieldText(field string, rowID int64) (string, error) {
q := fmt.Sprintf(`SELECT %s FROM %s WHERE id = ? LIMIT 1`, field, idx.table)
var text string
if err := idx.db.QueryRow(q, rowID).Scan(&text); err != nil {
return "", fmt.Errorf("fetch %s: %w", field, err)
}
return text, nil
}