feat(PROJ-44): Snippet + match_field fuer Suche, GetAttachmentText
Hit-Struct um Snippet + MatchField erweitert. enrichHitsWithSnippets fuellt diese pro Treffer: detectMatchField probt subject>body> attachment_text>attachment_names>from_addr>to_addr; buildSnippet ruft CALL SNIPPETS mit <b>-Markern. Snippet-Fehler droppen den Treffer nicht. AttachmentTextReader-Interface + Manticore-Implementation GetAttachmentText liefert den indexierten OCR-Text fuer den neuen /ocr-text-Endpoint.
This commit is contained in:
+17
-2
@@ -35,9 +35,16 @@ type SearchRequest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Hit is a single search result.
|
// Hit is a single search result.
|
||||||
|
//
|
||||||
|
// PROJ-44: Snippet and MatchField are populated by the Manticore Search path
|
||||||
|
// when a full-text query was provided. They remain empty for filter-only
|
||||||
|
// searches (e.g. date range without query) and when the per-hit highlight
|
||||||
|
// pass fails — the hit is still returned in that case (no hard error).
|
||||||
type Hit struct {
|
type Hit struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Score float64 `json:"score"`
|
Score float64 `json:"score"`
|
||||||
|
Snippet string `json:"snippet,omitempty"` // HTML-marked excerpt with <b>match</b> tags
|
||||||
|
MatchField string `json:"match_field,omitempty"` // subject|body|attachment_text|attachment_names|from_addr|to_addr
|
||||||
}
|
}
|
||||||
|
|
||||||
// SearchResult holds paginated search results.
|
// SearchResult holds paginated search results.
|
||||||
@@ -63,6 +70,14 @@ type AttachmentTextUpdater interface {
|
|||||||
UpdateAttachmentText(mailID, text string) error
|
UpdateAttachmentText(mailID, text string) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AttachmentTextReader is implemented by indexers that can return the stored
|
||||||
|
// OCR-extracted attachment text for a mail. Optional add-on to Indexer.
|
||||||
|
//
|
||||||
|
// PROJ-44: Manticore implements this for the /api/mails/{id}/ocr-text endpoint.
|
||||||
|
type AttachmentTextReader interface {
|
||||||
|
GetAttachmentText(mailID string) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
// TenantIndexer manages per-tenant Indexer instances.
|
// TenantIndexer manages per-tenant Indexer instances.
|
||||||
// Implemented by ManticoreTenantManager (primary) and TenantIndexManager (legacy Xapian).
|
// Implemented by ManticoreTenantManager (primary) and TenantIndexManager (legacy Xapian).
|
||||||
type TenantIndexer interface {
|
type TenantIndexer interface {
|
||||||
|
|||||||
@@ -226,6 +226,28 @@ func (idx *manticoreIndex) UpdateAttachmentText(mailID, text string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetAttachmentText returns the stored OCR-extracted text for a mail or
|
||||||
|
// "" if the document is not (yet) indexed or has no attachment text.
|
||||||
|
// Implements index.AttachmentTextReader (PROJ-44).
|
||||||
|
//
|
||||||
|
// Errors are returned only for unexpected SQL failures — a missing row maps
|
||||||
|
// to ("", nil) so callers can treat it the same as "no OCR text available".
|
||||||
|
func (idx *manticoreIndex) GetAttachmentText(mailID string) (string, error) {
|
||||||
|
rowID := hashMailID(mailID)
|
||||||
|
var text string
|
||||||
|
err := idx.db.QueryRow(
|
||||||
|
fmt.Sprintf(`SELECT attachment_text FROM %s WHERE id = ? LIMIT 1`, idx.table),
|
||||||
|
rowID,
|
||||||
|
).Scan(&text)
|
||||||
|
if err != nil {
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("manticore GetAttachmentText %s: %w", idx.table, err)
|
||||||
|
}
|
||||||
|
return text, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Delete removes a document by mail ID hash.
|
// Delete removes a document by mail ID hash.
|
||||||
func (idx *manticoreIndex) Delete(id string) error {
|
func (idx *manticoreIndex) Delete(id string) error {
|
||||||
rowID := hashMailID(id)
|
rowID := hashMailID(id)
|
||||||
@@ -357,6 +379,14 @@ func (idx *manticoreIndex) Search(req SearchRequest) (*SearchResult, error) {
|
|||||||
return nil, fmt.Errorf("manticore Search rows: %w", err)
|
return nil, fmt.Errorf("manticore Search rows: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PROJ-44: per-hit snippet + match_field heuristic. Only meaningful when
|
||||||
|
// the caller supplied a full-text query. Failures are logged via the
|
||||||
|
// returned error message wrapping but do not drop the hit — see
|
||||||
|
// enrichHitsWithSnippets.
|
||||||
|
if req.Query != "" && len(hits) > 0 {
|
||||||
|
idx.enrichHitsWithSnippets(hits, req.Query)
|
||||||
|
}
|
||||||
|
|
||||||
return &SearchResult{Total: total, Hits: hits}, nil
|
return &SearchResult{Total: total, Hits: hits}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,127 @@
|
|||||||
|
package index
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// matchFieldOrder defines the priority in which fields are probed when
|
||||||
|
// determining which one of them caused a hit. Order is taken from PROJ-44:
|
||||||
|
// subject > body > attachment_text > attachment_names > from_addr > to_addr.
|
||||||
|
//
|
||||||
|
// The list is intentionally small — the per-hit cost is one extra Manticore
|
||||||
|
// SELECT per probed field until a match is found. With typical page sizes
|
||||||
|
// (<=50) and at most 6 probes per hit this stays well below 300 cheap queries
|
||||||
|
// per request.
|
||||||
|
var matchFieldOrder = []string{
|
||||||
|
"subject",
|
||||||
|
"body",
|
||||||
|
"attachment_text",
|
||||||
|
"attachment_names",
|
||||||
|
"from_addr",
|
||||||
|
"to_addr",
|
||||||
|
}
|
||||||
|
|
||||||
|
// enrichHitsWithSnippets fills Hit.Snippet and Hit.MatchField for each hit in
|
||||||
|
// place. Errors are logged but never propagated — a hit without snippet is
|
||||||
|
// still a valid hit (PROJ-44 edge case: "Snippet-Generierung schlägt fehl …
|
||||||
|
// kein Hard-Error").
|
||||||
|
func (idx *manticoreIndex) enrichHitsWithSnippets(hits []Hit, query string) {
|
||||||
|
for i := range hits {
|
||||||
|
field := idx.detectMatchField(hits[i].ID, query)
|
||||||
|
if field != "" {
|
||||||
|
hits[i].MatchField = field
|
||||||
|
}
|
||||||
|
|
||||||
|
snip, err := idx.buildSnippet(hits[i].ID, query, field)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("manticore snippet: mail=%s err=%v", hits[i].ID, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hits[i].Snippet = snip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectMatchField runs a small MATCH() probe per field in matchFieldOrder
|
||||||
|
// until one returns a row. Returns "" when no field matches — that can happen
|
||||||
|
// for filter-only matches (e.g. date range only) or when the query terms
|
||||||
|
// require multiple fields combined.
|
||||||
|
func (idx *manticoreIndex) detectMatchField(mailID, query string) string {
|
||||||
|
rowID := hashMailID(mailID)
|
||||||
|
escaped := escapeManticoreMatch(query)
|
||||||
|
|
||||||
|
for _, field := range matchFieldOrder {
|
||||||
|
// SAFETY: field is from a static allow-list above, never user input.
|
||||||
|
matchExpr := fmt.Sprintf("@%s %s", field, escaped)
|
||||||
|
q := fmt.Sprintf(
|
||||||
|
`SELECT id FROM %s WHERE id = ? AND MATCH(?) LIMIT 1`,
|
||||||
|
idx.table,
|
||||||
|
)
|
||||||
|
var got int64
|
||||||
|
err := idx.db.QueryRow(q, rowID, matchExpr).Scan(&got)
|
||||||
|
if err == nil {
|
||||||
|
return field
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildSnippet returns an excerpt of the matched field with <b>...</b>
|
||||||
|
// markers around match words via Manticore's CALL SNIPPETS(...) function.
|
||||||
|
// When matchField is empty, "body" is used as a sensible default.
|
||||||
|
//
|
||||||
|
// Manticore's SNIPPETS expects the source text as the first argument, the
|
||||||
|
// table name as the second, and the query as the third. We fetch the source
|
||||||
|
// column for the hit first (small SELECT) and then call SNIPPETS in a second
|
||||||
|
// query. Two roundtrips per hit is acceptable for typical page sizes.
|
||||||
|
func (idx *manticoreIndex) buildSnippet(mailID, query, matchField string) (string, error) {
|
||||||
|
field := matchField
|
||||||
|
if field == "" {
|
||||||
|
field = "body"
|
||||||
|
}
|
||||||
|
// Whitelist guard — never interpolate user-provided field names.
|
||||||
|
allowed := false
|
||||||
|
for _, f := range matchFieldOrder {
|
||||||
|
if f == field {
|
||||||
|
allowed = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allowed {
|
||||||
|
return "", fmt.Errorf("manticore snippet: invalid field %q", field)
|
||||||
|
}
|
||||||
|
|
||||||
|
rowID := hashMailID(mailID)
|
||||||
|
source, err := idx.fetchFieldText(field, rowID)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
source = strings.TrimSpace(source)
|
||||||
|
if source == "" {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CALL SNIPPETS(text, table, query, ...options).
|
||||||
|
// Manticore returns a single-column, single-row result.
|
||||||
|
row := idx.db.QueryRow(
|
||||||
|
`CALL SNIPPETS(?, ?, ?, 'before_match=<b>', 'after_match=</b>', 'limit=240', 'around=12')`,
|
||||||
|
source, idx.table, query,
|
||||||
|
)
|
||||||
|
var snippet string
|
||||||
|
if err := row.Scan(&snippet); err != nil {
|
||||||
|
return "", fmt.Errorf("call snippets %s: %w", idx.table, err)
|
||||||
|
}
|
||||||
|
return snippet, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchFieldText loads a single text column for one row. field must already
|
||||||
|
// be validated against matchFieldOrder by the caller.
|
||||||
|
func (idx *manticoreIndex) fetchFieldText(field string, rowID int64) (string, error) {
|
||||||
|
q := fmt.Sprintf(`SELECT %s FROM %s WHERE id = ? LIMIT 1`, field, idx.table)
|
||||||
|
var text string
|
||||||
|
if err := idx.db.QueryRow(q, rowID).Scan(&text); err != nil {
|
||||||
|
return "", fmt.Errorf("fetch %s: %w", field, err)
|
||||||
|
}
|
||||||
|
return text, nil
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user