a1c4e59fff
mailparser: weitere Layouts (Timezone +02:00 mit Doppelpunkt, ohne Sekunden) storage: GetReceivedAts() für Batch-Lookup von received_at search_handlers: received_at als Fallback wenn pm.Date.IsZero() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
263 lines
6.7 KiB
Go
263 lines
6.7 KiB
Go
package mailparser
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/base64"
|
|
"fmt"
|
|
"io"
|
|
"mime"
|
|
"mime/multipart"
|
|
"mime/quotedprintable"
|
|
"net/mail"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// Attachment represents a MIME attachment in a parsed email.
|
|
type Attachment struct {
|
|
Filename string
|
|
ContentType string
|
|
Data []byte
|
|
Size int
|
|
}
|
|
|
|
// ParsedMail holds the structured content of a parsed email message.
|
|
type ParsedMail struct {
|
|
From string
|
|
To []string
|
|
CC []string
|
|
Subject string
|
|
MessageID string
|
|
InReplyTo string // In-Reply-To header (single message-id, no angle brackets)
|
|
References []string // References header (list of message-ids, no angle brackets)
|
|
TextBody string
|
|
HTMLBody string
|
|
Date time.Time
|
|
Attachments []Attachment
|
|
Raw []byte
|
|
}
|
|
|
|
// Parse parses a raw RFC 2822 / MIME email and returns a ParsedMail.
|
|
func Parse(raw []byte) (*ParsedMail, error) {
|
|
msg, err := mail.ReadMessage(bytes.NewReader(raw))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("mailparser: read message: %w", err)
|
|
}
|
|
|
|
pm := &ParsedMail{Raw: raw}
|
|
|
|
// From
|
|
if from := msg.Header.Get("From"); from != "" {
|
|
addrs, err := mail.ParseAddressList(from)
|
|
if err == nil && len(addrs) > 0 {
|
|
pm.From = addrs[0].Address
|
|
} else {
|
|
pm.From = from
|
|
}
|
|
}
|
|
|
|
// To
|
|
if to := msg.Header.Get("To"); to != "" {
|
|
addrs, err := mail.ParseAddressList(to)
|
|
if err == nil {
|
|
for _, a := range addrs {
|
|
pm.To = append(pm.To, a.Address)
|
|
}
|
|
}
|
|
}
|
|
|
|
// CC
|
|
if cc := msg.Header.Get("Cc"); cc != "" {
|
|
addrs, err := mail.ParseAddressList(cc)
|
|
if err == nil {
|
|
for _, a := range addrs {
|
|
pm.CC = append(pm.CC, a.Address)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Subject - decode MIME encoded-words
|
|
pm.Subject = decodeMIMEHeader(msg.Header.Get("Subject"))
|
|
|
|
// Message-ID - strip angle brackets
|
|
msgID := msg.Header.Get("Message-Id")
|
|
pm.MessageID = strings.Trim(msgID, "<>")
|
|
|
|
// In-Reply-To - strip angle brackets
|
|
if irt := msg.Header.Get("In-Reply-To"); irt != "" {
|
|
pm.InReplyTo = strings.Trim(strings.TrimSpace(irt), "<>")
|
|
}
|
|
|
|
// References - space-separated list of message-ids
|
|
if refs := msg.Header.Get("References"); refs != "" {
|
|
for _, r := range strings.Fields(refs) {
|
|
r = strings.Trim(r, "<>")
|
|
if r != "" {
|
|
pm.References = append(pm.References, r)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Date — try go-message parser first, then fallback formats, then zero
|
|
if d, err := msg.Header.Date(); err == nil {
|
|
pm.Date = d
|
|
} else {
|
|
// Some MUAs emit non-standard variants (e.g. "+0100 (CET)" suffix).
|
|
// Try common RFC 2822 / non-standard formats before giving up.
|
|
raw := strings.TrimSpace(msg.Header.Get("Date"))
|
|
// Strip parenthesised timezone comment: "... +0100 (CET)" → "... +0100"
|
|
if idx := strings.LastIndex(raw, "("); idx > 0 {
|
|
raw = strings.TrimSpace(raw[:idx])
|
|
}
|
|
parsed := false
|
|
for _, layout := range []string{
|
|
"Mon, 2 Jan 2006 15:04:05 -0700",
|
|
"Mon, 02 Jan 2006 15:04:05 -0700",
|
|
"2 Jan 2006 15:04:05 -0700",
|
|
"02 Jan 2006 15:04:05 -0700",
|
|
"Mon, 2 Jan 2006 15:04:05 MST",
|
|
"Mon, 02 Jan 2006 15:04:05 MST",
|
|
// Colon in timezone offset (e.g. "+02:00") used by some MTA versions
|
|
"Mon, 2 Jan 2006 15:04:05 -07:00",
|
|
"Mon, 02 Jan 2006 15:04:05 -07:00",
|
|
"2 Jan 2006 15:04:05 -07:00",
|
|
"02 Jan 2006 15:04:05 -07:00",
|
|
// Without seconds
|
|
"Mon, 2 Jan 2006 15:04 -0700",
|
|
"Mon, 02 Jan 2006 15:04 -0700",
|
|
"2 Jan 2006 15:04 -0700",
|
|
// Go stdlib aliases
|
|
time.RFC1123Z,
|
|
time.RFC1123,
|
|
} {
|
|
if t, err := time.Parse(layout, raw); err == nil {
|
|
pm.Date = t
|
|
parsed = true
|
|
break
|
|
}
|
|
}
|
|
if !parsed {
|
|
// Leave pm.Date as zero — storage will use DB DEFAULT NOW()
|
|
pm.Date = time.Time{}
|
|
}
|
|
}
|
|
|
|
// Parse body / MIME parts
|
|
contentType := msg.Header.Get("Content-Type")
|
|
mediaType, params, err := mime.ParseMediaType(contentType)
|
|
if err != nil {
|
|
// No content-type or parse error: treat as plain text
|
|
body, _ := io.ReadAll(msg.Body)
|
|
pm.TextBody = string(body)
|
|
return pm, nil
|
|
}
|
|
|
|
if strings.HasPrefix(mediaType, "multipart/") {
|
|
boundary := params["boundary"]
|
|
if err := parseMultipart(pm, msg.Body, boundary); err != nil {
|
|
return nil, fmt.Errorf("mailparser: multipart: %w", err)
|
|
}
|
|
} else {
|
|
body, _ := io.ReadAll(msg.Body)
|
|
decoded := decodeBody(body, msg.Header.Get("Content-Transfer-Encoding"))
|
|
if strings.Contains(mediaType, "html") {
|
|
pm.HTMLBody = string(decoded)
|
|
} else {
|
|
pm.TextBody = string(decoded)
|
|
}
|
|
}
|
|
|
|
return pm, nil
|
|
}
|
|
|
|
// parseMultipart walks MIME parts and fills text, html, and attachments.
|
|
func parseMultipart(pm *ParsedMail, body io.Reader, boundary string) error {
|
|
mr := multipart.NewReader(body, boundary)
|
|
for {
|
|
part, err := mr.NextPart()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ct := part.Header.Get("Content-Type")
|
|
mediaType, params, err := mime.ParseMediaType(ct)
|
|
if err != nil {
|
|
mediaType = "application/octet-stream"
|
|
params = map[string]string{}
|
|
}
|
|
|
|
data, _ := io.ReadAll(part)
|
|
cte := part.Header.Get("Content-Transfer-Encoding")
|
|
decoded := decodeBody(data, cte)
|
|
|
|
// Check disposition for attachment
|
|
disp := part.Header.Get("Content-Disposition")
|
|
dispType, dispParams, _ := mime.ParseMediaType(disp)
|
|
filename := dispParams["filename"]
|
|
if filename == "" {
|
|
filename = params["name"]
|
|
}
|
|
filename = decodeMIMEHeader(filename)
|
|
|
|
if strings.HasPrefix(dispType, "attachment") || filename != "" {
|
|
pm.Attachments = append(pm.Attachments, Attachment{
|
|
Filename: filename,
|
|
ContentType: mediaType,
|
|
Data: decoded,
|
|
Size: len(decoded),
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Nested multipart
|
|
if strings.HasPrefix(mediaType, "multipart/") {
|
|
if err := parseMultipart(pm, bytes.NewReader(decoded), params["boundary"]); err != nil {
|
|
return err
|
|
}
|
|
continue
|
|
}
|
|
|
|
switch {
|
|
case strings.Contains(mediaType, "text/plain"):
|
|
pm.TextBody += string(decoded)
|
|
case strings.Contains(mediaType, "text/html"):
|
|
pm.HTMLBody += string(decoded)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// decodeBody decodes Content-Transfer-Encoding if needed.
|
|
func decodeBody(data []byte, cte string) []byte {
|
|
switch strings.ToLower(strings.TrimSpace(cte)) {
|
|
case "quoted-printable":
|
|
decoded, err := io.ReadAll(quotedprintable.NewReader(bytes.NewReader(data)))
|
|
if err == nil {
|
|
return decoded
|
|
}
|
|
case "base64":
|
|
clean := bytes.ReplaceAll(data, []byte("\r\n"), []byte{})
|
|
clean = bytes.ReplaceAll(clean, []byte("\n"), []byte{})
|
|
clean = bytes.ReplaceAll(clean, []byte("\r"), []byte{})
|
|
decoded := make([]byte, base64.StdEncoding.DecodedLen(len(clean)))
|
|
n, err := base64.StdEncoding.Decode(decoded, clean)
|
|
if err == nil {
|
|
return decoded[:n]
|
|
}
|
|
}
|
|
return data
|
|
}
|
|
|
|
// decodeMIMEHeader decodes RFC 2047 encoded-word headers.
|
|
func decodeMIMEHeader(s string) string {
|
|
dec := new(mime.WordDecoder)
|
|
decoded, err := dec.DecodeHeader(s)
|
|
if err != nil {
|
|
return s
|
|
}
|
|
return decoded
|
|
}
|