Files
archivmail/pkg/mailparser/parser.go
T
sysops a1c4e59fff fix: Date-Parsing-Fallback für nicht-standard MTA-Datumsformate
mailparser: weitere Layouts (Timezone +02:00 mit Doppelpunkt, ohne Sekunden)
storage: GetReceivedAts() für Batch-Lookup von received_at
search_handlers: received_at als Fallback wenn pm.Date.IsZero()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 23:36:18 +02:00

263 lines
6.7 KiB
Go

package mailparser
import (
"bytes"
"encoding/base64"
"fmt"
"io"
"mime"
"mime/multipart"
"mime/quotedprintable"
"net/mail"
"strings"
"time"
)
// Attachment represents a MIME attachment in a parsed email.
type Attachment struct {
Filename string
ContentType string
Data []byte
Size int
}
// ParsedMail holds the structured content of a parsed email message.
type ParsedMail struct {
From string
To []string
CC []string
Subject string
MessageID string
InReplyTo string // In-Reply-To header (single message-id, no angle brackets)
References []string // References header (list of message-ids, no angle brackets)
TextBody string
HTMLBody string
Date time.Time
Attachments []Attachment
Raw []byte
}
// Parse parses a raw RFC 2822 / MIME email and returns a ParsedMail.
func Parse(raw []byte) (*ParsedMail, error) {
msg, err := mail.ReadMessage(bytes.NewReader(raw))
if err != nil {
return nil, fmt.Errorf("mailparser: read message: %w", err)
}
pm := &ParsedMail{Raw: raw}
// From
if from := msg.Header.Get("From"); from != "" {
addrs, err := mail.ParseAddressList(from)
if err == nil && len(addrs) > 0 {
pm.From = addrs[0].Address
} else {
pm.From = from
}
}
// To
if to := msg.Header.Get("To"); to != "" {
addrs, err := mail.ParseAddressList(to)
if err == nil {
for _, a := range addrs {
pm.To = append(pm.To, a.Address)
}
}
}
// CC
if cc := msg.Header.Get("Cc"); cc != "" {
addrs, err := mail.ParseAddressList(cc)
if err == nil {
for _, a := range addrs {
pm.CC = append(pm.CC, a.Address)
}
}
}
// Subject - decode MIME encoded-words
pm.Subject = decodeMIMEHeader(msg.Header.Get("Subject"))
// Message-ID - strip angle brackets
msgID := msg.Header.Get("Message-Id")
pm.MessageID = strings.Trim(msgID, "<>")
// In-Reply-To - strip angle brackets
if irt := msg.Header.Get("In-Reply-To"); irt != "" {
pm.InReplyTo = strings.Trim(strings.TrimSpace(irt), "<>")
}
// References - space-separated list of message-ids
if refs := msg.Header.Get("References"); refs != "" {
for _, r := range strings.Fields(refs) {
r = strings.Trim(r, "<>")
if r != "" {
pm.References = append(pm.References, r)
}
}
}
// Date — try go-message parser first, then fallback formats, then zero
if d, err := msg.Header.Date(); err == nil {
pm.Date = d
} else {
// Some MUAs emit non-standard variants (e.g. "+0100 (CET)" suffix).
// Try common RFC 2822 / non-standard formats before giving up.
raw := strings.TrimSpace(msg.Header.Get("Date"))
// Strip parenthesised timezone comment: "... +0100 (CET)" → "... +0100"
if idx := strings.LastIndex(raw, "("); idx > 0 {
raw = strings.TrimSpace(raw[:idx])
}
parsed := false
for _, layout := range []string{
"Mon, 2 Jan 2006 15:04:05 -0700",
"Mon, 02 Jan 2006 15:04:05 -0700",
"2 Jan 2006 15:04:05 -0700",
"02 Jan 2006 15:04:05 -0700",
"Mon, 2 Jan 2006 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05 MST",
// Colon in timezone offset (e.g. "+02:00") used by some MTA versions
"Mon, 2 Jan 2006 15:04:05 -07:00",
"Mon, 02 Jan 2006 15:04:05 -07:00",
"2 Jan 2006 15:04:05 -07:00",
"02 Jan 2006 15:04:05 -07:00",
// Without seconds
"Mon, 2 Jan 2006 15:04 -0700",
"Mon, 02 Jan 2006 15:04 -0700",
"2 Jan 2006 15:04 -0700",
// Go stdlib aliases
time.RFC1123Z,
time.RFC1123,
} {
if t, err := time.Parse(layout, raw); err == nil {
pm.Date = t
parsed = true
break
}
}
if !parsed {
// Leave pm.Date as zero — storage will use DB DEFAULT NOW()
pm.Date = time.Time{}
}
}
// Parse body / MIME parts
contentType := msg.Header.Get("Content-Type")
mediaType, params, err := mime.ParseMediaType(contentType)
if err != nil {
// No content-type or parse error: treat as plain text
body, _ := io.ReadAll(msg.Body)
pm.TextBody = string(body)
return pm, nil
}
if strings.HasPrefix(mediaType, "multipart/") {
boundary := params["boundary"]
if err := parseMultipart(pm, msg.Body, boundary); err != nil {
return nil, fmt.Errorf("mailparser: multipart: %w", err)
}
} else {
body, _ := io.ReadAll(msg.Body)
decoded := decodeBody(body, msg.Header.Get("Content-Transfer-Encoding"))
if strings.Contains(mediaType, "html") {
pm.HTMLBody = string(decoded)
} else {
pm.TextBody = string(decoded)
}
}
return pm, nil
}
// parseMultipart walks MIME parts and fills text, html, and attachments.
func parseMultipart(pm *ParsedMail, body io.Reader, boundary string) error {
mr := multipart.NewReader(body, boundary)
for {
part, err := mr.NextPart()
if err == io.EOF {
break
}
if err != nil {
return err
}
ct := part.Header.Get("Content-Type")
mediaType, params, err := mime.ParseMediaType(ct)
if err != nil {
mediaType = "application/octet-stream"
params = map[string]string{}
}
data, _ := io.ReadAll(part)
cte := part.Header.Get("Content-Transfer-Encoding")
decoded := decodeBody(data, cte)
// Check disposition for attachment
disp := part.Header.Get("Content-Disposition")
dispType, dispParams, _ := mime.ParseMediaType(disp)
filename := dispParams["filename"]
if filename == "" {
filename = params["name"]
}
filename = decodeMIMEHeader(filename)
if strings.HasPrefix(dispType, "attachment") || filename != "" {
pm.Attachments = append(pm.Attachments, Attachment{
Filename: filename,
ContentType: mediaType,
Data: decoded,
Size: len(decoded),
})
continue
}
// Nested multipart
if strings.HasPrefix(mediaType, "multipart/") {
if err := parseMultipart(pm, bytes.NewReader(decoded), params["boundary"]); err != nil {
return err
}
continue
}
switch {
case strings.Contains(mediaType, "text/plain"):
pm.TextBody += string(decoded)
case strings.Contains(mediaType, "text/html"):
pm.HTMLBody += string(decoded)
}
}
return nil
}
// decodeBody decodes Content-Transfer-Encoding if needed.
func decodeBody(data []byte, cte string) []byte {
switch strings.ToLower(strings.TrimSpace(cte)) {
case "quoted-printable":
decoded, err := io.ReadAll(quotedprintable.NewReader(bytes.NewReader(data)))
if err == nil {
return decoded
}
case "base64":
clean := bytes.ReplaceAll(data, []byte("\r\n"), []byte{})
clean = bytes.ReplaceAll(clean, []byte("\n"), []byte{})
clean = bytes.ReplaceAll(clean, []byte("\r"), []byte{})
decoded := make([]byte, base64.StdEncoding.DecodedLen(len(clean)))
n, err := base64.StdEncoding.Decode(decoded, clean)
if err == nil {
return decoded[:n]
}
}
return data
}
// decodeMIMEHeader decodes RFC 2047 encoded-word headers.
func decodeMIMEHeader(s string) string {
dec := new(mime.WordDecoder)
decoded, err := dec.DecodeHeader(s)
if err != nil {
return s
}
return decoded
}