feat(PROJ-36,PROJ-37): gzip-Kompression + Attachment-Deduplication
Sprint 1: Emails werden vor AES-256-GCM optional gzip-komprimiert (compress: true). Magic-Byte 0x01 als Prefix ermöglicht backward-kompatibles Load() für Legacy-Dateien. Neue DB-Tabelle storage_objects trackt Kompressions-Metadaten. Sprint 2: Attachments werden via SHA-256 dedupliziert — gleicher Anhang in N Mails wird nur einmal gespeichert. Neue Tabellen: attachments, email_attachments. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,109 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/archivmail/pkg/mailparser"
|
||||
)
|
||||
|
||||
// saveAttachments deduplicates and stores attachments from a parsed email.
|
||||
// Each unique attachment (by SHA-256 hash) is stored once on disk.
|
||||
// email_attachments links attachments to their email record.
|
||||
func (s *Store) saveAttachments(ctx context.Context, emailID string, pm *mailparser.ParsedMail) error {
|
||||
if s.db == nil || len(pm.Attachments) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, att := range pm.Attachments {
|
||||
if len(att.Data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
sum := sha256.Sum256(att.Data)
|
||||
hash := fmt.Sprintf("%x", sum[:])
|
||||
|
||||
// Check if this attachment is already stored
|
||||
var attID int64
|
||||
err := s.db.QueryRow(ctx, `SELECT id FROM attachments WHERE hash = $1`, hash).Scan(&attID)
|
||||
if err != nil {
|
||||
// Not found — compress and store
|
||||
toWrite := att.Data
|
||||
compression := "none"
|
||||
if s.compressEnabled {
|
||||
compressed, cerr := compressGzip(att.Data)
|
||||
if cerr == nil && len(compressed) < len(att.Data) {
|
||||
toWrite = compressed
|
||||
compression = "gzip"
|
||||
}
|
||||
}
|
||||
|
||||
attPath := s.attachmentPath(hash)
|
||||
if err := os.MkdirAll(filepath.Dir(attPath), 0o755); err != nil {
|
||||
return fmt.Errorf("storage: attachment mkdir: %w", err)
|
||||
}
|
||||
if _, statErr := os.Stat(attPath); os.IsNotExist(statErr) {
|
||||
if err := os.WriteFile(attPath, toWrite, 0o644); err != nil {
|
||||
return fmt.Errorf("storage: attachment write: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Register in storage_objects
|
||||
var soID int64
|
||||
soErr := s.db.QueryRow(ctx, `
|
||||
INSERT INTO storage_objects (storage_type, path, compression, size_original, size_compressed, checksum)
|
||||
VALUES ('filesystem', $1, $2, $3, $4, $5)
|
||||
RETURNING id
|
||||
`, attPath, compression, int64(len(att.Data)), int64(len(toWrite)), hash).Scan(&soID)
|
||||
|
||||
// Insert attachment record
|
||||
var insertErr error
|
||||
if soErr == nil {
|
||||
insertErr = s.db.QueryRow(ctx, `
|
||||
INSERT INTO attachments (filename, mime_type, size_bytes, hash, storage_id)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (hash) DO UPDATE SET hash = EXCLUDED.hash
|
||||
RETURNING id
|
||||
`, att.Filename, att.ContentType, int64(len(att.Data)), hash, soID).Scan(&attID)
|
||||
} else {
|
||||
insertErr = s.db.QueryRow(ctx, `
|
||||
INSERT INTO attachments (filename, mime_type, size_bytes, hash)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT (hash) DO UPDATE SET hash = EXCLUDED.hash
|
||||
RETURNING id
|
||||
`, att.Filename, att.ContentType, int64(len(att.Data)), hash).Scan(&attID)
|
||||
}
|
||||
if insertErr != nil {
|
||||
continue // non-fatal: mail is saved, attachment linking is best-effort
|
||||
}
|
||||
}
|
||||
|
||||
// Link attachment to email
|
||||
_, _ = s.db.Exec(ctx, `
|
||||
INSERT INTO email_attachments (email_id, attachment_id)
|
||||
VALUES ($1, $2)
|
||||
ON CONFLICT DO NOTHING
|
||||
`, emailID, attID)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadAttachment reads and decompresses an attachment by its SHA-256 hash.
|
||||
func (s *Store) LoadAttachment(hash string) ([]byte, error) {
|
||||
path := s.attachmentPath(hash)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("storage: attachment not found: %s", hash)
|
||||
}
|
||||
return maybeDecompress(data)
|
||||
}
|
||||
|
||||
// attachmentPath returns the on-disk path for a given attachment hash.
|
||||
// Uses 2-level 2-char prefix sharding: {dir}/attachments/{ab}/{cd}/{hash}
|
||||
func (s *Store) attachmentPath(hash string) string {
|
||||
return filepath.Join(s.dir, "attachments", hash[:2], hash[2:4], hash)
|
||||
}
|
||||
Reference in New Issue
Block a user