feat(PROJ-36,PROJ-37): gzip-Kompression + Attachment-Deduplication

Sprint 1: Emails werden vor AES-256-GCM optional gzip-komprimiert (compress: true).
Magic-Byte 0x01 als Prefix ermöglicht backward-kompatibles Load() für Legacy-Dateien.
Neue DB-Tabelle storage_objects trackt Kompressions-Metadaten.

Sprint 2: Attachments werden via SHA-256 dedupliziert — gleicher Anhang in N Mails
wird nur einmal gespeichert. Neue Tabellen: attachments, email_attachments.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
sysops
2026-04-05 01:19:51 +02:00
parent fdb25cb16a
commit 27d45f58e8
10 changed files with 279 additions and 45 deletions
+89 -27
View File
@@ -28,19 +28,21 @@ var ErrRetentionLock = errors.New("storage: mail is within retention period")
// Config holds the configuration for initialising a Store.
type Config struct {
Dir string // base directory for file storage
Keyfile string // path to 32-byte AES key file; empty = no encryption
DSN string // PostgreSQL DSN; empty = no DB
RetentionDays int // 0 = no lock; >0 = GoBD retention period in days
Dir string // base directory for file storage
Keyfile string // path to 32-byte AES key file; empty = no encryption
DSN string // PostgreSQL DSN; empty = no DB
RetentionDays int // 0 = no lock; >0 = GoBD retention period in days
CompressEnabled bool // gzip-compress emails and attachments before encryption
}
// Store is a file-based email storage with optional AES-256-GCM encryption
// and optional PostgreSQL metadata.
type Store struct {
dir string
key []byte // nil = no encryption
db *pgxpool.Pool // nil = no DB
retentionDays int // 0 = no lock
dir string
key []byte // nil = no encryption
db *pgxpool.Pool // nil = no DB
retentionDays int // 0 = no lock
compressEnabled bool // gzip before encryption
}
// StoreStats reports total mail count and size in bytes.
@@ -70,7 +72,7 @@ func New(cfg Config) (*Store, error) {
}
}
s := &Store{dir: cfg.Dir, retentionDays: cfg.RetentionDays}
s := &Store{dir: cfg.Dir, retentionDays: cfg.RetentionDays, compressEnabled: cfg.CompressEnabled}
// Load encryption key
if err := s.loadKey(cfg.Keyfile); err != nil {
@@ -99,6 +101,8 @@ func New(cfg Config) (*Store, error) {
// PROJ-33: Stable IMAP UIDs
_, _ = s.db.Exec(ctx, `ALTER TABLE emails ADD COLUMN IF NOT EXISTS uid BIGSERIAL`)
_, _ = s.db.Exec(ctx, `CREATE UNIQUE INDEX IF NOT EXISTS idx_emails_uid ON emails (uid)`)
// 2.0: storage_objects FK on emails
_, _ = s.db.Exec(ctx, `ALTER TABLE emails ADD COLUMN IF NOT EXISTS storage_id BIGINT REFERENCES storage_objects(id)`)
}
return s, nil
@@ -197,7 +201,33 @@ func (s *Store) decrypt(data []byte) ([]byte, error) {
// ── Database schema ───────────────────────────────────────────────────────
func (s *Store) initSchema(ctx context.Context) error {
// storage_objects must exist before emails (FK dependency)
_, err := s.db.Exec(ctx, `
CREATE TABLE IF NOT EXISTS storage_objects (
id BIGSERIAL PRIMARY KEY,
storage_type TEXT NOT NULL DEFAULT 'filesystem',
path TEXT NOT NULL,
compression TEXT NOT NULL DEFAULT 'none',
size_original BIGINT,
size_compressed BIGINT,
checksum CHAR(64),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS attachments (
id BIGSERIAL PRIMARY KEY,
filename TEXT,
mime_type TEXT,
size_bytes BIGINT,
hash CHAR(64) UNIQUE NOT NULL,
storage_id BIGINT REFERENCES storage_objects(id),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_attachments_hash ON attachments (hash);
`)
if err != nil {
return err
}
_, err = s.db.Exec(ctx, `
CREATE TABLE IF NOT EXISTS emails (
id TEXT PRIMARY KEY,
received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
@@ -228,6 +258,12 @@ func (s *Store) initSchema(ctx context.Context) error {
);
CREATE INDEX IF NOT EXISTS idx_email_refs_tenant ON email_refs (tenant_id);
CREATE INDEX IF NOT EXISTS idx_email_refs_email ON email_refs (email_id);
CREATE TABLE IF NOT EXISTS email_attachments (
email_id TEXT NOT NULL REFERENCES emails(id),
attachment_id BIGINT NOT NULL REFERENCES attachments(id),
PRIMARY KEY (email_id, attachment_id)
);
CREATE INDEX IF NOT EXISTS idx_email_attachments_email ON email_attachments (email_id);
`)
return err
}
@@ -290,16 +326,27 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int
}
if !fileExists {
// Determine what to write: encrypted or plaintext
// Compress before encryption (if enabled)
toStore := raw
compression := "none"
if s.compressEnabled {
compressed, cerr := compressGzip(raw)
if cerr == nil && len(compressed) < len(raw) {
toStore = compressed
compression = "gzip"
}
}
// Encrypt (if key configured)
var toWrite []byte
if s.key != nil {
encrypted, err := s.encrypt(raw)
encrypted, err := s.encrypt(toStore)
if err != nil {
return "", err
}
toWrite = encrypted
} else {
toWrite = raw
toWrite = toStore
}
if err := os.WriteFile(path, toWrite, 0o644); err != nil {
@@ -308,8 +355,19 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int
// Insert metadata into DB
if s.db != nil {
// Register in storage_objects
var storageID *int64
var sid int64
if soErr := s.db.QueryRow(ctx, `
INSERT INTO storage_objects (storage_type, path, compression, size_original, size_compressed, checksum)
VALUES ('filesystem', $1, $2, $3, $4, $5)
RETURNING id
`, path, compression, int64(len(raw)), int64(len(toWrite)), id).Scan(&sid); soErr == nil {
storageID = &sid
}
if parseErr == nil {
if err := s.insertMeta(ctx, id, pm, len(raw), tenantID); err != nil {
if err := s.insertMeta(ctx, id, pm, len(raw), tenantID, storageID); err != nil {
// Race: another goroutine inserted via Message-ID UNIQUE conflict.
// Resolve to the existing record's ID.
if messageID != "" {
@@ -327,7 +385,11 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int
// Non-conflict insert error: log but continue (file is written, metadata can be backfilled)
}
} else {
s.insertMetaMinimal(ctx, id, len(raw), tenantID)
s.insertMetaMinimal(ctx, id, len(raw), tenantID, storageID)
}
// Sprint 2: deduplicate and store attachments
if parseErr == nil {
_ = s.saveAttachments(ctx, id, pm)
}
// PROJ-34: Set retention lock.
// Mandanten-Mails: nur wenn der Mandant explizit retention_days > 0 gesetzt hat.
@@ -390,14 +452,14 @@ func (s *Store) Load(id string) ([]byte, error) {
if s.key != nil {
plaintext, err := s.decrypt(data)
if err != nil {
// If decryption fails, the file might be stored unencrypted
// (pre-encryption era). Return as-is for backwards compatibility.
return data, nil
// Pre-encryption era: file stored unencrypted — try decompression anyway.
out, _ := maybeDecompress(data)
return out, nil
}
return plaintext, nil
data = plaintext
}
return data, nil
return maybeDecompress(data)
}
// Delete removes a stored email by its ID, including its DB metadata row.
@@ -574,7 +636,7 @@ func (s *Store) firstAndLastFromFS() (first, last *MailRef, err error) {
// insertMeta inserts parsed email metadata into the emails table.
// Returns an error so the caller can detect UNIQUE-constraint conflicts on message_id.
func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.ParsedMail, size int, tenantID *int64) error {
func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.ParsedMail, size int, tenantID *int64, storageID *int64) error {
mailTo := strings.Join(pm.To, ", ")
hasAttach := len(pm.Attachments) > 0
@@ -588,20 +650,20 @@ func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.Parsed
receivedAt = time.Now()
}
_, err := s.db.Exec(ctx, `
INSERT INTO emails (id, received_at, mail_from, mail_to, subject, size_bytes, has_attach, tenant_id, message_id)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
INSERT INTO emails (id, received_at, mail_from, mail_to, subject, size_bytes, has_attach, tenant_id, message_id, storage_id)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
ON CONFLICT (id) DO NOTHING
`, id, receivedAt, pm.From, mailTo, pm.Subject, int64(size), hasAttach, tenantID, msgID)
`, id, receivedAt, pm.From, mailTo, pm.Subject, int64(size), hasAttach, tenantID, msgID, storageID)
return err
}
// insertMetaMinimal inserts minimal metadata when parsing fails.
func (s *Store) insertMetaMinimal(ctx context.Context, id string, size int, tenantID *int64) {
func (s *Store) insertMetaMinimal(ctx context.Context, id string, size int, tenantID *int64, storageID *int64) {
_, _ = s.db.Exec(ctx, `
INSERT INTO emails (id, received_at, size_bytes, tenant_id)
VALUES ($1, NOW(), $2, $3)
INSERT INTO emails (id, received_at, size_bytes, tenant_id, storage_id)
VALUES ($1, NOW(), $2, $3, $4)
ON CONFLICT (id) DO NOTHING
`, id, int64(size), tenantID)
`, id, int64(size), tenantID, storageID)
}
// SaveMeta upserts metadata for a given email ID. Used by the backfill process.