feat(PROJ-36,PROJ-37): gzip-Kompression + Attachment-Deduplication
Sprint 1: Emails werden vor AES-256-GCM optional gzip-komprimiert (compress: true). Magic-Byte 0x01 als Prefix ermöglicht backward-kompatibles Load() für Legacy-Dateien. Neue DB-Tabelle storage_objects trackt Kompressions-Metadaten. Sprint 2: Attachments werden via SHA-256 dedupliziert — gleicher Anhang in N Mails wird nur einmal gespeichert. Neue Tabellen: attachments, email_attachments. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+89
-27
@@ -28,19 +28,21 @@ var ErrRetentionLock = errors.New("storage: mail is within retention period")
|
||||
|
||||
// Config holds the configuration for initialising a Store.
|
||||
type Config struct {
|
||||
Dir string // base directory for file storage
|
||||
Keyfile string // path to 32-byte AES key file; empty = no encryption
|
||||
DSN string // PostgreSQL DSN; empty = no DB
|
||||
RetentionDays int // 0 = no lock; >0 = GoBD retention period in days
|
||||
Dir string // base directory for file storage
|
||||
Keyfile string // path to 32-byte AES key file; empty = no encryption
|
||||
DSN string // PostgreSQL DSN; empty = no DB
|
||||
RetentionDays int // 0 = no lock; >0 = GoBD retention period in days
|
||||
CompressEnabled bool // gzip-compress emails and attachments before encryption
|
||||
}
|
||||
|
||||
// Store is a file-based email storage with optional AES-256-GCM encryption
|
||||
// and optional PostgreSQL metadata.
|
||||
type Store struct {
|
||||
dir string
|
||||
key []byte // nil = no encryption
|
||||
db *pgxpool.Pool // nil = no DB
|
||||
retentionDays int // 0 = no lock
|
||||
dir string
|
||||
key []byte // nil = no encryption
|
||||
db *pgxpool.Pool // nil = no DB
|
||||
retentionDays int // 0 = no lock
|
||||
compressEnabled bool // gzip before encryption
|
||||
}
|
||||
|
||||
// StoreStats reports total mail count and size in bytes.
|
||||
@@ -70,7 +72,7 @@ func New(cfg Config) (*Store, error) {
|
||||
}
|
||||
}
|
||||
|
||||
s := &Store{dir: cfg.Dir, retentionDays: cfg.RetentionDays}
|
||||
s := &Store{dir: cfg.Dir, retentionDays: cfg.RetentionDays, compressEnabled: cfg.CompressEnabled}
|
||||
|
||||
// Load encryption key
|
||||
if err := s.loadKey(cfg.Keyfile); err != nil {
|
||||
@@ -99,6 +101,8 @@ func New(cfg Config) (*Store, error) {
|
||||
// PROJ-33: Stable IMAP UIDs
|
||||
_, _ = s.db.Exec(ctx, `ALTER TABLE emails ADD COLUMN IF NOT EXISTS uid BIGSERIAL`)
|
||||
_, _ = s.db.Exec(ctx, `CREATE UNIQUE INDEX IF NOT EXISTS idx_emails_uid ON emails (uid)`)
|
||||
// 2.0: storage_objects FK on emails
|
||||
_, _ = s.db.Exec(ctx, `ALTER TABLE emails ADD COLUMN IF NOT EXISTS storage_id BIGINT REFERENCES storage_objects(id)`)
|
||||
}
|
||||
|
||||
return s, nil
|
||||
@@ -197,7 +201,33 @@ func (s *Store) decrypt(data []byte) ([]byte, error) {
|
||||
// ── Database schema ───────────────────────────────────────────────────────
|
||||
|
||||
func (s *Store) initSchema(ctx context.Context) error {
|
||||
// storage_objects must exist before emails (FK dependency)
|
||||
_, err := s.db.Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS storage_objects (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
storage_type TEXT NOT NULL DEFAULT 'filesystem',
|
||||
path TEXT NOT NULL,
|
||||
compression TEXT NOT NULL DEFAULT 'none',
|
||||
size_original BIGINT,
|
||||
size_compressed BIGINT,
|
||||
checksum CHAR(64),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS attachments (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
filename TEXT,
|
||||
mime_type TEXT,
|
||||
size_bytes BIGINT,
|
||||
hash CHAR(64) UNIQUE NOT NULL,
|
||||
storage_id BIGINT REFERENCES storage_objects(id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_attachments_hash ON attachments (hash);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = s.db.Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS emails (
|
||||
id TEXT PRIMARY KEY,
|
||||
received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
@@ -228,6 +258,12 @@ func (s *Store) initSchema(ctx context.Context) error {
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_refs_tenant ON email_refs (tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_refs_email ON email_refs (email_id);
|
||||
CREATE TABLE IF NOT EXISTS email_attachments (
|
||||
email_id TEXT NOT NULL REFERENCES emails(id),
|
||||
attachment_id BIGINT NOT NULL REFERENCES attachments(id),
|
||||
PRIMARY KEY (email_id, attachment_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_attachments_email ON email_attachments (email_id);
|
||||
`)
|
||||
return err
|
||||
}
|
||||
@@ -290,16 +326,27 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int
|
||||
}
|
||||
|
||||
if !fileExists {
|
||||
// Determine what to write: encrypted or plaintext
|
||||
// Compress before encryption (if enabled)
|
||||
toStore := raw
|
||||
compression := "none"
|
||||
if s.compressEnabled {
|
||||
compressed, cerr := compressGzip(raw)
|
||||
if cerr == nil && len(compressed) < len(raw) {
|
||||
toStore = compressed
|
||||
compression = "gzip"
|
||||
}
|
||||
}
|
||||
|
||||
// Encrypt (if key configured)
|
||||
var toWrite []byte
|
||||
if s.key != nil {
|
||||
encrypted, err := s.encrypt(raw)
|
||||
encrypted, err := s.encrypt(toStore)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
toWrite = encrypted
|
||||
} else {
|
||||
toWrite = raw
|
||||
toWrite = toStore
|
||||
}
|
||||
|
||||
if err := os.WriteFile(path, toWrite, 0o644); err != nil {
|
||||
@@ -308,8 +355,19 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int
|
||||
|
||||
// Insert metadata into DB
|
||||
if s.db != nil {
|
||||
// Register in storage_objects
|
||||
var storageID *int64
|
||||
var sid int64
|
||||
if soErr := s.db.QueryRow(ctx, `
|
||||
INSERT INTO storage_objects (storage_type, path, compression, size_original, size_compressed, checksum)
|
||||
VALUES ('filesystem', $1, $2, $3, $4, $5)
|
||||
RETURNING id
|
||||
`, path, compression, int64(len(raw)), int64(len(toWrite)), id).Scan(&sid); soErr == nil {
|
||||
storageID = &sid
|
||||
}
|
||||
|
||||
if parseErr == nil {
|
||||
if err := s.insertMeta(ctx, id, pm, len(raw), tenantID); err != nil {
|
||||
if err := s.insertMeta(ctx, id, pm, len(raw), tenantID, storageID); err != nil {
|
||||
// Race: another goroutine inserted via Message-ID UNIQUE conflict.
|
||||
// Resolve to the existing record's ID.
|
||||
if messageID != "" {
|
||||
@@ -327,7 +385,11 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int
|
||||
// Non-conflict insert error: log but continue (file is written, metadata can be backfilled)
|
||||
}
|
||||
} else {
|
||||
s.insertMetaMinimal(ctx, id, len(raw), tenantID)
|
||||
s.insertMetaMinimal(ctx, id, len(raw), tenantID, storageID)
|
||||
}
|
||||
// Sprint 2: deduplicate and store attachments
|
||||
if parseErr == nil {
|
||||
_ = s.saveAttachments(ctx, id, pm)
|
||||
}
|
||||
// PROJ-34: Set retention lock.
|
||||
// Mandanten-Mails: nur wenn der Mandant explizit retention_days > 0 gesetzt hat.
|
||||
@@ -390,14 +452,14 @@ func (s *Store) Load(id string) ([]byte, error) {
|
||||
if s.key != nil {
|
||||
plaintext, err := s.decrypt(data)
|
||||
if err != nil {
|
||||
// If decryption fails, the file might be stored unencrypted
|
||||
// (pre-encryption era). Return as-is for backwards compatibility.
|
||||
return data, nil
|
||||
// Pre-encryption era: file stored unencrypted — try decompression anyway.
|
||||
out, _ := maybeDecompress(data)
|
||||
return out, nil
|
||||
}
|
||||
return plaintext, nil
|
||||
data = plaintext
|
||||
}
|
||||
|
||||
return data, nil
|
||||
return maybeDecompress(data)
|
||||
}
|
||||
|
||||
// Delete removes a stored email by its ID, including its DB metadata row.
|
||||
@@ -574,7 +636,7 @@ func (s *Store) firstAndLastFromFS() (first, last *MailRef, err error) {
|
||||
|
||||
// insertMeta inserts parsed email metadata into the emails table.
|
||||
// Returns an error so the caller can detect UNIQUE-constraint conflicts on message_id.
|
||||
func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.ParsedMail, size int, tenantID *int64) error {
|
||||
func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.ParsedMail, size int, tenantID *int64, storageID *int64) error {
|
||||
mailTo := strings.Join(pm.To, ", ")
|
||||
hasAttach := len(pm.Attachments) > 0
|
||||
|
||||
@@ -588,20 +650,20 @@ func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.Parsed
|
||||
receivedAt = time.Now()
|
||||
}
|
||||
_, err := s.db.Exec(ctx, `
|
||||
INSERT INTO emails (id, received_at, mail_from, mail_to, subject, size_bytes, has_attach, tenant_id, message_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
INSERT INTO emails (id, received_at, mail_from, mail_to, subject, size_bytes, has_attach, tenant_id, message_id, storage_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
`, id, receivedAt, pm.From, mailTo, pm.Subject, int64(size), hasAttach, tenantID, msgID)
|
||||
`, id, receivedAt, pm.From, mailTo, pm.Subject, int64(size), hasAttach, tenantID, msgID, storageID)
|
||||
return err
|
||||
}
|
||||
|
||||
// insertMetaMinimal inserts minimal metadata when parsing fails.
|
||||
func (s *Store) insertMetaMinimal(ctx context.Context, id string, size int, tenantID *int64) {
|
||||
func (s *Store) insertMetaMinimal(ctx context.Context, id string, size int, tenantID *int64, storageID *int64) {
|
||||
_, _ = s.db.Exec(ctx, `
|
||||
INSERT INTO emails (id, received_at, size_bytes, tenant_id)
|
||||
VALUES ($1, NOW(), $2, $3)
|
||||
INSERT INTO emails (id, received_at, size_bytes, tenant_id, storage_id)
|
||||
VALUES ($1, NOW(), $2, $3, $4)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
`, id, int64(size), tenantID)
|
||||
`, id, int64(size), tenantID, storageID)
|
||||
}
|
||||
|
||||
// SaveMeta upserts metadata for a given email ID. Used by the backfill process.
|
||||
|
||||
Reference in New Issue
Block a user