diff --git a/cmd/archivmail/cmd_export.go b/cmd/archivmail/cmd_export.go index c95253f..1897301 100644 --- a/cmd/archivmail/cmd_export.go +++ b/cmd/archivmail/cmd_export.go @@ -65,9 +65,10 @@ func runExport(args []string) { } storeCfg := storage.Config{ - Dir: cfg.Storage.StorePath, - Keyfile: cfg.Storage.Keyfile, - DSN: cfg.Database.DSN(), + Dir: cfg.Storage.StorePath, + Keyfile: cfg.Storage.Keyfile, + DSN: cfg.Database.DSN(), + CompressEnabled: cfg.Storage.Compress, } mailStore, err := storage.New(storeCfg) if err != nil { diff --git a/cmd/archivmail/cmd_import.go b/cmd/archivmail/cmd_import.go index a8f51cc..6309195 100644 --- a/cmd/archivmail/cmd_import.go +++ b/cmd/archivmail/cmd_import.go @@ -57,9 +57,10 @@ func runImport(args []string) { } storeCfg := storage.Config{ - Dir: cfg.Storage.StorePath, - Keyfile: cfg.Storage.Keyfile, - DSN: cfg.Database.DSN(), + Dir: cfg.Storage.StorePath, + Keyfile: cfg.Storage.Keyfile, + DSN: cfg.Database.DSN(), + CompressEnabled: cfg.Storage.Compress, } mailStore, err := storage.New(storeCfg) if err != nil { diff --git a/cmd/archivmail/cmd_import_piler.go b/cmd/archivmail/cmd_import_piler.go index 336454c..8cf97d0 100644 --- a/cmd/archivmail/cmd_import_piler.go +++ b/cmd/archivmail/cmd_import_piler.go @@ -63,9 +63,10 @@ func runImportPiler(args []string) { } storeCfg := storage.Config{ - Dir: cfg.Storage.StorePath, - Keyfile: cfg.Storage.Keyfile, - DSN: cfg.Database.DSN(), + Dir: cfg.Storage.StorePath, + Keyfile: cfg.Storage.Keyfile, + DSN: cfg.Database.DSN(), + CompressEnabled: cfg.Storage.Compress, } mailStore, err := storage.New(storeCfg) if err != nil { diff --git a/cmd/archivmail/cmd_reindex.go b/cmd/archivmail/cmd_reindex.go index 4ea92fb..7e54813 100644 --- a/cmd/archivmail/cmd_reindex.go +++ b/cmd/archivmail/cmd_reindex.go @@ -30,9 +30,10 @@ func runReindex(args []string) { } storeCfg := storage.Config{ - Dir: cfg.Storage.StorePath, - Keyfile: cfg.Storage.Keyfile, - DSN: cfg.Database.DSN(), + Dir: cfg.Storage.StorePath, + Keyfile: cfg.Storage.Keyfile, + DSN: cfg.Database.DSN(), + CompressEnabled: cfg.Storage.Compress, } mailStore, err := storage.New(storeCfg) if err != nil { diff --git a/cmd/archivmail/main.go b/cmd/archivmail/main.go index 0277c4b..bb1bc91 100644 --- a/cmd/archivmail/main.go +++ b/cmd/archivmail/main.go @@ -105,10 +105,11 @@ func main() { // Storage with encryption + DB metadata storeCfg := storage.Config{ - Dir: cfg.Storage.StorePath, - Keyfile: cfg.Storage.Keyfile, - DSN: cfg.Database.DSN(), - RetentionDays: cfg.Storage.RetentionDays, + Dir: cfg.Storage.StorePath, + Keyfile: cfg.Storage.Keyfile, + DSN: cfg.Database.DSN(), + RetentionDays: cfg.Storage.RetentionDays, + CompressEnabled: cfg.Storage.Compress, } mailStore, err := storage.New(storeCfg) if err != nil { diff --git a/config/config.go b/config/config.go index 79f4374..c8b065e 100644 --- a/config/config.go +++ b/config/config.go @@ -63,9 +63,9 @@ type SMTPOutConfig struct { type StorageConfig struct { StorePath string `yaml:"store_path"` AStorePath string `yaml:"astore_path"` - XapianPath string `yaml:"xapian_path"` Keyfile string `yaml:"keyfile"` RetentionDays int `yaml:"retention_days"` // 0 = kein Lock (GoBD-Compliance: z.B. 3650 für 10 Jahre) + Compress bool `yaml:"compress"` // gzip-Kompression vor AES-256-GCM (spart ~40-60% Disk) } // DatabaseConfig holds PostgreSQL connection settings. diff --git a/features/INDEX.md b/features/INDEX.md index bcd4800..825495e 100644 --- a/features/INDEX.md +++ b/features/INDEX.md @@ -52,7 +52,9 @@ | PROJ-34 | Retention-Policy + Löschsperre (GoBD-Compliance) | Deployed | [PROJ-34](PROJ-34-retention-policy.md) | 2026-03-31 | | PROJ-35 | OCR & Anhang-Volltext-Indexierung | Planned | [PROJ-35](PROJ-35-ocr-anhang-volltext.md) | 2026-04-04 | +| PROJ-36 | gzip-Kompression + storage_objects-Tabelle | In Progress | [PROJ-36](PROJ-36-compression-storage-objects.md) | 2026-04-05 | +| PROJ-37 | Attachment-Deduplication (Hash-basiert) | In Progress | [PROJ-37](PROJ-37-attachment-deduplication.md) | 2026-04-05 | -## Next Available ID: PROJ-35 +## Next Available ID: PROJ-38 diff --git a/internal/storage/attachments.go b/internal/storage/attachments.go new file mode 100644 index 0000000..ca0e24e --- /dev/null +++ b/internal/storage/attachments.go @@ -0,0 +1,109 @@ +package storage + +import ( + "context" + "crypto/sha256" + "fmt" + "os" + "path/filepath" + + "github.com/archivmail/pkg/mailparser" +) + +// saveAttachments deduplicates and stores attachments from a parsed email. +// Each unique attachment (by SHA-256 hash) is stored once on disk. +// email_attachments links attachments to their email record. +func (s *Store) saveAttachments(ctx context.Context, emailID string, pm *mailparser.ParsedMail) error { + if s.db == nil || len(pm.Attachments) == 0 { + return nil + } + + for _, att := range pm.Attachments { + if len(att.Data) == 0 { + continue + } + + sum := sha256.Sum256(att.Data) + hash := fmt.Sprintf("%x", sum[:]) + + // Check if this attachment is already stored + var attID int64 + err := s.db.QueryRow(ctx, `SELECT id FROM attachments WHERE hash = $1`, hash).Scan(&attID) + if err != nil { + // Not found — compress and store + toWrite := att.Data + compression := "none" + if s.compressEnabled { + compressed, cerr := compressGzip(att.Data) + if cerr == nil && len(compressed) < len(att.Data) { + toWrite = compressed + compression = "gzip" + } + } + + attPath := s.attachmentPath(hash) + if err := os.MkdirAll(filepath.Dir(attPath), 0o755); err != nil { + return fmt.Errorf("storage: attachment mkdir: %w", err) + } + if _, statErr := os.Stat(attPath); os.IsNotExist(statErr) { + if err := os.WriteFile(attPath, toWrite, 0o644); err != nil { + return fmt.Errorf("storage: attachment write: %w", err) + } + } + + // Register in storage_objects + var soID int64 + soErr := s.db.QueryRow(ctx, ` + INSERT INTO storage_objects (storage_type, path, compression, size_original, size_compressed, checksum) + VALUES ('filesystem', $1, $2, $3, $4, $5) + RETURNING id + `, attPath, compression, int64(len(att.Data)), int64(len(toWrite)), hash).Scan(&soID) + + // Insert attachment record + var insertErr error + if soErr == nil { + insertErr = s.db.QueryRow(ctx, ` + INSERT INTO attachments (filename, mime_type, size_bytes, hash, storage_id) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (hash) DO UPDATE SET hash = EXCLUDED.hash + RETURNING id + `, att.Filename, att.ContentType, int64(len(att.Data)), hash, soID).Scan(&attID) + } else { + insertErr = s.db.QueryRow(ctx, ` + INSERT INTO attachments (filename, mime_type, size_bytes, hash) + VALUES ($1, $2, $3, $4) + ON CONFLICT (hash) DO UPDATE SET hash = EXCLUDED.hash + RETURNING id + `, att.Filename, att.ContentType, int64(len(att.Data)), hash).Scan(&attID) + } + if insertErr != nil { + continue // non-fatal: mail is saved, attachment linking is best-effort + } + } + + // Link attachment to email + _, _ = s.db.Exec(ctx, ` + INSERT INTO email_attachments (email_id, attachment_id) + VALUES ($1, $2) + ON CONFLICT DO NOTHING + `, emailID, attID) + } + + return nil +} + +// LoadAttachment reads and decompresses an attachment by its SHA-256 hash. +func (s *Store) LoadAttachment(hash string) ([]byte, error) { + path := s.attachmentPath(hash) + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("storage: attachment not found: %s", hash) + } + return maybeDecompress(data) +} + +// attachmentPath returns the on-disk path for a given attachment hash. +// Uses 2-level 2-char prefix sharding: {dir}/attachments/{ab}/{cd}/{hash} +func (s *Store) attachmentPath(hash string) string { + return filepath.Join(s.dir, "attachments", hash[:2], hash[2:4], hash) +} diff --git a/internal/storage/compress.go b/internal/storage/compress.go new file mode 100644 index 0000000..281fc48 --- /dev/null +++ b/internal/storage/compress.go @@ -0,0 +1,56 @@ +package storage + +import ( + "bytes" + "compress/gzip" + "fmt" + "io" +) + +// Compression magic byte — prepended to compressed content before encryption. +// RFC 2822 email headers always start with printable ASCII (>= 0x20), +// so 0x01 is safe as a marker and will never appear in legacy uncompressed files. +const ( + magicGzip = byte(0x01) +) + +// compressGzip compresses data with gzip and prepends the magic byte. +// Returns an error only on internal gzip failure (effectively never). +func compressGzip(data []byte) ([]byte, error) { + var buf bytes.Buffer + buf.WriteByte(magicGzip) + w, err := gzip.NewWriterLevel(&buf, gzip.BestCompression) + if err != nil { + return nil, fmt.Errorf("storage: gzip writer: %w", err) + } + if _, err := w.Write(data); err != nil { + _ = w.Close() + return nil, fmt.Errorf("storage: gzip write: %w", err) + } + if err := w.Close(); err != nil { + return nil, fmt.Errorf("storage: gzip close: %w", err) + } + return buf.Bytes(), nil +} + +// maybeDecompress inspects the first byte of data and decompresses if needed. +// Legacy files (no magic byte, first byte is printable ASCII) are returned as-is. +func maybeDecompress(data []byte) ([]byte, error) { + if len(data) < 2 { + return data, nil + } + if data[0] != magicGzip { + // Legacy file: no compression — return as-is + return data, nil + } + r, err := gzip.NewReader(bytes.NewReader(data[1:])) + if err != nil { + return nil, fmt.Errorf("storage: gzip reader: %w", err) + } + defer r.Close() + out, err := io.ReadAll(r) + if err != nil { + return nil, fmt.Errorf("storage: gzip decompress: %w", err) + } + return out, nil +} diff --git a/internal/storage/storage.go b/internal/storage/storage.go index 3ce4698..c92ac09 100644 --- a/internal/storage/storage.go +++ b/internal/storage/storage.go @@ -28,19 +28,21 @@ var ErrRetentionLock = errors.New("storage: mail is within retention period") // Config holds the configuration for initialising a Store. type Config struct { - Dir string // base directory for file storage - Keyfile string // path to 32-byte AES key file; empty = no encryption - DSN string // PostgreSQL DSN; empty = no DB - RetentionDays int // 0 = no lock; >0 = GoBD retention period in days + Dir string // base directory for file storage + Keyfile string // path to 32-byte AES key file; empty = no encryption + DSN string // PostgreSQL DSN; empty = no DB + RetentionDays int // 0 = no lock; >0 = GoBD retention period in days + CompressEnabled bool // gzip-compress emails and attachments before encryption } // Store is a file-based email storage with optional AES-256-GCM encryption // and optional PostgreSQL metadata. type Store struct { - dir string - key []byte // nil = no encryption - db *pgxpool.Pool // nil = no DB - retentionDays int // 0 = no lock + dir string + key []byte // nil = no encryption + db *pgxpool.Pool // nil = no DB + retentionDays int // 0 = no lock + compressEnabled bool // gzip before encryption } // StoreStats reports total mail count and size in bytes. @@ -70,7 +72,7 @@ func New(cfg Config) (*Store, error) { } } - s := &Store{dir: cfg.Dir, retentionDays: cfg.RetentionDays} + s := &Store{dir: cfg.Dir, retentionDays: cfg.RetentionDays, compressEnabled: cfg.CompressEnabled} // Load encryption key if err := s.loadKey(cfg.Keyfile); err != nil { @@ -99,6 +101,8 @@ func New(cfg Config) (*Store, error) { // PROJ-33: Stable IMAP UIDs _, _ = s.db.Exec(ctx, `ALTER TABLE emails ADD COLUMN IF NOT EXISTS uid BIGSERIAL`) _, _ = s.db.Exec(ctx, `CREATE UNIQUE INDEX IF NOT EXISTS idx_emails_uid ON emails (uid)`) + // 2.0: storage_objects FK on emails + _, _ = s.db.Exec(ctx, `ALTER TABLE emails ADD COLUMN IF NOT EXISTS storage_id BIGINT REFERENCES storage_objects(id)`) } return s, nil @@ -197,7 +201,33 @@ func (s *Store) decrypt(data []byte) ([]byte, error) { // ── Database schema ─────────────────────────────────────────────────────── func (s *Store) initSchema(ctx context.Context) error { + // storage_objects must exist before emails (FK dependency) _, err := s.db.Exec(ctx, ` + CREATE TABLE IF NOT EXISTS storage_objects ( + id BIGSERIAL PRIMARY KEY, + storage_type TEXT NOT NULL DEFAULT 'filesystem', + path TEXT NOT NULL, + compression TEXT NOT NULL DEFAULT 'none', + size_original BIGINT, + size_compressed BIGINT, + checksum CHAR(64), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); + CREATE TABLE IF NOT EXISTS attachments ( + id BIGSERIAL PRIMARY KEY, + filename TEXT, + mime_type TEXT, + size_bytes BIGINT, + hash CHAR(64) UNIQUE NOT NULL, + storage_id BIGINT REFERENCES storage_objects(id), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); + CREATE INDEX IF NOT EXISTS idx_attachments_hash ON attachments (hash); + `) + if err != nil { + return err + } + _, err = s.db.Exec(ctx, ` CREATE TABLE IF NOT EXISTS emails ( id TEXT PRIMARY KEY, received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), @@ -228,6 +258,12 @@ func (s *Store) initSchema(ctx context.Context) error { ); CREATE INDEX IF NOT EXISTS idx_email_refs_tenant ON email_refs (tenant_id); CREATE INDEX IF NOT EXISTS idx_email_refs_email ON email_refs (email_id); + CREATE TABLE IF NOT EXISTS email_attachments ( + email_id TEXT NOT NULL REFERENCES emails(id), + attachment_id BIGINT NOT NULL REFERENCES attachments(id), + PRIMARY KEY (email_id, attachment_id) + ); + CREATE INDEX IF NOT EXISTS idx_email_attachments_email ON email_attachments (email_id); `) return err } @@ -290,16 +326,27 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int } if !fileExists { - // Determine what to write: encrypted or plaintext + // Compress before encryption (if enabled) + toStore := raw + compression := "none" + if s.compressEnabled { + compressed, cerr := compressGzip(raw) + if cerr == nil && len(compressed) < len(raw) { + toStore = compressed + compression = "gzip" + } + } + + // Encrypt (if key configured) var toWrite []byte if s.key != nil { - encrypted, err := s.encrypt(raw) + encrypted, err := s.encrypt(toStore) if err != nil { return "", err } toWrite = encrypted } else { - toWrite = raw + toWrite = toStore } if err := os.WriteFile(path, toWrite, 0o644); err != nil { @@ -308,8 +355,19 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int // Insert metadata into DB if s.db != nil { + // Register in storage_objects + var storageID *int64 + var sid int64 + if soErr := s.db.QueryRow(ctx, ` + INSERT INTO storage_objects (storage_type, path, compression, size_original, size_compressed, checksum) + VALUES ('filesystem', $1, $2, $3, $4, $5) + RETURNING id + `, path, compression, int64(len(raw)), int64(len(toWrite)), id).Scan(&sid); soErr == nil { + storageID = &sid + } + if parseErr == nil { - if err := s.insertMeta(ctx, id, pm, len(raw), tenantID); err != nil { + if err := s.insertMeta(ctx, id, pm, len(raw), tenantID, storageID); err != nil { // Race: another goroutine inserted via Message-ID UNIQUE conflict. // Resolve to the existing record's ID. if messageID != "" { @@ -327,7 +385,11 @@ func (s *Store) Save(ctx context.Context, raw []byte, _ time.Time, tenantID *int // Non-conflict insert error: log but continue (file is written, metadata can be backfilled) } } else { - s.insertMetaMinimal(ctx, id, len(raw), tenantID) + s.insertMetaMinimal(ctx, id, len(raw), tenantID, storageID) + } + // Sprint 2: deduplicate and store attachments + if parseErr == nil { + _ = s.saveAttachments(ctx, id, pm) } // PROJ-34: Set retention lock. // Mandanten-Mails: nur wenn der Mandant explizit retention_days > 0 gesetzt hat. @@ -390,14 +452,14 @@ func (s *Store) Load(id string) ([]byte, error) { if s.key != nil { plaintext, err := s.decrypt(data) if err != nil { - // If decryption fails, the file might be stored unencrypted - // (pre-encryption era). Return as-is for backwards compatibility. - return data, nil + // Pre-encryption era: file stored unencrypted — try decompression anyway. + out, _ := maybeDecompress(data) + return out, nil } - return plaintext, nil + data = plaintext } - return data, nil + return maybeDecompress(data) } // Delete removes a stored email by its ID, including its DB metadata row. @@ -574,7 +636,7 @@ func (s *Store) firstAndLastFromFS() (first, last *MailRef, err error) { // insertMeta inserts parsed email metadata into the emails table. // Returns an error so the caller can detect UNIQUE-constraint conflicts on message_id. -func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.ParsedMail, size int, tenantID *int64) error { +func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.ParsedMail, size int, tenantID *int64, storageID *int64) error { mailTo := strings.Join(pm.To, ", ") hasAttach := len(pm.Attachments) > 0 @@ -588,20 +650,20 @@ func (s *Store) insertMeta(ctx context.Context, id string, pm *mailparser.Parsed receivedAt = time.Now() } _, err := s.db.Exec(ctx, ` - INSERT INTO emails (id, received_at, mail_from, mail_to, subject, size_bytes, has_attach, tenant_id, message_id) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) + INSERT INTO emails (id, received_at, mail_from, mail_to, subject, size_bytes, has_attach, tenant_id, message_id, storage_id) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) ON CONFLICT (id) DO NOTHING - `, id, receivedAt, pm.From, mailTo, pm.Subject, int64(size), hasAttach, tenantID, msgID) + `, id, receivedAt, pm.From, mailTo, pm.Subject, int64(size), hasAttach, tenantID, msgID, storageID) return err } // insertMetaMinimal inserts minimal metadata when parsing fails. -func (s *Store) insertMetaMinimal(ctx context.Context, id string, size int, tenantID *int64) { +func (s *Store) insertMetaMinimal(ctx context.Context, id string, size int, tenantID *int64, storageID *int64) { _, _ = s.db.Exec(ctx, ` - INSERT INTO emails (id, received_at, size_bytes, tenant_id) - VALUES ($1, NOW(), $2, $3) + INSERT INTO emails (id, received_at, size_bytes, tenant_id, storage_id) + VALUES ($1, NOW(), $2, $3, $4) ON CONFLICT (id) DO NOTHING - `, id, int64(size), tenantID) + `, id, int64(size), tenantID, storageID) } // SaveMeta upserts metadata for a given email ID. Used by the backfill process.