CSS_MPC_toolkit/scripts/db_tune_recommendations.sql at main · rlseaman/CSS_MPC_toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
-- ==============================================================================
-- MPC/SBN Database Tuning Recommendations
-- ==============================================================================
-- Host: MPC/SBN replica (RHEL 8.6 VM, 251 GB RAM, HDD)
-- Database: mpc_sbn (PostgreSQL 15.2, 446 GB, logical replica)
-- Generated: 2026-02-08
--
-- All current settings are at PostgreSQL defaults except max_connections,
-- shared_buffers (128 MB!), and WAL sizes.  This database has never been
-- tuned for its 446 GB / 526M-row workload despite having 251 GB RAM.
--
-- INSTRUCTIONS:
--   1. Determine VM RAM (free -h)
--   2. Edit postgresql.conf with the values below
--   3. Run the ALTER TABLE commands for per-table autovacuum settings
--   4. Restart PostgreSQL for postgresql.conf changes
--   5. Run the one-time manual VACUUM commands
--
-- ==============================================================================


-- ==============================================================================
-- FINDING 1: shared_buffers = 128 MB (default) for a 446 GB database
-- ==============================================================================
--
-- This is the single most impactful setting.  PostgreSQL default is 128 MB
-- regardless of database size.  Nearly every query hits disk.
--
-- EDIT postgresql.conf:
--
--   shared_buffers = '64GB'        -- 25% of 251 GB RAM.
--   effective_cache_size = '192GB' -- 75% of RAM (planner hint, not allocation).
--                                  -- OS buff/cache is currently 242 GB, so
--                                  -- this is conservative.
--
-- Also set these related memory parameters:
--
--   work_mem = '128MB'             -- Currently 4 MB (default); used per sort/hash
--                                  -- operation in queries.  With 251 GB RAM and
--                                  -- max_connections=100, worst case is 12.5 GB.
--
--   wal_buffers = '64MB'           -- Currently 4 MB (auto from shared_buffers).
--                                  -- Helps write-heavy replication apply.
--
--   huge_pages = 'on'              -- Recommended for 64 GB shared_buffers.
--                                  -- On RHEL 8, configure OS first:
--                                  --   echo 'vm.nr_hugepages = 33000' > /etc/sysctl.d/postgres.conf
--                                  --   echo 'vm.hugetlb_shm_group = <postgres_gid>' >> /etc/sysctl.d/postgres.conf
--                                  --   sysctl --system
--
-- REQUIRES: PostgreSQL restart (sudo systemctl restart postgresql-15)


-- ==============================================================================
-- FINDING 2: maintenance_work_mem = 64 MB for VACUUM on 239 GB table
-- ==============================================================================
--
-- maintenance_work_mem controls how much memory VACUUM and CREATE INDEX can
-- use.  At 64 MB, vacuuming obs_sbn is agonizingly slow — it can only hold
-- references to ~2.7M dead tuples per pass, requiring dozens of passes to
-- process 82M dead rows.
--
-- EDIT postgresql.conf:
--
--   maintenance_work_mem = '4GB'   -- For manual VACUUM and index builds.
--                                  -- Allows ~170M dead tuple refs per pass —
--                                  -- well over the 82M dead rows in obs_sbn,
--                                  -- so one pass suffices.
--
--   autovacuum_work_mem = '2GB'    -- Separate limit for autovacuum workers.
--                                  -- Currently -1 (inherits maintenance_work_mem).
--                                  -- Setting it independently prevents a manual
--                                  -- VACUUM from starving autovacuum or vice versa.
--                                  -- With 3 workers: 3 * 2 GB = 6 GB worst case.
--                                  -- Negligible with 251 GB RAM.
--
-- REQUIRES: PostgreSQL restart for maintenance_work_mem.
-- autovacuum_work_mem can be changed with reload (no restart).


-- ==============================================================================
-- FINDING 3: obs_sbn not vacuumed in 186 days (82M dead rows, 15.5%)
-- ==============================================================================
--
-- The default autovacuum threshold is: 50 + 0.2 * n_live_tup
-- For obs_sbn: 50 + 0.2 * 526M = 105.3M dead rows needed to trigger.
-- Current dead rows: 82M (77.6% of threshold).  At current accumulation
-- rate it could be months before autovacuum triggers.
--
-- FIX: Two actions — immediate manual vacuum, then lower the threshold.
--
-- STEP A: Per-table autovacuum settings (run as database owner, no restart)

ALTER TABLE obs_sbn SET (
    autovacuum_vacuum_scale_factor = 0.05,      -- Trigger at 5% dead (was 20%)
    autovacuum_vacuum_threshold = 1000000,       -- At least 1M dead rows
    autovacuum_analyze_scale_factor = 0.02,      -- Re-analyze at 2% change
    autovacuum_analyze_threshold = 500000,
    autovacuum_vacuum_cost_delay = 0             -- No throttling for this table
);

-- autovacuum_vacuum_cost_delay = 0 is aggressive but appropriate for obs_sbn:
-- the default 2ms delay per 200 cost units makes autovacuum process only
-- ~80 MB/s, requiring ~50 minutes just to scan 239 GB before any cleanup.
-- With delay=0, autovacuum uses full I/O bandwidth and finishes much faster.
-- On a dedicated VM with no user-facing latency requirements, this is safe.
--
-- New trigger threshold: 1,000,000 + 0.05 * 526M = ~27.3M dead rows
-- (instead of the current 105.3M).

-- STEP B: Increase global autovacuum cost limits (in postgresql.conf)
--
--   autovacuum_vacuum_cost_delay = '0'    -- Currently 2ms (default).
--   autovacuum_vacuum_cost_limit = '2000' -- Currently 200 (default via
--                                         -- vacuum_cost_limit).  Shared across
--                                         -- all workers.  2000 = 10x throughput.
--
-- ALTERNATIVE: instead of setting cost_delay=0 globally, set it only on
-- obs_sbn (via ALTER TABLE above) and leave the global at 2ms for smaller
-- tables where aggressive vacuum is unnecessary.
--
-- STEP C: One-time manual VACUUM (run after postgresql.conf changes are applied)
--
-- IMPORTANT: Run this AFTER increasing maintenance_work_mem to 2 GB.
-- At 64 MB, this would take many passes and hours.  At 2 GB, one pass.
--
-- Run from psql as database owner (not in a transaction block):
--
--   VACUUM (VERBOSE) obs_sbn;
--
-- Expected duration: 30-90 minutes depending on I/O speed.
-- This is a regular VACUUM, not VACUUM FULL — it does NOT lock the table
-- and does NOT rewrite the table.  Replication continues during vacuum.
-- The VERBOSE flag shows progress.
--
-- DO NOT run VACUUM FULL — it rewrites the entire table (239 GB),
-- requires an ACCESS EXCLUSIVE lock, and would halt replication.


-- ==============================================================================
-- FINDING 4: Five tables not vacuumed in 367+ days
-- ==============================================================================
--
-- These tables have low dead-tuple churn relative to their size, so the
-- default 20% threshold never triggers:
--
--   obs_alterations_deletions      1.47M rows, 79K dead (5.4%)  367 days
--   neocp_events                   310K rows,  114 dead (0.0%)  367 days
--   neocp_obs_archive              777K rows,  42 dead (0.0%)   367 days
--   numbered_identifications       876K rows,  360 dead (0.0%)  367 days
--   current_identifications        2.04M rows, 57K dead (2.8%)  367 days
--
-- Also stale:
--   primary_objects                1.55M rows, 2.7K dead (0.2%) 214 days
--   obs_alterations_unassociations 536K rows,  89 dead (0.0%)   164 days
--
-- Even with few dead tuples, periodic vacuuming is needed to:
--   - Update the visibility map (enables index-only scans)
--   - Prevent transaction ID wraparound (autovacuum_freeze_max_age)
--   - Keep planner statistics fresh (via auto-analyze)
--
-- FIX: Lower the global scale factor (in postgresql.conf):
--
--   autovacuum_vacuum_scale_factor = '0.05'     -- 5% instead of 20%
--   autovacuum_analyze_scale_factor = '0.02'    -- 2% instead of 10%
--
-- This makes autovacuum more responsive for all tables.  For a dedicated
-- database VM, the modest I/O increase is negligible.
--
-- For the immediate backlog, run manual vacuums (as database owner):
--
--   VACUUM (VERBOSE) obs_alterations_deletions;
--   VACUUM (VERBOSE) current_identifications;
--   VACUUM (VERBOSE) primary_objects;
--   VACUUM (VERBOSE) numbered_identifications;
--   VACUUM (VERBOSE) neocp_obs_archive;
--   VACUUM (VERBOSE) neocp_events;
--   VACUUM (VERBOSE) obs_alterations_unassociations;
--
-- These are small tables (< 350 MB each) — each should complete in seconds.
-- Run ANALYZE afterward to refresh planner statistics:
--
--   ANALYZE obs_alterations_deletions;
--   ANALYZE current_identifications;
--   ANALYZE primary_objects;
--   ANALYZE numbered_identifications;
--   ANALYZE neocp_obs_archive;
--   ANALYZE neocp_events;
--   ANALYZE obs_alterations_unassociations;


-- ==============================================================================
-- ADDITIONAL: Storage-type tuning
-- ==============================================================================
--
-- The replica uses spinning disks (lsblk ROTA=1).  Keep defaults:
--   random_page_cost = 4             (already default — correct for HDD)
--   effective_io_concurrency = 1     (already default — correct for HDD)
--
-- If storage is ever migrated to SSD, change to:
--   random_page_cost = 1.1
--   effective_io_concurrency = 200


-- ==============================================================================
-- SUMMARY: postgresql.conf changes for the MPC/SBN replica
-- ==============================================================================
--
-- VM: RHEL 8.6, 251 GB RAM, spinning disks (HDD)
-- Current state: 242 GB in OS page cache, 1.5 GB used by processes
--
-- The OS page cache is already providing excellent read buffering (242 GB),
-- which is why the database functions despite 128 MB shared_buffers.
-- However, PostgreSQL-managed shared_buffers are more efficient: they
-- respect buffer invalidation, track dirty pages, and avoid double-caching.
-- With 251 GB RAM there is ample headroom.
--
-- ## Memory
-- shared_buffers = '64GB'                     # 25% of 251 GB RAM
--                                             # PostgreSQL docs recommend 25%
--                                             # With HDD, large shared_buffers
--                                             # reduces random reads significantly
-- effective_cache_size = '192GB'              # 75% of RAM — tells planner how
--                                             # much OS cache to expect (not an
--                                             # allocation).  242 GB is currently
--                                             # in buff/cache, so 192 GB is
--                                             # conservative.
-- work_mem = '128MB'                          # Currently 4 MB.  With 251 GB RAM
--                                             # and max_connections=100, worst
--                                             # case is 12.5 GB.  Eliminates
--                                             # sort/hash spills to disk for
--                                             # complex queries.
-- maintenance_work_mem = '4GB'                # Currently 64 MB.  Allows VACUUM
--                                             # to process obs_sbn's 82M dead
--                                             # rows in a single pass.
-- autovacuum_work_mem = '2GB'                 # Per autovacuum worker.
--                                             # 3 workers * 2 GB = 6 GB max.
-- wal_buffers = '64MB'                        # Currently 4 MB.  Helps write
--                                             # throughput during replication
--                                             # apply batches.
--
-- ## Autovacuum (more responsive)
-- autovacuum_vacuum_scale_factor = 0.05       # Trigger at 5% dead (was 20%)
-- autovacuum_analyze_scale_factor = 0.02      # Re-analyze at 2% changed
-- autovacuum_vacuum_cost_limit = 2000         # 10x default throughput
--                                             # With HDD, autovacuum I/O cost
--                                             # is real; 2000 is aggressive but
--                                             # appropriate for a dedicated VM
--                                             # with no user-facing latency.
--
-- ## WAL / Checkpoint
-- max_wal_size = '4GB'                        # Currently 1 GB.  Larger WAL
--                                             # allows longer intervals between
--                                             # checkpoints, reducing I/O spikes.
--                                             # Important with HDD.
-- checkpoint_completion_target = 0.9          # Already set (good)
--
-- ## HDD — keep defaults:
-- # random_page_cost = 4              (already correct)
-- # effective_io_concurrency = 1      (already correct)
--
-- ## Huge pages (optional, reduces TLB misses for 64 GB shared_buffers)
-- # In /etc/sysctl.d/postgres.conf:
-- #   vm.nr_hugepages = 33000         # 64 GB / 2 MB per page + margin
-- #   vm.hugetlb_shm_group = <postgres_gid>
-- # Then: sysctl --system
-- # huge_pages = 'on'  in postgresql.conf (change from 'try' to 'on')
--
-- REQUIRES: sudo systemctl restart postgresql-15
--
-- After restart, run the ALTER TABLE and manual VACUUM/ANALYZE commands above.