Skip to content

Commit 87d9434

Browse files
committed
Improve PostgreSQL backup reliability with retries and better error handling
1 parent 8159256 commit 87d9434

1 file changed

Lines changed: 70 additions & 17 deletions

File tree

scripts/backup-functions.sh

Lines changed: 70 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -398,12 +398,38 @@ EOF
398398

399399
# Reload PostgreSQL configuration
400400
log "INFO" "Reloading PostgreSQL configuration..."
401-
su-exec postgres pg_ctl reload -D "$pgdata"
402-
401+
if ! su-exec postgres pg_ctl reload -D "$pgdata"; then
402+
log "ERROR" "Failed to reload PostgreSQL configuration"
403+
return 1
404+
fi
405+
406+
# Wait for configuration to take effect
407+
log "INFO" "Waiting for configuration reload to take effect..."
408+
sleep 10
409+
403410
# Verify the archive_command was updated
404411
log "INFO" "Verifying archive_command configuration..."
405-
archive_cmd_check=$(su-exec postgres psql -d "$pg_database" -t -c "SHOW archive_command;" 2>/dev/null | sed 's/^[ \t]*//;s/[ \t]*$//')
406-
log "INFO" "Current archive_command: $archive_cmd_check"
412+
local max_retries=5
413+
local retry_count=0
414+
local archive_cmd_check=""
415+
416+
while [ $retry_count -lt $max_retries ]; do
417+
archive_cmd_check=$(su-exec postgres psql -d "$pg_database" -t -c "SHOW archive_command;" 2>/dev/null | sed 's/^[ \t]*//;s/[ \t]*$//')
418+
if [[ "$archive_cmd_check" == *"pgbackrest"* ]]; then
419+
log "INFO" "Archive command updated successfully: $archive_cmd_check"
420+
break
421+
else
422+
log "WARN" "Archive command not yet updated (attempt $((retry_count + 1))/$max_retries): $archive_cmd_check"
423+
sleep 5
424+
retry_count=$((retry_count + 1))
425+
fi
426+
done
427+
428+
if [[ "$archive_cmd_check" != *"pgbackrest"* ]]; then
429+
log "ERROR" "Archive command was not updated after $max_retries attempts"
430+
log "ERROR" "Current archive_command: $archive_cmd_check"
431+
return 1
432+
fi
407433

408434
# Test archive-push manually with a dummy WAL file
409435
log "INFO" "Testing archive-push functionality..."
@@ -427,23 +453,50 @@ EOF
427453

428454
# Force a WAL switch to trigger archiving
429455
log "INFO" "Forcing WAL switch to trigger archiving..."
430-
su-exec postgres psql -d "$pg_database" -c "SELECT pg_switch_wal();" || true
431-
432-
# Wait a moment for archiving to complete
433-
sleep 5
456+
if ! su-exec postgres psql -d "$pg_database" -c "SELECT pg_switch_wal();"; then
457+
log "WARN" "Failed to force WAL switch, but continuing..."
458+
fi
459+
460+
# Wait longer for archiving to complete
461+
log "INFO" "Waiting for WAL archiving to complete..."
462+
sleep 15
434463

435464
# Check if any WAL files have been archived
436465
log "INFO" "Checking for archived WAL files..."
437-
if [ -d "/var/lib/pgbackrest/archive/${stanza_name}" ]; then
438-
archived_count=$(find "/var/lib/pgbackrest/archive/${stanza_name}" -name "*.gz" -o -name "*.lz4" -o -name "*.xz" -o -name "*.bz2" -o -name "*-*" | wc -l)
439-
log "INFO" "Found ${archived_count} archived WAL files"
440-
if [ "$archived_count" -gt 0 ]; then
441-
log "INFO" "WAL archiving is working correctly"
442-
else
443-
log "WARN" "No archived WAL files found yet"
466+
local archive_dir="/var/lib/pgbackrest/archive/${stanza_name}"
467+
local max_wait=60
468+
local wait_time=0
469+
local archived_count=0
470+
471+
while [ $wait_time -lt $max_wait ]; do
472+
if [ -d "$archive_dir" ]; then
473+
archived_count=$(find "$archive_dir" -type f \( -name "*.gz" -o -name "*.lz4" -o -name "*.xz" -o -name "*.bz2" -o -name "*-*" \) | wc -l)
474+
if [ "$archived_count" -gt 0 ]; then
475+
log "INFO" "Found ${archived_count} archived WAL files"
476+
log "INFO" "WAL archiving is working correctly"
477+
break
478+
fi
444479
fi
445-
else
446-
log "WARN" "WAL archive directory does not exist"
480+
481+
if [ $wait_time -eq 0 ]; then
482+
log "INFO" "Waiting for WAL files to be archived..."
483+
fi
484+
485+
sleep 5
486+
wait_time=$((wait_time + 5))
487+
488+
if [ $((wait_time % 15)) -eq 0 ]; then
489+
log "INFO" "Still waiting for WAL archiving... (${wait_time}s/${max_wait}s)"
490+
fi
491+
done
492+
493+
if [ "$archived_count" -eq 0 ]; then
494+
log "WARN" "No archived WAL files found after ${max_wait} seconds"
495+
log "WARN" "This may cause backup failures. Check PostgreSQL logs for archive errors."
496+
497+
# Show PostgreSQL log for debugging
498+
log "INFO" "Recent PostgreSQL log entries:"
499+
tail -20 "$pgdata/log/"*.log 2>/dev/null || log "WARN" "Could not read PostgreSQL logs"
447500
fi
448501

449502
return 0

0 commit comments

Comments
 (0)