0.系统初始优化

#!/bin/bash
set -e

# ==========================================
# 配置区域 (唯一需要修改的地方)
# ==========================================
# 定义所有节点的 IP 和主机名映射
declare -A NODES=(
    ["db1"]="10.0.0.101"
    ["db2"]="10.0.0.102"
    ["db3"]="10.0.0.103"
    ["ha1"]="10.0.0.104"
    ["ha2"]="10.0.0.105"
)
# postgres 用户的默认密码
POSTGRES_PASS="postgres123" 
# ==========================================

# 颜色定义
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[!]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; exit 1; }
backup_file() { [ -f "$1" ] && cp "$1" "${1}.backup.$(date +%Y%m%d_%H%M%S)" && log_info "Backup: $1"; }

# 解析参数 (保留参数解析以保持兼容性,但实际不再启用 SSH 配置)
FORCE_SSH=""
AUTO_REBOOT=true
while [[ "$#" -gt 0 ]]; do
    case $1 in
        --force-ssh) log_warn "Parameter --force-ssh is deprecated and ignored." ;;
        --skip-ssh) log_warn "SSH setup is already disabled by default." ;;
        --no-reboot) AUTO_REBOOT=false ;;
        *) log_error "Unknown parameter: $1. Use --no-reboot" ;;
    esac
    shift
done

# 检查 root
[ $EUID -ne 0 ] && log_error "Must run as root"

# ==========================================
# 1. 自动检测 IP 和 角色
# ==========================================
log_info "Auto-detecting node role..."
CURRENT_IPS=$(ip -o -4 addr show scope global | awk '{print $4}' | cut -d/ -f1)

HOSTNAME=""
DETECTED_IP=""
NODE_TYPE="unknown"

for node in "${!NODES[@]}"; do
    node_ip=${NODES[$node]}
    if echo "$CURRENT_IPS" | grep -q "^${node_ip}$"; then
        HOSTNAME=$node
        DETECTED_IP=$node_ip
        # 判断节点类型
        if [[ "$node" == db* ]]; then
            NODE_TYPE="database"
        else
            NODE_TYPE="haproxy"
        fi
        break
    fi
done

if [ -z "$HOSTNAME" ]; then
    log_error "IP not in list! Current IPs: $CURRENT_IPS. Check script config."
fi

log_info "Detected: ${HOSTNAME} (${DETECTED_IP}) - Type: ${NODE_TYPE}"

# ==========================================
# 2. 决策是否配置 SSH (已强制禁用)
# ==========================================
SETUP_SSH=false
log_warn "SSH trust setup is disabled by default."

# ==========================================
# 3. 基础系统配置
# ==========================================
log_info "Starting deployment..."

# 3.1 设置主机名
hostnamectl set-hostname "$HOSTNAME"
log_info "Hostname set"

# 3.2 配置 hosts 文件
backup_file /etc/hosts
sed -i '/# Auto-generated by deploy_prep/,/# End of auto-generated/d' /etc/hosts
cat >> /etc/hosts <<EOF
# Auto-generated by deploy_prep
EOF
for node in "${!NODES[@]}"; do
    echo "${NODES[$node]} $node" >> /etc/hosts
done
echo "# End of auto-generated" >> /etc/hosts
log_info "Hosts file updated"

# 3.3 临时关闭防火墙
ufw disable || true
systemctl stop apparmor 2>/dev/null || true
systemctl disable apparmor 2>/dev/null || true
log_warn "Firewall/AppArmor temp disabled"

# 3.4 安装依赖 (已移除 sshpass)
apt update
apt install -y build-essential libreadline-dev zlib1g-dev libxml2-dev libxslt1-dev libssl-dev libpam0g-dev libsystemd-dev git curl chrony
log_info "Dependencies installed"

# ==========================================
# 4. 系统优化 (内核、IO、时间)
# ==========================================
total_mem_bytes=$(grep MemTotal /proc/meminfo | awk '{print $2*1024}')
page_size=$(getconf PAGE_SIZE)
cpu_cores=$(nproc)
total_mem_mb=$((total_mem_bytes / 1024**2))
log_info "Hardware: ${total_mem_mb}MB RAM, $cpu_cores Cores"

shmmax=$((total_mem_bytes / 2))
shmall=$((total_mem_bytes / page_size))
backup_file /etc/sysctl.conf
cat >> /etc/sysctl.conf <<EOF
# PostgreSQL Optimizations
kernel.shmmax = $shmmax
kernel.shmall = $shmall
kernel.shmmni = 4096
kernel.sem = 250 256000 32 1024
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
vm.swappiness = 1
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
EOF
sysctl -p
sed -i 's/^#RemoveIPC=yes/RemoveIPC=no/' /etc/systemd/logind.conf
systemctl restart systemd-logind

backup_file /etc/default/grub
if ! grep -q "numa=off" /etc/default/grub; then
    sed -i 's/GRUB_CMDLINE_LINUX="/GRUB_CMDLINE_LINUX="numa=off /' /etc/default/grub
    update-grub
fi

cat > /etc/udev/rules.d/60-postgresql-io.rules <<EOF
ACTION=="add|change", KERNEL=="nvme[0-9]*", ATTR{queue/scheduler}="none"
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="none"
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="1", ATTR{queue/scheduler}="mq-deadline"
EOF
udevadm control --reload-rules && udevadm trigger

backup_file /etc/chrony/chrony.conf
sed -i '/^server/d' /etc/chrony/chrony.conf
cat >> /etc/chrony/chrony.conf <<EOF
server ntp1.aliyun.com iburst
server ntp2.aliyun.com iburst
EOF
timedatectl set-timezone Asia/Shanghai
systemctl enable --now chrony
chronyc -a makestep

nofile=$(( total_mem_mb * 2 > 65536 ? 65536 : (total_mem_mb * 2 < 16384 ? 16384 : total_mem_mb * 2) ))
nproc=$(( cpu_cores * 1024 > 32768 ? 32768 : (cpu_cores * 1024 < 8192 ? 8192 : cpu_cores * 1024) ))
cat > /etc/security/limits.d/postgres.conf <<EOF
postgres soft nofile $nofile
postgres hard nofile $nofile
postgres soft nproc $nproc
postgres hard nproc $nproc
EOF
log_info "System optimized"

# ==========================================
# 5. Postgres 用户创建 (SSH 互信已取消)
# ==========================================
log_info "Setting up Postgres user..."

if ! id "postgres" &>/dev/null; then
    useradd -m -s /bin/bash postgres
    echo "postgres:$POSTGRES_PASS" | chpasswd
    log_info "User 'postgres' created."
else
    log_info "User 'postgres' already exists."
    # 确保密码更新
    echo "postgres:$POSTGRES_PASS" | chpasswd
fi

log_warn "SSH trust setup skipped (manual configuration required if needed)."

# ==========================================
# 结束与重启
# ==========================================
log_info "All tasks finished successfully."

if [ "$AUTO_REBOOT" = true ]; then
    echo
    log_warn "============================================="
    log_warn "  系统将在 5 秒后自动重启"
    log_warn "  按 Ctrl+C 可取消"
    log_warn "============================================="
    echo
    
    for i in {5..1}; do
        echo -ne "${YELLOW}重启倒计时:$i...${NC}\r"
        sleep 1
    done
    
    log_info "正在重启..."
    reboot
else
    log_info "已跳过自动重启。"
fi

1.配置免密登录

1.1ssh免密

#!/bin/bash

# ==============================================================================
# 脚本功能:配置 PostgreSQL 集群节点间的 SSH 免密信任 (用于 repmgr)
# 适用用户:postgres (必须以 postgres 用户运行)
# 运行方式:在 3 台主机上分别执行此脚本
# ==============================================================================

# 1. 定义集群所有节点的 IP 和主机名
# 请根据实际情况修改,确保包含当前机器
ALL_NODES=(
    "10.0.0.101:db1"
    "10.0.0.102:db2"
    "10.0.0.103:db3"
)

# 2. 定义 SSH 用户 (通常为 postgres)
SSH_USER="postgres"

# ------------------------------------------------------------------------------
# 检查是否以 postgres 用户运行
# ------------------------------------------------------------------------------
if [ "$USER" != "$SSH_USER" ]; then
    echo "错误:请使用 '$SSH_USER' 用户运行此脚本 (例如:su - $SSH_USER)"
    exit 1
fi

# ------------------------------------------------------------------------------
# 3. 生成本地 SSH 密钥 (如果不存在)
# ------------------------------------------------------------------------------
SSH_DIR="$HOME/.ssh"
KEY_FILE="$SSH_DIR/id_rsa"

if [ ! -d "$SSH_DIR" ]; then
    mkdir -p "$SSH_DIR"
    chmod 700 "$SSH_DIR"
    echo "已创建 $SSH_DIR 目录"
fi

if [ ! -f "$KEY_FILE" ]; then
    echo "正在生成 SSH 密钥对 (无密码短语)..."
    ssh-keygen -t rsa -b 4096 -f "$KEY_FILE" -N "" -q
    echo "密钥生成完成:$KEY_FILE"
else
    echo "检测到已有密钥:$KEY_FILE (跳过生成)"
fi

# ------------------------------------------------------------------------------
# 4. 配置 SSH 客户端 (关闭 StrictHostKeyChecking 以避免 repmgr 阻塞)
# ------------------------------------------------------------------------------
CONFIG_FILE="$SSH_DIR/config"
echo "正在配置 $CONFIG_FILE ..."
cat > "$CONFIG_FILE" <<EOF
Host *
    StrictHostKeyChecking no
    UserKnownHostsFile /dev/null
    LogLevel ERROR
EOF
chmod 600 "$CONFIG_FILE"
echo "SSH 配置完成 (已禁用主机密钥检查)"

# ------------------------------------------------------------------------------
# 5. 将公钥分发到所有节点 (包括自己,确保全互联)
# ------------------------------------------------------------------------------
echo "开始分发公钥到集群节点..."

# 获取当前机器的 IP (用于跳过自己,虽然复制给自己也没坏处)
CURRENT_IP=$(hostname -i | awk '{print $1}')

for node in "${ALL_NODES[@]}"; do
    IP=$(echo $node | cut -d':' -f1)
    HOSTNAME=$(echo $node | cut -d':' -f2)
    
    # 如果是自己,跳过 (可选,但为了逻辑清晰建议跳过)
    if [ "$IP" == "$CURRENT_IP" ]; then
        echo "跳过本机:$HOSTNAME ($IP)"
        continue
    fi

    echo "----------------------------------------"
    echo "正在配置 -> $HOSTNAME ($IP)"
    
    # 使用 cat 管道方式代替 ssh-copy-id (兼容性更好)
    # 注意:第一次执行时,这里会提示你输入 $SSH_USER@IP 的密码,请输入一次
    cat "$KEY_FILE.pub" | ssh -o ConnectTimeout=5 "$SSH_USER@$IP" "
        mkdir -p ~/.ssh && 
        chmod 700 ~/.ssh && 
        cat >> ~/.ssh/authorized_keys && 
        chmod 600 ~/.ssh/authorized_keys && 
        echo 'Success on $HOSTNAME'
    " 2>&1

    if [ $? -eq 0 ]; then
        echo "成功:$HOSTNAME 信任已建立"
    else
        echo "失败:无法连接到 $HOSTNAME,请检查网络或 SSH 服务"
    fi
done

# ------------------------------------------------------------------------------
# 6. 验证
# ------------------------------------------------------------------------------
echo ""
echo "========================================"
echo "配置完成!正在进行连通性测试..."
echo "========================================"

for node in "${ALL_NODES[@]}"; do
    IP=$(echo $node | cut -d':' -f1)
    HOSTNAME=$(echo $node | cut -d':' -f2)
    
    if [ "$IP" == "$CURRENT_IP" ]; then continue; fi

    RESULT=$(ssh "$SSH_USER@$IP" "hostname" 2>&1)
    if [ $? -eq 0 ]; then
        echo "[OK] $HOSTNAME ($IP) 免密登录成功 (返回主机名:$RESULT)"
    else
        echo "[FAIL] $HOSTNAME ($IP) 免密登录失败"
    fi
done

echo ""
echo "提示:请确保在 3 台主机上都运行了此脚本,以实现双向免密。"

2.安装 install_postgresql.sh 

#!/bin/bash
set -e

# ==========================================
# 可配置区域
# ==========================================
PG_VERSION="15.15"
PG_MAJOR="15"
INSTALL_PREFIX="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
# 阿里云镜像源
DOWNLOAD_URL="https://mirrors.aliyun.com/postgresql/source/v${PG_VERSION}/postgresql-${PG_VERSION}.tar.gz"
SRC_DIR="/usr/local/src"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_step() { echo -e "${BLUE}[STEP]${NC} $1"; }

# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"

# 检查是否已安装
if [ -f "${INSTALL_PREFIX}/bin/psql" ]; then
    log_warn "PostgreSQL is already installed at ${INSTALL_PREFIX}"
    read -p "Do you want to re-install? (This will NOT delete data) (yes/no): " confirm
    if [ "$confirm" != "yes" ]; then
        log_info "Exiting installation."
        exit 0
    fi
fi

# ==========================================
# 1. 安装系统依赖
# ==========================================
log_step "Installing build dependencies..."
apt update
apt install -y build-essential libreadline-dev zlib1g-dev libxml2-dev \
               libxslt1-dev libssl-dev libpam0g-dev libsystemd-dev \
               git curl wget pkg-config

# ==========================================
# 2. 准备目录
# ==========================================
log_step "Preparing directories..."
mkdir -p ${INSTALL_PREFIX} ${INSTALL_PREFIX}/run ${PGDATA} ${SRC_DIR}

# 创建 postgres 用户 (如果不存在)
if ! id "${PGUSER}" &>/dev/null; then
    useradd -m -s /bin/bash ${PGUSER}
    log_info "User ${PGUSER} created."
fi

# ==========================================
# 3. 下载源码
# ==========================================
cd ${SRC_DIR}
PKG_NAME="postgresql-${PG_VERSION}.tar.gz"

if [ -f "${PKG_NAME}" ]; then
    log_info "Source package already exists, skipping download."
else
    log_step "Downloading PostgreSQL ${PG_VERSION}..."
    # 使用 wget 断点续传,重试 3 次
    wget -c --tries=3 --timeout=30 ${DOWNLOAD_URL} -O ${PKG_NAME}
fi

# 验证文件是否下载成功
if [ ! -f "${PKG_NAME}" ]; then
    log_error "Download failed. Please check network or URL."
fi

# 解压
log_step "Extracting source code..."
rm -rf postgresql-${PG_VERSION} # 清理旧的解压目录
tar -zxf ${PKG_NAME}
cd postgresql-${PG_VERSION}

# ==========================================
# 4. 编译与安装
# ==========================================
log_step "Configuring build..."
./configure --prefix=${INSTALL_PREFIX} \
            --with-systemd \
            --with-openssl \
            --with-pam \
            --with-libxml \
            --with-libxslt

log_step "Compiling (using $(nproc) CPU cores)..."
make -j$(nproc) world

log_step "Installing..."
make install-world

# ==========================================
# 5. 配置环境变量
# ==========================================
log_step "Configuring environment variables..."
cat > /etc/profile.d/pgsql.sh <<EOF
# PostgreSQL Environment
export PATH=${INSTALL_PREFIX}/bin:\$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:\$LD_LIBRARY_PATH
export PGHOME=${INSTALL_PREFIX}
export PGDATA=${PGDATA}
EOF

# 立即生效(当前 shell)
export PATH=${INSTALL_PREFIX}/bin:$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH

# ==========================================
# 6. 配置 Systemd 服务
# ==========================================
log_step "Configuring systemd service..."
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL ${PG_MAJOR} database server
Documentation=man:postgres(1)
After=network.target

[Service]
Type=notify
User=${PGUSER}
Group=${PGUSER}
Environment=PGDATA=${PGDATA}
Environment=PGHOME=${INSTALL_PREFIX}

ExecStart=${INSTALL_PREFIX}/bin/postgres -D \${PGDATA}
ExecReload=/bin/kill -HUP \$MAINPID

KillMode=mixed
KillSignal=SIGINT
TimeoutSec=0
LimitNOFILE=65536
LimitNPROC=32768

# OOM Score adjustment (make it less likely to be killed)
OOMScoreAdjust=-1000

[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload
systemctl enable postgresql

# ==========================================
# 7. 权限收尾
# ==========================================
log_step "Setting permissions..."
chown -R ${PGUSER}:${PGUSER} ${INSTALL_PREFIX} ${PGDATA}
chmod 700 ${PGDATA}

# ==========================================
# 完成
# ==========================================
echo
log_info "=================================================="
log_info "  PostgreSQL ${PG_VERSION} installation complete!"
log_info "  Install Path: ${INSTALL_PREFIX}"
log_info "  Systemd: systemctl start postgresql"
log_info "  Next step: Initialize the database (initdb)"
log_info "=================================================="
echo
#!/bin/bash
set -e

# ==========================================
# 可配置区域
# ==========================================
PG_VERSION="15.15"
PG_MAJOR="15"
INSTALL_PREFIX="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
DOWNLOAD_URL="https://mirrors.aliyun.com/postgresql/source/v${PG_VERSION}/postgresql-${PG_VERSION}.tar.gz"
SRC_DIR="/usr/local/src"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_step() { echo -e "${BLUE}[STEP]${NC} $1"; }

# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"

# 检查是否已安装
if [ -f "${INSTALL_PREFIX}/bin/psql" ]; then
    log_warn "PostgreSQL is already installed at ${INSTALL_PREFIX}"
    read -p "Do you want to re-install? (This will NOT delete data) (yes/no): " confirm
    if [ "$confirm" != "yes" ]; then
        log_info "Exiting installation."
        exit 0
    fi
fi

# ==========================================
# 1. 安装系统依赖
# ==========================================
log_step "Installing build dependencies..."
apt update
apt install -y build-essential libreadline-dev zlib1g-dev libxml2-dev \
               libxslt1-dev libssl-dev libpam0g-dev libsystemd-dev \
               git curl wget pkg-config

# ==========================================
# 2. 准备目录
# ==========================================
log_step "Preparing directories..."
mkdir -p ${INSTALL_PREFIX} ${INSTALL_PREFIX}/run ${PGDATA} ${SRC_DIR}

# 创建 postgres 用户 (如果不存在)
if ! id "${PGUSER}" &>/dev/null; then
    useradd -m -s /bin/bash ${PGUSER}
    log_info "User ${PGUSER} created."
fi

# ==========================================
# 3. 下载源码
# ==========================================
cd ${SRC_DIR}
PKG_NAME="postgresql-${PG_VERSION}.tar.gz"

if [ -f "${PKG_NAME}" ]; then
    log_info "Source package already exists, skipping download."
else
    log_step "Downloading PostgreSQL ${PG_VERSION}..."
    wget -c --tries=3 --timeout=30 ${DOWNLOAD_URL} -O ${PKG_NAME}
fi

if [ ! -f "${PKG_NAME}" ]; then
    log_error "Download failed. Please check network or URL."
fi

log_step "Extracting source code..."
rm -rf postgresql-${PG_VERSION}
tar -zxf ${PKG_NAME}
cd postgresql-${PG_VERSION}

# ==========================================
# 4. 编译与安装
# ==========================================
log_step "Configuring build..."
./configure --prefix=${INSTALL_PREFIX} \
            --with-systemd \
            --with-openssl \
            --with-pam \
            --with-libxml \
            --with-libxslt

log_step "Compiling (using $(nproc) CPU cores)..."
make -j$(nproc) world

log_step "Installing..."
make install-world

# ==========================================
# 5. 配置环境变量
# ==========================================
log_step "Configuring environment variables..."
cat > /etc/profile.d/pgsql.sh <<EOF
# PostgreSQL Environment
export PATH=${INSTALL_PREFIX}/bin:\$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:\$LD_LIBRARY_PATH
export PGHOME=${INSTALL_PREFIX}
export PGDATA=${PGDATA}
EOF

export PATH=${INSTALL_PREFIX}/bin:$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH

# ==========================================
# 6. 【核心优化】配置稳健版 Systemd 服务
# ==========================================
log_step "Configuring systemd service (Optimized for stability)..."
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL ${PG_MAJOR} database server
Documentation=man:postgres(1)
After=network.target

[Service]
# 优化点1:使用 forking + pg_ctl,最稳健的组合
Type=forking
User=${PGUSER}
Group=${PGUSER}

# 优化点2:写死绝对路径,不依赖变量解析
Environment=PATH=${INSTALL_PREFIX}/bin:/usr/bin:/bin

# 优化点3:使用 pg_ctl,自带等待启动、优雅关闭
# -s: 静默模式
# -w: 等待数据库完全启动才返回
# -t 300: 超时时间 300 秒
ExecStart=${INSTALL_PREFIX}/bin/pg_ctl start -D ${PGDATA} -s -w -t 300
ExecStop=${INSTALL_PREFIX}/bin/pg_ctl stop -D ${PGDATA} -s -m fast
ExecReload=${INSTALL_PREFIX}/bin/pg_ctl reload -D ${PGDATA} -s

# 优化点4:资源限制
TimeoutSec=300
LimitNOFILE=65536
LimitNPROC=32768

# 优化点5:OOM 分数调整,让数据库不容易被 OOM Killer 杀掉
OOMScoreAdjust=-1000

[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload
systemctl enable postgresql

# ==========================================
# 7. 权限收尾
# ==========================================
log_step "Setting permissions..."
chown -R ${PGUSER}:${PGUSER} ${INSTALL_PREFIX} ${PGDATA}
chmod 700 ${PGDATA}

# ==========================================
# 完成
# ==========================================
echo
log_info "=================================================="
log_info "  PostgreSQL ${PG_VERSION} installation complete!"
log_info "  Install Path: ${INSTALL_PREFIX}"
log_info "  Data Path:    ${PGDATA}"
log_info ""
log_info "  Systemd Commands:"
log_info "    Start:   systemctl start postgresql"
log_info "    Stop:    systemctl stop postgresql"
log_info "    Status:  systemctl status postgresql"
log_info ""
log_info "  Next steps (as root):"
log_info "    1. su - postgres"
log_info "    2. initdb -D ${PGDATA}"
log_info "    3. systemctl start postgresql"
log_info "=================================================="
echo

3.主库初始化 init_primary_db.sh

#!/bin/bash
set -e

# ==========================================
# 配置与路径定义 (修改这里即可适配环境)
# ==========================================
PGHOME="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
PGPORT="5432"
REPMGR_PASS="repmgr123"
SUBNET="10.0.0.0/24"

# 命令绝对路径 (避免环境变量问题)
INITDB="${PGHOME}/bin/initdb"
PSQL="${PGHOME}/bin/psql"
PG_CTL="${PGHOME}/bin/pg_ctl"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }

# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"

# 检查 PostgreSQL 程序是否存在
if [ ! -f "$INITDB" ]; then
    log_error "PostgreSQL binaries not found at $PGHOME. Did you run the compile step?"
fi

# ==========================================
# 1. 目录准备
# ==========================================
log_info "Preparing directories..."
mkdir -p ${PGHOME}/run
chown -R ${PGUSER}:${PGUSER} ${PGHOME} ${PGDATA}
chmod 700 ${PGDATA}

# 检查数据目录是否为空 (防止覆盖)
if [ "$(ls -A ${PGDATA})" ]; then
    log_warn "Data directory ${PGDATA} is not empty!"
    read -p "Do you want to CLEAN ALL DATA and re-initialize? (yes/no): " confirm
    if [ "$confirm" != "yes" ]; then
        log_error "User cancelled. Exiting."
    fi
    log_warn "Cleaning data directory..."
    rm -rf ${PGDATA}/*
fi

# ==========================================
# 2. 初始化数据库 (initdb)
# ==========================================
log_info "Running initdb..."
su - ${PGUSER} -c "${INITDB} -D ${PGDATA} -E UTF8 --locale=en_US.UTF-8"

if [ $? -eq 0 ]; then
    log_info "initdb completed successfully."
else
    log_error "initdb failed!"
fi

# ==========================================
# 3. 配置 postgresql.conf
# ==========================================
log_info "Configuring postgresql.conf..."

# 备份原配置
su - ${PGUSER} -c "cp ${PGDATA}/postgresql.conf ${PGDATA}/postgresql.conf.backup"

# 追加配置
cat >> ${PGDATA}/postgresql.conf <<EOF
# ==========================================
# Custom Configuration for HA
# ==========================================
listen_addresses = '*'
port = ${PGPORT}
unix_socket_directories = '${PGHOME}/run,/tmp'
unix_socket_permissions = 0700

# WAL Settings
wal_level = replica
max_wal_senders = 10
wal_keep_size = 2GB
hot_standby = on

# Logging
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
EOF

chown ${PGUSER}:${PGUSER} ${PGDATA}/postgresql.conf
log_info "postgresql.conf configured."

# ==========================================
# 4. 配置 pg_hba.conf
# ==========================================
log_info "Configuring pg_hba.conf..."

su - ${PGUSER} -c "cp ${PGDATA}/pg_hba.conf ${PGDATA}/pg_hba.conf.backup"

cat >> ${PGDATA}/pg_hba.conf <<EOF
# ==========================================
# Replication Access
# ==========================================
local   replication     repmgr                                  trust
host    replication     repmgr          ${SUBNET}               scram-sha-256
host    repmgr          repmgr          ${SUBNET}               scram-sha-256

# ==========================================
# General Access
# ==========================================
host    all             all             ${SUBNET}               scram-sha-256
EOF

chown ${PGUSER}:${PGUSER} ${PGDATA}/pg_hba.conf
log_info "pg_hba.conf configured."

# ==========================================
# 5. 启动数据库 (Systemd)
# ==========================================
log_info "Starting PostgreSQL via systemd..."

# 确保 systemd 服务文件存在 (如果没有,这里创建一个)
if [ ! -f "/etc/systemd/system/postgresql.service" ]; then
    log_warn "Systemd service file not found, creating one..."
    cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL database server
After=network.target

[Service]
Type=notify
User=${PGUSER}
Group=${PGUSER}
Environment=PGDATA=${PGDATA}
ExecStart=${PGHOME}/bin/postgres -D ${PGDATA}
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=mixed
KillSignal=SIGINT
TimeoutSec=0
LimitNOFILE=65536

[Install]
WantedBy=multi-user.target
EOF
    systemctl daemon-reload
fi

systemctl start postgresql

if systemctl is-active --quiet postgresql; then
    log_info "PostgreSQL started successfully."
else
    log_error "PostgreSQL failed to start. Check 'journalctl -xeu postgresql' for details."
fi

# ==========================================
# 6. 创建 repmgr 用户和数据库
# ==========================================
log_info "Creating repmgr user and database..."

# 等待一小会儿确保数据库完全就绪
sleep 2

su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE USER repmgr WITH SUPERUSER PASSWORD '${REPMGR_PASS}';\""
su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE DATABASE repmgr OWNER repmgr;\""

log_info "repmgr user created."

# ==========================================
# 完成
# ==========================================
echo
log_info "============================================="
log_info "  Primary DB (db1) initialization complete!"
log_info "  Data Dir: ${PGDATA}"
log_info "  Socket Dir: ${PGHOME}/run"
log_info "  Access: psql -h ${PGHOME}/run"
log_info "============================================="
echo
#!/bin/bash
set -e

# ==========================================
# 配置与路径定义 (修改这里即可适配环境)
# ==========================================
PGHOME="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
PGPORT="5432"
# 【重要提醒】生产环境请务必修改此密码!
REPMGR_PASS="repmgr123"
SUBNET="10.0.0.0/24"

# 命令绝对路径 (避免环境变量问题)
INITDB="${PGHOME}/bin/initdb"
PSQL="${PGHOME}/bin/psql"
PG_CTL="${PGHOME}/bin/pg_ctl"
PG_ISREADY="${PGHOME}/bin/pg_isready"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }

# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"

if [ ! -f "$INITDB" ]; then
    log_error "PostgreSQL binaries not found at $PGHOME. Did you run the compile step?"
fi

# ==========================================
# 1. 目录准备
# ==========================================
log_info "Preparing directories..."
mkdir -p ${PGHOME}/run ${PGDATA}/log
chown -R ${PGUSER}:${PGUSER} ${PGHOME}
chmod 700 ${PGDATA} 2>/dev/null || true

# 检查数据目录是否为空
if [ "$(ls -A ${PGDATA} 2>/dev/null)" ]; then
    log_warn "Data directory ${PGDATA} is not empty!"
    read -p "Do you want to CLEAN ALL DATA and re-initialize? (yes/no): " confirm
    if [ "$confirm" != "yes" ]; then
        log_error "User cancelled. Exiting."
    fi
    log_warn "Cleaning data directory..."
    rm -rf ${PGDATA}/*
fi

# ==========================================
# 2. 初始化数据库 (initdb)
# ==========================================
log_info "Running initdb..."
su - ${PGUSER} -c "${INITDB} -D ${PGDATA} -E UTF8 --locale=en_US.UTF-8"
log_info "initdb completed successfully."

# ==========================================
# 3. 【优化】配置 postgresql.conf (使用 sed 替换,避免重复配置)
# ==========================================
log_info "Configuring postgresql.conf..."
su - ${PGUSER} -c "cp ${PGDATA}/postgresql.conf ${PGDATA}/postgresql.conf.backup.$(date +%s)"

# 优化点:使用 sed 精确修改参数,而不是简单追加
# 这样即使原配置有注释或默认值,也能正确覆盖
sed -i "s/^#*listen_addresses = .*/listen_addresses = '*'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*port = .*/port = ${PGPORT}/" ${PGDATA}/postgresql.conf
sed -i "s/^#*unix_socket_directories = .*/unix_socket_directories = '${PGHOME}\/run,\/tmp'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*unix_socket_permissions = .*/unix_socket_permissions = 0700/" ${PGDATA}/postgresql.conf

# WAL 配置
sed -i "s/^#*wal_level = .*/wal_level = replica/" ${PGDATA}/postgresql.conf
sed -i "s/^#*max_wal_senders = .*/max_wal_senders = 10/" ${PGDATA}/postgresql.conf
sed -i "s/^#*wal_keep_size = .*/wal_keep_size = 2GB/" ${PGDATA}/postgresql.conf
sed -i "s/^#*hot_standby = .*/hot_standby = on/" ${PGDATA}/postgresql.conf
sed -i "s/^#*wal_log_hints = .*/wal_log_hints = on/" ${PGDATA}/postgresql.conf # 为 pg_rewind 做准备

# 日志配置
sed -i "s/^#*log_destination = .*/log_destination = 'stderr'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*logging_collector = .*/logging_collector = on/" ${PGDATA}/postgresql.conf
sed -i "s/^#*log_directory = .*/log_directory = 'log'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*log_filename = .*/log_filename = 'postgresql-%Y-%m-%d.log'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*log_line_prefix = .*/log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '/" ${PGDATA}/postgresql.conf

chown ${PGUSER}:${PGUSER} ${PGDATA}/postgresql.conf
log_info "postgresql.conf configured (using sed replacement)."

# ==========================================
# 4. 配置 pg_hba.conf
# ==========================================
log_info "Configuring pg_hba.conf..."
su - ${PGUSER} -c "cp ${PGDATA}/pg_hba.conf ${PGDATA}/pg_hba.conf.backup.$(date +%s)"

# 先清理可能存在的旧配置块(避免重复追加)
sed -i '/# ==========================================/,/# ==========================================/d' ${PGDATA}/pg_hba.conf 2>/dev/null || true

cat >> ${PGDATA}/pg_hba.conf <<EOF

# ==========================================
# Replication Access
# ==========================================
local   replication     repmgr                                  trust
host    replication     repmgr          ${SUBNET}               scram-sha-256
host    repmgr          repmgr          ${SUBNET}               scram-sha-256

# ==========================================
# General Access
# ==========================================
host    all             all             ${SUBNET}               scram-sha-256
EOF

chown ${PGUSER}:${PGUSER} ${PGDATA}/pg_hba.conf
log_info "pg_hba.conf configured."

# ==========================================
# 5. 【核心优化】部署稳健版 Systemd 服务并启动
# ==========================================
log_info "Checking & deploying systemd service..."

# 优化点:不管旧服务文件是否存在,都备份并部署我们的稳健版
if [ -f "/etc/systemd/system/postgresql.service" ]; then
    log_warn "Existing systemd service file found, backing up..."
    cp /etc/systemd/system/postgresql.service /etc/systemd/system/postgresql.service.backup.$(date +%s)
fi

# 部署优化版 Systemd 服务(Type=forking + pg_ctl + 绝对路径)
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL database server
After=network.target

[Service]
Type=forking
User=${PGUSER}
Group=${PGUSER}
Environment=PATH=${PGHOME}/bin:/usr/bin:/bin
ExecStart=${PGHOME}/bin/pg_ctl start -D ${PGDATA} -s -w -t 300
ExecStop=${PGHOME}/bin/pg_ctl stop -D ${PGDATA} -s -m fast
ExecReload=${PGHOME}/bin/pg_ctl reload -D ${PGDATA} -s
TimeoutSec=300
LimitNOFILE=65536
LimitNPROC=32768
OOMScoreAdjust=-1000

[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload
systemctl enable postgresql 2>/dev/null || true

log_info "Starting PostgreSQL via systemd..."
systemctl start postgresql

# ==========================================
# 6. 【优化】循环等待数据库完全就绪 (使用 pg_isready)
# ==========================================
log_info "Waiting for database to become ready..."
for i in {1..30}; do
    if su - ${PGUSER} -c "${PG_ISREADY} -h ${PGHOME}/run -p ${PGPORT} -q"; then
        log_info "Database is ready."
        break
    fi
    log_warn "Waiting... ($i/30)"
    sleep 2
done

if ! su - ${PGUSER} -c "${PG_ISREADY} -h ${PGHOME}/run -p ${PGPORT} -q"; then
    log_error "Database did not become ready within 60 seconds. Check logs."
fi

# ==========================================
# 7. 创建 repmgr 用户和数据库
# ==========================================
log_info "Creating repmgr user and database..."

su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE USER repmgr WITH SUPERUSER PASSWORD '${REPMGR_PASS}';\"" 2>/dev/null || \
    log_warn "User repmgr already exists, skipping creation."

su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE DATABASE repmgr OWNER repmgr;\"" 2>/dev/null || \
    log_warn "Database repmgr already exists, skipping creation."

log_info "repmgr user/database configured."

# ==========================================
# 完成
# ==========================================
echo
log_info "============================================="
log_info "  Primary DB (db1) initialization complete!"
log_info "  Data Dir:   ${PGDATA}"
log_info "  Socket Dir: ${PGHOME}/run"
log_info "  Access:     psql -h ${PGHOME}/run"
log_info ""
log_info "  Systemd Status: systemctl status postgresql"
log_info "============================================="
echo

4.安装repgmr

4.1主库配置

#!/bin/bash
set -euo pipefail

# ================= 配置区 =================
# 基础路径
PGSQL_BIN_PATH="/apps/pgsql/bin"
PG_DATA_DIR="/data/pgsql"
PG_BACKUP_DIR="/data/backup"       # 【优化1】明确定义备份目录,移出 PGDATA
REPMGR_ETC_DIR="/apps/repmgr/etc"
REPMGR_LOG_DIR="/apps/repmgr/log"

# 节点信息
PRIMARY_PHYSICAL_IP="10.0.0.101"
PRIMARY_NODE_NAME="db1"
PRIMARY_NODE_ID=1

# 认证信息 (必须全集群一致)
REPMGR_PASSWORD="postgres123"
REPMGR_USER="repmgr"
REPMGR_DB="repmgr"
PRIMARY_PORT=5432

# 集群节点列表 (用于 pg_hba 自动配置)
CLUSTER_NODES=("10.0.0.101" "10.0.0.102" "10.0.0.103")

# 服务管理
PG_SERVICE_NAME="postgresql"
POSTGRES_USER="postgres"
# =========================================

# 路径自动推导
PG_CONF="${PG_DATA_DIR}/postgresql.conf"
PG_HBA="${PG_DATA_DIR}/pg_hba.conf"
REPMGR_CONF="${REPMGR_ETC_DIR}/repmgr.conf"
REPMGR_PATH="${PGSQL_BIN_PATH}/repmgr"

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }

# ================= 预检查 =================
log_info "=========================================="
log_info "  repmgr 主库初始化 (${PRIMARY_NODE_NAME})"
log_info "=========================================="

[ "$EUID" -ne 0 ] && log_error "请使用 root 用户执行"
[ ! -f "${PGSQL_BIN_PATH}/pg_config" ] && log_error "未找到 pg_config,请检查 PostgreSQL 安装"
[ ! -d "${PG_DATA_DIR}" ] && log_error "数据目录不存在: ${PG_DATA_DIR}"
# 【优化2】检查 repmgr 二进制是否存在
[ ! -f "${REPMGR_PATH}" ] && log_error "未找到 repmgr 二进制文件: ${REPMGR_PATH},请先编译安装"

# 【优化3】检查 PGDATA 纯净度
log_info "检查数据目录纯净度..."
for item in "${PG_DATA_DIR}"/*; do
    if [ -d "$item" ]; then
        dir_name=$(basename "$item")
        if [[ ! "$dir_name" =~ ^(base|global|pg_commit_ts|pg_dynshmem|pg_logical|pg_multixact|pg_notify|pg_replslot|pg_serial|pg_snapshots|pg_stat|pg_stat_tmp|pg_subtrans|pg_tblspc|pg_twophase|pg_wal|pg_xact|log)$ ]]; then
            log_warn "发现非标准目录: $item"
            log_warn "为避免从库克隆失败,建议将其移出 ${PG_DATA_DIR}"
        fi
    fi
done

# ================= 环境准备 =================
log_info "[1/6] 准备目录与环境..."
mkdir -p ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chown -R ${POSTGRES_USER}: ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chmod 700 ${PG_BACKUP_DIR} ${REPMGR_LOG_DIR}

# 【优化4】配置 .pgpass(主库也需要,方便后续管理)
log_info "配置 .pgpass..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:*:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass

# ================= 数据库配置 =================
log_info "[2/6] 创建 repmgr 用户与数据库..."
su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/psql -p ${PRIMARY_PORT} -c \"CREATE USER ${REPMGR_USER} WITH SUPERUSER PASSWORD '${REPMGR_PASSWORD}';\"" 2>/dev/null || log_warn "用户已存在,跳过创建"
su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/createdb -p ${PRIMARY_PORT} -O ${REPMGR_USER} ${REPMGR_DB};" 2>/dev/null || log_warn "数据库已存在,跳过创建"

log_info "[3/6] 修改 postgresql.conf..."
# 【优化5】修改配置前自动备份
if [ ! -f "${PG_CONF}.bak.repmgr" ]; then
    cp ${PG_CONF} ${PG_CONF}.bak.repmgr
    log_info "已备份 postgresql.conf 为 ${PG_CONF}.bak.repmgr"
fi

set_conf() {
    local key=$1
    local value=$2
    if grep -q "^#*${key}" "${PG_CONF}"; then
        sed -i "s/^#*${key}.*/${key} = ${value}/" "${PG_CONF}"
    else
        echo "${key} = ${value}" >> "${PG_CONF}"
    fi
}
set_conf "wal_level" "logical"
set_conf "max_wal_senders" "10"
set_conf "max_replication_slots" "10"
set_conf "wal_keep_size" "1024"
set_conf "shared_preload_libraries" "'repmgr'"
set_conf "wal_log_hints" "on"

log_info "[4/6] 配置 pg_hba.conf..."
HBA_MARKER="# repmgr cluster auto config"
if ! grep -qF "${HBA_MARKER}" "${PG_HBA}"; then
    # 【优化6】修改 pg_hba 前自动备份
    if [ ! -f "${PG_HBA}.bak.repmgr" ]; then
        cp ${PG_HBA} ${PG_HBA}.bak.repmgr
        log_info "已备份 pg_hba.conf 为 ${PG_HBA}.bak.repmgr"
    fi
    
    echo -e "\n${HBA_MARKER}" >> "${PG_HBA}"
    for node_ip in "${CLUSTER_NODES[@]}"; do
        echo "host    replication     ${REPMGR_USER}    ${node_ip}/32        scram-sha-256" >> "${PG_HBA}"
        echo "host    ${REPMGR_DB}    ${REPMGR_USER}    ${node_ip}/32        scram-sha-256" >> "${PG_HBA}"
    done
fi
# 【优化7】显式设置 pg_hba 所有者
chown ${POSTGRES_USER}: ${PG_HBA}

# ================= 注册主节点 =================
log_info "[5/6] 生成配置并注册主节点..."
cat > ${REPMGR_CONF} << EOF
node_id=${PRIMARY_NODE_ID}
node_name='${PRIMARY_NODE_NAME}'
conninfo='host=${PRIMARY_PHYSICAL_IP} port=${PRIMARY_PORT} user=${REPMGR_USER} dbname=${REPMGR_DB} password=${REPMGR_PASSWORD} connect_timeout=2'
pg_bindir='${PGSQL_BIN_PATH}'
data_directory='${PG_DATA_DIR}'
use_replication_slots=yes
failover=automatic
promote_command='${REPMGR_PATH} standby promote -f ${REPMGR_CONF} --log-to-file'
follow_command='${REPMGR_PATH} standby follow -f ${REPMGR_CONF} --log-to-file --upstream-node-id=%n'
log_file='${REPMGR_LOG_DIR}/repmgrd.log'
log_level=INFO
EOF
chown ${POSTGRES_USER}: ${REPMGR_CONF}
chmod 600 ${REPMGR_CONF}

log_info "正在重启 PostgreSQL 以应用配置..."
systemctl restart ${PG_SERVICE_NAME}

log_info "等待数据库启动..."
for i in {1..30}; do
    su - "${POSTGRES_USER}" -c "${PGSQL_BIN_PATH}/pg_isready -q" && break
    [ ${i} -eq 30 ] && log_error "数据库启动超时"
    sleep 2
done

log_info "正在注册主节点..."
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} primary register"

# ================= 完成验证 =================
log_info "[6/6] 部署完成,验证状态..."
log_info "=========================================="
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} cluster show"
log_info "=========================================="
log_info ""
log_info "【配置说明】"
log_info "✓ 主库节点名: ${PRIMARY_NODE_NAME} (ID: ${PRIMARY_NODE_ID})"
log_info "✓ 备份目录: ${PG_BACKUP_DIR} (已移出 PGDATA)"
log_info "✓ 日志文件: ${REPMGR_LOG_DIR}/repmgrd.log"
log_info ""

4.2 从库配置

#!/bin/bash
set -euo pipefail

# ================= 配置区(优化路径) =================
REPMGR_VERSION="5.5.0"
# 【核心变更】repmgr 将直接安装进 PostgreSQL 目录
PGSQL_BIN_PATH="/apps/pgsql/bin"
PG_DATA_DIR="/data/pgsql"
PG_BACKUP_DIR="${PG_DATA_DIR}/backup"

# repmgr 的配置和日志仍放在独立目录,方便管理
REPMGR_ETC_DIR="/apps/repmgr/etc"
REPMGR_LOG_DIR="/apps/repmgr/log"

PRIMARY_PHYSICAL_IP="10.0.0.101"
STANDBY_PHYSICAL_IP="10.0.0.102"
STANDBY_NODE_NAME="db2"
STANDBY_NODE_ID=2
WITNESS_IP="10.0.0.103"

REPMGR_PASSWORD="postgres123"
REPMGR_USER="repmgr"
REPMGR_DB="repmgr"
PRIMARY_PORT=5432

PG_SERVICE_NAME="postgresql"
POSTGRES_USER="postgres"

# ================= 路径与工具定义 =================
PG_CONF="${PG_DATA_DIR}/postgresql.conf"
PG_HBA="${PG_DATA_DIR}/pg_hba.conf"
REPMGR_CONF="${REPMGR_ETC_DIR}/repmgr.conf"
PG_CONFIG="${PGSQL_BIN_PATH}/pg_config"
# 【核心变更】repmgr 现在就在 PG 的 bin 目录下
REPMGR_PATH="${PGSQL_BIN_PATH}/repmgr"
PATH="${PGSQL_BIN_PATH}:${PATH}"

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }

# ================= 0. 严格预检查 =================
log_info "=========================================="
log_info "  repmgr 从库一键部署 (PG目录集成版)"
log_info "=========================================="

[ "$EUID" -ne 0 ] && log_error "请用 root 执行"
[ ! -f "${PG_CONFIG}" ] && log_error "未找到 pg_config: ${PG_CONFIG}"
[ ! -d "${PGSQL_BIN_PATH}" ] && log_error "PostgreSQL 二进制目录不存在: ${PGSQL_BIN_PATH}"

log_info "正在停止从库 PostgreSQL 服务..."
systemctl stop "${PG_SERVICE_NAME}" 2>/dev/null || true

if [ -d "${PG_DATA_DIR}" ]; then
    if [ "$(ls -A ${PG_DATA_DIR} 2>/dev/null)" ]; then
        log_error "数据目录 ${PG_DATA_DIR} 不为空!"
        log_error "为了安全,请手动清理: rm -rf ${PG_DATA_DIR}/*"
        exit 1
    fi
else
    mkdir -p ${PG_DATA_DIR}
    chown -R ${POSTGRES_USER}: ${PG_DATA_DIR}
fi

# 准备 repmgr 的配置和日志目录
mkdir -p ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR} ${PG_BACKUP_DIR}
chown -R ${POSTGRES_USER}: ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR} ${PG_BACKUP_DIR}
chmod 700 ${PG_DATA_DIR} ${REPMGR_LOG_DIR}

# ================= 1. 安装依赖 =================
log_info "[1/10] 安装依赖..."
apt update -qq && apt install -y -qq flex bison make gcc libpq-dev libcurl4-openssl-dev libjson-c-dev

# ================= 2. 【核心修改】编译安装 repmgr 进 PG 目录 =================
log_info "[2/10] 编译安装 repmgr ${REPMGR_VERSION} (集成至 PG 目录)..."
PKG_NAME="repmgr-${REPMGR_VERSION}.tar.gz"
DOWNLOAD_URL="https://githubfast.com/EnterpriseDB/repmgr/releases/download/v${REPMGR_VERSION}/${PKG_NAME}"

cd /tmp

if [ ! -f "${PKG_NAME}" ]; then
    log_info "正在下载 ${PKG_NAME} ..."
    wget -q --timeout=30 --tries=3 "${DOWNLOAD_URL}" -O "${PKG_NAME}" || log_error "下载失败,请检查网络"
fi

log_info "正在解压源码..."
rm -rf "repmgr-${REPMGR_VERSION}"
tar -zxf "${PKG_NAME}" || log_error "解压失败"
cd "repmgr-${REPMGR_VERSION}" || log_error "无法进入源码目录"

# 【关键】不指定 --prefix,通过 PG_CONFIG 自动定位到 PG 安装目录
export PG_CONFIG="${PG_CONFIG}"

log_info "正在配置 (configure)..."
# 注意:这里没有 --prefix,它会自动安装到 pg_config --bindir 下
./configure || log_error "Configure 失败"

log_info "正在编译 (make)..."
make -j$(nproc) || log_error "编译失败"

log_info "正在安装 (make install)..."
make install || log_error "安装失败"

# 【关键】验证文件确实安装到了 /apps/pgsql/bin
if [ ! -f "${REPMGR_PATH}" ]; then
    log_error "repmgr 未安装到预期路径: ${REPMGR_PATH}"
fi
log_info "✓ repmgr 安装完成: ${REPMGR_PATH}"

# ================= 3. 配置环境变量(简化) =================
log_info "[3/10] 配置环境变量..."
# 因为 repmgr 就在 PG 的 bin 下,通常只需要确保 PG bin 在 PATH 里即可
BASHRC_MARKER="# repmgr & PostgreSQL Environment"
if ! grep -qF "${BASHRC_MARKER}" /home/${POSTGRES_USER}/.bashrc; then
    cat >> /home/${POSTGRES_USER}/.bashrc << EOF

${BASHRC_MARKER}
export PATH=${PGSQL_BIN_PATH}:\$PATH
export PGDATA=${PG_DATA_DIR}
EOF
fi

# ================= 4. 配置 .pgpass =================
log_info "[4/10] 配置 .pgpass..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:${REPMGR_DB}:${REPMGR_USER}:${REPMGR_PASSWORD}
*:*:replication:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass

# ================= 5. 校验主库连通性 =================
log_info "[5/10] 校验主库连通性..."
if ! su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/psql -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} -c 'SELECT 1' -t -q"; then
    log_error "无法连接主库 ${PRIMARY_PHYSICAL_IP}!"
    exit 1
fi
log_info "✓ 主库连接正常"

# ================= 6. 生成 repmgr.conf =================
log_info "[6/10] 生成 repmgr.conf..."
cat > ${REPMGR_CONF} << EOF
node_id=${STANDBY_NODE_ID}
node_name='${STANDBY_NODE_NAME}'
conninfo='host=${STANDBY_PHYSICAL_IP} port=${PRIMARY_PORT} user=${REPMGR_USER} dbname=${REPMGR_DB} password=${REPMGR_PASSWORD} connect_timeout=2'
# 【优化】pg_bindir 现在也统一指向 PG 目录
pg_bindir='${PGSQL_BIN_PATH}'
data_directory='${PG_DATA_DIR}'
use_replication_slots=yes
failover=automatic
# 【优化】命令路径直接使用 PG bin 目录
promote_command='${REPMGR_PATH} standby promote -f ${REPMGR_CONF} --log-to-file'
follow_command='${REPMGR_PATH} standby follow -f ${REPMGR_CONF} --log-to-file --upstream-node-id=%n'
log_file='${REPMGR_LOG_DIR}/repmgrd.log'
log_level=INFO
EOF
chown ${POSTGRES_USER}: ${REPMGR_CONF}
chmod 600 ${REPMGR_CONF}

# ================= 7. 克隆主库数据 =================
log_info "[7/10] 开始克隆主库数据(这可能需要几分钟)..."
chown -R ${POSTGRES_USER}: ${PG_DATA_DIR}

CLONE_CMD="${REPMGR_PATH} -f ${REPMGR_CONF} standby clone \
    -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} \
    --fast-checkpoint --force"

log_info "执行克隆命令: ${CLONE_CMD}"
if ! su - "${POSTGRES_USER}" -c "${CLONE_CMD}"; then
    log_error "主库克隆失败!请检查上述报错"
    exit 1
fi

[ -f "${PG_DATA_DIR}/standby.signal" ] || (touch "${PG_DATA_DIR}/standby.signal" && chown ${POSTGRES_USER}: "${PG_DATA_DIR}/standby.signal")
log_info "✓ 主库数据克隆完成"

# ================= 8. 检查 shared_preload_libraries =================
log_info "[8/10] 检查 shared_preload_libraries..."

# ================= 9. 启动从库 & 注册 =================
log_info "[9/10] 启动从库并注册节点..."
rm -f "${PG_DATA_DIR}/postmaster.pid"

if ! systemctl start "${PG_SERVICE_NAME}"; then
    log_error "从库启动失败!排查命令: journalctl -n 50 -u ${PG_SERVICE_NAME}"
    exit 1
fi

log_info "等待从库启动完成..."
for i in {1..60}; do
    if su - "${POSTGRES_USER}" -c "${PGSQL_BIN_PATH}/pg_isready -q"; then
        log_info "✓ PostgreSQL 从库启动成功"
        break
    fi
    [ ${i} -eq 60 ] && log_error "从库启动超时,请检查 PostgreSQL 日志"
    sleep 2
done

log_info "正在注册从库节点到集群..."
su - "${POSTGRES_USER}" -c "${REPMGR_PATH} -f ${REPMGR_CONF} standby register \
    -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} \
    --upstream-node-id=1 --force"

# ================= 10. 完成验证 =================
log_info "[10/10] 部署完成,验证状态..."
log_info "=========================================="
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} cluster show"
log_info "=========================================="
log_info ""
log_info "【配置说明】"
log_info "✓ 从库节点名: ${STANDBY_NODE_NAME} (ID: ${STANDBY_NODE_ID})"
log_info "✓ 连接主库: ${PRIMARY_PHYSICAL_IP}"
log_info "✓ 日志文件: ${REPMGR_LOG_DIR}/repmgrd.log"
log_info ""

4.3选举节点配置

#!/bin/bash
set -euo pipefail

# ================= 配置区 =================
PGSQL_BIN_PATH="/apps/pgsql/bin"
PG_DATA_DIR="/data/pgsql"
PG_BACKUP_DIR="/data/backup"       # 【优化1】统一备份目录规范
REPMGR_ETC_DIR="/apps/repmgr/etc"
REPMGR_LOG_DIR="/apps/repmgr/log"

PRIMARY_PHYSICAL_IP="10.0.0.101"
WITNESS_PHYSICAL_IP="10.0.0.103"
WITNESS_NODE_NAME="witness"
WITNESS_NODE_ID=3

REPMGR_PASSWORD="postgres123"
REPMGR_USER="repmgr"
REPMGR_DB="repmgr"
PRIMARY_PORT=5432

PG_SERVICE_NAME="postgresql"
POSTGRES_USER="postgres"
# =========================================

PG_CONF="${PG_DATA_DIR}/postgresql.conf"
PG_HBA="${PG_DATA_DIR}/pg_hba.conf"
REPMGR_CONF="${REPMGR_ETC_DIR}/repmgr.conf"
REPMGR_PATH="${PGSQL_BIN_PATH}/repmgr"

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }

# ================= 0. 预检查 =================
log_info "=========================================="
log_info "  repmgr 见证节点部署 (${WITNESS_NODE_NAME})"
log_info "=========================================="

[ "$EUID" -ne 0 ] && log_error "请使用 root 用户执行"
[ ! -f "${PGSQL_BIN_PATH}/pg_config" ] && log_error "未找到 pg_config"
# 【优化2】预检查 repmgr 二进制
[ ! -f "${REPMGR_PATH}" ] && log_error "未找到 repmgr 二进制文件: ${REPMGR_PATH},请先编译安装"

log_info "正在停止本地 PostgreSQL 服务..."
systemctl stop "${PG_SERVICE_NAME}" 2>/dev/null || true

mkdir -p ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chown -R ${POSTGRES_USER}: ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chmod 700 ${PG_BACKUP_DIR} ${REPMGR_LOG_DIR}

# 【优化3】提前配置 .pgpass(不依赖顺序,后续管理也方便)
log_info "配置 .pgpass..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:*:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass

# ================= 1. 编译安装 repmgr =================
log_info "[1/6] 编译安装 repmgr..."
REPMGR_VERSION="5.5.0"
PKG_NAME="repmgr-${REPMGR_VERSION}.tar.gz"
DOWNLOAD_URL="https://githubfast.com/EnterpriseDB/repmgr/releases/download/v${REPMGR_VERSION}/${PKG_NAME}"

cd /tmp
[ ! -f "${PKG_NAME}" ] && wget -q --timeout=30 --tries=3 "${DOWNLOAD_URL}" -O "${PKG_NAME}"
rm -rf repmgr-${REPMGR_VERSION} && tar -zxf "${PKG_NAME}" && cd repmgr-${REPMGR_VERSION}

export PG_CONFIG="${PGSQL_BIN_PATH}/pg_config"
./configure > /dev/null
make -s -j$(nproc) > /dev/null
make -s install > /dev/null
log_info "✓ repmgr 安装完成"

# ================= 2. 初始化本地数据库 =================
log_info "[2/6] 初始化见证节点本地数据库..."
if [ ! -d "${PG_DATA_DIR}/base" ]; then
    log_info "数据目录为空,正在初始化新集群..."
    mkdir -p ${PG_DATA_DIR}
    chown -R ${POSTGRES_USER}: ${PG_DATA_DIR}
    chmod 700 ${PG_DATA_DIR}
    
    PWFILE="/tmp/initdb_pwfile_$(date +%s)"
    echo "${REPMGR_PASSWORD}" > "${PWFILE}"
    chown ${POSTGRES_USER}: "${PWFILE}"
    chmod 600 "${PWFILE}"
    
    su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/initdb -D ${PG_DATA_DIR} -A scram-sha-256 --pwfile=${PWFILE}"
    
    rm -f "${PWFILE}"
    
    # 【优化4】修改 postgresql.conf 前自动备份
    cp ${PG_CONF} ${PG_CONF}.bak.repmgr
    sed -i "s/^#*listen_addresses = .*/listen_addresses = 'localhost, ${WITNESS_PHYSICAL_IP}'/" "${PG_CONF}"
    sed -i "s/^#*port = .*/port = ${PRIMARY_PORT}/" "${PG_CONF}"
else
    log_info "数据目录已存在,跳过初始化步骤"
fi

# ================= 3. 启动本地 PostgreSQL =================
log_info "[3/6] 启动见证节点本地 PostgreSQL..."
rm -f "${PG_DATA_DIR}/postmaster.pid"
systemctl start "${PG_SERVICE_NAME}"

log_info "等待数据库启动..."
for i in {1..30}; do
    su - "${POSTGRES_USER}" -c "${PGSQL_BIN_PATH}/pg_isready -q" && break
    [ ${i} -eq 30 ] && log_error "启动超时"
    sleep 1
done
log_info "✓ 本地 PostgreSQL 启动成功"

# ================= 4. 配置本地权限与用户 =================
log_info "[4/6] 配置本地 repmgr 用户与 pg_hba..."

# 1. 先修改好 pg_hba.conf(root 改文件,不连数据库)
# 【优化5】修改 pg_hba 前自动备份
if [ ! -f "${PG_HBA}.bak.repmgr" ]; then
    cp ${PG_HBA} ${PG_HBA}.bak.repmgr
fi

HBA_MARKER="# repmgr cluster auto config"
if ! grep -qF "${HBA_MARKER}" "${PG_HBA}"; then
    echo -e "\n${HBA_MARKER}" >> "${PG_HBA}"
    for node_ip in "10.0.0.101" "10.0.0.102" "10.0.0.103"; do
        echo "host    replication     ${REPMGR_USER}    ${node_ip}/32        scram-sha-256" >> "${PG_HBA}"
        echo "host    ${REPMGR_DB}    ${REPMGR_USER}    ${node_ip}/32        scram-sha-256" >> "${PG_HBA}"
    done
fi
chown ${POSTGRES_USER}: ${PG_HBA}

# 2. 【核心】所有数据库操作在一个块里完成,靠 PGPASSWORD
su - ${POSTGRES_USER} <<EOF
set -e
export PGPASSWORD="${REPMGR_PASSWORD}"

${PGSQL_BIN_PATH}/psql -tAc "SELECT 1 FROM pg_roles WHERE rolname='${REPMGR_USER}'" | grep -q 1 || \
    ${PGSQL_BIN_PATH}/psql -c "CREATE USER ${REPMGR_USER} WITH SUPERUSER LOGIN;"
${PGSQL_BIN_PATH}/psql -c "ALTER USER ${REPMGR_USER} WITH PASSWORD '${REPMGR_PASSWORD}';"
${PGSQL_BIN_PATH}/psql -tAc "SELECT 1 FROM pg_database WHERE datname='${REPMGR_DB}'" | grep -q 1 || \
    ${PGSQL_BIN_PATH}/psql -c "CREATE DATABASE ${REPMGR_DB} OWNER ${REPMGR_USER};"

${PGSQL_BIN_PATH}/psql -c 'SELECT pg_reload_conf();'
EOF

# ================= 5. 连接主库并注册 =================
log_info "[5/6] 校验主库连通性..."
if ! su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/psql -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} -c 'SELECT 1' -t -q"; then
    log_error "无法连接主库"
fi
log_info "✓ 主库连接正常"

log_info "[6/6] 生成配置并注册见证节点..."
cat > ${REPMGR_CONF} << EOF
node_id=${WITNESS_NODE_ID}
node_name='${WITNESS_NODE_NAME}'
conninfo='host=${WITNESS_PHYSICAL_IP} port=${PRIMARY_PORT} user=${REPMGR_USER} dbname=${REPMGR_DB} password=${REPMGR_PASSWORD} connect_timeout=2'
pg_bindir='${PGSQL_BIN_PATH}'
data_directory='${PG_DATA_DIR}'
node_type='witness'
log_file='${REPMGR_LOG_DIR}/repmgrd.log'
log_level=INFO
EOF
chown ${POSTGRES_USER}: ${REPMGR_CONF}
chmod 600 ${REPMGR_CONF}

log_info "正在向主库注册见证节点..."
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} witness register -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} --force"

# ================= 完成 =================
log_info "=========================================="
log_info "  见证节点部署完成!"
log_info "=========================================="
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} cluster show"

4.4部署验证

su - postgres 
postgres@db3:~$ /apps/repmgr/bin/repmgr cluster show -f /apps/repmgr/etc/repmgr.conf
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string                                                               
----+------+---------+-----------+----------+----------+----------+----------+----------------------------------------------------------------------------------
 1  | db1  | primary | * running |          | default  | 100      | 1        | host=10.0.0.101 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2
 2  | db2  | standby |   running | db1      | default  | 100      | 1        | host=10.0.0.102 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2
 3  | db3  | witness | * running | db1      | default  | 0        | n/a      | host=10.0.0.103 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2

5. repmgrd 守护进程一键配置脚本(db1/2/3分别执行)

#!/bin/bash
set -euo pipefail

# ==========================================
# 【用户必须修改区域】
# ==========================================
INSTALL_PREFIX="/apps/repmgr"
PGSQL_BIN_PATH="/apps/pgsql/bin"
PGDATA="/data/pgsql"
POSTGRES_USER="postgres"
PG_SERVICE_NAME="postgresql"
PG_BACKUP_DIR="/data/backup"

# 【关键修复】补全缺失的 repmgr 连接信息定义
REPMGR_USER="repmgr"
REPMGR_PASSWORD="postgres123" # 如果 repmgr.conf 里有,会自动覆盖这里
REPMGR_DB="repmgr"
# ==========================================
# 【以下内容请勿修改】
# ==========================================

# 路径定义
REPMGR_BIN="${PGSQL_BIN_PATH}/repmgr"
REPMGRD_BIN="${PGSQL_BIN_PATH}/repmgrd"
REPMGR_CONF_FILE="${INSTALL_PREFIX}/etc/repmgr.conf"
REPMGR_LOG_DIR="${INSTALL_PREFIX}/log"
REPMGRD_LOG="${REPMGR_LOG_DIR}/repmgrd.log"
REPMGRD_PID="${REPMGR_LOG_DIR}/repmgrd.pid"
SERVICE_FILE="/etc/systemd/system/repmgrd.service"
PG_CONF="${PGDATA}/postgresql.conf"
PSQL_PATH="${PGSQL_BIN_PATH}/psql"
PG_ISREADY="${PGSQL_BIN_PATH}/pg_isready"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }

# ==========================================
# 核心函数:等待PostgreSQL就绪
# ==========================================
wait_pg_ready() {
    local timeout=${1:-30}
    log_info "等待 PostgreSQL 服务就绪,超时时间 ${timeout} 秒..."
    for ((i=1; i<=timeout; i++)); do
        if su - "${POSTGRES_USER}" -c "${PG_ISREADY} -q" 2>/dev/null; then
            log_info "PostgreSQL 服务就绪,耗时 ${i} 秒"
            return 0
        fi
        log_warn "等待中... (${i}/${timeout})"
        sleep 1
    done
    log_error "PostgreSQL 启动超时!"
}

# ==========================================
# 核心函数:检查并确保 shared_preload_libraries 生效
# ==========================================
ensure_shared_preload_libraries() {
    log_info "正在检查 shared_preload_libraries 内存状态..."
    
    local TMP_SQL="/tmp/check_shared_libs_$$.sql"
    echo "SHOW shared_preload_libraries;" > "${TMP_SQL}"
    chown ${POSTGRES_USER}: "${TMP_SQL}"
    
    local CURRENT_VAL
    CURRENT_VAL=$(su - ${POSTGRES_USER} -c "PGPASSWORD=${REPMGR_PASSWORD:-} ${PSQL_PATH} -t -q -f '${TMP_SQL}' 2>/dev/null" | xargs || true)
    
    rm -f "${TMP_SQL}"
    
    if [[ ",${CURRENT_VAL}," == *",repmgr,"* ]] || [ "${CURRENT_VAL}" = "repmgr" ]; then
        log_info "shared_preload_libraries 已在内存中生效: ${CURRENT_VAL}"
        return 0
    fi

    log_warn "内存中未检测到 repmgr,检查配置文件..."
    
    if ! grep -q "shared_preload_libraries.*repmgr" "${PG_CONF}"; then
        log_warn "配置文件中未设置,正在修改..."
        mkdir -p ${PG_BACKUP_DIR}
        BACKUP_FILE="${PG_BACKUP_DIR}/postgresql.conf.bak.repmgr.$(date +%s)"
        cp ${PG_CONF} ${BACKUP_FILE}
        log_info "配置已备份至: ${BACKUP_FILE}"

        if grep -q "^#*shared_preload_libraries" "${PG_CONF}"; then
            CURRENT_FILE_VAL=$(grep "^shared_preload_libraries" "${PG_CONF}" | cut -d"'" -f2 || true)
            if [ -z "${CURRENT_FILE_VAL:-}" ]; then
                sed -i "s/^#*shared_preload_libraries = .*/shared_preload_libraries = 'repmgr'/" "${PG_CONF}"
            else
                if [[ ! ",${CURRENT_FILE_VAL}," == *",repmgr,"* ]]; then
                    sed -i "s/^shared_preload_libraries = '${CURRENT_FILE_VAL}'/shared_preload_libraries = '${CURRENT_FILE_VAL}, repmgr'/" "${PG_CONF}"
                fi
            fi
        else
            echo "shared_preload_libraries = 'repmgr'" >> "${PG_CONF}"
        fi
    fi

    echo ""
    echo -e "${RED}==========================================${NC}"
    echo -e "${RED}  ⚠️  重要:必须重启 PostgreSQL${NC}"
    echo -e "${RED}==========================================${NC}"
    echo -e "${YELLOW}配置已修改,为了确保 repmgrd 能正常启动,${NC}"
    echo -e "${YELLOW}数据库将在 5 秒后自动重启...${NC}"
    echo ""
    sleep 5

    log_info "正在重启 PostgreSQL (服务名: ${PG_SERVICE_NAME})..."
    systemctl stop "${PG_SERVICE_NAME}"
    if ! ps aux | grep -v grep | grep -q 'postgres'; then
        rm -f ${PGDATA}/postmaster.pid 2>/dev/null || true
    fi
    if ! systemctl start ${PG_SERVICE_NAME}; then
        log_error "PostgreSQL 重启失败!请手动检查后重试"
    fi
    
    wait_pg_ready 30

    echo "SHOW shared_preload_libraries;" > "${TMP_SQL}"
    chown ${POSTGRES_USER}: "${TMP_SQL}"
    CURRENT_VAL=$(su - ${POSTGRES_USER} -c "PGPASSWORD=${REPMGR_PASSWORD:-} ${PSQL_PATH} -t -q -f '${TMP_SQL}' 2>/dev/null" | xargs || true)
    rm -f "${TMP_SQL}"
    
    if [[ ",${CURRENT_VAL}," != *",repmgr,"* ]] && [ "${CURRENT_VAL}" != "repmgr" ]; then
        log_error "重启后仍未生效!当前值: ${CURRENT_VAL}"
    fi
    log_info "shared_preload_libraries 生效确认完成"
}

# ==========================================
# 【-1/8 环境清理】
# ==========================================
echo -e "${RED}==========================================${NC}"
echo -e "${RED}  ⚠️  警告:repmgrd 服务强制清理模式${NC}"
echo -e "${RED}==========================================${NC}"

PERFORM_CLEAN=false

if [[ $# -gt 0 ]] && [[ "$1" == "--force-clean" ]]; then
    PERFORM_CLEAN=true
else
    read -p "确认要清理 repmgrd 环境吗? (输入 YES/yes 确认): " confirm_clean
    if [[ "${confirm_clean:-}" =~ ^[Yy][Ee][Ss]$ ]]; then
        PERFORM_CLEAN=true
    fi
fi

if [ "${PERFORM_CLEAN}" = true ]; then
    log_warn "正在执行清理..."
    systemctl stop repmgrd 2>/dev/null || true
    systemctl disable repmgrd 2>/dev/null || true
    pkill -9 repmgrd 2>/dev/null || true
    rm -f ${REPMGRD_PID} /tmp/repmgrd.pid ${SERVICE_FILE} /etc/logrotate.d/repmgrd
    [ -d "${REPMGR_LOG_DIR}" ] && rm -f ${REPMGR_LOG_DIR}/*.log ${REPMGR_LOG_DIR}/*.pid
    systemctl daemon-reload
    log_info "清理完成,5秒后开始部署..."
    sleep 5
else
    log_info "跳过清理,继续部署..."
fi

# ==========================================
# 0. 预检查
# ==========================================
log_info "=========================================="
log_info "  repmgrd 服务一键部署 (生产稳定版)"
log_info "=========================================="

[ "$EUID" -ne 0 ] && log_error "请用 root 执行"
[ ! -f "${REPMGR_CONF_FILE}" ] && log_error "repmgr.conf 不存在: ${REPMGR_CONF_FILE},请先部署主/从/见证节点"
[ ! -f "${PG_CONF}" ] && log_error "PG 配置文件不存在: ${PG_CONF}"
[ ! -f "${PSQL_PATH}" ] && log_error "psql 不存在: ${PSQL_PATH}"
[ ! -f "${REPMGR_BIN}" ] && log_error "repmgr 二进制不存在: ${REPMGR_BIN}"
[ ! -f "${REPMGRD_BIN}" ] && log_error "repmgrd 二进制不存在: ${REPMGRD_BIN}"

# 从 repmgr.conf 中提取密码(覆盖上面的默认值)
if grep -q "conninfo" "${REPMGR_CONF_FILE}"; then
    TMP_PASS=$(grep "conninfo" "${REPMGR_CONF_FILE}" | grep -oP "password=\K[^' ]+" || true)
    if [ -n "${TMP_PASS:-}" ]; then
        REPMGR_PASSWORD="${TMP_PASS}"
        log_info "已从 repmgr.conf 自动获取数据库密码"
    fi
fi

mkdir -p ${PG_BACKUP_DIR} ${REPMGR_LOG_DIR}
chown -R ${POSTGRES_USER}: ${PG_BACKUP_DIR} ${INSTALL_PREFIX}
chmod 700 ${REPMGR_LOG_DIR}

# 提前配置 .pgpass
log_info "提前配置 .pgpass (优先执行)..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:*:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass

log_info "预检查通过"

# ==========================================
# 1. 清理旧环境
# ==========================================
log_info "[1/8] 清理旧环境..."
systemctl stop repmgrd 2>/dev/null || true
rm -f ${REPMGRD_PID} /tmp/repmgrd.pid

# ==========================================
# 2. 先确保PostgreSQL运行正常
# ==========================================
log_info "[2/8] 检查 PostgreSQL 运行状态..."
if ! systemctl is-active --quiet ${PG_SERVICE_NAME}; then
    log_info "PostgreSQL 未运行,正在启动..."
    if ! ps aux | grep -v grep | grep -q 'postgres'; then
        rm -f ${PGDATA}/postmaster.pid 2>/dev/null || true
    fi
    if ! systemctl start ${PG_SERVICE_NAME}; then
        log_error "启动失败"
    fi
    wait_pg_ready 30
else
    log_info "PostgreSQL 运行状态正常"
fi

# ==========================================
# 3. 检查并确保参数生效
# ==========================================
log_info "[3/8] 检查并确保 shared_preload_libraries 生效..."
ensure_shared_preload_libraries

# ==========================================
# 4. 补全 repmgr.conf 日志配置
# ==========================================
log_info "[4/8] 补全 repmgr.conf 日志配置..."
sed -i '/^monitoring_history=/d' "${REPMGR_CONF_FILE}"
sed -i '/^monitor_interval=/d' "${REPMGR_CONF_FILE}"

if ! grep -q "^log_file" "${REPMGR_CONF_FILE}"; then
    echo "" >> "${REPMGR_CONF_FILE}"
    echo "log_file='${REPMGRD_LOG}'" >> "${REPMGR_CONF_FILE}"
    echo "log_level=INFO" >> "${REPMGR_CONF_FILE}"
    chown ${POSTGRES_USER}: ${REPMGR_CONF_FILE}
    log_info "已添加基础日志配置"
fi

# ==========================================
# 5. 生成 systemd 服务文件
# ==========================================
log_info "[5/8] 生成 systemd 服务文件..."
cat > ${SERVICE_FILE} << EOF
[Unit]
Description=PostgreSQL Replication Manager Daemon
After=network.target ${PG_SERVICE_NAME}.service
Requires=${PG_SERVICE_NAME}.service

[Service]
Type=forking
User=${POSTGRES_USER}
Group=${POSTGRES_USER}
ExecStart=${REPMGRD_BIN} -f ${REPMGR_CONF_FILE} --pid-file ${REPMGRD_PID}
ExecStop=/bin/kill -QUIT \$MAINPID
PIDFile=${REPMGRD_PID}
Environment=PATH=${PGSQL_BIN_PATH}:/usr/local/bin:/usr/bin:/bin
Restart=always
RestartSec=5
StartLimitIntervalSec=60s
StartLimitBurst=5

[Install]
WantedBy=multi-user.target
EOF

# ==========================================
# 6. 配置日志轮转
# ==========================================
log_info "[6/8] 配置日志轮转规则..."
cat > /etc/logrotate.d/repmgrd << EOF
${REPMGRD_LOG} {
    daily
    rotate 7
    compress
    delaycompress
    missingok
    notifempty
    create 0640 ${POSTGRES_USER} ${POSTGRES_USER}
    sharedscripts
    postrotate
        [ -f ${REPMGRD_PID} ] && kill -HUP \$(cat ${REPMGRD_PID}) 2>/dev/null || true
    endscript
}
EOF

# ==========================================
# 7. 启动服务
# ==========================================
log_info "[7/8] 启动 repmgrd 服务..."
systemctl daemon-reload
systemctl enable repmgrd

if systemctl start repmgrd; then
    sleep 3
    # ==========================================
    # 8. 最终验证
    # ==========================================
    log_info "[8/8] 验证服务状态..."
    if systemctl is-active --quiet repmgrd; then
        log_info "=========================================="
        log_info "  🎉 repmgrd 服务配置并启动成功!"
        log_info "=========================================="
        log_info "配置文件: ${REPMGR_CONF_FILE}"
        log_info "日志文件: ${REPMGRD_LOG}"
        log_info ""
        log_info "常用管理命令:"
        log_info "  查看服务状态: systemctl status repmgrd"
        log_info "  查看集群状态: su - postgres -c 'repmgr cluster show -f ${REPMGR_CONF_FILE}'"
        log_info "  查看日志: tail -f ${REPMGRD_LOG}"
        log_info ""
        log_info "无人值守清理命令: bash $0 --force-clean"
    else
        log_error "服务启动失败,请尝试手动前台运行查看报错:"
        log_error "  su - postgres"
        log_error "  ${REPMGRD_BIN} -f ${REPMGR_CONF_FILE} --verbose"
    fi
else
    log_error "服务启动命令执行失败"
fi

6.手动切换测试

#db2上执行 

su - postgres -c '
/apps/pgsql/bin/repmgr standby switchover \
  -f /apps/repmgr/etc/repmgr.conf \
  --siblings-follow \
  --force
'


NOTICE: executing switchover on node "db2" (ID: 2)
NOTICE: attempting to pause repmgrd on 3 nodes
NOTICE: local node "db2" (ID: 2) will be promoted to primary; current primary "db1" (ID: 1) will be demoted to standby
NOTICE: stopping current primary node "db1" (ID: 1)
NOTICE: issuing CHECKPOINT on node "db1" (ID: 1) 
DETAIL: executing server command "/apps/pgsql/bin/pg_ctl  -D '/data/pgsql' -W -m fast stop"
INFO: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
INFO: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
NOTICE: current primary has been cleanly shut down at location 0/30086F0
NOTICE: promoting standby to primary
DETAIL: promoting server "db2" (ID: 2) using pg_promote()
NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
NOTICE: STANDBY PROMOTE successful
DETAIL: server "db2" (ID: 2) was successfully promoted to primary
NOTICE: node "db2" (ID: 2) promoted to primary, node "db1" (ID: 1) demoted to standby
NOTICE: executing STANDBY FOLLOW on 1 of 1 siblings
INFO:  node 3 received notification to follow node 2
INFO: STANDBY FOLLOW successfully executed on all reachable sibling nodes
NOTICE: switchover was successful
DETAIL: node "db2" is now primary and node "db1" is attached as standby
NOTICE: STANDBY SWITCHOVER has completed successfully











root@db1:~# su - postgres -c '/apps/pgsql/bin/repmgr cluster show -f /apps/repmgr/etc/repmgr.conf'
 ID | Name | Role    | Status    | Upstream | Location | Priority | Timeline | Connection string                                                               
----+------+---------+-----------+----------+----------+----------+----------+----------------------------------------------------------------------------------
 1  | db1  | standby |   running | db2      | default  | 100      | 3        | host=10.0.0.101 user=repmgr dbname=repmgr connect_timeout=2                     
 2  | db2  | primary | * running |          | default  | 100      | 4        | host=10.0.0.102 user=repmgr dbname=repmgr connect_timeout=2                     
 3  | db3  | witness | * running | db2      | default  | 0        | n/a      | host=10.0.0.103 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2








#db1上执行 切换回来

su - postgres -c '
/apps/pgsql/bin/repmgr standby switchover \
  -f /apps/repmgr/etc/repmgr.conf \
  --siblings-follow \
  --force
'

7.配置 haproxy+keepalived

7.1在 db1、db2、ha1、ha2 上执行

# 安装通用依赖
apt update -y
apt install -y socat keepalived haproxy psmisc ethtool

7.2db1 & db2 数据库节点:优化版健康检查服务

#!/bin/bash

# 创建检测脚本
cat > /usr/local/bin/pg_check_role.sh << 'EOF'
#!/bin/bash
# 优化点:本地socket连接、无网络依赖、错误兜底、纯文本输出
PGSQL_BIN_PATH="/apps/pgsql/bin"
export PATH=${PGSQL_BIN_PATH}:/usr/bin:/bin

# 本地socket连接,不依赖网络,适配postgres用户peer认证
is_recovery=$(psql -U postgres -d postgres -h /tmp -t -c "SELECT pg_is_in_recovery();" 2>/dev/null | tr -d '[:space:]' || echo "unknown")

# 严格匹配主库状态,异常返回STANDBY
if [ "$is_recovery" = "f" ]; then
    echo -n "MASTER"
else
    echo -n "STANDBY"
fi
EOF

# 权限配置(必须postgres用户可执行)
chmod +x /usr/local/bin/pg_check_role.sh
chown postgres:postgres /usr/local/bin/pg_check_role.sh

# 手动测试验证
su - postgres -c "/usr/local/bin/pg_check_role.sh"
# 主库应输出 MASTER,备库应输出 STANDBY

7.3ha1 & ha2 HA 节点: HAProxy 配置

添加这些

vim /etc/haproxy/haproxy.cfg
# -----------------------------------------------------------------------------
# PostgreSQL TCP专用配置(适配长连接、连接池)
# -----------------------------------------------------------------------------
defaults postgres
    mode tcp
    option tcplog
    option dontlognull
    timeout connect 5s
    timeout client  1h
    timeout server  1h
    # 优化点:显式健康检查参数,2秒一次,2次成功上线,3次失败下线
    default-server inter 2s rise 2 fall 3 on-marked-down shutdown-sessions

# -----------------------------------------------------------------------------
# 核心:PostgreSQL 写VIP入口(仅转发主库)
# -----------------------------------------------------------------------------
listen postgres_write
    # 绑定VIP,注意:Keepalived会自动管理VIP,这里直接绑定即可
    bind 10.0.0.113:5432
    mode tcp
    option tcplog

    # 优化点:精准主库识别,只认返回MASTER的节点
    option tcp-check
    tcp-check connect port 23267
    tcp-check expect string MASTER

    # 后端数据库节点,无backup标签,自动选主
    server db1 10.0.0.101:5432 check port 23267
    server db2 10.0.0.102:5432 check port 23267

7.4配置keepalived 

7.4.1配置ha1 keepalived

root@ha1:~# cat /etc/keepalived/keepalived.conf
global_defs {
    router_id HARBOR_LB_02    # 【保留基座】保持原有 router_id
    script_user root
    enable_script_security
}

# ============== 【优化】检测脚本区 ==============
# 1. HAProxy 进程检测(通用)
vrrp_script chk_haproxy {
    script "/usr/bin/systemctl is-active --quiet haproxy"
    interval 2
    fall 2
    rise 1
}

# 2. 【新增】网卡物理链路检测(用于PG实例)
vrrp_script chk_network {
    script "/usr/bin/ethtool ens33 | grep -q 'Link detected: yes'"
    interval 2
    fall 2
    rise 2
}

# 3. 【修复】Minio API 检测(修复原基座的语法错误,增加 || 逻辑)
vrrp_script chk_minio_api {
    script "/usr/bin/curl -s --connect-timeout 2 http://10.0.0.106:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.107:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.108:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.109:9000/minio/health/live | grep -q '\"status\":\"success\"'"
    interval 2
    weight -15
    fall 2
    rise 2
}

# 4. Minio Console 检测(保留基座)
vrrp_script chk_minio_console {
    script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:9001/minio/health/ready | grep -q '\"status\":\"success\"'"
    interval 2
    weight -15
    fall 2
    rise 2
}

# ============== 【保留基座】注释掉的历史配置 ==============
#vrrp_script chk_k8s_api {
#    script "/usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.101:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.102:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.103:6443/healthz | grep -q ok"
#    interval 2
#    weight -15
#    fall 2
#    rise 2
#}
#
#vrrp_script chk_harbor_local {
#    script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:8080/api/v2.0/health | grep -q healthy"
#    interval 2
#    weight -15
#    fall 2
#    rise 2
#}

# ============== 【优化】Minio 实例:抢占模式 ==============
vrrp_instance VI_MINIO {
    state MASTER                    # 【优化】主节点显式设为 MASTER
    interface ens33
    virtual_router_id 53
    priority 120                    # 【优化】优先级拉高至 120,防抖动
    advert_int 1
    # 【优化】移除 nopreempt,开启抢占模式
    
    authentication {
        auth_type PASS
        auth_pass minivip1          # 【保留基座】密码不变
    }

    virtual_ipaddress {
        10.0.0.112/32 dev ens33 label ens33:minio01  # 【保留基座】VIP不变
    }

    track_script {
        chk_haproxy
        chk_minio_api              # 【保留基座】使用修复后的完整检测
        chk_minio_console
    }

    notify_master "/usr/bin/systemctl restart haproxy"
    notify_backup "/usr/bin/systemctl restart haproxy"
}

# ============== 【新增】PostgreSQL 实例:非抢占模式 ==============
vrrp_instance VI_1 {
    state BACKUP                    # 【优化】PG 用 BACKUP + nopreempt
    interface ens33
    virtual_router_id 54
    priority 100
    advert_int 1
    nopreempt                       # 【优化】PG 保持非抢占

    authentication {
        auth_type PASS
        auth_pass PG_HA@2026        # 【保留优化版】PG 密码
    }

    virtual_ipaddress {
        10.0.0.113/24 dev ens33 label ens33:pgvip  # 【保留优化版】PG VIP
    }

    track_script {
        chk_haproxy
        chk_network                 # 【优化】增加网卡检测
    }

    notify_master "/usr/bin/systemctl restart haproxy"
    notify_backup "/usr/bin/systemctl restart haproxy"
}

# ============== 【保留基座】注释掉的历史实例 ==============
#vrrp_instance VI_K8S {
#    state MASTER
#    interface ens33
#    virtual_router_id 51
#    priority 100
#    advert_int 1
#    authentication {
#        auth_type PASS
#        auth_pass k8svip01
#    }
#virtual_ipaddress {
#        10.0.0.110/32 dev ens33 label ens33:k8s01
#    }
#    track_script {
#        chk_haproxy
#        chk_k8s_api
#    }
#notify_master "/usr/bin/systemctl restart haproxy"
#    notify_backup "/usr/bin/systemctl restart haproxy"
#}
#vrrp_instance VI_HARBOR {
#    state MASTER
#    interface ens33
#    virtual_router_id 52
#    priority 100
#    advert_int 1
#    authentication {
#        auth_type PASS
#        auth_pass habrvip1
#    }
#    virtual_ipaddress {
#        10.0.0.111/32 dev ens33 label ens33:habr01
#    }
#    track_script {
#        chk_haproxy
#        chk_harbor_local
#    }
#    notify_master "/usr/bin/systemctl restart haproxy"
#    notify_backup "/usr/bin/systemctl restart haproxy"
#}

7.4.2 ha2 不需要minio的可以把他取消

root@ha2:~# cat /etc/keepalived/keepalived.conf
global_defs {
    router_id HARBOR_LB_02    # 【保留基座】保持与 ha1 一致的 router_id
    script_user root
    enable_script_security
}

# ============== 【优化】检测脚本区 ==============
# 1. HAProxy 进程检测(通用)
vrrp_script chk_haproxy {
    script "/usr/bin/systemctl is-active --quiet haproxy"
    interval 2
    fall 2
    rise 1
}

# 2. 【新增】网卡物理链路检测(用于PG实例)
vrrp_script chk_network {
    script "/usr/bin/ethtool ens33 | grep -q 'Link detected: yes'"
    interval 2
    fall 2
    rise 2
}

# 3. 【修复】Minio API 检测(修复原基座的语法错误,补全 || 和 --connect-timeout)
vrrp_script chk_minio_api {
    script "/usr/bin/curl -s --connect-timeout 2 http://10.0.0.106:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.107:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.108:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.109:9000/minio/health/live | grep -q '\"status\":\"success\"'"
    interval 2
    weight -15
    fall 2
    rise 2
}

# 4. Minio Console 检测(保留基座)
vrrp_script chk_minio_console {
    script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:9001/minio/health/ready | grep -q '\"status\":\"success\"'"
    interval 2
    weight -15
    fall 2
    rise 2
}

# ============== 【保留基座】注释掉的历史配置 ==============
#vrrp_script chk_k8s_api {
#    script "/usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.101:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.102:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.103:6443/healthz | grep -q ok"
#    interval 2
#    weight -15
#    fall 2
#    rise 2
#}
#
#vrrp_script chk_harbor_local {
#    script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:8080/api/v2.0/health | grep -q healthy"
#    interval 2
#    weight -15
#    fall 2
#    rise 2
#}

# ============== 【优化】Minio 实例:备节点(BACKUP + 低优先级) ==============
vrrp_instance VI_MINIO {
    state BACKUP                    # 【优化】备节点设为 BACKUP
    interface ens33
    virtual_router_id 53
    priority 90                     # 【优化】优先级 90,比 ha1 低 30,确保不抢占
    advert_int 1
    # 【优化】不设置 nopreempt,配合 ha1 的抢占模式
    
    authentication {
        auth_type PASS
        auth_pass minivip1          # 【保留基座】密码不变
    }

    virtual_ipaddress {
        10.0.0.112/32 dev ens33 label ens33:minio01  # 【保留基座】VIP不变
    }

    track_script {
        chk_haproxy
        chk_minio_api              # 【保留基座】使用修复后的完整检测
        chk_minio_console
    }

    notify_master "/usr/bin/systemctl restart haproxy"
    notify_backup "/usr/bin/systemctl restart haproxy"
}

# ============== 【新增】PostgreSQL 实例:备节点(非抢占模式) ==============
vrrp_instance VI_1 {
    state BACKUP                    # 【优化】PG 备节点保持 BACKUP
    interface ens33
    virtual_router_id 54
    priority 90                     # 【优化】优先级 90,比主节点低 10
    advert_int 1
    nopreempt                       # 【优化】PG 保持非抢占,即使优先级高也不抢

    authentication {
        auth_type PASS
        auth_pass PG_HA@2026        # 【保留优化版】PG 密码
    }

    virtual_ipaddress {
        10.0.0.113/24 dev ens33 label ens33:pgvip  # 【保留优化版】PG VIP
    }

    track_script {
        chk_haproxy
        chk_network                 # 【优化】增加网卡检测
    }

    notify_master "/usr/bin/systemctl restart haproxy"
    notify_backup "/usr/bin/systemctl restart haproxy"
}

# ============== 【保留基座】注释掉的历史实例 ==============
#vrrp_instance VI_K8S {
#    state BACKUP
#    interface ens33
#    virtual_router_id 51
#    priority 90
#    advert_int 1
#    authentication {
#        auth_type PASS
#        auth_pass k8svip01
#    }
#virtual_ipaddress {
#        10.0.0.110/32 dev ens33 label ens33:k8s01
#    }
#    track_script {
#        chk_haproxy
#        chk_k8s_api
#    }
#notify_master "/usr/bin/systemctl restart haproxy"
#    notify_backup "/usr/bin/systemctl restart haproxy"
#}
#vrrp_instance VI_HARBOR {
#    state BACKUP
#    interface ens33
#    virtual_router_id 52
#    priority 90
#    advert_int 1
#    authentication {
#        auth_type PASS
#        auth_pass habrvip1
#    }
#    virtual_ipaddress {
#        10.0.0.111/32 dev ens33 label ens33:habr01
#    }
#    track_script {
#        chk_haproxy
#        chk_harbor_local
#    }
#    notify_master "/usr/bin/systemctl restart haproxy"
#    notify_backup "/usr/bin/systemctl restart haproxy"
#}

7.5接下来:进行最终的功能验证测试

我们来分别测试 Minio 的抢占模式和 PostgreSQL 的非抢占模式,确保两者都按预期工作。


测试一:Minio 抢占模式(自动抢回)

  1. 在 ha2 上抓日志

    tail -f /var/log/syslog | grep -i 'VI_MINIO'
    
  2. 在 ha1 上停止 haproxy(模拟 Minio 入口故障):

    systemctl stop haproxy
    
  3. 观察
    • 等待 5 秒,Minio VIP (10.0.0.112) 会飘到 ha2。
  4. 在 ha1 上恢复 haproxy

    systemctl start haproxy
    
  5. 观察
    • 等待 5 秒,Minio VIP 会自动从 ha2 抢回 ha1(因为是抢占模式)。

测试二:PostgreSQL 非抢占模式(稳定优先)

  1. 在 ha2 上抓日志

    tail -f /var/log/syslog | grep -i 'VI_1'
    
  2. 在 ha1 上停止 haproxy(模拟 PG 入口故障):

    systemctl stop haproxy
    
  3. 观察
    • 等待 5 秒,PG VIP (10.0.0.113) 会飘到 ha2。
  4. 在 ha1 上恢复 haproxy

    systemctl start haproxy
    
  5. 观察
    • PG VIP 不会自动抢回 ha1(因为是非抢占模式),这是正常的!
  6. (可选)手动回切 PG VIP:如果想让 PG VIP 回到 ha1,在 ha2 上执行:

    systemctl restart keepalived
    

最终状态总结

表格

VIP 模式 当前位置 行为
10.0.0.112 (Minio) 抢占模式 ha1 ha1 故障恢复后自动抢回
10.0.0.113 (PostgreSQL) 非抢占模式 ha1 谁故障谁释放,恢复后不自动抢回,需手动回切

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐