Harbor(外置db+redis) + K8s 高可用集群实施手册(独立LB + Calico + kubeasz版)二(harbor内置db确认版本二进制安装高可用pgsql15.15集群)脚本版
本文详细介绍了PostgreSQL高可用集群的部署过程,主要包括以下内容: 系统初始化优化:配置内核参数、IO调度策略、时间同步等系统级优化。 安装PostgreSQL:从源码编译安装PostgreSQL 15.15版本,配置环境变量和systemd服务。 主库初始化:初始化主库数据目录,配置postgresql.conf和pg_hba.conf,创建repmgr用户和数据库。 配置repmgr:
·
0.系统初始优化
#!/bin/bash
set -e
# ==========================================
# 配置区域 (唯一需要修改的地方)
# ==========================================
# 定义所有节点的 IP 和主机名映射
declare -A NODES=(
["db1"]="10.0.0.101"
["db2"]="10.0.0.102"
["db3"]="10.0.0.103"
["ha1"]="10.0.0.104"
["ha2"]="10.0.0.105"
)
# postgres 用户的默认密码
POSTGRES_PASS="postgres123"
# ==========================================
# 颜色定义
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[!]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; exit 1; }
backup_file() { [ -f "$1" ] && cp "$1" "${1}.backup.$(date +%Y%m%d_%H%M%S)" && log_info "Backup: $1"; }
# 解析参数 (保留参数解析以保持兼容性,但实际不再启用 SSH 配置)
FORCE_SSH=""
AUTO_REBOOT=true
while [[ "$#" -gt 0 ]]; do
case $1 in
--force-ssh) log_warn "Parameter --force-ssh is deprecated and ignored." ;;
--skip-ssh) log_warn "SSH setup is already disabled by default." ;;
--no-reboot) AUTO_REBOOT=false ;;
*) log_error "Unknown parameter: $1. Use --no-reboot" ;;
esac
shift
done
# 检查 root
[ $EUID -ne 0 ] && log_error "Must run as root"
# ==========================================
# 1. 自动检测 IP 和 角色
# ==========================================
log_info "Auto-detecting node role..."
CURRENT_IPS=$(ip -o -4 addr show scope global | awk '{print $4}' | cut -d/ -f1)
HOSTNAME=""
DETECTED_IP=""
NODE_TYPE="unknown"
for node in "${!NODES[@]}"; do
node_ip=${NODES[$node]}
if echo "$CURRENT_IPS" | grep -q "^${node_ip}$"; then
HOSTNAME=$node
DETECTED_IP=$node_ip
# 判断节点类型
if [[ "$node" == db* ]]; then
NODE_TYPE="database"
else
NODE_TYPE="haproxy"
fi
break
fi
done
if [ -z "$HOSTNAME" ]; then
log_error "IP not in list! Current IPs: $CURRENT_IPS. Check script config."
fi
log_info "Detected: ${HOSTNAME} (${DETECTED_IP}) - Type: ${NODE_TYPE}"
# ==========================================
# 2. 决策是否配置 SSH (已强制禁用)
# ==========================================
SETUP_SSH=false
log_warn "SSH trust setup is disabled by default."
# ==========================================
# 3. 基础系统配置
# ==========================================
log_info "Starting deployment..."
# 3.1 设置主机名
hostnamectl set-hostname "$HOSTNAME"
log_info "Hostname set"
# 3.2 配置 hosts 文件
backup_file /etc/hosts
sed -i '/# Auto-generated by deploy_prep/,/# End of auto-generated/d' /etc/hosts
cat >> /etc/hosts <<EOF
# Auto-generated by deploy_prep
EOF
for node in "${!NODES[@]}"; do
echo "${NODES[$node]} $node" >> /etc/hosts
done
echo "# End of auto-generated" >> /etc/hosts
log_info "Hosts file updated"
# 3.3 临时关闭防火墙
ufw disable || true
systemctl stop apparmor 2>/dev/null || true
systemctl disable apparmor 2>/dev/null || true
log_warn "Firewall/AppArmor temp disabled"
# 3.4 安装依赖 (已移除 sshpass)
apt update
apt install -y build-essential libreadline-dev zlib1g-dev libxml2-dev libxslt1-dev libssl-dev libpam0g-dev libsystemd-dev git curl chrony
log_info "Dependencies installed"
# ==========================================
# 4. 系统优化 (内核、IO、时间)
# ==========================================
total_mem_bytes=$(grep MemTotal /proc/meminfo | awk '{print $2*1024}')
page_size=$(getconf PAGE_SIZE)
cpu_cores=$(nproc)
total_mem_mb=$((total_mem_bytes / 1024**2))
log_info "Hardware: ${total_mem_mb}MB RAM, $cpu_cores Cores"
shmmax=$((total_mem_bytes / 2))
shmall=$((total_mem_bytes / page_size))
backup_file /etc/sysctl.conf
cat >> /etc/sysctl.conf <<EOF
# PostgreSQL Optimizations
kernel.shmmax = $shmmax
kernel.shmall = $shmall
kernel.shmmni = 4096
kernel.sem = 250 256000 32 1024
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
vm.swappiness = 1
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
EOF
sysctl -p
sed -i 's/^#RemoveIPC=yes/RemoveIPC=no/' /etc/systemd/logind.conf
systemctl restart systemd-logind
backup_file /etc/default/grub
if ! grep -q "numa=off" /etc/default/grub; then
sed -i 's/GRUB_CMDLINE_LINUX="/GRUB_CMDLINE_LINUX="numa=off /' /etc/default/grub
update-grub
fi
cat > /etc/udev/rules.d/60-postgresql-io.rules <<EOF
ACTION=="add|change", KERNEL=="nvme[0-9]*", ATTR{queue/scheduler}="none"
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="none"
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="1", ATTR{queue/scheduler}="mq-deadline"
EOF
udevadm control --reload-rules && udevadm trigger
backup_file /etc/chrony/chrony.conf
sed -i '/^server/d' /etc/chrony/chrony.conf
cat >> /etc/chrony/chrony.conf <<EOF
server ntp1.aliyun.com iburst
server ntp2.aliyun.com iburst
EOF
timedatectl set-timezone Asia/Shanghai
systemctl enable --now chrony
chronyc -a makestep
nofile=$(( total_mem_mb * 2 > 65536 ? 65536 : (total_mem_mb * 2 < 16384 ? 16384 : total_mem_mb * 2) ))
nproc=$(( cpu_cores * 1024 > 32768 ? 32768 : (cpu_cores * 1024 < 8192 ? 8192 : cpu_cores * 1024) ))
cat > /etc/security/limits.d/postgres.conf <<EOF
postgres soft nofile $nofile
postgres hard nofile $nofile
postgres soft nproc $nproc
postgres hard nproc $nproc
EOF
log_info "System optimized"
# ==========================================
# 5. Postgres 用户创建 (SSH 互信已取消)
# ==========================================
log_info "Setting up Postgres user..."
if ! id "postgres" &>/dev/null; then
useradd -m -s /bin/bash postgres
echo "postgres:$POSTGRES_PASS" | chpasswd
log_info "User 'postgres' created."
else
log_info "User 'postgres' already exists."
# 确保密码更新
echo "postgres:$POSTGRES_PASS" | chpasswd
fi
log_warn "SSH trust setup skipped (manual configuration required if needed)."
# ==========================================
# 结束与重启
# ==========================================
log_info "All tasks finished successfully."
if [ "$AUTO_REBOOT" = true ]; then
echo
log_warn "============================================="
log_warn " 系统将在 5 秒后自动重启"
log_warn " 按 Ctrl+C 可取消"
log_warn "============================================="
echo
for i in {5..1}; do
echo -ne "${YELLOW}重启倒计时:$i...${NC}\r"
sleep 1
done
log_info "正在重启..."
reboot
else
log_info "已跳过自动重启。"
fi
1.配置免密登录
1.1ssh免密
#!/bin/bash
# ==============================================================================
# 脚本功能:配置 PostgreSQL 集群节点间的 SSH 免密信任 (用于 repmgr)
# 适用用户:postgres (必须以 postgres 用户运行)
# 运行方式:在 3 台主机上分别执行此脚本
# ==============================================================================
# 1. 定义集群所有节点的 IP 和主机名
# 请根据实际情况修改,确保包含当前机器
ALL_NODES=(
"10.0.0.101:db1"
"10.0.0.102:db2"
"10.0.0.103:db3"
)
# 2. 定义 SSH 用户 (通常为 postgres)
SSH_USER="postgres"
# ------------------------------------------------------------------------------
# 检查是否以 postgres 用户运行
# ------------------------------------------------------------------------------
if [ "$USER" != "$SSH_USER" ]; then
echo "错误:请使用 '$SSH_USER' 用户运行此脚本 (例如:su - $SSH_USER)"
exit 1
fi
# ------------------------------------------------------------------------------
# 3. 生成本地 SSH 密钥 (如果不存在)
# ------------------------------------------------------------------------------
SSH_DIR="$HOME/.ssh"
KEY_FILE="$SSH_DIR/id_rsa"
if [ ! -d "$SSH_DIR" ]; then
mkdir -p "$SSH_DIR"
chmod 700 "$SSH_DIR"
echo "已创建 $SSH_DIR 目录"
fi
if [ ! -f "$KEY_FILE" ]; then
echo "正在生成 SSH 密钥对 (无密码短语)..."
ssh-keygen -t rsa -b 4096 -f "$KEY_FILE" -N "" -q
echo "密钥生成完成:$KEY_FILE"
else
echo "检测到已有密钥:$KEY_FILE (跳过生成)"
fi
# ------------------------------------------------------------------------------
# 4. 配置 SSH 客户端 (关闭 StrictHostKeyChecking 以避免 repmgr 阻塞)
# ------------------------------------------------------------------------------
CONFIG_FILE="$SSH_DIR/config"
echo "正在配置 $CONFIG_FILE ..."
cat > "$CONFIG_FILE" <<EOF
Host *
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
LogLevel ERROR
EOF
chmod 600 "$CONFIG_FILE"
echo "SSH 配置完成 (已禁用主机密钥检查)"
# ------------------------------------------------------------------------------
# 5. 将公钥分发到所有节点 (包括自己,确保全互联)
# ------------------------------------------------------------------------------
echo "开始分发公钥到集群节点..."
# 获取当前机器的 IP (用于跳过自己,虽然复制给自己也没坏处)
CURRENT_IP=$(hostname -i | awk '{print $1}')
for node in "${ALL_NODES[@]}"; do
IP=$(echo $node | cut -d':' -f1)
HOSTNAME=$(echo $node | cut -d':' -f2)
# 如果是自己,跳过 (可选,但为了逻辑清晰建议跳过)
if [ "$IP" == "$CURRENT_IP" ]; then
echo "跳过本机:$HOSTNAME ($IP)"
continue
fi
echo "----------------------------------------"
echo "正在配置 -> $HOSTNAME ($IP)"
# 使用 cat 管道方式代替 ssh-copy-id (兼容性更好)
# 注意:第一次执行时,这里会提示你输入 $SSH_USER@IP 的密码,请输入一次
cat "$KEY_FILE.pub" | ssh -o ConnectTimeout=5 "$SSH_USER@$IP" "
mkdir -p ~/.ssh &&
chmod 700 ~/.ssh &&
cat >> ~/.ssh/authorized_keys &&
chmod 600 ~/.ssh/authorized_keys &&
echo 'Success on $HOSTNAME'
" 2>&1
if [ $? -eq 0 ]; then
echo "成功:$HOSTNAME 信任已建立"
else
echo "失败:无法连接到 $HOSTNAME,请检查网络或 SSH 服务"
fi
done
# ------------------------------------------------------------------------------
# 6. 验证
# ------------------------------------------------------------------------------
echo ""
echo "========================================"
echo "配置完成!正在进行连通性测试..."
echo "========================================"
for node in "${ALL_NODES[@]}"; do
IP=$(echo $node | cut -d':' -f1)
HOSTNAME=$(echo $node | cut -d':' -f2)
if [ "$IP" == "$CURRENT_IP" ]; then continue; fi
RESULT=$(ssh "$SSH_USER@$IP" "hostname" 2>&1)
if [ $? -eq 0 ]; then
echo "[OK] $HOSTNAME ($IP) 免密登录成功 (返回主机名:$RESULT)"
else
echo "[FAIL] $HOSTNAME ($IP) 免密登录失败"
fi
done
echo ""
echo "提示:请确保在 3 台主机上都运行了此脚本,以实现双向免密。"
2.安装 install_postgresql.sh
#!/bin/bash
set -e
# ==========================================
# 可配置区域
# ==========================================
PG_VERSION="15.15"
PG_MAJOR="15"
INSTALL_PREFIX="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
# 阿里云镜像源
DOWNLOAD_URL="https://mirrors.aliyun.com/postgresql/source/v${PG_VERSION}/postgresql-${PG_VERSION}.tar.gz"
SRC_DIR="/usr/local/src"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_step() { echo -e "${BLUE}[STEP]${NC} $1"; }
# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"
# 检查是否已安装
if [ -f "${INSTALL_PREFIX}/bin/psql" ]; then
log_warn "PostgreSQL is already installed at ${INSTALL_PREFIX}"
read -p "Do you want to re-install? (This will NOT delete data) (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
log_info "Exiting installation."
exit 0
fi
fi
# ==========================================
# 1. 安装系统依赖
# ==========================================
log_step "Installing build dependencies..."
apt update
apt install -y build-essential libreadline-dev zlib1g-dev libxml2-dev \
libxslt1-dev libssl-dev libpam0g-dev libsystemd-dev \
git curl wget pkg-config
# ==========================================
# 2. 准备目录
# ==========================================
log_step "Preparing directories..."
mkdir -p ${INSTALL_PREFIX} ${INSTALL_PREFIX}/run ${PGDATA} ${SRC_DIR}
# 创建 postgres 用户 (如果不存在)
if ! id "${PGUSER}" &>/dev/null; then
useradd -m -s /bin/bash ${PGUSER}
log_info "User ${PGUSER} created."
fi
# ==========================================
# 3. 下载源码
# ==========================================
cd ${SRC_DIR}
PKG_NAME="postgresql-${PG_VERSION}.tar.gz"
if [ -f "${PKG_NAME}" ]; then
log_info "Source package already exists, skipping download."
else
log_step "Downloading PostgreSQL ${PG_VERSION}..."
# 使用 wget 断点续传,重试 3 次
wget -c --tries=3 --timeout=30 ${DOWNLOAD_URL} -O ${PKG_NAME}
fi
# 验证文件是否下载成功
if [ ! -f "${PKG_NAME}" ]; then
log_error "Download failed. Please check network or URL."
fi
# 解压
log_step "Extracting source code..."
rm -rf postgresql-${PG_VERSION} # 清理旧的解压目录
tar -zxf ${PKG_NAME}
cd postgresql-${PG_VERSION}
# ==========================================
# 4. 编译与安装
# ==========================================
log_step "Configuring build..."
./configure --prefix=${INSTALL_PREFIX} \
--with-systemd \
--with-openssl \
--with-pam \
--with-libxml \
--with-libxslt
log_step "Compiling (using $(nproc) CPU cores)..."
make -j$(nproc) world
log_step "Installing..."
make install-world
# ==========================================
# 5. 配置环境变量
# ==========================================
log_step "Configuring environment variables..."
cat > /etc/profile.d/pgsql.sh <<EOF
# PostgreSQL Environment
export PATH=${INSTALL_PREFIX}/bin:\$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:\$LD_LIBRARY_PATH
export PGHOME=${INSTALL_PREFIX}
export PGDATA=${PGDATA}
EOF
# 立即生效(当前 shell)
export PATH=${INSTALL_PREFIX}/bin:$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
# ==========================================
# 6. 配置 Systemd 服务
# ==========================================
log_step "Configuring systemd service..."
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL ${PG_MAJOR} database server
Documentation=man:postgres(1)
After=network.target
[Service]
Type=notify
User=${PGUSER}
Group=${PGUSER}
Environment=PGDATA=${PGDATA}
Environment=PGHOME=${INSTALL_PREFIX}
ExecStart=${INSTALL_PREFIX}/bin/postgres -D \${PGDATA}
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=mixed
KillSignal=SIGINT
TimeoutSec=0
LimitNOFILE=65536
LimitNPROC=32768
# OOM Score adjustment (make it less likely to be killed)
OOMScoreAdjust=-1000
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable postgresql
# ==========================================
# 7. 权限收尾
# ==========================================
log_step "Setting permissions..."
chown -R ${PGUSER}:${PGUSER} ${INSTALL_PREFIX} ${PGDATA}
chmod 700 ${PGDATA}
# ==========================================
# 完成
# ==========================================
echo
log_info "=================================================="
log_info " PostgreSQL ${PG_VERSION} installation complete!"
log_info " Install Path: ${INSTALL_PREFIX}"
log_info " Systemd: systemctl start postgresql"
log_info " Next step: Initialize the database (initdb)"
log_info "=================================================="
echo
#!/bin/bash
set -e
# ==========================================
# 可配置区域
# ==========================================
PG_VERSION="15.15"
PG_MAJOR="15"
INSTALL_PREFIX="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
DOWNLOAD_URL="https://mirrors.aliyun.com/postgresql/source/v${PG_VERSION}/postgresql-${PG_VERSION}.tar.gz"
SRC_DIR="/usr/local/src"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_step() { echo -e "${BLUE}[STEP]${NC} $1"; }
# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"
# 检查是否已安装
if [ -f "${INSTALL_PREFIX}/bin/psql" ]; then
log_warn "PostgreSQL is already installed at ${INSTALL_PREFIX}"
read -p "Do you want to re-install? (This will NOT delete data) (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
log_info "Exiting installation."
exit 0
fi
fi
# ==========================================
# 1. 安装系统依赖
# ==========================================
log_step "Installing build dependencies..."
apt update
apt install -y build-essential libreadline-dev zlib1g-dev libxml2-dev \
libxslt1-dev libssl-dev libpam0g-dev libsystemd-dev \
git curl wget pkg-config
# ==========================================
# 2. 准备目录
# ==========================================
log_step "Preparing directories..."
mkdir -p ${INSTALL_PREFIX} ${INSTALL_PREFIX}/run ${PGDATA} ${SRC_DIR}
# 创建 postgres 用户 (如果不存在)
if ! id "${PGUSER}" &>/dev/null; then
useradd -m -s /bin/bash ${PGUSER}
log_info "User ${PGUSER} created."
fi
# ==========================================
# 3. 下载源码
# ==========================================
cd ${SRC_DIR}
PKG_NAME="postgresql-${PG_VERSION}.tar.gz"
if [ -f "${PKG_NAME}" ]; then
log_info "Source package already exists, skipping download."
else
log_step "Downloading PostgreSQL ${PG_VERSION}..."
wget -c --tries=3 --timeout=30 ${DOWNLOAD_URL} -O ${PKG_NAME}
fi
if [ ! -f "${PKG_NAME}" ]; then
log_error "Download failed. Please check network or URL."
fi
log_step "Extracting source code..."
rm -rf postgresql-${PG_VERSION}
tar -zxf ${PKG_NAME}
cd postgresql-${PG_VERSION}
# ==========================================
# 4. 编译与安装
# ==========================================
log_step "Configuring build..."
./configure --prefix=${INSTALL_PREFIX} \
--with-systemd \
--with-openssl \
--with-pam \
--with-libxml \
--with-libxslt
log_step "Compiling (using $(nproc) CPU cores)..."
make -j$(nproc) world
log_step "Installing..."
make install-world
# ==========================================
# 5. 配置环境变量
# ==========================================
log_step "Configuring environment variables..."
cat > /etc/profile.d/pgsql.sh <<EOF
# PostgreSQL Environment
export PATH=${INSTALL_PREFIX}/bin:\$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:\$LD_LIBRARY_PATH
export PGHOME=${INSTALL_PREFIX}
export PGDATA=${PGDATA}
EOF
export PATH=${INSTALL_PREFIX}/bin:$PATH
export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
# ==========================================
# 6. 【核心优化】配置稳健版 Systemd 服务
# ==========================================
log_step "Configuring systemd service (Optimized for stability)..."
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL ${PG_MAJOR} database server
Documentation=man:postgres(1)
After=network.target
[Service]
# 优化点1:使用 forking + pg_ctl,最稳健的组合
Type=forking
User=${PGUSER}
Group=${PGUSER}
# 优化点2:写死绝对路径,不依赖变量解析
Environment=PATH=${INSTALL_PREFIX}/bin:/usr/bin:/bin
# 优化点3:使用 pg_ctl,自带等待启动、优雅关闭
# -s: 静默模式
# -w: 等待数据库完全启动才返回
# -t 300: 超时时间 300 秒
ExecStart=${INSTALL_PREFIX}/bin/pg_ctl start -D ${PGDATA} -s -w -t 300
ExecStop=${INSTALL_PREFIX}/bin/pg_ctl stop -D ${PGDATA} -s -m fast
ExecReload=${INSTALL_PREFIX}/bin/pg_ctl reload -D ${PGDATA} -s
# 优化点4:资源限制
TimeoutSec=300
LimitNOFILE=65536
LimitNPROC=32768
# 优化点5:OOM 分数调整,让数据库不容易被 OOM Killer 杀掉
OOMScoreAdjust=-1000
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable postgresql
# ==========================================
# 7. 权限收尾
# ==========================================
log_step "Setting permissions..."
chown -R ${PGUSER}:${PGUSER} ${INSTALL_PREFIX} ${PGDATA}
chmod 700 ${PGDATA}
# ==========================================
# 完成
# ==========================================
echo
log_info "=================================================="
log_info " PostgreSQL ${PG_VERSION} installation complete!"
log_info " Install Path: ${INSTALL_PREFIX}"
log_info " Data Path: ${PGDATA}"
log_info ""
log_info " Systemd Commands:"
log_info " Start: systemctl start postgresql"
log_info " Stop: systemctl stop postgresql"
log_info " Status: systemctl status postgresql"
log_info ""
log_info " Next steps (as root):"
log_info " 1. su - postgres"
log_info " 2. initdb -D ${PGDATA}"
log_info " 3. systemctl start postgresql"
log_info "=================================================="
echo
3.主库初始化 init_primary_db.sh
#!/bin/bash
set -e
# ==========================================
# 配置与路径定义 (修改这里即可适配环境)
# ==========================================
PGHOME="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
PGPORT="5432"
REPMGR_PASS="repmgr123"
SUBNET="10.0.0.0/24"
# 命令绝对路径 (避免环境变量问题)
INITDB="${PGHOME}/bin/initdb"
PSQL="${PGHOME}/bin/psql"
PG_CTL="${PGHOME}/bin/pg_ctl"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"
# 检查 PostgreSQL 程序是否存在
if [ ! -f "$INITDB" ]; then
log_error "PostgreSQL binaries not found at $PGHOME. Did you run the compile step?"
fi
# ==========================================
# 1. 目录准备
# ==========================================
log_info "Preparing directories..."
mkdir -p ${PGHOME}/run
chown -R ${PGUSER}:${PGUSER} ${PGHOME} ${PGDATA}
chmod 700 ${PGDATA}
# 检查数据目录是否为空 (防止覆盖)
if [ "$(ls -A ${PGDATA})" ]; then
log_warn "Data directory ${PGDATA} is not empty!"
read -p "Do you want to CLEAN ALL DATA and re-initialize? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
log_error "User cancelled. Exiting."
fi
log_warn "Cleaning data directory..."
rm -rf ${PGDATA}/*
fi
# ==========================================
# 2. 初始化数据库 (initdb)
# ==========================================
log_info "Running initdb..."
su - ${PGUSER} -c "${INITDB} -D ${PGDATA} -E UTF8 --locale=en_US.UTF-8"
if [ $? -eq 0 ]; then
log_info "initdb completed successfully."
else
log_error "initdb failed!"
fi
# ==========================================
# 3. 配置 postgresql.conf
# ==========================================
log_info "Configuring postgresql.conf..."
# 备份原配置
su - ${PGUSER} -c "cp ${PGDATA}/postgresql.conf ${PGDATA}/postgresql.conf.backup"
# 追加配置
cat >> ${PGDATA}/postgresql.conf <<EOF
# ==========================================
# Custom Configuration for HA
# ==========================================
listen_addresses = '*'
port = ${PGPORT}
unix_socket_directories = '${PGHOME}/run,/tmp'
unix_socket_permissions = 0700
# WAL Settings
wal_level = replica
max_wal_senders = 10
wal_keep_size = 2GB
hot_standby = on
# Logging
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
EOF
chown ${PGUSER}:${PGUSER} ${PGDATA}/postgresql.conf
log_info "postgresql.conf configured."
# ==========================================
# 4. 配置 pg_hba.conf
# ==========================================
log_info "Configuring pg_hba.conf..."
su - ${PGUSER} -c "cp ${PGDATA}/pg_hba.conf ${PGDATA}/pg_hba.conf.backup"
cat >> ${PGDATA}/pg_hba.conf <<EOF
# ==========================================
# Replication Access
# ==========================================
local replication repmgr trust
host replication repmgr ${SUBNET} scram-sha-256
host repmgr repmgr ${SUBNET} scram-sha-256
# ==========================================
# General Access
# ==========================================
host all all ${SUBNET} scram-sha-256
EOF
chown ${PGUSER}:${PGUSER} ${PGDATA}/pg_hba.conf
log_info "pg_hba.conf configured."
# ==========================================
# 5. 启动数据库 (Systemd)
# ==========================================
log_info "Starting PostgreSQL via systemd..."
# 确保 systemd 服务文件存在 (如果没有,这里创建一个)
if [ ! -f "/etc/systemd/system/postgresql.service" ]; then
log_warn "Systemd service file not found, creating one..."
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL database server
After=network.target
[Service]
Type=notify
User=${PGUSER}
Group=${PGUSER}
Environment=PGDATA=${PGDATA}
ExecStart=${PGHOME}/bin/postgres -D ${PGDATA}
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=mixed
KillSignal=SIGINT
TimeoutSec=0
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
fi
systemctl start postgresql
if systemctl is-active --quiet postgresql; then
log_info "PostgreSQL started successfully."
else
log_error "PostgreSQL failed to start. Check 'journalctl -xeu postgresql' for details."
fi
# ==========================================
# 6. 创建 repmgr 用户和数据库
# ==========================================
log_info "Creating repmgr user and database..."
# 等待一小会儿确保数据库完全就绪
sleep 2
su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE USER repmgr WITH SUPERUSER PASSWORD '${REPMGR_PASS}';\""
su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE DATABASE repmgr OWNER repmgr;\""
log_info "repmgr user created."
# ==========================================
# 完成
# ==========================================
echo
log_info "============================================="
log_info " Primary DB (db1) initialization complete!"
log_info " Data Dir: ${PGDATA}"
log_info " Socket Dir: ${PGHOME}/run"
log_info " Access: psql -h ${PGHOME}/run"
log_info "============================================="
echo
#!/bin/bash
set -e
# ==========================================
# 配置与路径定义 (修改这里即可适配环境)
# ==========================================
PGHOME="/apps/pgsql"
PGDATA="/data/pgsql"
PGUSER="postgres"
PGPORT="5432"
# 【重要提醒】生产环境请务必修改此密码!
REPMGR_PASS="repmgr123"
SUBNET="10.0.0.0/24"
# 命令绝对路径 (避免环境变量问题)
INITDB="${PGHOME}/bin/initdb"
PSQL="${PGHOME}/bin/psql"
PG_CTL="${PGHOME}/bin/pg_ctl"
PG_ISREADY="${PGHOME}/bin/pg_isready"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
# ==========================================
# 前置检查
# ==========================================
[ $EUID -ne 0 ] && log_error "Must run as root"
if [ ! -f "$INITDB" ]; then
log_error "PostgreSQL binaries not found at $PGHOME. Did you run the compile step?"
fi
# ==========================================
# 1. 目录准备
# ==========================================
log_info "Preparing directories..."
mkdir -p ${PGHOME}/run ${PGDATA}/log
chown -R ${PGUSER}:${PGUSER} ${PGHOME}
chmod 700 ${PGDATA} 2>/dev/null || true
# 检查数据目录是否为空
if [ "$(ls -A ${PGDATA} 2>/dev/null)" ]; then
log_warn "Data directory ${PGDATA} is not empty!"
read -p "Do you want to CLEAN ALL DATA and re-initialize? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
log_error "User cancelled. Exiting."
fi
log_warn "Cleaning data directory..."
rm -rf ${PGDATA}/*
fi
# ==========================================
# 2. 初始化数据库 (initdb)
# ==========================================
log_info "Running initdb..."
su - ${PGUSER} -c "${INITDB} -D ${PGDATA} -E UTF8 --locale=en_US.UTF-8"
log_info "initdb completed successfully."
# ==========================================
# 3. 【优化】配置 postgresql.conf (使用 sed 替换,避免重复配置)
# ==========================================
log_info "Configuring postgresql.conf..."
su - ${PGUSER} -c "cp ${PGDATA}/postgresql.conf ${PGDATA}/postgresql.conf.backup.$(date +%s)"
# 优化点:使用 sed 精确修改参数,而不是简单追加
# 这样即使原配置有注释或默认值,也能正确覆盖
sed -i "s/^#*listen_addresses = .*/listen_addresses = '*'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*port = .*/port = ${PGPORT}/" ${PGDATA}/postgresql.conf
sed -i "s/^#*unix_socket_directories = .*/unix_socket_directories = '${PGHOME}\/run,\/tmp'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*unix_socket_permissions = .*/unix_socket_permissions = 0700/" ${PGDATA}/postgresql.conf
# WAL 配置
sed -i "s/^#*wal_level = .*/wal_level = replica/" ${PGDATA}/postgresql.conf
sed -i "s/^#*max_wal_senders = .*/max_wal_senders = 10/" ${PGDATA}/postgresql.conf
sed -i "s/^#*wal_keep_size = .*/wal_keep_size = 2GB/" ${PGDATA}/postgresql.conf
sed -i "s/^#*hot_standby = .*/hot_standby = on/" ${PGDATA}/postgresql.conf
sed -i "s/^#*wal_log_hints = .*/wal_log_hints = on/" ${PGDATA}/postgresql.conf # 为 pg_rewind 做准备
# 日志配置
sed -i "s/^#*log_destination = .*/log_destination = 'stderr'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*logging_collector = .*/logging_collector = on/" ${PGDATA}/postgresql.conf
sed -i "s/^#*log_directory = .*/log_directory = 'log'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*log_filename = .*/log_filename = 'postgresql-%Y-%m-%d.log'/" ${PGDATA}/postgresql.conf
sed -i "s/^#*log_line_prefix = .*/log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '/" ${PGDATA}/postgresql.conf
chown ${PGUSER}:${PGUSER} ${PGDATA}/postgresql.conf
log_info "postgresql.conf configured (using sed replacement)."
# ==========================================
# 4. 配置 pg_hba.conf
# ==========================================
log_info "Configuring pg_hba.conf..."
su - ${PGUSER} -c "cp ${PGDATA}/pg_hba.conf ${PGDATA}/pg_hba.conf.backup.$(date +%s)"
# 先清理可能存在的旧配置块(避免重复追加)
sed -i '/# ==========================================/,/# ==========================================/d' ${PGDATA}/pg_hba.conf 2>/dev/null || true
cat >> ${PGDATA}/pg_hba.conf <<EOF
# ==========================================
# Replication Access
# ==========================================
local replication repmgr trust
host replication repmgr ${SUBNET} scram-sha-256
host repmgr repmgr ${SUBNET} scram-sha-256
# ==========================================
# General Access
# ==========================================
host all all ${SUBNET} scram-sha-256
EOF
chown ${PGUSER}:${PGUSER} ${PGDATA}/pg_hba.conf
log_info "pg_hba.conf configured."
# ==========================================
# 5. 【核心优化】部署稳健版 Systemd 服务并启动
# ==========================================
log_info "Checking & deploying systemd service..."
# 优化点:不管旧服务文件是否存在,都备份并部署我们的稳健版
if [ -f "/etc/systemd/system/postgresql.service" ]; then
log_warn "Existing systemd service file found, backing up..."
cp /etc/systemd/system/postgresql.service /etc/systemd/system/postgresql.service.backup.$(date +%s)
fi
# 部署优化版 Systemd 服务(Type=forking + pg_ctl + 绝对路径)
cat > /etc/systemd/system/postgresql.service <<EOF
[Unit]
Description=PostgreSQL database server
After=network.target
[Service]
Type=forking
User=${PGUSER}
Group=${PGUSER}
Environment=PATH=${PGHOME}/bin:/usr/bin:/bin
ExecStart=${PGHOME}/bin/pg_ctl start -D ${PGDATA} -s -w -t 300
ExecStop=${PGHOME}/bin/pg_ctl stop -D ${PGDATA} -s -m fast
ExecReload=${PGHOME}/bin/pg_ctl reload -D ${PGDATA} -s
TimeoutSec=300
LimitNOFILE=65536
LimitNPROC=32768
OOMScoreAdjust=-1000
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable postgresql 2>/dev/null || true
log_info "Starting PostgreSQL via systemd..."
systemctl start postgresql
# ==========================================
# 6. 【优化】循环等待数据库完全就绪 (使用 pg_isready)
# ==========================================
log_info "Waiting for database to become ready..."
for i in {1..30}; do
if su - ${PGUSER} -c "${PG_ISREADY} -h ${PGHOME}/run -p ${PGPORT} -q"; then
log_info "Database is ready."
break
fi
log_warn "Waiting... ($i/30)"
sleep 2
done
if ! su - ${PGUSER} -c "${PG_ISREADY} -h ${PGHOME}/run -p ${PGPORT} -q"; then
log_error "Database did not become ready within 60 seconds. Check logs."
fi
# ==========================================
# 7. 创建 repmgr 用户和数据库
# ==========================================
log_info "Creating repmgr user and database..."
su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE USER repmgr WITH SUPERUSER PASSWORD '${REPMGR_PASS}';\"" 2>/dev/null || \
log_warn "User repmgr already exists, skipping creation."
su - ${PGUSER} -c "${PSQL} -h ${PGHOME}/run -p ${PGPORT} -c \"CREATE DATABASE repmgr OWNER repmgr;\"" 2>/dev/null || \
log_warn "Database repmgr already exists, skipping creation."
log_info "repmgr user/database configured."
# ==========================================
# 完成
# ==========================================
echo
log_info "============================================="
log_info " Primary DB (db1) initialization complete!"
log_info " Data Dir: ${PGDATA}"
log_info " Socket Dir: ${PGHOME}/run"
log_info " Access: psql -h ${PGHOME}/run"
log_info ""
log_info " Systemd Status: systemctl status postgresql"
log_info "============================================="
echo
4.安装repgmr
4.1主库配置
#!/bin/bash
set -euo pipefail
# ================= 配置区 =================
# 基础路径
PGSQL_BIN_PATH="/apps/pgsql/bin"
PG_DATA_DIR="/data/pgsql"
PG_BACKUP_DIR="/data/backup" # 【优化1】明确定义备份目录,移出 PGDATA
REPMGR_ETC_DIR="/apps/repmgr/etc"
REPMGR_LOG_DIR="/apps/repmgr/log"
# 节点信息
PRIMARY_PHYSICAL_IP="10.0.0.101"
PRIMARY_NODE_NAME="db1"
PRIMARY_NODE_ID=1
# 认证信息 (必须全集群一致)
REPMGR_PASSWORD="postgres123"
REPMGR_USER="repmgr"
REPMGR_DB="repmgr"
PRIMARY_PORT=5432
# 集群节点列表 (用于 pg_hba 自动配置)
CLUSTER_NODES=("10.0.0.101" "10.0.0.102" "10.0.0.103")
# 服务管理
PG_SERVICE_NAME="postgresql"
POSTGRES_USER="postgres"
# =========================================
# 路径自动推导
PG_CONF="${PG_DATA_DIR}/postgresql.conf"
PG_HBA="${PG_DATA_DIR}/pg_hba.conf"
REPMGR_CONF="${REPMGR_ETC_DIR}/repmgr.conf"
REPMGR_PATH="${PGSQL_BIN_PATH}/repmgr"
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
# ================= 预检查 =================
log_info "=========================================="
log_info " repmgr 主库初始化 (${PRIMARY_NODE_NAME})"
log_info "=========================================="
[ "$EUID" -ne 0 ] && log_error "请使用 root 用户执行"
[ ! -f "${PGSQL_BIN_PATH}/pg_config" ] && log_error "未找到 pg_config,请检查 PostgreSQL 安装"
[ ! -d "${PG_DATA_DIR}" ] && log_error "数据目录不存在: ${PG_DATA_DIR}"
# 【优化2】检查 repmgr 二进制是否存在
[ ! -f "${REPMGR_PATH}" ] && log_error "未找到 repmgr 二进制文件: ${REPMGR_PATH},请先编译安装"
# 【优化3】检查 PGDATA 纯净度
log_info "检查数据目录纯净度..."
for item in "${PG_DATA_DIR}"/*; do
if [ -d "$item" ]; then
dir_name=$(basename "$item")
if [[ ! "$dir_name" =~ ^(base|global|pg_commit_ts|pg_dynshmem|pg_logical|pg_multixact|pg_notify|pg_replslot|pg_serial|pg_snapshots|pg_stat|pg_stat_tmp|pg_subtrans|pg_tblspc|pg_twophase|pg_wal|pg_xact|log)$ ]]; then
log_warn "发现非标准目录: $item"
log_warn "为避免从库克隆失败,建议将其移出 ${PG_DATA_DIR}"
fi
fi
done
# ================= 环境准备 =================
log_info "[1/6] 准备目录与环境..."
mkdir -p ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chown -R ${POSTGRES_USER}: ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chmod 700 ${PG_BACKUP_DIR} ${REPMGR_LOG_DIR}
# 【优化4】配置 .pgpass(主库也需要,方便后续管理)
log_info "配置 .pgpass..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:*:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass
# ================= 数据库配置 =================
log_info "[2/6] 创建 repmgr 用户与数据库..."
su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/psql -p ${PRIMARY_PORT} -c \"CREATE USER ${REPMGR_USER} WITH SUPERUSER PASSWORD '${REPMGR_PASSWORD}';\"" 2>/dev/null || log_warn "用户已存在,跳过创建"
su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/createdb -p ${PRIMARY_PORT} -O ${REPMGR_USER} ${REPMGR_DB};" 2>/dev/null || log_warn "数据库已存在,跳过创建"
log_info "[3/6] 修改 postgresql.conf..."
# 【优化5】修改配置前自动备份
if [ ! -f "${PG_CONF}.bak.repmgr" ]; then
cp ${PG_CONF} ${PG_CONF}.bak.repmgr
log_info "已备份 postgresql.conf 为 ${PG_CONF}.bak.repmgr"
fi
set_conf() {
local key=$1
local value=$2
if grep -q "^#*${key}" "${PG_CONF}"; then
sed -i "s/^#*${key}.*/${key} = ${value}/" "${PG_CONF}"
else
echo "${key} = ${value}" >> "${PG_CONF}"
fi
}
set_conf "wal_level" "logical"
set_conf "max_wal_senders" "10"
set_conf "max_replication_slots" "10"
set_conf "wal_keep_size" "1024"
set_conf "shared_preload_libraries" "'repmgr'"
set_conf "wal_log_hints" "on"
log_info "[4/6] 配置 pg_hba.conf..."
HBA_MARKER="# repmgr cluster auto config"
if ! grep -qF "${HBA_MARKER}" "${PG_HBA}"; then
# 【优化6】修改 pg_hba 前自动备份
if [ ! -f "${PG_HBA}.bak.repmgr" ]; then
cp ${PG_HBA} ${PG_HBA}.bak.repmgr
log_info "已备份 pg_hba.conf 为 ${PG_HBA}.bak.repmgr"
fi
echo -e "\n${HBA_MARKER}" >> "${PG_HBA}"
for node_ip in "${CLUSTER_NODES[@]}"; do
echo "host replication ${REPMGR_USER} ${node_ip}/32 scram-sha-256" >> "${PG_HBA}"
echo "host ${REPMGR_DB} ${REPMGR_USER} ${node_ip}/32 scram-sha-256" >> "${PG_HBA}"
done
fi
# 【优化7】显式设置 pg_hba 所有者
chown ${POSTGRES_USER}: ${PG_HBA}
# ================= 注册主节点 =================
log_info "[5/6] 生成配置并注册主节点..."
cat > ${REPMGR_CONF} << EOF
node_id=${PRIMARY_NODE_ID}
node_name='${PRIMARY_NODE_NAME}'
conninfo='host=${PRIMARY_PHYSICAL_IP} port=${PRIMARY_PORT} user=${REPMGR_USER} dbname=${REPMGR_DB} password=${REPMGR_PASSWORD} connect_timeout=2'
pg_bindir='${PGSQL_BIN_PATH}'
data_directory='${PG_DATA_DIR}'
use_replication_slots=yes
failover=automatic
promote_command='${REPMGR_PATH} standby promote -f ${REPMGR_CONF} --log-to-file'
follow_command='${REPMGR_PATH} standby follow -f ${REPMGR_CONF} --log-to-file --upstream-node-id=%n'
log_file='${REPMGR_LOG_DIR}/repmgrd.log'
log_level=INFO
EOF
chown ${POSTGRES_USER}: ${REPMGR_CONF}
chmod 600 ${REPMGR_CONF}
log_info "正在重启 PostgreSQL 以应用配置..."
systemctl restart ${PG_SERVICE_NAME}
log_info "等待数据库启动..."
for i in {1..30}; do
su - "${POSTGRES_USER}" -c "${PGSQL_BIN_PATH}/pg_isready -q" && break
[ ${i} -eq 30 ] && log_error "数据库启动超时"
sleep 2
done
log_info "正在注册主节点..."
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} primary register"
# ================= 完成验证 =================
log_info "[6/6] 部署完成,验证状态..."
log_info "=========================================="
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} cluster show"
log_info "=========================================="
log_info ""
log_info "【配置说明】"
log_info "✓ 主库节点名: ${PRIMARY_NODE_NAME} (ID: ${PRIMARY_NODE_ID})"
log_info "✓ 备份目录: ${PG_BACKUP_DIR} (已移出 PGDATA)"
log_info "✓ 日志文件: ${REPMGR_LOG_DIR}/repmgrd.log"
log_info ""
4.2 从库配置
#!/bin/bash
set -euo pipefail
# ================= 配置区(优化路径) =================
REPMGR_VERSION="5.5.0"
# 【核心变更】repmgr 将直接安装进 PostgreSQL 目录
PGSQL_BIN_PATH="/apps/pgsql/bin"
PG_DATA_DIR="/data/pgsql"
PG_BACKUP_DIR="${PG_DATA_DIR}/backup"
# repmgr 的配置和日志仍放在独立目录,方便管理
REPMGR_ETC_DIR="/apps/repmgr/etc"
REPMGR_LOG_DIR="/apps/repmgr/log"
PRIMARY_PHYSICAL_IP="10.0.0.101"
STANDBY_PHYSICAL_IP="10.0.0.102"
STANDBY_NODE_NAME="db2"
STANDBY_NODE_ID=2
WITNESS_IP="10.0.0.103"
REPMGR_PASSWORD="postgres123"
REPMGR_USER="repmgr"
REPMGR_DB="repmgr"
PRIMARY_PORT=5432
PG_SERVICE_NAME="postgresql"
POSTGRES_USER="postgres"
# ================= 路径与工具定义 =================
PG_CONF="${PG_DATA_DIR}/postgresql.conf"
PG_HBA="${PG_DATA_DIR}/pg_hba.conf"
REPMGR_CONF="${REPMGR_ETC_DIR}/repmgr.conf"
PG_CONFIG="${PGSQL_BIN_PATH}/pg_config"
# 【核心变更】repmgr 现在就在 PG 的 bin 目录下
REPMGR_PATH="${PGSQL_BIN_PATH}/repmgr"
PATH="${PGSQL_BIN_PATH}:${PATH}"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
# ================= 0. 严格预检查 =================
log_info "=========================================="
log_info " repmgr 从库一键部署 (PG目录集成版)"
log_info "=========================================="
[ "$EUID" -ne 0 ] && log_error "请用 root 执行"
[ ! -f "${PG_CONFIG}" ] && log_error "未找到 pg_config: ${PG_CONFIG}"
[ ! -d "${PGSQL_BIN_PATH}" ] && log_error "PostgreSQL 二进制目录不存在: ${PGSQL_BIN_PATH}"
log_info "正在停止从库 PostgreSQL 服务..."
systemctl stop "${PG_SERVICE_NAME}" 2>/dev/null || true
if [ -d "${PG_DATA_DIR}" ]; then
if [ "$(ls -A ${PG_DATA_DIR} 2>/dev/null)" ]; then
log_error "数据目录 ${PG_DATA_DIR} 不为空!"
log_error "为了安全,请手动清理: rm -rf ${PG_DATA_DIR}/*"
exit 1
fi
else
mkdir -p ${PG_DATA_DIR}
chown -R ${POSTGRES_USER}: ${PG_DATA_DIR}
fi
# 准备 repmgr 的配置和日志目录
mkdir -p ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR} ${PG_BACKUP_DIR}
chown -R ${POSTGRES_USER}: ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR} ${PG_BACKUP_DIR}
chmod 700 ${PG_DATA_DIR} ${REPMGR_LOG_DIR}
# ================= 1. 安装依赖 =================
log_info "[1/10] 安装依赖..."
apt update -qq && apt install -y -qq flex bison make gcc libpq-dev libcurl4-openssl-dev libjson-c-dev
# ================= 2. 【核心修改】编译安装 repmgr 进 PG 目录 =================
log_info "[2/10] 编译安装 repmgr ${REPMGR_VERSION} (集成至 PG 目录)..."
PKG_NAME="repmgr-${REPMGR_VERSION}.tar.gz"
DOWNLOAD_URL="https://githubfast.com/EnterpriseDB/repmgr/releases/download/v${REPMGR_VERSION}/${PKG_NAME}"
cd /tmp
if [ ! -f "${PKG_NAME}" ]; then
log_info "正在下载 ${PKG_NAME} ..."
wget -q --timeout=30 --tries=3 "${DOWNLOAD_URL}" -O "${PKG_NAME}" || log_error "下载失败,请检查网络"
fi
log_info "正在解压源码..."
rm -rf "repmgr-${REPMGR_VERSION}"
tar -zxf "${PKG_NAME}" || log_error "解压失败"
cd "repmgr-${REPMGR_VERSION}" || log_error "无法进入源码目录"
# 【关键】不指定 --prefix,通过 PG_CONFIG 自动定位到 PG 安装目录
export PG_CONFIG="${PG_CONFIG}"
log_info "正在配置 (configure)..."
# 注意:这里没有 --prefix,它会自动安装到 pg_config --bindir 下
./configure || log_error "Configure 失败"
log_info "正在编译 (make)..."
make -j$(nproc) || log_error "编译失败"
log_info "正在安装 (make install)..."
make install || log_error "安装失败"
# 【关键】验证文件确实安装到了 /apps/pgsql/bin
if [ ! -f "${REPMGR_PATH}" ]; then
log_error "repmgr 未安装到预期路径: ${REPMGR_PATH}"
fi
log_info "✓ repmgr 安装完成: ${REPMGR_PATH}"
# ================= 3. 配置环境变量(简化) =================
log_info "[3/10] 配置环境变量..."
# 因为 repmgr 就在 PG 的 bin 下,通常只需要确保 PG bin 在 PATH 里即可
BASHRC_MARKER="# repmgr & PostgreSQL Environment"
if ! grep -qF "${BASHRC_MARKER}" /home/${POSTGRES_USER}/.bashrc; then
cat >> /home/${POSTGRES_USER}/.bashrc << EOF
${BASHRC_MARKER}
export PATH=${PGSQL_BIN_PATH}:\$PATH
export PGDATA=${PG_DATA_DIR}
EOF
fi
# ================= 4. 配置 .pgpass =================
log_info "[4/10] 配置 .pgpass..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:${REPMGR_DB}:${REPMGR_USER}:${REPMGR_PASSWORD}
*:*:replication:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass
# ================= 5. 校验主库连通性 =================
log_info "[5/10] 校验主库连通性..."
if ! su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/psql -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} -c 'SELECT 1' -t -q"; then
log_error "无法连接主库 ${PRIMARY_PHYSICAL_IP}!"
exit 1
fi
log_info "✓ 主库连接正常"
# ================= 6. 生成 repmgr.conf =================
log_info "[6/10] 生成 repmgr.conf..."
cat > ${REPMGR_CONF} << EOF
node_id=${STANDBY_NODE_ID}
node_name='${STANDBY_NODE_NAME}'
conninfo='host=${STANDBY_PHYSICAL_IP} port=${PRIMARY_PORT} user=${REPMGR_USER} dbname=${REPMGR_DB} password=${REPMGR_PASSWORD} connect_timeout=2'
# 【优化】pg_bindir 现在也统一指向 PG 目录
pg_bindir='${PGSQL_BIN_PATH}'
data_directory='${PG_DATA_DIR}'
use_replication_slots=yes
failover=automatic
# 【优化】命令路径直接使用 PG bin 目录
promote_command='${REPMGR_PATH} standby promote -f ${REPMGR_CONF} --log-to-file'
follow_command='${REPMGR_PATH} standby follow -f ${REPMGR_CONF} --log-to-file --upstream-node-id=%n'
log_file='${REPMGR_LOG_DIR}/repmgrd.log'
log_level=INFO
EOF
chown ${POSTGRES_USER}: ${REPMGR_CONF}
chmod 600 ${REPMGR_CONF}
# ================= 7. 克隆主库数据 =================
log_info "[7/10] 开始克隆主库数据(这可能需要几分钟)..."
chown -R ${POSTGRES_USER}: ${PG_DATA_DIR}
CLONE_CMD="${REPMGR_PATH} -f ${REPMGR_CONF} standby clone \
-h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} \
--fast-checkpoint --force"
log_info "执行克隆命令: ${CLONE_CMD}"
if ! su - "${POSTGRES_USER}" -c "${CLONE_CMD}"; then
log_error "主库克隆失败!请检查上述报错"
exit 1
fi
[ -f "${PG_DATA_DIR}/standby.signal" ] || (touch "${PG_DATA_DIR}/standby.signal" && chown ${POSTGRES_USER}: "${PG_DATA_DIR}/standby.signal")
log_info "✓ 主库数据克隆完成"
# ================= 8. 检查 shared_preload_libraries =================
log_info "[8/10] 检查 shared_preload_libraries..."
# ================= 9. 启动从库 & 注册 =================
log_info "[9/10] 启动从库并注册节点..."
rm -f "${PG_DATA_DIR}/postmaster.pid"
if ! systemctl start "${PG_SERVICE_NAME}"; then
log_error "从库启动失败!排查命令: journalctl -n 50 -u ${PG_SERVICE_NAME}"
exit 1
fi
log_info "等待从库启动完成..."
for i in {1..60}; do
if su - "${POSTGRES_USER}" -c "${PGSQL_BIN_PATH}/pg_isready -q"; then
log_info "✓ PostgreSQL 从库启动成功"
break
fi
[ ${i} -eq 60 ] && log_error "从库启动超时,请检查 PostgreSQL 日志"
sleep 2
done
log_info "正在注册从库节点到集群..."
su - "${POSTGRES_USER}" -c "${REPMGR_PATH} -f ${REPMGR_CONF} standby register \
-h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} \
--upstream-node-id=1 --force"
# ================= 10. 完成验证 =================
log_info "[10/10] 部署完成,验证状态..."
log_info "=========================================="
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} cluster show"
log_info "=========================================="
log_info ""
log_info "【配置说明】"
log_info "✓ 从库节点名: ${STANDBY_NODE_NAME} (ID: ${STANDBY_NODE_ID})"
log_info "✓ 连接主库: ${PRIMARY_PHYSICAL_IP}"
log_info "✓ 日志文件: ${REPMGR_LOG_DIR}/repmgrd.log"
log_info ""
4.3选举节点配置
#!/bin/bash
set -euo pipefail
# ================= 配置区 =================
PGSQL_BIN_PATH="/apps/pgsql/bin"
PG_DATA_DIR="/data/pgsql"
PG_BACKUP_DIR="/data/backup" # 【优化1】统一备份目录规范
REPMGR_ETC_DIR="/apps/repmgr/etc"
REPMGR_LOG_DIR="/apps/repmgr/log"
PRIMARY_PHYSICAL_IP="10.0.0.101"
WITNESS_PHYSICAL_IP="10.0.0.103"
WITNESS_NODE_NAME="witness"
WITNESS_NODE_ID=3
REPMGR_PASSWORD="postgres123"
REPMGR_USER="repmgr"
REPMGR_DB="repmgr"
PRIMARY_PORT=5432
PG_SERVICE_NAME="postgresql"
POSTGRES_USER="postgres"
# =========================================
PG_CONF="${PG_DATA_DIR}/postgresql.conf"
PG_HBA="${PG_DATA_DIR}/pg_hba.conf"
REPMGR_CONF="${REPMGR_ETC_DIR}/repmgr.conf"
REPMGR_PATH="${PGSQL_BIN_PATH}/repmgr"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
# ================= 0. 预检查 =================
log_info "=========================================="
log_info " repmgr 见证节点部署 (${WITNESS_NODE_NAME})"
log_info "=========================================="
[ "$EUID" -ne 0 ] && log_error "请使用 root 用户执行"
[ ! -f "${PGSQL_BIN_PATH}/pg_config" ] && log_error "未找到 pg_config"
# 【优化2】预检查 repmgr 二进制
[ ! -f "${REPMGR_PATH}" ] && log_error "未找到 repmgr 二进制文件: ${REPMGR_PATH},请先编译安装"
log_info "正在停止本地 PostgreSQL 服务..."
systemctl stop "${PG_SERVICE_NAME}" 2>/dev/null || true
mkdir -p ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chown -R ${POSTGRES_USER}: ${PG_BACKUP_DIR} ${REPMGR_ETC_DIR} ${REPMGR_LOG_DIR}
chmod 700 ${PG_BACKUP_DIR} ${REPMGR_LOG_DIR}
# 【优化3】提前配置 .pgpass(不依赖顺序,后续管理也方便)
log_info "配置 .pgpass..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:*:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass
# ================= 1. 编译安装 repmgr =================
log_info "[1/6] 编译安装 repmgr..."
REPMGR_VERSION="5.5.0"
PKG_NAME="repmgr-${REPMGR_VERSION}.tar.gz"
DOWNLOAD_URL="https://githubfast.com/EnterpriseDB/repmgr/releases/download/v${REPMGR_VERSION}/${PKG_NAME}"
cd /tmp
[ ! -f "${PKG_NAME}" ] && wget -q --timeout=30 --tries=3 "${DOWNLOAD_URL}" -O "${PKG_NAME}"
rm -rf repmgr-${REPMGR_VERSION} && tar -zxf "${PKG_NAME}" && cd repmgr-${REPMGR_VERSION}
export PG_CONFIG="${PGSQL_BIN_PATH}/pg_config"
./configure > /dev/null
make -s -j$(nproc) > /dev/null
make -s install > /dev/null
log_info "✓ repmgr 安装完成"
# ================= 2. 初始化本地数据库 =================
log_info "[2/6] 初始化见证节点本地数据库..."
if [ ! -d "${PG_DATA_DIR}/base" ]; then
log_info "数据目录为空,正在初始化新集群..."
mkdir -p ${PG_DATA_DIR}
chown -R ${POSTGRES_USER}: ${PG_DATA_DIR}
chmod 700 ${PG_DATA_DIR}
PWFILE="/tmp/initdb_pwfile_$(date +%s)"
echo "${REPMGR_PASSWORD}" > "${PWFILE}"
chown ${POSTGRES_USER}: "${PWFILE}"
chmod 600 "${PWFILE}"
su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/initdb -D ${PG_DATA_DIR} -A scram-sha-256 --pwfile=${PWFILE}"
rm -f "${PWFILE}"
# 【优化4】修改 postgresql.conf 前自动备份
cp ${PG_CONF} ${PG_CONF}.bak.repmgr
sed -i "s/^#*listen_addresses = .*/listen_addresses = 'localhost, ${WITNESS_PHYSICAL_IP}'/" "${PG_CONF}"
sed -i "s/^#*port = .*/port = ${PRIMARY_PORT}/" "${PG_CONF}"
else
log_info "数据目录已存在,跳过初始化步骤"
fi
# ================= 3. 启动本地 PostgreSQL =================
log_info "[3/6] 启动见证节点本地 PostgreSQL..."
rm -f "${PG_DATA_DIR}/postmaster.pid"
systemctl start "${PG_SERVICE_NAME}"
log_info "等待数据库启动..."
for i in {1..30}; do
su - "${POSTGRES_USER}" -c "${PGSQL_BIN_PATH}/pg_isready -q" && break
[ ${i} -eq 30 ] && log_error "启动超时"
sleep 1
done
log_info "✓ 本地 PostgreSQL 启动成功"
# ================= 4. 配置本地权限与用户 =================
log_info "[4/6] 配置本地 repmgr 用户与 pg_hba..."
# 1. 先修改好 pg_hba.conf(root 改文件,不连数据库)
# 【优化5】修改 pg_hba 前自动备份
if [ ! -f "${PG_HBA}.bak.repmgr" ]; then
cp ${PG_HBA} ${PG_HBA}.bak.repmgr
fi
HBA_MARKER="# repmgr cluster auto config"
if ! grep -qF "${HBA_MARKER}" "${PG_HBA}"; then
echo -e "\n${HBA_MARKER}" >> "${PG_HBA}"
for node_ip in "10.0.0.101" "10.0.0.102" "10.0.0.103"; do
echo "host replication ${REPMGR_USER} ${node_ip}/32 scram-sha-256" >> "${PG_HBA}"
echo "host ${REPMGR_DB} ${REPMGR_USER} ${node_ip}/32 scram-sha-256" >> "${PG_HBA}"
done
fi
chown ${POSTGRES_USER}: ${PG_HBA}
# 2. 【核心】所有数据库操作在一个块里完成,靠 PGPASSWORD
su - ${POSTGRES_USER} <<EOF
set -e
export PGPASSWORD="${REPMGR_PASSWORD}"
${PGSQL_BIN_PATH}/psql -tAc "SELECT 1 FROM pg_roles WHERE rolname='${REPMGR_USER}'" | grep -q 1 || \
${PGSQL_BIN_PATH}/psql -c "CREATE USER ${REPMGR_USER} WITH SUPERUSER LOGIN;"
${PGSQL_BIN_PATH}/psql -c "ALTER USER ${REPMGR_USER} WITH PASSWORD '${REPMGR_PASSWORD}';"
${PGSQL_BIN_PATH}/psql -tAc "SELECT 1 FROM pg_database WHERE datname='${REPMGR_DB}'" | grep -q 1 || \
${PGSQL_BIN_PATH}/psql -c "CREATE DATABASE ${REPMGR_DB} OWNER ${REPMGR_USER};"
${PGSQL_BIN_PATH}/psql -c 'SELECT pg_reload_conf();'
EOF
# ================= 5. 连接主库并注册 =================
log_info "[5/6] 校验主库连通性..."
if ! su - ${POSTGRES_USER} -c "${PGSQL_BIN_PATH}/psql -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} -c 'SELECT 1' -t -q"; then
log_error "无法连接主库"
fi
log_info "✓ 主库连接正常"
log_info "[6/6] 生成配置并注册见证节点..."
cat > ${REPMGR_CONF} << EOF
node_id=${WITNESS_NODE_ID}
node_name='${WITNESS_NODE_NAME}'
conninfo='host=${WITNESS_PHYSICAL_IP} port=${PRIMARY_PORT} user=${REPMGR_USER} dbname=${REPMGR_DB} password=${REPMGR_PASSWORD} connect_timeout=2'
pg_bindir='${PGSQL_BIN_PATH}'
data_directory='${PG_DATA_DIR}'
node_type='witness'
log_file='${REPMGR_LOG_DIR}/repmgrd.log'
log_level=INFO
EOF
chown ${POSTGRES_USER}: ${REPMGR_CONF}
chmod 600 ${REPMGR_CONF}
log_info "正在向主库注册见证节点..."
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} witness register -h ${PRIMARY_PHYSICAL_IP} -p ${PRIMARY_PORT} -U ${REPMGR_USER} -d ${REPMGR_DB} --force"
# ================= 完成 =================
log_info "=========================================="
log_info " 见证节点部署完成!"
log_info "=========================================="
su - ${POSTGRES_USER} -c "${REPMGR_PATH} -f ${REPMGR_CONF} cluster show"
4.4部署验证
su - postgres
postgres@db3:~$ /apps/repmgr/bin/repmgr cluster show -f /apps/repmgr/etc/repmgr.conf
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+----------------------------------------------------------------------------------
1 | db1 | primary | * running | | default | 100 | 1 | host=10.0.0.101 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2
2 | db2 | standby | running | db1 | default | 100 | 1 | host=10.0.0.102 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2
3 | db3 | witness | * running | db1 | default | 0 | n/a | host=10.0.0.103 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2
5. repmgrd 守护进程一键配置脚本(db1/2/3分别执行)
#!/bin/bash
set -euo pipefail
# ==========================================
# 【用户必须修改区域】
# ==========================================
INSTALL_PREFIX="/apps/repmgr"
PGSQL_BIN_PATH="/apps/pgsql/bin"
PGDATA="/data/pgsql"
POSTGRES_USER="postgres"
PG_SERVICE_NAME="postgresql"
PG_BACKUP_DIR="/data/backup"
# 【关键修复】补全缺失的 repmgr 连接信息定义
REPMGR_USER="repmgr"
REPMGR_PASSWORD="postgres123" # 如果 repmgr.conf 里有,会自动覆盖这里
REPMGR_DB="repmgr"
# ==========================================
# 【以下内容请勿修改】
# ==========================================
# 路径定义
REPMGR_BIN="${PGSQL_BIN_PATH}/repmgr"
REPMGRD_BIN="${PGSQL_BIN_PATH}/repmgrd"
REPMGR_CONF_FILE="${INSTALL_PREFIX}/etc/repmgr.conf"
REPMGR_LOG_DIR="${INSTALL_PREFIX}/log"
REPMGRD_LOG="${REPMGR_LOG_DIR}/repmgrd.log"
REPMGRD_PID="${REPMGR_LOG_DIR}/repmgrd.pid"
SERVICE_FILE="/etc/systemd/system/repmgrd.service"
PG_CONF="${PGDATA}/postgresql.conf"
PSQL_PATH="${PGSQL_BIN_PATH}/psql"
PG_ISREADY="${PGSQL_BIN_PATH}/pg_isready"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
# ==========================================
# 核心函数:等待PostgreSQL就绪
# ==========================================
wait_pg_ready() {
local timeout=${1:-30}
log_info "等待 PostgreSQL 服务就绪,超时时间 ${timeout} 秒..."
for ((i=1; i<=timeout; i++)); do
if su - "${POSTGRES_USER}" -c "${PG_ISREADY} -q" 2>/dev/null; then
log_info "PostgreSQL 服务就绪,耗时 ${i} 秒"
return 0
fi
log_warn "等待中... (${i}/${timeout})"
sleep 1
done
log_error "PostgreSQL 启动超时!"
}
# ==========================================
# 核心函数:检查并确保 shared_preload_libraries 生效
# ==========================================
ensure_shared_preload_libraries() {
log_info "正在检查 shared_preload_libraries 内存状态..."
local TMP_SQL="/tmp/check_shared_libs_$$.sql"
echo "SHOW shared_preload_libraries;" > "${TMP_SQL}"
chown ${POSTGRES_USER}: "${TMP_SQL}"
local CURRENT_VAL
CURRENT_VAL=$(su - ${POSTGRES_USER} -c "PGPASSWORD=${REPMGR_PASSWORD:-} ${PSQL_PATH} -t -q -f '${TMP_SQL}' 2>/dev/null" | xargs || true)
rm -f "${TMP_SQL}"
if [[ ",${CURRENT_VAL}," == *",repmgr,"* ]] || [ "${CURRENT_VAL}" = "repmgr" ]; then
log_info "shared_preload_libraries 已在内存中生效: ${CURRENT_VAL}"
return 0
fi
log_warn "内存中未检测到 repmgr,检查配置文件..."
if ! grep -q "shared_preload_libraries.*repmgr" "${PG_CONF}"; then
log_warn "配置文件中未设置,正在修改..."
mkdir -p ${PG_BACKUP_DIR}
BACKUP_FILE="${PG_BACKUP_DIR}/postgresql.conf.bak.repmgr.$(date +%s)"
cp ${PG_CONF} ${BACKUP_FILE}
log_info "配置已备份至: ${BACKUP_FILE}"
if grep -q "^#*shared_preload_libraries" "${PG_CONF}"; then
CURRENT_FILE_VAL=$(grep "^shared_preload_libraries" "${PG_CONF}" | cut -d"'" -f2 || true)
if [ -z "${CURRENT_FILE_VAL:-}" ]; then
sed -i "s/^#*shared_preload_libraries = .*/shared_preload_libraries = 'repmgr'/" "${PG_CONF}"
else
if [[ ! ",${CURRENT_FILE_VAL}," == *",repmgr,"* ]]; then
sed -i "s/^shared_preload_libraries = '${CURRENT_FILE_VAL}'/shared_preload_libraries = '${CURRENT_FILE_VAL}, repmgr'/" "${PG_CONF}"
fi
fi
else
echo "shared_preload_libraries = 'repmgr'" >> "${PG_CONF}"
fi
fi
echo ""
echo -e "${RED}==========================================${NC}"
echo -e "${RED} ⚠️ 重要:必须重启 PostgreSQL${NC}"
echo -e "${RED}==========================================${NC}"
echo -e "${YELLOW}配置已修改,为了确保 repmgrd 能正常启动,${NC}"
echo -e "${YELLOW}数据库将在 5 秒后自动重启...${NC}"
echo ""
sleep 5
log_info "正在重启 PostgreSQL (服务名: ${PG_SERVICE_NAME})..."
systemctl stop "${PG_SERVICE_NAME}"
if ! ps aux | grep -v grep | grep -q 'postgres'; then
rm -f ${PGDATA}/postmaster.pid 2>/dev/null || true
fi
if ! systemctl start ${PG_SERVICE_NAME}; then
log_error "PostgreSQL 重启失败!请手动检查后重试"
fi
wait_pg_ready 30
echo "SHOW shared_preload_libraries;" > "${TMP_SQL}"
chown ${POSTGRES_USER}: "${TMP_SQL}"
CURRENT_VAL=$(su - ${POSTGRES_USER} -c "PGPASSWORD=${REPMGR_PASSWORD:-} ${PSQL_PATH} -t -q -f '${TMP_SQL}' 2>/dev/null" | xargs || true)
rm -f "${TMP_SQL}"
if [[ ",${CURRENT_VAL}," != *",repmgr,"* ]] && [ "${CURRENT_VAL}" != "repmgr" ]; then
log_error "重启后仍未生效!当前值: ${CURRENT_VAL}"
fi
log_info "shared_preload_libraries 生效确认完成"
}
# ==========================================
# 【-1/8 环境清理】
# ==========================================
echo -e "${RED}==========================================${NC}"
echo -e "${RED} ⚠️ 警告:repmgrd 服务强制清理模式${NC}"
echo -e "${RED}==========================================${NC}"
PERFORM_CLEAN=false
if [[ $# -gt 0 ]] && [[ "$1" == "--force-clean" ]]; then
PERFORM_CLEAN=true
else
read -p "确认要清理 repmgrd 环境吗? (输入 YES/yes 确认): " confirm_clean
if [[ "${confirm_clean:-}" =~ ^[Yy][Ee][Ss]$ ]]; then
PERFORM_CLEAN=true
fi
fi
if [ "${PERFORM_CLEAN}" = true ]; then
log_warn "正在执行清理..."
systemctl stop repmgrd 2>/dev/null || true
systemctl disable repmgrd 2>/dev/null || true
pkill -9 repmgrd 2>/dev/null || true
rm -f ${REPMGRD_PID} /tmp/repmgrd.pid ${SERVICE_FILE} /etc/logrotate.d/repmgrd
[ -d "${REPMGR_LOG_DIR}" ] && rm -f ${REPMGR_LOG_DIR}/*.log ${REPMGR_LOG_DIR}/*.pid
systemctl daemon-reload
log_info "清理完成,5秒后开始部署..."
sleep 5
else
log_info "跳过清理,继续部署..."
fi
# ==========================================
# 0. 预检查
# ==========================================
log_info "=========================================="
log_info " repmgrd 服务一键部署 (生产稳定版)"
log_info "=========================================="
[ "$EUID" -ne 0 ] && log_error "请用 root 执行"
[ ! -f "${REPMGR_CONF_FILE}" ] && log_error "repmgr.conf 不存在: ${REPMGR_CONF_FILE},请先部署主/从/见证节点"
[ ! -f "${PG_CONF}" ] && log_error "PG 配置文件不存在: ${PG_CONF}"
[ ! -f "${PSQL_PATH}" ] && log_error "psql 不存在: ${PSQL_PATH}"
[ ! -f "${REPMGR_BIN}" ] && log_error "repmgr 二进制不存在: ${REPMGR_BIN}"
[ ! -f "${REPMGRD_BIN}" ] && log_error "repmgrd 二进制不存在: ${REPMGRD_BIN}"
# 从 repmgr.conf 中提取密码(覆盖上面的默认值)
if grep -q "conninfo" "${REPMGR_CONF_FILE}"; then
TMP_PASS=$(grep "conninfo" "${REPMGR_CONF_FILE}" | grep -oP "password=\K[^' ]+" || true)
if [ -n "${TMP_PASS:-}" ]; then
REPMGR_PASSWORD="${TMP_PASS}"
log_info "已从 repmgr.conf 自动获取数据库密码"
fi
fi
mkdir -p ${PG_BACKUP_DIR} ${REPMGR_LOG_DIR}
chown -R ${POSTGRES_USER}: ${PG_BACKUP_DIR} ${INSTALL_PREFIX}
chmod 700 ${REPMGR_LOG_DIR}
# 提前配置 .pgpass
log_info "提前配置 .pgpass (优先执行)..."
cat > /home/${POSTGRES_USER}/.pgpass << EOF
*:*:*:${REPMGR_USER}:${REPMGR_PASSWORD}
EOF
chown ${POSTGRES_USER}: /home/${POSTGRES_USER}/.pgpass
chmod 0600 /home/${POSTGRES_USER}/.pgpass
log_info "预检查通过"
# ==========================================
# 1. 清理旧环境
# ==========================================
log_info "[1/8] 清理旧环境..."
systemctl stop repmgrd 2>/dev/null || true
rm -f ${REPMGRD_PID} /tmp/repmgrd.pid
# ==========================================
# 2. 先确保PostgreSQL运行正常
# ==========================================
log_info "[2/8] 检查 PostgreSQL 运行状态..."
if ! systemctl is-active --quiet ${PG_SERVICE_NAME}; then
log_info "PostgreSQL 未运行,正在启动..."
if ! ps aux | grep -v grep | grep -q 'postgres'; then
rm -f ${PGDATA}/postmaster.pid 2>/dev/null || true
fi
if ! systemctl start ${PG_SERVICE_NAME}; then
log_error "启动失败"
fi
wait_pg_ready 30
else
log_info "PostgreSQL 运行状态正常"
fi
# ==========================================
# 3. 检查并确保参数生效
# ==========================================
log_info "[3/8] 检查并确保 shared_preload_libraries 生效..."
ensure_shared_preload_libraries
# ==========================================
# 4. 补全 repmgr.conf 日志配置
# ==========================================
log_info "[4/8] 补全 repmgr.conf 日志配置..."
sed -i '/^monitoring_history=/d' "${REPMGR_CONF_FILE}"
sed -i '/^monitor_interval=/d' "${REPMGR_CONF_FILE}"
if ! grep -q "^log_file" "${REPMGR_CONF_FILE}"; then
echo "" >> "${REPMGR_CONF_FILE}"
echo "log_file='${REPMGRD_LOG}'" >> "${REPMGR_CONF_FILE}"
echo "log_level=INFO" >> "${REPMGR_CONF_FILE}"
chown ${POSTGRES_USER}: ${REPMGR_CONF_FILE}
log_info "已添加基础日志配置"
fi
# ==========================================
# 5. 生成 systemd 服务文件
# ==========================================
log_info "[5/8] 生成 systemd 服务文件..."
cat > ${SERVICE_FILE} << EOF
[Unit]
Description=PostgreSQL Replication Manager Daemon
After=network.target ${PG_SERVICE_NAME}.service
Requires=${PG_SERVICE_NAME}.service
[Service]
Type=forking
User=${POSTGRES_USER}
Group=${POSTGRES_USER}
ExecStart=${REPMGRD_BIN} -f ${REPMGR_CONF_FILE} --pid-file ${REPMGRD_PID}
ExecStop=/bin/kill -QUIT \$MAINPID
PIDFile=${REPMGRD_PID}
Environment=PATH=${PGSQL_BIN_PATH}:/usr/local/bin:/usr/bin:/bin
Restart=always
RestartSec=5
StartLimitIntervalSec=60s
StartLimitBurst=5
[Install]
WantedBy=multi-user.target
EOF
# ==========================================
# 6. 配置日志轮转
# ==========================================
log_info "[6/8] 配置日志轮转规则..."
cat > /etc/logrotate.d/repmgrd << EOF
${REPMGRD_LOG} {
daily
rotate 7
compress
delaycompress
missingok
notifempty
create 0640 ${POSTGRES_USER} ${POSTGRES_USER}
sharedscripts
postrotate
[ -f ${REPMGRD_PID} ] && kill -HUP \$(cat ${REPMGRD_PID}) 2>/dev/null || true
endscript
}
EOF
# ==========================================
# 7. 启动服务
# ==========================================
log_info "[7/8] 启动 repmgrd 服务..."
systemctl daemon-reload
systemctl enable repmgrd
if systemctl start repmgrd; then
sleep 3
# ==========================================
# 8. 最终验证
# ==========================================
log_info "[8/8] 验证服务状态..."
if systemctl is-active --quiet repmgrd; then
log_info "=========================================="
log_info " 🎉 repmgrd 服务配置并启动成功!"
log_info "=========================================="
log_info "配置文件: ${REPMGR_CONF_FILE}"
log_info "日志文件: ${REPMGRD_LOG}"
log_info ""
log_info "常用管理命令:"
log_info " 查看服务状态: systemctl status repmgrd"
log_info " 查看集群状态: su - postgres -c 'repmgr cluster show -f ${REPMGR_CONF_FILE}'"
log_info " 查看日志: tail -f ${REPMGRD_LOG}"
log_info ""
log_info "无人值守清理命令: bash $0 --force-clean"
else
log_error "服务启动失败,请尝试手动前台运行查看报错:"
log_error " su - postgres"
log_error " ${REPMGRD_BIN} -f ${REPMGR_CONF_FILE} --verbose"
fi
else
log_error "服务启动命令执行失败"
fi
6.手动切换测试
#db2上执行
su - postgres -c '
/apps/pgsql/bin/repmgr standby switchover \
-f /apps/repmgr/etc/repmgr.conf \
--siblings-follow \
--force
'
NOTICE: executing switchover on node "db2" (ID: 2)
NOTICE: attempting to pause repmgrd on 3 nodes
NOTICE: local node "db2" (ID: 2) will be promoted to primary; current primary "db1" (ID: 1) will be demoted to standby
NOTICE: stopping current primary node "db1" (ID: 1)
NOTICE: issuing CHECKPOINT on node "db1" (ID: 1)
DETAIL: executing server command "/apps/pgsql/bin/pg_ctl -D '/data/pgsql' -W -m fast stop"
INFO: checking for primary shutdown; 1 of 60 attempts ("shutdown_check_timeout")
INFO: checking for primary shutdown; 2 of 60 attempts ("shutdown_check_timeout")
NOTICE: current primary has been cleanly shut down at location 0/30086F0
NOTICE: promoting standby to primary
DETAIL: promoting server "db2" (ID: 2) using pg_promote()
NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
NOTICE: STANDBY PROMOTE successful
DETAIL: server "db2" (ID: 2) was successfully promoted to primary
NOTICE: node "db2" (ID: 2) promoted to primary, node "db1" (ID: 1) demoted to standby
NOTICE: executing STANDBY FOLLOW on 1 of 1 siblings
INFO: node 3 received notification to follow node 2
INFO: STANDBY FOLLOW successfully executed on all reachable sibling nodes
NOTICE: switchover was successful
DETAIL: node "db2" is now primary and node "db1" is attached as standby
NOTICE: STANDBY SWITCHOVER has completed successfully
root@db1:~# su - postgres -c '/apps/pgsql/bin/repmgr cluster show -f /apps/repmgr/etc/repmgr.conf'
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
----+------+---------+-----------+----------+----------+----------+----------+----------------------------------------------------------------------------------
1 | db1 | standby | running | db2 | default | 100 | 3 | host=10.0.0.101 user=repmgr dbname=repmgr connect_timeout=2
2 | db2 | primary | * running | | default | 100 | 4 | host=10.0.0.102 user=repmgr dbname=repmgr connect_timeout=2
3 | db3 | witness | * running | db2 | default | 0 | n/a | host=10.0.0.103 user=repmgr dbname=repmgr password=postgres123 connect_timeout=2
#db1上执行 切换回来
su - postgres -c '
/apps/pgsql/bin/repmgr standby switchover \
-f /apps/repmgr/etc/repmgr.conf \
--siblings-follow \
--force
'
7.配置 haproxy+keepalived
7.1在 db1、db2、ha1、ha2 上执行
# 安装通用依赖
apt update -y
apt install -y socat keepalived haproxy psmisc ethtool
7.2db1 & db2 数据库节点:优化版健康检查服务
#!/bin/bash
# 创建检测脚本
cat > /usr/local/bin/pg_check_role.sh << 'EOF'
#!/bin/bash
# 优化点:本地socket连接、无网络依赖、错误兜底、纯文本输出
PGSQL_BIN_PATH="/apps/pgsql/bin"
export PATH=${PGSQL_BIN_PATH}:/usr/bin:/bin
# 本地socket连接,不依赖网络,适配postgres用户peer认证
is_recovery=$(psql -U postgres -d postgres -h /tmp -t -c "SELECT pg_is_in_recovery();" 2>/dev/null | tr -d '[:space:]' || echo "unknown")
# 严格匹配主库状态,异常返回STANDBY
if [ "$is_recovery" = "f" ]; then
echo -n "MASTER"
else
echo -n "STANDBY"
fi
EOF
# 权限配置(必须postgres用户可执行)
chmod +x /usr/local/bin/pg_check_role.sh
chown postgres:postgres /usr/local/bin/pg_check_role.sh
# 手动测试验证
su - postgres -c "/usr/local/bin/pg_check_role.sh"
# 主库应输出 MASTER,备库应输出 STANDBY
7.3ha1 & ha2 HA 节点: HAProxy 配置
添加这些
vim /etc/haproxy/haproxy.cfg
# -----------------------------------------------------------------------------
# PostgreSQL TCP专用配置(适配长连接、连接池)
# -----------------------------------------------------------------------------
defaults postgres
mode tcp
option tcplog
option dontlognull
timeout connect 5s
timeout client 1h
timeout server 1h
# 优化点:显式健康检查参数,2秒一次,2次成功上线,3次失败下线
default-server inter 2s rise 2 fall 3 on-marked-down shutdown-sessions
# -----------------------------------------------------------------------------
# 核心:PostgreSQL 写VIP入口(仅转发主库)
# -----------------------------------------------------------------------------
listen postgres_write
# 绑定VIP,注意:Keepalived会自动管理VIP,这里直接绑定即可
bind 10.0.0.113:5432
mode tcp
option tcplog
# 优化点:精准主库识别,只认返回MASTER的节点
option tcp-check
tcp-check connect port 23267
tcp-check expect string MASTER
# 后端数据库节点,无backup标签,自动选主
server db1 10.0.0.101:5432 check port 23267
server db2 10.0.0.102:5432 check port 23267
7.4配置keepalived
7.4.1配置ha1 keepalived
root@ha1:~# cat /etc/keepalived/keepalived.conf
global_defs {
router_id HARBOR_LB_02 # 【保留基座】保持原有 router_id
script_user root
enable_script_security
}
# ============== 【优化】检测脚本区 ==============
# 1. HAProxy 进程检测(通用)
vrrp_script chk_haproxy {
script "/usr/bin/systemctl is-active --quiet haproxy"
interval 2
fall 2
rise 1
}
# 2. 【新增】网卡物理链路检测(用于PG实例)
vrrp_script chk_network {
script "/usr/bin/ethtool ens33 | grep -q 'Link detected: yes'"
interval 2
fall 2
rise 2
}
# 3. 【修复】Minio API 检测(修复原基座的语法错误,增加 || 逻辑)
vrrp_script chk_minio_api {
script "/usr/bin/curl -s --connect-timeout 2 http://10.0.0.106:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.107:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.108:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.109:9000/minio/health/live | grep -q '\"status\":\"success\"'"
interval 2
weight -15
fall 2
rise 2
}
# 4. Minio Console 检测(保留基座)
vrrp_script chk_minio_console {
script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:9001/minio/health/ready | grep -q '\"status\":\"success\"'"
interval 2
weight -15
fall 2
rise 2
}
# ============== 【保留基座】注释掉的历史配置 ==============
#vrrp_script chk_k8s_api {
# script "/usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.101:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.102:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.103:6443/healthz | grep -q ok"
# interval 2
# weight -15
# fall 2
# rise 2
#}
#
#vrrp_script chk_harbor_local {
# script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:8080/api/v2.0/health | grep -q healthy"
# interval 2
# weight -15
# fall 2
# rise 2
#}
# ============== 【优化】Minio 实例:抢占模式 ==============
vrrp_instance VI_MINIO {
state MASTER # 【优化】主节点显式设为 MASTER
interface ens33
virtual_router_id 53
priority 120 # 【优化】优先级拉高至 120,防抖动
advert_int 1
# 【优化】移除 nopreempt,开启抢占模式
authentication {
auth_type PASS
auth_pass minivip1 # 【保留基座】密码不变
}
virtual_ipaddress {
10.0.0.112/32 dev ens33 label ens33:minio01 # 【保留基座】VIP不变
}
track_script {
chk_haproxy
chk_minio_api # 【保留基座】使用修复后的完整检测
chk_minio_console
}
notify_master "/usr/bin/systemctl restart haproxy"
notify_backup "/usr/bin/systemctl restart haproxy"
}
# ============== 【新增】PostgreSQL 实例:非抢占模式 ==============
vrrp_instance VI_1 {
state BACKUP # 【优化】PG 用 BACKUP + nopreempt
interface ens33
virtual_router_id 54
priority 100
advert_int 1
nopreempt # 【优化】PG 保持非抢占
authentication {
auth_type PASS
auth_pass PG_HA@2026 # 【保留优化版】PG 密码
}
virtual_ipaddress {
10.0.0.113/24 dev ens33 label ens33:pgvip # 【保留优化版】PG VIP
}
track_script {
chk_haproxy
chk_network # 【优化】增加网卡检测
}
notify_master "/usr/bin/systemctl restart haproxy"
notify_backup "/usr/bin/systemctl restart haproxy"
}
# ============== 【保留基座】注释掉的历史实例 ==============
#vrrp_instance VI_K8S {
# state MASTER
# interface ens33
# virtual_router_id 51
# priority 100
# advert_int 1
# authentication {
# auth_type PASS
# auth_pass k8svip01
# }
#virtual_ipaddress {
# 10.0.0.110/32 dev ens33 label ens33:k8s01
# }
# track_script {
# chk_haproxy
# chk_k8s_api
# }
#notify_master "/usr/bin/systemctl restart haproxy"
# notify_backup "/usr/bin/systemctl restart haproxy"
#}
#vrrp_instance VI_HARBOR {
# state MASTER
# interface ens33
# virtual_router_id 52
# priority 100
# advert_int 1
# authentication {
# auth_type PASS
# auth_pass habrvip1
# }
# virtual_ipaddress {
# 10.0.0.111/32 dev ens33 label ens33:habr01
# }
# track_script {
# chk_haproxy
# chk_harbor_local
# }
# notify_master "/usr/bin/systemctl restart haproxy"
# notify_backup "/usr/bin/systemctl restart haproxy"
#}
7.4.2 ha2 不需要minio的可以把他取消
root@ha2:~# cat /etc/keepalived/keepalived.conf
global_defs {
router_id HARBOR_LB_02 # 【保留基座】保持与 ha1 一致的 router_id
script_user root
enable_script_security
}
# ============== 【优化】检测脚本区 ==============
# 1. HAProxy 进程检测(通用)
vrrp_script chk_haproxy {
script "/usr/bin/systemctl is-active --quiet haproxy"
interval 2
fall 2
rise 1
}
# 2. 【新增】网卡物理链路检测(用于PG实例)
vrrp_script chk_network {
script "/usr/bin/ethtool ens33 | grep -q 'Link detected: yes'"
interval 2
fall 2
rise 2
}
# 3. 【修复】Minio API 检测(修复原基座的语法错误,补全 || 和 --connect-timeout)
vrrp_script chk_minio_api {
script "/usr/bin/curl -s --connect-timeout 2 http://10.0.0.106:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.107:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.108:9000/minio/health/live | grep -q '\"status\":\"success\"' || /usr/bin/curl -s --connect-timeout 2 http://10.0.0.109:9000/minio/health/live | grep -q '\"status\":\"success\"'"
interval 2
weight -15
fall 2
rise 2
}
# 4. Minio Console 检测(保留基座)
vrrp_script chk_minio_console {
script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:9001/minio/health/ready | grep -q '\"status\":\"success\"'"
interval 2
weight -15
fall 2
rise 2
}
# ============== 【保留基座】注释掉的历史配置 ==============
#vrrp_script chk_k8s_api {
# script "/usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.101:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.102:6443/healthz | grep -q ok || /usr/bin/curl -k -s --connect-timeout 2 https://10.0.0.103:6443/healthz | grep -q ok"
# interval 2
# weight -15
# fall 2
# rise 2
#}
#
#vrrp_script chk_harbor_local {
# script "/usr/bin/curl -s --connect-timeout 2 http://127.0.0.1:8080/api/v2.0/health | grep -q healthy"
# interval 2
# weight -15
# fall 2
# rise 2
#}
# ============== 【优化】Minio 实例:备节点(BACKUP + 低优先级) ==============
vrrp_instance VI_MINIO {
state BACKUP # 【优化】备节点设为 BACKUP
interface ens33
virtual_router_id 53
priority 90 # 【优化】优先级 90,比 ha1 低 30,确保不抢占
advert_int 1
# 【优化】不设置 nopreempt,配合 ha1 的抢占模式
authentication {
auth_type PASS
auth_pass minivip1 # 【保留基座】密码不变
}
virtual_ipaddress {
10.0.0.112/32 dev ens33 label ens33:minio01 # 【保留基座】VIP不变
}
track_script {
chk_haproxy
chk_minio_api # 【保留基座】使用修复后的完整检测
chk_minio_console
}
notify_master "/usr/bin/systemctl restart haproxy"
notify_backup "/usr/bin/systemctl restart haproxy"
}
# ============== 【新增】PostgreSQL 实例:备节点(非抢占模式) ==============
vrrp_instance VI_1 {
state BACKUP # 【优化】PG 备节点保持 BACKUP
interface ens33
virtual_router_id 54
priority 90 # 【优化】优先级 90,比主节点低 10
advert_int 1
nopreempt # 【优化】PG 保持非抢占,即使优先级高也不抢
authentication {
auth_type PASS
auth_pass PG_HA@2026 # 【保留优化版】PG 密码
}
virtual_ipaddress {
10.0.0.113/24 dev ens33 label ens33:pgvip # 【保留优化版】PG VIP
}
track_script {
chk_haproxy
chk_network # 【优化】增加网卡检测
}
notify_master "/usr/bin/systemctl restart haproxy"
notify_backup "/usr/bin/systemctl restart haproxy"
}
# ============== 【保留基座】注释掉的历史实例 ==============
#vrrp_instance VI_K8S {
# state BACKUP
# interface ens33
# virtual_router_id 51
# priority 90
# advert_int 1
# authentication {
# auth_type PASS
# auth_pass k8svip01
# }
#virtual_ipaddress {
# 10.0.0.110/32 dev ens33 label ens33:k8s01
# }
# track_script {
# chk_haproxy
# chk_k8s_api
# }
#notify_master "/usr/bin/systemctl restart haproxy"
# notify_backup "/usr/bin/systemctl restart haproxy"
#}
#vrrp_instance VI_HARBOR {
# state BACKUP
# interface ens33
# virtual_router_id 52
# priority 90
# advert_int 1
# authentication {
# auth_type PASS
# auth_pass habrvip1
# }
# virtual_ipaddress {
# 10.0.0.111/32 dev ens33 label ens33:habr01
# }
# track_script {
# chk_haproxy
# chk_harbor_local
# }
# notify_master "/usr/bin/systemctl restart haproxy"
# notify_backup "/usr/bin/systemctl restart haproxy"
#}
7.5接下来:进行最终的功能验证测试
我们来分别测试 Minio 的抢占模式和 PostgreSQL 的非抢占模式,确保两者都按预期工作。
测试一:Minio 抢占模式(自动抢回)
- 在 ha2 上抓日志:
tail -f /var/log/syslog | grep -i 'VI_MINIO' - 在 ha1 上停止 haproxy(模拟 Minio 入口故障):
systemctl stop haproxy - 观察:
- 等待 5 秒,Minio VIP (10.0.0.112) 会飘到 ha2。
- 在 ha1 上恢复 haproxy:
systemctl start haproxy - 观察:
- 等待 5 秒,Minio VIP 会自动从 ha2 抢回 ha1(因为是抢占模式)。
测试二:PostgreSQL 非抢占模式(稳定优先)
- 在 ha2 上抓日志:
tail -f /var/log/syslog | grep -i 'VI_1' - 在 ha1 上停止 haproxy(模拟 PG 入口故障):
systemctl stop haproxy - 观察:
- 等待 5 秒,PG VIP (10.0.0.113) 会飘到 ha2。
- 在 ha1 上恢复 haproxy:
systemctl start haproxy - 观察:
- PG VIP 不会自动抢回 ha1(因为是非抢占模式),这是正常的!
- (可选)手动回切 PG VIP:如果想让 PG VIP 回到 ha1,在 ha2 上执行:
systemctl restart keepalived
最终状态总结
表格
| VIP | 模式 | 当前位置 | 行为 |
|---|---|---|---|
| 10.0.0.112 (Minio) | 抢占模式 | ha1 | ha1 故障恢复后自动抢回 |
| 10.0.0.113 (PostgreSQL) | 非抢占模式 | ha1 | 谁故障谁释放,恢复后不自动抢回,需手动回切 |
更多推荐
所有评论(0)