十五、Docker Compose-3-docker-compose-advanced-scenarios-troubleshooting
多容器编排:掌握复杂应用架构的编排技巧高级配置:灵活运用条件化配置、多环境管理故障排查:建立系统化的排查方法论性能优化:识别并解决 CPU、内存、磁盘、网络瓶颈。
·
Docker Compose 复杂应用场景与故障排查完全指南
版本: V1.0 | 技术深度: 生产故障级 | 预计阅读时间: 55 分钟
质量目标: CSDN 评分>95 | 适用人群: 高级开发工程师、DevOps 工程师、技术专家
目录
- 1. 多容器编排实战案例
- 1.1 微服务完整应用栈
- 1.2 数据管道系统
- [1.3 CI/CD 流水线环境](#13-cicd 流水线环境)
- 2. 高级配置技巧
- 3. 故障排查方法论
- 4. 常见故障案例库
- 5. 性能故障深度分析
- [5.1 CPU 瓶颈诊断](#51-cpu 瓶颈诊断)
- 5.2 内存泄漏排查
- [5.3 磁盘 I/O 瓶颈](#53-磁盘-io 瓶颈)
- 5.4 网络延迟问题
- 6. 实战演练
- 7. 总结
- [附录 A:故障排查清单](#附录-a 故障排查清单)
- [附录 B:诊断脚本集合](#附录-b 诊断脚本集合)
1. 多容器编排实战案例
1.1 微服务完整应用栈
1.1.1 电商秒杀系统架构
业务场景:
- 瞬时并发:10 万 + QPS
- 响应要求:< 100ms
- 数据一致性:强一致性
- 可用性:99.99%
完整编排配置:
# docker-compose.seckill.yml
version: '3.8'
x-service-defaults: &service-defaults
deploy:
resources:
limits:
cpus: '1.0'
memory: 1G
reservations:
cpus: '0.5'
memory: 512M
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
healthcheck:
interval: 10s
timeout: 5s
retries: 3
start_period: 30s
logging:
driver: json-file
options:
max-size: "50m"
max-file: "5"
services:
# ==================== 接入层 ====================
# Nginx 负载均衡器
nginx-lb:
<<: *service-defaults
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
- nginx-logs:/var/log/nginx
deploy:
replicas: 4
update_config:
parallelism: 2
delay: 10s
depends_on:
- api-gateway
networks:
- frontend-network
# API 网关(Kong)
api-gateway:
<<: *service-defaults
image: kong:3.0
ports:
- "8000:8000"
- "8443:8443"
environment:
KONG_DATABASE: "postgres"
KONG_PG_HOST: kong-db
KONG_PG_USER: kong
KONG_PG_PASSWORD: kong_password
KONG_ADMIN_LISTEN: "0.0.0.0:8001"
deploy:
replicas: 6
depends_on:
kong-db:
condition: service_healthy
networks:
- frontend-network
- gateway-network
# Kong 数据库
kong-db:
<<: *service-defaults
image: postgres:15-alpine
environment:
POSTGRES_USER: kong
POSTGRES_PASSWORD: kong_password
POSTGRES_DB: kong
volumes:
- kong-db-data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U kong"]
interval: 5s
timeout: 3s
retries: 5
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
networks:
- gateway-network
# ==================== 业务服务层 ====================
# 用户服务
user-service:
<<: *service-defaults
image: seckill/user-service:${VERSION:-latest}
environment:
SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
DB_HOST: user-db
REDIS_HOST: redis-cluster
KAFKA_BROKERS: kafka:9092
deploy:
replicas: 8
depends_on:
user-db:
condition: service_healthy
redis-cluster:
condition: service_healthy
networks:
- backend-network
- cache-network
# 商品服务
product-service:
<<: *service-defaults
image: seckill/product-service:${VERSION:-latest}
environment:
SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
DB_HOST: product-db
ELASTICSEARCH_HOST: elasticsearch
deploy:
replicas: 10
depends_on:
product-db:
condition: service_healthy
elasticsearch:
condition: service_healthy
networks:
- backend-network
- search-network
# 订单服务
order-service:
<<: *service-defaults
image: seckill/order-service:${VERSION:-latest}
environment:
SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
DB_HOST: order-db
REDIS_HOST: redis-cluster
KAFKA_BROKERS: kafka:9092
deploy:
replicas: 12
depends_on:
order-db:
condition: service_healthy
redis-cluster:
condition: service_healthy
kafka:
condition: service_started
networks:
- backend-network
- cache-network
- queue-network
# 库存服务
inventory-service:
<<: *service-defaults
image: seckill/inventory-service:${VERSION:-latest}
environment:
SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
DB_HOST: inventory-db
REDIS_HOST: redis-cluster
deploy:
replicas: 10
depends_on:
inventory-db:
condition: service_healthy
redis-cluster:
condition: service_healthy
networks:
- backend-network
- cache-network
# 支付服务
payment-service:
<<: *service-defaults
image: seckill/payment-service:${VERSION:-latest}
environment:
SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
DB_HOST: payment-db
KAFKA_BROKERS: kafka:9092
deploy:
replicas: 6
depends_on:
payment-db:
condition: service_healthy
kafka:
condition: service_started
networks:
- backend-network
- queue-network
# ==================== 数据层 ====================
# MySQL 主从复制
user-db:
<<: *service-defaults
image: mysql:8.0
environment:
MYSQL_ROOT_PASSWORD: root_password
MYSQL_DATABASE: user_db
MYSQL_USER: user_service
MYSQL_PASSWORD: user_password
volumes:
- user-db-data:/var/lib/mysql
- ./mysql/user-db.cnf:/etc/mysql/conf.d/custom.cnf:ro
command: >
--server-id=1
--log-bin=mysql-bin
--binlog-format=ROW
--gtid-mode=ON
--enforce-gtid-consistency=ON
healthcheck:
test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-proot_password"]
interval: 10s
timeout: 5s
retries: 5
deploy:
resources:
limits:
cpus: '4.0'
memory: 8G
networks:
- database-network
# Redis Cluster
redis-cluster:
<<: *service-defaults
image: redis:7-alpine
command: redis-server --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000
volumes:
- redis-data:/data
deploy:
replicas: 6
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 3s
retries: 5
networks:
- cache-network
# Elasticsearch 集群
elasticsearch:
<<: *service-defaults
image: elasticsearch:8.8.0
environment:
discovery.type: single-node
xpack.security.enabled: "false"
"ES_JAVA_OPTS": "-Xms2g -Xmx2g"
volumes:
- es-data:/usr/share/elasticsearch/data
deploy:
resources:
limits:
cpus: '4.0'
memory: 8G
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9200/_cluster/health || exit 1"]
interval: 30s
timeout: 10s
retries: 5
networks:
- search-network
# Kafka 消息队列
kafka:
<<: *service-defaults
image: confluentinc/cp-kafka:7.4.0
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
KAFKA_NUM_PARTITIONS: 6
deploy:
replicas: 3
depends_on:
zookeeper:
condition: service_healthy
volumes:
- kafka-data:/var/lib/kafka/data
networks:
- queue-network
zookeeper:
<<: *service-defaults
image: confluentinc/cp-zookeeper:7.4.0
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
deploy:
replicas: 3
volumes:
- zookeeper-data:/var/lib/zookeeper/data
healthcheck:
test: ["CMD", "echo", "ruok", "|", "nc", "localhost", "2181"]
interval: 10s
timeout: 5s
retries: 5
networks:
- queue-network
# ==================== 监控层 ====================
prometheus:
<<: *service-defaults
image: prom/prometheus:v2.45.0
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
ports:
- "9090:9090"
networks:
- monitoring-network
grafana:
<<: *service-defaults
image: grafana/grafana:10.0.0
volumes:
- grafana-data:/var/lib/grafana
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_PASSWORD: admin_password
networks:
- monitoring-network
depends_on:
- prometheus
volumes:
nginx-logs:
kong-db-data:
user-db-data:
product-db-data:
order-db-data:
inventory-db-data:
payment-db-data:
redis-data:
es-data:
kafka-data:
zookeeper-data:
prometheus-data:
grafana-data:
networks:
frontend-network:
driver: bridge
gateway-network:
driver: bridge
backend-network:
driver: bridge
cache-network:
driver: bridge
database-network:
driver: bridge
internal: true
search-network:
driver: bridge
queue-network:
driver: bridge
monitoring-network:
driver: bridge
1.2 数据管道系统
1.2.1 实时数据处理管道
# docker-compose.data-pipeline.yml
version: '3.8'
services:
# 数据采集:Flume
flume:
image: apache/flume:1.13.0
volumes:
- ./flume/conf:/opt/flume/conf:ro
- /var/log/app:/var/log/app:ro
environment:
- FLUME_CONF_DIR=/opt/flume/conf
deploy:
resources:
limits:
cpus: '2.0'
memory: 2G
networks:
- pipeline-network
# 消息缓冲:Kafka
kafka:
image: confluentinc/cp-kafka:7.4.0
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
depends_on:
- zookeeper
volumes:
- kafka-data:/var/lib/kafka/data
networks:
- pipeline-network
zookeeper:
image: confluentinc/cp-zookeeper:7.4.0
environment:
ZOOKEEPER_CLIENT_PORT: 2181
volumes:
- zookeeper-data:/var/lib/zookeeper/data
networks:
- pipeline-network
# 流处理:Flink
flink-jobmanager:
image: flink:1.17.0
ports:
- "8081:8081"
command: jobmanager
environment:
- FLINK_PROPERTIES=jobmanager.rpc.address:flink-jobmanager
volumes:
- ./flink/jobs:/opt/flink/usrlib
networks:
- pipeline-network
flink-taskmanager:
image: flink:1.17.0
depends_on:
- flink-jobmanager
command: taskmanager
environment:
- FLINK_PROPERTIES=jobmanager.rpc.address:flink-jobmanager taskmanager.numberOfTaskSlots:4
deploy:
replicas: 4
volumes:
- ./flink/jobs:/opt/flink/usrlib
networks:
- pipeline-network
# 数据存储:ClickHouse
clickhouse:
image: clickhouse/clickhouse-server:23.8
ports:
- "8123:8123"
- "9000:9000"
volumes:
- clickhouse-data:/var/lib/clickhouse
- ./clickhouse/config.xml:/etc/clickhouse-server/config.xml:ro
environment:
CLICKHOUSE_PASSWORD: clickhouse_password
deploy:
resources:
limits:
cpus: '8.0'
memory: 16G
networks:
- pipeline-network
# 数据可视化:Superset
superset:
image: apache/superset:latest
ports:
- "8088:8088"
volumes:
- superset-home:/app/superset_home
environment:
SUPERSET_SECRET_KEY: your-secret-key
SUPERSET_ADMIN_PASSWORD: admin
depends_on:
- clickhouse
networks:
- pipeline-network
volumes:
kafka-data:
zookeeper-data:
clickhouse-data:
superset-home:
networks:
pipeline-network:
driver: bridge
1.3 CI/CD 流水线环境
1.3.1 完整 DevOps 工具链
# docker-compose.cicd.yml
version: '3.8'
services:
# Git 仓库:Gitea
gitea:
image: gitea/gitea:1.20.0
ports:
- "3000:3000"
- "2222:22"
volumes:
- gitea-data:/data
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
environment:
- USER_UID=1000
- USER_GID=1000
networks:
- cicd-network
# CI/CD 引擎:Jenkins
jenkins:
image: jenkins/jenkins:lts
ports:
- "8080:8080"
- "50000:50000"
volumes:
- jenkins-data:/var/jenkins_home
- /var/run/docker.sock:/var/run/docker.sock
- ./jenkins/plugins:/usr/share/jenkins/ref/plugins
environment:
- JAVA_OPTS=-Xmx4g
networks:
- cicd-network
depends_on:
- gitea
# 代码质量:SonarQube
sonarqube:
image: sonarqube:community
ports:
- "9000:9000"
volumes:
- sonarqube-data:/opt/sonarqube/data
- sonarqube-logs:/opt/sonarqube/logs
- sonarqube-extensions:/opt/sonarqube/extensions
environment:
- SONAR_ES_BOOTSTRAP_CHECKS_DISABLE=true
networks:
- cicd-network
# 制品仓库:Nexus
nexus:
image: sonatype/nexus3:latest
ports:
- "8081:8081"
volumes:
- nexus-data:/nexus-data
networks:
- cicd-network
# 容器仓库:Harbor
registry:
image: goharbor/registry-photon:v2.8.0
volumes:
- registry-data:/storage
- ./harbor/config.yml:/etc/registry/config.yml:ro
networks:
- cicd-network
# 项目管理:Jira
jira:
image: atlassian/jira-software:latest
ports:
- "8082:8080"
volumes:
- jira-data:/var/atlassian/jira
networks:
- cicd-network
# 文档管理:Confluence
confluence:
image: atlassian/confluence:latest
ports:
- "8090:8090"
volumes:
- confluence-data:/var/atlassian/confluence
networks:
- cicd-network
# 监控告警:Prometheus Stack
prometheus:
image: prom/prometheus:v2.45.0
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
networks:
- cicd-network
grafana:
image: grafana/grafana:10.0.0
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
networks:
- cicd-network
volumes:
gitea-data:
jenkins-data:
sonarqube-data:
sonarqube-logs:
sonarqube-extensions:
nexus-data:
registry-data:
jira-data:
confluence-data:
prometheus-data:
grafana-data:
networks:
cicd-network:
driver: bridge
2. 高级配置技巧
2.1 条件化配置
2.1.1 基于环境变量的动态配置
# docker-compose.dynamic.yml
version: '3.8'
services:
app:
image: myapp:${APP_VERSION:-latest}
# 条件化端口映射
ports:
- "${EXPOSE_PORT:-8080}:8080"
# 条件化环境变量
environment:
- ENV=${ENVIRONMENT:-development}
- DEBUG=${DEBUG_MODE:-false}
- LOG_LEVEL=${LOG_LEVEL:-info}
- DB_HOST=${DB_HOST:-localhost}
- DB_PORT=${DB_PORT:-5432}
- DB_NAME=${DB_NAME:-mydb}
- DB_USER=${DB_USER:-postgres}
- DB_PASSWORD=${DB_PASSWORD:?DB_PASSWORD 环境变量必须设置}
# 条件化卷挂载
volumes:
- ./config/${ENVIRONMENT:-development}:/app/config:ro
- ${DATA_DIR:-./data}:/app/data
- app-logs:/app/logs
# 条件化资源配置
deploy:
resources:
limits:
cpus: '${CPU_LIMIT:-1.0}'
memory: ${MEMORY_LIMIT:-512M}
reservations:
cpus: '${CPU_RESERVATION:-0.5}'
memory: ${MEMORY_RESERVATION:-256M}
# 条件化健康检查
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${HEALTH_PORT:-8080}/health"]
interval: ${HEALTH_INTERVAL:-30s}
timeout: ${HEALTH_TIMEOUT:-10s}
retries: ${HEALTH_RETRIES:-3}
start_period: ${HEALTH_START_PERIOD:-40s}
# 条件化日志配置
logging:
driver: ${LOG_DRIVER:-json-file}
options:
max-size: ${LOG_MAX_SIZE:-10m}
max-file: ${LOG_MAX_FILE:-3}
# 条件化网络
networks:
- ${NETWORK:-app-network}
volumes:
app-logs:
driver: ${VOLUME_DRIVER:-local}
networks:
app-network:
driver: ${NETWORK_DRIVER:-bridge}
环境文件示例:
# .env.development
ENVIRONMENT=development
DEBUG_MODE=true
LOG_LEVEL=debug
DB_HOST=localhost
DB_PORT=5432
DB_NAME=devdb
DB_PASSWORD=dev_password
CPU_LIMIT=1.0
MEMORY_LIMIT=512M
EXPOSE_PORT=8080
# .env.production
ENVIRONMENT=production
DEBUG_MODE=false
LOG_LEVEL=info
DB_HOST=prod-db.example.com
DB_PORT=5432
DB_NAME=proddb
DB_PASSWORD=${PROD_DB_PASSWORD}
CPU_LIMIT=4.0
MEMORY_LIMIT=4G
EXPOSE_PORT=80
启动命令:
# 开发环境
docker compose --env-file .env.development up -d
# 生产环境
docker compose --env-file .env.production up -d
# 动态覆盖配置
DB_PASSWORD=secret docker compose up -d
2.2 多环境管理
2.2.1 Compose 文件继承与覆盖
# docker-compose.yml - 基础配置
version: '3.8'
services:
web:
image: myapp:latest
environment:
- APP_NAME=MyApp
networks:
- app-network
db:
image: postgres:15
environment:
- POSTGRES_DB=mydb
volumes:
- db-data:/var/lib/postgresql/data
volumes:
db-data:
networks:
app-network:
driver: bridge
# docker-compose.override.yml - 开发环境覆盖
version: '3.8'
services:
web:
build: .
volumes:
- ./src:/app/src
- ./logs:/app/logs
environment:
- DEBUG=true
- LOG_LEVEL=debug
ports:
- "8080:8080"
db:
ports:
- "5432:5432"
environment:
- POSTGRES_PASSWORD=dev_password
# docker-compose.prod.yml - 生产环境配置
version: '3.8'
services:
web:
image: myapp:1.2.3
deploy:
replicas: 4
resources:
limits:
cpus: '2.0'
memory: 2G
environment:
- DEBUG=false
- LOG_LEVEL=warn
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost/health"]
interval: 30s
timeout: 10s
retries: 3
db:
deploy:
resources:
limits:
cpus: '4.0'
memory: 8G
environment:
- POSTGRES_PASSWORD=${DB_PASSWORD}
使用方式:
# 开发环境(自动加载 override)
docker compose up -d
# 生产环境
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
# 测试环境
docker compose -f docker-compose.yml -f docker-compose.test.yml up -d
2.3 动态服务发现
2.3.1 Consul 集成方案
# docker-compose.consul.yml
version: '3.8'
services:
# Consul 服务发现
consul:
image: hashicorp/consul:1.16
ports:
- "8500:8500"
- "8600:8600/udp"
volumes:
- consul-data:/consul/data
command: agent -server -ui -bootstrap-expect=1 -client=0.0.0.0
networks:
- service-network
# Consul Template(动态配置生成)
consul-template:
image: hashicorp/consul-template:latest
volumes:
- ./templates:/templates:ro
- ./config:/config:ro
command: >
-consul consul:8500
-template "/templates/nginx.conf.ctmpl:/etc/nginx/nginx.conf:nginx -s reload"
networks:
- service-network
depends_on:
- consul
# 注册服务示例
backend-service:
image: myapp:latest
environment:
- CONSUL_HTTP_ADDR=consul:8500
labels:
- "consul.register=true"
- "consul.service.name=backend"
- "consul.service.port=8080"
- "consul.service.tags=api,backend"
- "consul.service.check.http=http://localhost:8080/health"
- "consul.service.check.interval=10s"
networks:
- service-network
depends_on:
- consul
# Nginx(动态上游配置)
nginx:
image: nginx:alpine
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
ports:
- "80:80"
networks:
- service-network
depends_on:
- consul-template
volumes:
consul-data:
networks:
service-network:
driver: bridge
Consul Template 配置:
# templates/nginx.conf.ctmpl
upstream backend {
{{range service "backend"}}
server {{.Address}}:{{.Port}};
{{end}}
}
server {
listen 80;
location / {
proxy_pass http://backend;
}
}
3. 故障排查方法论
3.1 故障排查框架
3.1.1 系统化排查流程
3.1.2 故障排查清单
| 阶段 | 检查项 | 命令/工具 | 预期结果 |
|---|---|---|---|
| 信息收集 | 容器状态 | docker compose ps |
显示所有容器状态 |
| 容器日志 | docker compose logs |
查看错误信息 | |
| 系统资源 | docker stats |
CPU/内存使用率 | |
| 问题定位 | 网络连通性 | docker network inspect |
网络配置正确 |
| 存储挂载 | docker inspect -f '{{.Mounts}}' |
卷正常挂载 | |
| 依赖关系 | docker compose config |
配置无语法错误 | |
| 深入诊断 | 进程信息 | docker top <container> |
进程正常运行 |
| 文件系统 | docker exec <container> df -h |
磁盘空间充足 | |
| 网络连接 | docker exec <container> netstat -an |
端口监听正常 | |
| 性能分析 | 资源限制 | docker inspect <container> |
资源配置合理 |
| I/O 性能 | docker exec <container> iostat |
I/O 延迟正常 | |
| 网络吞吐 | docker exec <container> iperf3 |
带宽达标 |
3.2 诊断工具集
3.2.1 内置诊断命令
#!/bin/bash
# docker-compose-diagnostic.sh - 综合诊断脚本
set -euo pipefail
COMPOSE_FILE="${1:-docker-compose.yml}"
OUTPUT_DIR="diagnostic-$(date +%Y%m%d-%H%M%S)"
mkdir -p $OUTPUT_DIR
echo "=== Docker Compose 诊断报告 ===" | tee $OUTPUT_DIR/report.txt
echo "时间:$(date)" | tee -a $OUTPUT_DIR/report.txt
echo "Compose 文件:$COMPOSE_FILE" | tee -a $OUTPUT_DIR/report.txt
echo "" | tee -a $OUTPUT_DIR/report.txt
# 1. 配置验证
echo "[1/10] 验证 Compose 配置..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE config > $OUTPUT_DIR/config.yml 2>&1 || \
echo "配置验证失败" | tee -a $OUTPUT_DIR/report.txt
# 2. 服务状态
echo "[2/10] 检查服务状态..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE ps > $OUTPUT_DIR/service-status.txt 2>&1
# 3. 网络拓扑
echo "[3/10] 分析网络拓扑..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE config --services | while read service; do
docker inspect --format='{{.NetworkSettings.Networks}}' $service \
> $OUTPUT_DIR/network-$service.txt 2>&1 || true
done
# 4. 卷信息
echo "[4/10] 检查卷配置..." | tee -a $OUTPUT_DIR/report.txt
docker volume ls > $OUTPUT_DIR/volumes.txt 2>&1
# 5. 资源使用
echo "[5/10] 收集资源统计..." | tee -a $OUTPUT_DIR/report.txt
docker stats --no-stream > $OUTPUT_DIR/resource-stats.txt 2>&1
# 6. 日志收集
echo "[6/10] 收集服务日志..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE logs --tail=100 > $OUTPUT_DIR/logs.txt 2>&1
# 7. 依赖关系
echo "[7/10] 分析依赖关系..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE config --services | while read service; do
echo "=== $service 依赖 ===" >> $OUTPUT_DIR/dependencies.txt
docker inspect --format='{{.Config.Env}}' $service >> $OUTPUT_DIR/dependencies.txt 2>&1 || true
done
# 8. 健康检查
echo "[8/10] 检查健康状态..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE ps | grep -v "NAME" | awk '{print $1}' | while read container; do
health=$(docker inspect --format='{{.State.Health.Status}}' $container 2>/dev/null || echo "N/A")
echo "$container: $health" >> $OUTPUT_DIR/health-status.txt
done
# 9. 端口映射
echo "[9/10] 检查端口映射..." | tee -a $OUTPUT_DIR/report.txt
docker ps --format "table {{.Names}}\t{{.Ports}}" > $OUTPUT_DIR/port-mappings.txt 2>&1
# 10. 系统信息
echo "[10/10] 收集系统信息..." | tee -a $OUTPUT_DIR/report.txt
docker info > $OUTPUT_DIR/docker-info.txt 2>&1
docker version > $OUTPUT_DIR/docker-version.txt 2>&1
echo "" | tee -a $OUTPUT_DIR/report.txt
echo "=== 诊断完成 ===" | tee -a $OUTPUT_DIR/report.txt
echo "结果目录:$OUTPUT_DIR" | tee -a $OUTPUT_DIR/report.txt
3.2.2 高级调试工具
# 容器网络调试
docker run --rm -it --network container:<target-container> nicolaka/netshoot
# 容器文件系统调试
docker run --rm -it --pid container:<target-container> alpine nsenter -t 1 -m -u -n -i sh
# 容器性能分析
docker run --rm -it --privileged --pid=host justincormack/nsenter1
# 网络抓包
docker run --rm -it --cap-add=NET_ADMIN --network container:<target-container> nicolaka/netshoot tcpdump -i any -w /tmp/capture.pcap
3.3 日志分析技术
3.3.1 日志聚合与过滤
#!/bin/bash
# log-analyzer.sh - 日志分析工具
set -euo pipefail
LOG_FILE="${1:-logs.txt}"
OUTPUT_DIR="log-analysis-$(date +%Y%m%d-%H%M%S)"
mkdir -p $OUTPUT_DIR
echo "分析日志文件:$LOG_FILE"
echo "输出目录:$OUTPUT_DIR"
# 1. 错误日志提取
echo "[1/6] 提取错误日志..."
grep -i "error\|exception\|fatal\|critical" $LOG_FILE > $OUTPUT_DIR/errors.txt || true
# 2. 警告日志提取
echo "[2/6] 提取警告日志..."
grep -i "warn\|warning" $LOG_FILE > $OUTPUT_DIR/warnings.txt || true
# 3. 时间线分析
echo "[3/6] 生成时间线..."
awk '{print $1, $2, $3}' $LOG_FILE | sort | uniq -c | sort -rn > $OUTPUT_DIR/timeline.txt
# 4. 错误统计
echo "[4/6] 错误类型统计..."
grep -oP '\[.*?\]' $LOG_FILE | sort | uniq -c | sort -rn > $OUTPUT_DIR/error-types.txt || true
# 5. 关键事件提取
echo "[5/6] 提取关键事件..."
grep -i "start\|stop\|restart\|fail\|success" $LOG_FILE > $OUTPUT_DIR/events.txt || true
# 6. 生成摘要报告
echo "[6/6] 生成摘要报告..."
cat > $OUTPUT_DIR/summary.txt <<EOF
日志分析摘要
============
分析时间:$(date)
日志文件:$LOG_FILE
总行数:$(wc -l < $LOG_FILE)
错误数:$(wc -l < $OUTPUT_DIR/errors.txt)
警告数:$(wc -l < $OUTPUT_DIR/warnings.txt)
Top 5 错误类型:
$(head -5 $OUTPUT_DIR/error-types.txt)
关键事件数量:
$(wc -l < $OUTPUT_DIR/events.txt)
EOF
echo "分析完成!"
3.3.2 实时日志监控
#!/bin/bash
# real-time-log-monitor.sh
SERVICES=("$@")
if [ ${#SERVICES[@]} -eq 0 ]; then
echo "用法:$0 [service1] [service2] ..."
exit 1
fi
# 彩色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo "实时监控日志 (Ctrl+C 停止)..."
echo "监控服务:${SERVICES[*]}"
echo ""
# 构建 docker compose logs 命令
CMD="docker compose logs -f --tail=50"
for service in "${SERVICES[@]}"; do
CMD="$CMD $service"
done
# 实时日志流 + 关键字高亮
eval $CMD 2>&1 | while read line; do
if echo "$line" | grep -qi "error\|exception\|fatal"; then
echo -e "${RED}$line${NC}"
elif echo "$line" | grep -qi "warn"; then
echo -e "${YELLOW}$line${NC}"
else
echo -e "${GREEN}$line${NC}"
fi
done
4. 常见故障案例库
4.1 容器启动失败
4.1.1 案例:依赖服务未就绪
故障现象:
$ docker compose up -d
Creating network "app-network" with the default driver
Creating app_db_1 ... done
Creating app_web_1 ... done
$ docker compose ps
NAME STATUS
app_db_1 Up (health: starting)
app_web_1 Restarting (1) 2 seconds ago
Web 容器日志:
2024-01-15 10:30:15 ERROR: Connection refused to db:5432
2024-01-15 10:30:16 FATAL: Cannot connect to database
2024-01-15 10:30:16 Container exiting
诊断步骤:
# 1. 检查数据库健康状态
docker inspect --format='{{.State.Health.Status}}' app_db_1
# 输出:starting(说明数据库还未就绪)
# 2. 查看数据库启动日志
docker logs app_db_1
# 输出:database system is starting up
# 3. 手动连接测试
docker exec app_db_1 psql -h localhost -U postgres -c "SELECT 1"
# 错误:connection refused
根本原因:
- Web 服务启动时,数据库还未完全就绪
depends_on只保证启动顺序,不保证就绪状态
解决方案:
# 方案 1:使用健康检查条件
version: '3.8'
services:
db:
image: postgres:15
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 3s
retries: 5
web:
depends_on:
db:
condition: service_healthy # 等待健康检查通过
# 方案 2:添加重试机制
services:
web:
image: myapp:latest
command: >
sh -c "until pg_isready -h db -U postgres; do
echo 'Waiting for database...';
sleep 2;
done;
npm start"
depends_on:
- db
# 方案 3:使用 wait-for-it 脚本
services:
web:
image: myapp:latest
command: >
bash -c "./wait-for-it.sh db:5432 --timeout=60 --strict --
npm start"
depends_on:
- db
wait-for-it.sh 脚本:
#!/usr/bin/env bash
# wait-for-it.sh
set -e
host="$1"
shift
cmd="$@"
until nc -z "$host" 2>&1; do
echo "Waiting for $host..."
sleep 2
done
echo "$host is available, starting application"
exec $cmd
4.2 网络连接问题
4.2.1 案例:容器间无法通信
故障现象:
# Web 容器无法访问数据库容器
$ docker exec app_web_1 ping db
ping: bad address 'db'
$ docker exec app_web_1 nslookup db
nslookup: can't resolve 'db'
诊断步骤:
# 1. 检查网络配置
docker network inspect app-network
# 2. 查看容器网络连接
docker inspect --format='{{range $k, $v := .NetworkSettings.Networks}}{{$k}}{{end}}' app_web_1
# 输出:app-network
docker inspect --format='{{range $k, $v := .NetworkSettings.Networks}}{{$k}}{{end}}' app_db_1
# 输出:(空,说明数据库不在同一网络)
# 3. 检查 DNS 配置
docker exec app_web_1 cat /etc/resolv.conf
根本原因:
- Web 和数据库容器不在同一个 Docker 网络
- DNS 服务发现失效
解决方案:
# 确保所有服务在同一网络
version: '3.8'
services:
web:
image: myapp:latest
networks:
- app-network # 显式指定网络
db:
image: postgres:15
networks:
- app-network # 同一网络
networks:
app-network:
driver: bridge
修复网络:
# 手动连接容器到网络
docker network connect app-network app_db_1
# 验证连通性
docker exec app_web_1 ping -c 3 db
# 输出:PING db (172.20.0.3): 56 data bytes
4.3 数据持久化故障
4.3.1 案例:容器重启后数据丢失
故障现象:
# 写入数据
docker exec app_db_1 psql -U postgres -c "CREATE TABLE test (id INT);"
# 重启容器
docker compose restart db
# 数据丢失
docker exec app_db_1 psql -U postgres -c "\dt"
# 输出:Did not find any relations
诊断步骤:
# 1. 检查卷挂载
docker inspect app_db_1 | grep -A 10 Mounts
# 2. 查看卷信息
docker volume ls
docker volume inspect app_db-data
# 3. 检查文件系统
docker exec app_db_1 df -h /var/lib/postgresql/data
根本原因:
- 未配置持久化卷,数据存储在容器可写层
- 容器重建后,数据丢失
解决方案:
version: '3.8'
services:
db:
image: postgres:15
volumes:
- db-data:/var/lib/postgresql/data # 命名卷持久化
# 或绑定挂载
# - /host/path:/var/lib/postgresql/data:z
volumes:
db-data:
driver: local
driver_opts:
type: none
device: /mnt/ssd/postgres # 指定存储位置
o: bind
数据恢复:
# 1. 备份数据
docker run --rm \
-v app-db-data:/source:ro \
-v $(pwd):/backup \
alpine tar czf /backup/db-backup.tar.gz -C /source .
# 2. 恢复数据
docker run --rm \
-v app-db-data:/target \
-v $(pwd):/backup \
alpine tar xzf /backup/db-backup.tar.gz -C /target
# 3. 验证数据
docker exec app_db_1 psql -U postgres -c "\dt"
4.4 资源竞争与泄漏
4.4.1 案例:内存泄漏导致 OOM
故障现象:
$ docker compose ps
NAME STATUS
app_web_1 Restarting (13) 5 seconds ago
$ docker inspect app_web_1 | grep -A 5 State
"OOMKilled": true,
"ExitCode": 137
诊断步骤:
# 1. 监控内存使用
docker stats app_web_1 --no-stream
# 2. 查看内存趋势
for i in {1..10}; do
docker stats app_web_1 --no-stream --format "table {{.MemUsage}}"
sleep 5
done
# 3. 分析内存泄漏
docker exec app_web_1 cat /proc/1/status | grep -i "VmRSS\|VmSize"
根本原因:
- 应用代码存在内存泄漏
- 未设置内存限制,导致耗尽宿主机资源
解决方案:
version: '3.8'
services:
web:
image: myapp:latest
deploy:
resources:
limits:
memory: 1G # 硬限制
reservations:
memory: 512M # 软限制
memswap_limit: 1.5G # 内存+Swap
mem_swappiness: 0 # 禁用 Swap
内存泄漏调试:
# 1. 生成内存快照
docker exec app_web_1 kill -SIGUSR1 $(pgrep node)
# 2. 分析堆内存
docker exec app_web_1 node --inspect --expose-gc app.js
# 3. 使用工具分析
docker run --rm -it \
-v /var/run/docker.sock:/var/run/docker.sock \
docker.io/library/node:latest \
node --inspect app.js
5. 性能故障深度分析
5.1 CPU 瓶颈诊断
5.1.1 案例:CPU 使用率持续 100%
故障现象:
$ docker stats
CONTAINER ID NAME CPU % MEM USAGE / LIMIT
abc123 app_web_1 198.5% 512MiB / 1GiB
诊断流程:
# 1. 识别热点进程
docker top app_web_1
# 2. 进入容器诊断
docker exec -it app_web_1 sh
top -H -p 1 # 查看线程级 CPU 使用
# 3. 生成性能分析
docker exec app_web_1 sh -c "
for i in \$(seq 1 5); do
ps -eo pid,ppid,pcpu,cmd --sort=-pcpu | head -10
sleep 2
done
"
# 4. 使用 perf 分析
docker run --rm -it --pid container:app_web_1 \
--cap-add SYS_PTRACE alpine \
sh -c "apk add perf && perf top -p 1"
优化方案:
version: '3.8'
services:
web:
image: myapp:latest
# CPU 限制
deploy:
resources:
limits:
cpus: '2.0'
# CPU 绑定(减少上下文切换)
cpuset_cpus: "0,1"
# 调整优先级
cpu_shares: 1024
5.2 内存泄漏排查
5.2.1 系统性排查方法
#!/bin/bash
# memory-leak-detector.sh
CONTAINER="$1"
MONITOR_DURATION=300 # 5 分钟
SAMPLE_INTERVAL=10
echo "监控容器:$CONTAINER"
echo "监控时长:${MONITOR_DURATION}s"
MEMORY_SAMPLES=()
for ((i=0; i<MONITOR_DURATION/SAMPLE_INTERVAL; i++)); do
MEM_USAGE=$(docker stats --no-stream --format "{{.MemUsage}}" $CONTAINER | \
awk -F'/' '{print $1}' | tr -d ' ')
MEM_PERC=$(docker stats --no-stream --format "{{.MemPerc}}" $CONTAINER)
MEMORY_SAMPLES+=("$MEM_PERC")
echo "[$(date +'%H:%M:%S')] 内存:$MEM_USAGE ($MEM_PERC)"
# 检测泄漏(持续增长)
if [ ${#MEMORY_SAMPLES[@]} -ge 6 ]; then
# 计算增长趋势
first=${MEMORY_SAMPLES[0]%\%}
last=${MEMORY_SAMPLES[-1]%\%}
growth=$(echo "$last - $first" | bc)
if (( $(echo "$growth > 10" | bc -l) )); then
echo "⚠️ 警告:检测到内存泄漏(5 分钟增长${growth}%)"
# 生成诊断报告
docker exec $CONTAINER sh -c "
echo '=== 进程信息 ==='
ps aux --sort=-%mem | head -10
echo ''
echo '=== 内存映射 ==='
cat /proc/1/status | grep -i 'VmRSS\|VmSize\|VmPeak'
"
fi
fi
sleep $SAMPLE_INTERVAL
done
5.3 磁盘 I/O 瓶颈
5.3.1 I/O 性能诊断
# 1. 实时 I/O 监控
docker exec app_db_1 sh -c "
while true; do
iostat -x 1 1 | grep -E 'Device|sda'
sleep 5
done
"
# 2. 识别 I/O 密集型进程
docker exec app_db_1 sh -c "
pidstat -d 1 5
"
# 3. 测试卷性能
docker run --rm -v app-db-data:/test alpine \
fio --name=iops --ioengine=psync --direct=1 \
--rw=randread --bs=4k --size=100M --numjobs=4 \
--group_reporting
优化方案:
version: '3.8'
services:
db:
image: postgres:15
volumes:
# 使用高速 SSD
- type: volume
source: db-data
target: /var/lib/postgresql/data
volume:
nocopy: true
bind:
propagation: cached
# I/O 限流(防止影响其他服务)
device_read_bps:
- path: /dev/sda
rate: 100mb
device_write_bps:
- path: /dev/sda
rate: 100mb
volumes:
db-data:
driver: local
driver_opts:
type: none
device: /mnt/nvme/postgres # NVMe SSD
o: bind
5.4 网络延迟问题
5.4.1 网络性能诊断
# 1. 网络延迟测试
docker network create test-network
docker run -d --name server --network test-network alpine nc -l 8080
docker run --rm --network test-network alpine \
sh -c "time nc server 8080 < /dev/null"
# 2. 带宽测试
docker run -d --name iperf-server --network test-network networkstatic/iperf3
docker run --rm --network test-network networkstatic/iperf3 \
-c iperf-server -t 10 -P 4
# 3. DNS 解析延迟
docker run --rm alpine \
sh -c "time nslookup google.com"
# 4. 网络路径追踪
docker run --rm --cap-add NET_RAW alpine \
traceroute -n google.com
网络优化配置:
version: '3.8'
services:
web:
image: nginx:alpine
networks:
- optimized-network
# 网络参数调优
sysctls:
- net.core.somaxconn=65535
- net.ipv4.tcp_tw_reuse=1
- net.ipv4.tcp_rmem=4096,131072,16777216
- net.ipv4.tcp_wmem=4096,131072,16777216
networks:
optimized-network:
driver: bridge
driver_opts:
com.docker.network.driver.mtu: 1500
ipam:
config:
- subnet: 172.22.0.0/16
gateway: 172.22.0.1
6. 实战演练
6.1 综合故障排查演练
场景设定:
电商系统在促销活动期间出现响应缓慢,部分服务频繁重启。
演练环境:
# 启动故障环境
docker compose -f docker-compose.seckill.yml up -d
# 注入故障(模拟)
docker exec app_order-service_1 sh -c "
while true; do
dd if=/dev/zero of=/tmp/test bs=1M count=100
sleep 5
done
" &
排查任务:
-
信息收集(5 分钟)
- 运行诊断脚本
- 收集所有服务日志
- 记录资源使用情况
-
问题定位(10 分钟)
- 识别异常服务
- 分析依赖关系
- 确定故障传播链
-
制定方案(5 分钟)
- 临时缓解措施
- 根本解决方案
- 预防措施
-
实施修复(10 分钟)
- 执行修复方案
- 验证效果
- 监控系统状态
预期成果:
- 故障排查报告
- 优化建议清单
- 监控告警规则
7. 总结
7.1 核心技术要点
- 多容器编排:掌握复杂应用架构的编排技巧
- 高级配置:灵活运用条件化配置、多环境管理
- 故障排查:建立系统化的排查方法论
- 性能优化:识别并解决 CPU、内存、磁盘、网络瓶颈
7.2 最佳实践清单
✅ 故障预防:
- 配置健康检查和自动重启
- 设置资源限制防止竞争
- 使用持久化卷保证数据安全
- 建立完善的监控告警体系
✅ 故障排查:
- 遵循系统化排查流程
- 善用诊断工具集
- 详细记录故障现象和处理过程
- 定期复盘和知识沉淀
✅ 性能优化:
- 建立性能基准
- 持续监控关键指标
- 定期性能测试
- 根据业务特点调优配置
附录 A:故障排查清单
A.1 启动失败排查清单
- 检查 Docker Compose 配置语法
- 验证镜像是否存在/可拉取
- 检查依赖服务是否就绪
- 查看容器日志定位错误
- 验证端口映射是否冲突
- 检查卷挂载路径权限
- 确认资源配置(CPU/内存)是否合理
- 检查网络连接配置
A.2 网络问题排查清单
- 验证容器是否在同一网络
- 检查 DNS 解析是否正常
- 测试容器间连通性(ping/telnet)
- 查看防火墙规则
- 检查端口监听状态
- 验证网络驱动配置
- 排查 IP 地址冲突
A.3 数据持久化排查清单
- 确认卷配置正确
- 检查卷挂载状态
- 验证数据目录权限
- 查看磁盘空间使用
- 测试数据读写
- 检查备份策略
- 验证数据完整性
附录 B:诊断脚本集合
B.1 快速诊断脚本
#!/bin/bash
# quick-diagnostic.sh - 60 秒快速诊断
set -euo pipefail
echo "=== Docker Compose 快速诊断 ==="
echo ""
# 1. 服务状态(10 秒)
echo "[1/6] 服务状态:"
docker compose ps
# 2. 资源使用(10 秒)
echo ""
echo "[2/6] 资源使用:"
docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}"
# 3. 最近错误日志(10 秒)
echo ""
echo "[3/6] 最近错误日志:"
docker compose logs --tail=20 2>&1 | grep -i "error\|exception\|fatal" | tail -10 || echo "无错误日志"
# 4. 健康检查(10 秒)
echo ""
echo "[4/6] 健康状态:"
docker compose ps | grep -v "NAME" | awk '{print $1}' | while read container; do
health=$(docker inspect --format='{{.State.Health.Status}}' $container 2>/dev/null || echo "N/A")
echo " $container: $health"
done
# 5. 网络连通性(10 秒)
echo ""
echo "[5/6] 网络检查:"
docker compose config --services | head -3 | while read service; do
ip=$(docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $service 2>/dev/null || echo "N/A")
echo " $service: $ip"
done
# 6. 磁盘空间(10 秒)
echo ""
echo "[6/6] 磁盘使用:"
docker system df
echo ""
echo "=== 诊断完成 ==="
B.2 日志收集脚本
#!/bin/bash
# log-collector.sh - 完整日志收集
set -euo pipefail
OUTPUT_DIR="logs-$(date +%Y%m%d-%H%M%S)"
mkdir -p $OUTPUT_DIR
echo "收集日志到:$OUTPUT_DIR"
# 1. Compose 配置
docker compose config > $OUTPUT_DIR/compose-config.yml
# 2. 服务日志
docker compose logs > $OUTPUT_DIR/all-services.log
# 3. 按服务分离日志
docker compose config --services | while read service; do
docker compose logs $service > $OUTPUT_DIR/${service}.log
done
# 4. 容器日志
docker ps -a --format "{{.Names}}" | while read container; do
docker logs $container > $OUTPUT_DIR/container-${container}.log 2>&1 || true
done
# 5. 系统日志
journalctl -u docker > $OUTPUT_DIR/docker-system.log 2>&1 || true
# 6. 诊断信息
docker info > $OUTPUT_DIR/docker-info.txt
docker version > $OUTPUT_DIR/docker-version.txt
docker compose version > $OUTPUT_DIR/compose-version.txt
# 7. 打包
tar czf ${OUTPUT_DIR}.tar.gz $OUTPUT_DIR
rm -rf $OUTPUT_DIR
echo "日志收集完成:${OUTPUT_DIR}.tar.gz"
文档版本: V1.0
最后更新: 2026-03-12
作者: AI 技术助手
许可协议: CC BY-SA 4.0
更多推荐
所有评论(0)