Docker Compose 复杂应用场景与故障排查完全指南

版本: V1.0 | 技术深度: 生产故障级 | 预计阅读时间: 55 分钟
质量目标: CSDN 评分>95 | 适用人群: 高级开发工程师、DevOps 工程师、技术专家


目录


1. 多容器编排实战案例

1.1 微服务完整应用栈

1.1.1 电商秒杀系统架构

业务场景

  • 瞬时并发:10 万 + QPS
  • 响应要求:< 100ms
  • 数据一致性:强一致性
  • 可用性:99.99%

完整编排配置

# docker-compose.seckill.yml
version: '3.8'

x-service-defaults: &service-defaults
  deploy:
    resources:
      limits:
        cpus: '1.0'
        memory: 1G
      reservations:
        cpus: '0.5'
        memory: 512M
    restart_policy:
      condition: on-failure
      delay: 5s
      max_attempts: 3
  healthcheck:
    interval: 10s
    timeout: 5s
    retries: 3
    start_period: 30s
  logging:
    driver: json-file
    options:
      max-size: "50m"
      max-file: "5"

services:
  # ==================== 接入层 ====================
  # Nginx 负载均衡器
  nginx-lb:
    <<: *service-defaults
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/ssl:/etc/nginx/ssl:ro
      - nginx-logs:/var/log/nginx
    deploy:
      replicas: 4
      update_config:
        parallelism: 2
        delay: 10s
    depends_on:
      - api-gateway
    networks:
      - frontend-network

  # API 网关(Kong)
  api-gateway:
    <<: *service-defaults
    image: kong:3.0
    ports:
      - "8000:8000"
      - "8443:8443"
    environment:
      KONG_DATABASE: "postgres"
      KONG_PG_HOST: kong-db
      KONG_PG_USER: kong
      KONG_PG_PASSWORD: kong_password
      KONG_ADMIN_LISTEN: "0.0.0.0:8001"
    deploy:
      replicas: 6
    depends_on:
      kong-db:
        condition: service_healthy
    networks:
      - frontend-network
      - gateway-network

  # Kong 数据库
  kong-db:
    <<: *service-defaults
    image: postgres:15-alpine
    environment:
      POSTGRES_USER: kong
      POSTGRES_PASSWORD: kong_password
      POSTGRES_DB: kong
    volumes:
      - kong-db-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U kong"]
      interval: 5s
      timeout: 3s
      retries: 5
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 4G
    networks:
      - gateway-network

  # ==================== 业务服务层 ====================
  # 用户服务
  user-service:
    <<: *service-defaults
    image: seckill/user-service:${VERSION:-latest}
    environment:
      SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
      DB_HOST: user-db
      REDIS_HOST: redis-cluster
      KAFKA_BROKERS: kafka:9092
    deploy:
      replicas: 8
    depends_on:
      user-db:
        condition: service_healthy
      redis-cluster:
        condition: service_healthy
    networks:
      - backend-network
      - cache-network

  # 商品服务
  product-service:
    <<: *service-defaults
    image: seckill/product-service:${VERSION:-latest}
    environment:
      SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
      DB_HOST: product-db
      ELASTICSEARCH_HOST: elasticsearch
    deploy:
      replicas: 10
    depends_on:
      product-db:
        condition: service_healthy
      elasticsearch:
        condition: service_healthy
    networks:
      - backend-network
      - search-network

  # 订单服务
  order-service:
    <<: *service-defaults
    image: seckill/order-service:${VERSION:-latest}
    environment:
      SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
      DB_HOST: order-db
      REDIS_HOST: redis-cluster
      KAFKA_BROKERS: kafka:9092
    deploy:
      replicas: 12
    depends_on:
      order-db:
        condition: service_healthy
      redis-cluster:
        condition: service_healthy
      kafka:
        condition: service_started
    networks:
      - backend-network
      - cache-network
      - queue-network

  # 库存服务
  inventory-service:
    <<: *service-defaults
    image: seckill/inventory-service:${VERSION:-latest}
    environment:
      SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
      DB_HOST: inventory-db
      REDIS_HOST: redis-cluster
    deploy:
      replicas: 10
    depends_on:
      inventory-db:
        condition: service_healthy
      redis-cluster:
        condition: service_healthy
    networks:
      - backend-network
      - cache-network

  # 支付服务
  payment-service:
    <<: *service-defaults
    image: seckill/payment-service:${VERSION:-latest}
    environment:
      SPRING_PROFILES_ACTIVE: ${PROFILE:-prod}
      DB_HOST: payment-db
      KAFKA_BROKERS: kafka:9092
    deploy:
      replicas: 6
    depends_on:
      payment-db:
        condition: service_healthy
      kafka:
        condition: service_started
    networks:
      - backend-network
      - queue-network

  # ==================== 数据层 ====================
  # MySQL 主从复制
  user-db:
    <<: *service-defaults
    image: mysql:8.0
    environment:
      MYSQL_ROOT_PASSWORD: root_password
      MYSQL_DATABASE: user_db
      MYSQL_USER: user_service
      MYSQL_PASSWORD: user_password
    volumes:
      - user-db-data:/var/lib/mysql
      - ./mysql/user-db.cnf:/etc/mysql/conf.d/custom.cnf:ro
    command: >
      --server-id=1
      --log-bin=mysql-bin
      --binlog-format=ROW
      --gtid-mode=ON
      --enforce-gtid-consistency=ON
    healthcheck:
      test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-proot_password"]
      interval: 10s
      timeout: 5s
      retries: 5
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 8G
    networks:
      - database-network

  # Redis Cluster
  redis-cluster:
    <<: *service-defaults
    image: redis:7-alpine
    command: redis-server --cluster-enabled yes --cluster-config-file nodes.conf --cluster-node-timeout 5000
    volumes:
      - redis-data:/data
    deploy:
      replicas: 6
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 3s
      retries: 5
    networks:
      - cache-network

  # Elasticsearch 集群
  elasticsearch:
    <<: *service-defaults
    image: elasticsearch:8.8.0
    environment:
      discovery.type: single-node
      xpack.security.enabled: "false"
      "ES_JAVA_OPTS": "-Xms2g -Xmx2g"
    volumes:
      - es-data:/usr/share/elasticsearch/data
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 8G
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:9200/_cluster/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
    networks:
      - search-network

  # Kafka 消息队列
  kafka:
    <<: *service-defaults
    image: confluentinc/cp-kafka:7.4.0
    environment:
      KAFKA_BROKER_ID: 1
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
      KAFKA_NUM_PARTITIONS: 6
    deploy:
      replicas: 3
    depends_on:
      zookeeper:
        condition: service_healthy
    volumes:
      - kafka-data:/var/lib/kafka/data
    networks:
      - queue-network

  zookeeper:
    <<: *service-defaults
    image: confluentinc/cp-zookeeper:7.4.0
    environment:
      ZOOKEEPER_CLIENT_PORT: 2181
      ZOOKEEPER_TICK_TIME: 2000
    deploy:
      replicas: 3
    volumes:
      - zookeeper-data:/var/lib/zookeeper/data
    healthcheck:
      test: ["CMD", "echo", "ruok", "|", "nc", "localhost", "2181"]
      interval: 10s
      timeout: 5s
      retries: 5
    networks:
      - queue-network

  # ==================== 监控层 ====================
  prometheus:
    <<: *service-defaults
    image: prom/prometheus:v2.45.0
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
    ports:
      - "9090:9090"
    networks:
      - monitoring-network

  grafana:
    <<: *service-defaults
    image: grafana/grafana:10.0.0
    volumes:
      - grafana-data:/var/lib/grafana
    ports:
      - "3000:3000"
    environment:
      GF_SECURITY_ADMIN_PASSWORD: admin_password
    networks:
      - monitoring-network
    depends_on:
      - prometheus

volumes:
  nginx-logs:
  kong-db-data:
  user-db-data:
  product-db-data:
  order-db-data:
  inventory-db-data:
  payment-db-data:
  redis-data:
  es-data:
  kafka-data:
  zookeeper-data:
  prometheus-data:
  grafana-data:

networks:
  frontend-network:
    driver: bridge
  gateway-network:
    driver: bridge
  backend-network:
    driver: bridge
  cache-network:
    driver: bridge
  database-network:
    driver: bridge
    internal: true
  search-network:
    driver: bridge
  queue-network:
    driver: bridge
  monitoring-network:
    driver: bridge

1.2 数据管道系统

1.2.1 实时数据处理管道
# docker-compose.data-pipeline.yml
version: '3.8'

services:
  # 数据采集:Flume
  flume:
    image: apache/flume:1.13.0
    volumes:
      - ./flume/conf:/opt/flume/conf:ro
      - /var/log/app:/var/log/app:ro
    environment:
      - FLUME_CONF_DIR=/opt/flume/conf
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
    networks:
      - pipeline-network

  # 消息缓冲:Kafka
  kafka:
    image: confluentinc/cp-kafka:7.4.0
    environment:
      KAFKA_BROKER_ID: 1
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
    depends_on:
      - zookeeper
    volumes:
      - kafka-data:/var/lib/kafka/data
    networks:
      - pipeline-network

  zookeeper:
    image: confluentinc/cp-zookeeper:7.4.0
    environment:
      ZOOKEEPER_CLIENT_PORT: 2181
    volumes:
      - zookeeper-data:/var/lib/zookeeper/data
    networks:
      - pipeline-network

  # 流处理:Flink
  flink-jobmanager:
    image: flink:1.17.0
    ports:
      - "8081:8081"
    command: jobmanager
    environment:
      - FLINK_PROPERTIES=jobmanager.rpc.address:flink-jobmanager
    volumes:
      - ./flink/jobs:/opt/flink/usrlib
    networks:
      - pipeline-network

  flink-taskmanager:
    image: flink:1.17.0
    depends_on:
      - flink-jobmanager
    command: taskmanager
    environment:
      - FLINK_PROPERTIES=jobmanager.rpc.address:flink-jobmanager taskmanager.numberOfTaskSlots:4
    deploy:
      replicas: 4
    volumes:
      - ./flink/jobs:/opt/flink/usrlib
    networks:
      - pipeline-network

  # 数据存储:ClickHouse
  clickhouse:
    image: clickhouse/clickhouse-server:23.8
    ports:
      - "8123:8123"
      - "9000:9000"
    volumes:
      - clickhouse-data:/var/lib/clickhouse
      - ./clickhouse/config.xml:/etc/clickhouse-server/config.xml:ro
    environment:
      CLICKHOUSE_PASSWORD: clickhouse_password
    deploy:
      resources:
        limits:
          cpus: '8.0'
          memory: 16G
    networks:
      - pipeline-network

  # 数据可视化:Superset
  superset:
    image: apache/superset:latest
    ports:
      - "8088:8088"
    volumes:
      - superset-home:/app/superset_home
    environment:
      SUPERSET_SECRET_KEY: your-secret-key
      SUPERSET_ADMIN_PASSWORD: admin
    depends_on:
      - clickhouse
    networks:
      - pipeline-network

volumes:
  kafka-data:
  zookeeper-data:
  clickhouse-data:
  superset-home:

networks:
  pipeline-network:
    driver: bridge

1.3 CI/CD 流水线环境

1.3.1 完整 DevOps 工具链
# docker-compose.cicd.yml
version: '3.8'

services:
  # Git 仓库:Gitea
  gitea:
    image: gitea/gitea:1.20.0
    ports:
      - "3000:3000"
      - "2222:22"
    volumes:
      - gitea-data:/data
      - /etc/timezone:/etc/timezone:ro
      - /etc/localtime:/etc/localtime:ro
    environment:
      - USER_UID=1000
      - USER_GID=1000
    networks:
      - cicd-network

  # CI/CD 引擎:Jenkins
  jenkins:
    image: jenkins/jenkins:lts
    ports:
      - "8080:8080"
      - "50000:50000"
    volumes:
      - jenkins-data:/var/jenkins_home
      - /var/run/docker.sock:/var/run/docker.sock
      - ./jenkins/plugins:/usr/share/jenkins/ref/plugins
    environment:
      - JAVA_OPTS=-Xmx4g
    networks:
      - cicd-network
    depends_on:
      - gitea

  # 代码质量:SonarQube
  sonarqube:
    image: sonarqube:community
    ports:
      - "9000:9000"
    volumes:
      - sonarqube-data:/opt/sonarqube/data
      - sonarqube-logs:/opt/sonarqube/logs
      - sonarqube-extensions:/opt/sonarqube/extensions
    environment:
      - SONAR_ES_BOOTSTRAP_CHECKS_DISABLE=true
    networks:
      - cicd-network

  # 制品仓库:Nexus
  nexus:
    image: sonatype/nexus3:latest
    ports:
      - "8081:8081"
    volumes:
      - nexus-data:/nexus-data
    networks:
      - cicd-network

  # 容器仓库:Harbor
  registry:
    image: goharbor/registry-photon:v2.8.0
    volumes:
      - registry-data:/storage
      - ./harbor/config.yml:/etc/registry/config.yml:ro
    networks:
      - cicd-network

  # 项目管理:Jira
  jira:
    image: atlassian/jira-software:latest
    ports:
      - "8082:8080"
    volumes:
      - jira-data:/var/atlassian/jira
    networks:
      - cicd-network

  # 文档管理:Confluence
  confluence:
    image: atlassian/confluence:latest
    ports:
      - "8090:8090"
    volumes:
      - confluence-data:/var/atlassian/confluence
    networks:
      - cicd-network

  # 监控告警:Prometheus Stack
  prometheus:
    image: prom/prometheus:v2.45.0
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
    networks:
      - cicd-network

  grafana:
    image: grafana/grafana:10.0.0
    ports:
      - "3000:3000"
    volumes:
      - grafana-data:/var/lib/grafana
    networks:
      - cicd-network

volumes:
  gitea-data:
  jenkins-data:
  sonarqube-data:
  sonarqube-logs:
  sonarqube-extensions:
  nexus-data:
  registry-data:
  jira-data:
  confluence-data:
  prometheus-data:
  grafana-data:

networks:
  cicd-network:
    driver: bridge

2. 高级配置技巧

2.1 条件化配置

2.1.1 基于环境变量的动态配置
# docker-compose.dynamic.yml
version: '3.8'

services:
  app:
    image: myapp:${APP_VERSION:-latest}
    
    # 条件化端口映射
    ports:
      - "${EXPOSE_PORT:-8080}:8080"
    
    # 条件化环境变量
    environment:
      - ENV=${ENVIRONMENT:-development}
      - DEBUG=${DEBUG_MODE:-false}
      - LOG_LEVEL=${LOG_LEVEL:-info}
      - DB_HOST=${DB_HOST:-localhost}
      - DB_PORT=${DB_PORT:-5432}
      - DB_NAME=${DB_NAME:-mydb}
      - DB_USER=${DB_USER:-postgres}
      - DB_PASSWORD=${DB_PASSWORD:?DB_PASSWORD 环境变量必须设置}
    
    # 条件化卷挂载
    volumes:
      - ./config/${ENVIRONMENT:-development}:/app/config:ro
      - ${DATA_DIR:-./data}:/app/data
      - app-logs:/app/logs
    
    # 条件化资源配置
    deploy:
      resources:
        limits:
          cpus: '${CPU_LIMIT:-1.0}'
          memory: ${MEMORY_LIMIT:-512M}
        reservations:
          cpus: '${CPU_RESERVATION:-0.5}'
          memory: ${MEMORY_RESERVATION:-256M}
    
    # 条件化健康检查
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${HEALTH_PORT:-8080}/health"]
      interval: ${HEALTH_INTERVAL:-30s}
      timeout: ${HEALTH_TIMEOUT:-10s}
      retries: ${HEALTH_RETRIES:-3}
      start_period: ${HEALTH_START_PERIOD:-40s}
    
    # 条件化日志配置
    logging:
      driver: ${LOG_DRIVER:-json-file}
      options:
        max-size: ${LOG_MAX_SIZE:-10m}
        max-file: ${LOG_MAX_FILE:-3}
    
    # 条件化网络
    networks:
      - ${NETWORK:-app-network}

volumes:
  app-logs:
    driver: ${VOLUME_DRIVER:-local}

networks:
  app-network:
    driver: ${NETWORK_DRIVER:-bridge}

环境文件示例

# .env.development
ENVIRONMENT=development
DEBUG_MODE=true
LOG_LEVEL=debug
DB_HOST=localhost
DB_PORT=5432
DB_NAME=devdb
DB_PASSWORD=dev_password
CPU_LIMIT=1.0
MEMORY_LIMIT=512M
EXPOSE_PORT=8080

# .env.production
ENVIRONMENT=production
DEBUG_MODE=false
LOG_LEVEL=info
DB_HOST=prod-db.example.com
DB_PORT=5432
DB_NAME=proddb
DB_PASSWORD=${PROD_DB_PASSWORD}
CPU_LIMIT=4.0
MEMORY_LIMIT=4G
EXPOSE_PORT=80

启动命令

# 开发环境
docker compose --env-file .env.development up -d

# 生产环境
docker compose --env-file .env.production up -d

# 动态覆盖配置
DB_PASSWORD=secret docker compose up -d

2.2 多环境管理

2.2.1 Compose 文件继承与覆盖
# docker-compose.yml - 基础配置
version: '3.8'

services:
  web:
    image: myapp:latest
    environment:
      - APP_NAME=MyApp
    networks:
      - app-network

  db:
    image: postgres:15
    environment:
      - POSTGRES_DB=mydb
    volumes:
      - db-data:/var/lib/postgresql/data

volumes:
  db-data:

networks:
  app-network:
    driver: bridge
# docker-compose.override.yml - 开发环境覆盖
version: '3.8'

services:
  web:
    build: .
    volumes:
      - ./src:/app/src
      - ./logs:/app/logs
    environment:
      - DEBUG=true
      - LOG_LEVEL=debug
    ports:
      - "8080:8080"

  db:
    ports:
      - "5432:5432"
    environment:
      - POSTGRES_PASSWORD=dev_password
# docker-compose.prod.yml - 生产环境配置
version: '3.8'

services:
  web:
    image: myapp:1.2.3
    deploy:
      replicas: 4
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
    environment:
      - DEBUG=false
      - LOG_LEVEL=warn
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  db:
    deploy:
      resources:
        limits:
          cpus: '4.0'
          memory: 8G
    environment:
      - POSTGRES_PASSWORD=${DB_PASSWORD}

使用方式

# 开发环境(自动加载 override)
docker compose up -d

# 生产环境
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d

# 测试环境
docker compose -f docker-compose.yml -f docker-compose.test.yml up -d

2.3 动态服务发现

2.3.1 Consul 集成方案
# docker-compose.consul.yml
version: '3.8'

services:
  # Consul 服务发现
  consul:
    image: hashicorp/consul:1.16
    ports:
      - "8500:8500"
      - "8600:8600/udp"
    volumes:
      - consul-data:/consul/data
    command: agent -server -ui -bootstrap-expect=1 -client=0.0.0.0
    networks:
      - service-network

  # Consul Template(动态配置生成)
  consul-template:
    image: hashicorp/consul-template:latest
    volumes:
      - ./templates:/templates:ro
      - ./config:/config:ro
    command: >
      -consul consul:8500
      -template "/templates/nginx.conf.ctmpl:/etc/nginx/nginx.conf:nginx -s reload"
    networks:
      - service-network
    depends_on:
      - consul

  # 注册服务示例
  backend-service:
    image: myapp:latest
    environment:
      - CONSUL_HTTP_ADDR=consul:8500
    labels:
      - "consul.register=true"
      - "consul.service.name=backend"
      - "consul.service.port=8080"
      - "consul.service.tags=api,backend"
      - "consul.service.check.http=http://localhost:8080/health"
      - "consul.service.check.interval=10s"
    networks:
      - service-network
    depends_on:
      - consul

  # Nginx(动态上游配置)
  nginx:
    image: nginx:alpine
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
    ports:
      - "80:80"
    networks:
      - service-network
    depends_on:
      - consul-template

volumes:
  consul-data:

networks:
  service-network:
    driver: bridge

Consul Template 配置

# templates/nginx.conf.ctmpl
upstream backend {
{{range service "backend"}}
    server {{.Address}}:{{.Port}};
{{end}}
}

server {
    listen 80;
    
    location / {
        proxy_pass http://backend;
    }
}

3. 故障排查方法论

3.1 故障排查框架

3.1.1 系统化排查流程

启动失败

网络问题

数据问题

性能问题

故障报告

信息收集

问题定位

问题类型?

容器启动排查

网络连通性排查

存储持久化排查

资源瓶颈排查

查看容器日志

检查依赖服务

验证配置正确性

测试 DNS 解析

检查网络配置

验证防火墙规则

检查卷挂载

验证数据完整性

查看磁盘空间

监控资源使用

性能基准对比

识别瓶颈点

制定解决方案

实施修复

验证效果

文档记录

故障复盘

3.1.2 故障排查清单
阶段 检查项 命令/工具 预期结果
信息收集 容器状态 docker compose ps 显示所有容器状态
容器日志 docker compose logs 查看错误信息
系统资源 docker stats CPU/内存使用率
问题定位 网络连通性 docker network inspect 网络配置正确
存储挂载 docker inspect -f '{{.Mounts}}' 卷正常挂载
依赖关系 docker compose config 配置无语法错误
深入诊断 进程信息 docker top <container> 进程正常运行
文件系统 docker exec <container> df -h 磁盘空间充足
网络连接 docker exec <container> netstat -an 端口监听正常
性能分析 资源限制 docker inspect <container> 资源配置合理
I/O 性能 docker exec <container> iostat I/O 延迟正常
网络吞吐 docker exec <container> iperf3 带宽达标

3.2 诊断工具集

3.2.1 内置诊断命令
#!/bin/bash
# docker-compose-diagnostic.sh - 综合诊断脚本

set -euo pipefail

COMPOSE_FILE="${1:-docker-compose.yml}"
OUTPUT_DIR="diagnostic-$(date +%Y%m%d-%H%M%S)"

mkdir -p $OUTPUT_DIR

echo "=== Docker Compose 诊断报告 ===" | tee $OUTPUT_DIR/report.txt
echo "时间:$(date)" | tee -a $OUTPUT_DIR/report.txt
echo "Compose 文件:$COMPOSE_FILE" | tee -a $OUTPUT_DIR/report.txt
echo "" | tee -a $OUTPUT_DIR/report.txt

# 1. 配置验证
echo "[1/10] 验证 Compose 配置..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE config > $OUTPUT_DIR/config.yml 2>&1 || \
    echo "配置验证失败" | tee -a $OUTPUT_DIR/report.txt

# 2. 服务状态
echo "[2/10] 检查服务状态..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE ps > $OUTPUT_DIR/service-status.txt 2>&1

# 3. 网络拓扑
echo "[3/10] 分析网络拓扑..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE config --services | while read service; do
    docker inspect --format='{{.NetworkSettings.Networks}}' $service \
        > $OUTPUT_DIR/network-$service.txt 2>&1 || true
done

# 4. 卷信息
echo "[4/10] 检查卷配置..." | tee -a $OUTPUT_DIR/report.txt
docker volume ls > $OUTPUT_DIR/volumes.txt 2>&1

# 5. 资源使用
echo "[5/10] 收集资源统计..." | tee -a $OUTPUT_DIR/report.txt
docker stats --no-stream > $OUTPUT_DIR/resource-stats.txt 2>&1

# 6. 日志收集
echo "[6/10] 收集服务日志..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE logs --tail=100 > $OUTPUT_DIR/logs.txt 2>&1

# 7. 依赖关系
echo "[7/10] 分析依赖关系..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE config --services | while read service; do
    echo "=== $service 依赖 ===" >> $OUTPUT_DIR/dependencies.txt
    docker inspect --format='{{.Config.Env}}' $service >> $OUTPUT_DIR/dependencies.txt 2>&1 || true
done

# 8. 健康检查
echo "[8/10] 检查健康状态..." | tee -a $OUTPUT_DIR/report.txt
docker compose -f $COMPOSE_FILE ps | grep -v "NAME" | awk '{print $1}' | while read container; do
    health=$(docker inspect --format='{{.State.Health.Status}}' $container 2>/dev/null || echo "N/A")
    echo "$container: $health" >> $OUTPUT_DIR/health-status.txt
done

# 9. 端口映射
echo "[9/10] 检查端口映射..." | tee -a $OUTPUT_DIR/report.txt
docker ps --format "table {{.Names}}\t{{.Ports}}" > $OUTPUT_DIR/port-mappings.txt 2>&1

# 10. 系统信息
echo "[10/10] 收集系统信息..." | tee -a $OUTPUT_DIR/report.txt
docker info > $OUTPUT_DIR/docker-info.txt 2>&1
docker version > $OUTPUT_DIR/docker-version.txt 2>&1

echo "" | tee -a $OUTPUT_DIR/report.txt
echo "=== 诊断完成 ===" | tee -a $OUTPUT_DIR/report.txt
echo "结果目录:$OUTPUT_DIR" | tee -a $OUTPUT_DIR/report.txt
3.2.2 高级调试工具
# 容器网络调试
docker run --rm -it --network container:<target-container> nicolaka/netshoot

# 容器文件系统调试
docker run --rm -it --pid container:<target-container> alpine nsenter -t 1 -m -u -n -i sh

# 容器性能分析
docker run --rm -it --privileged --pid=host justincormack/nsenter1

# 网络抓包
docker run --rm -it --cap-add=NET_ADMIN --network container:<target-container> nicolaka/netshoot tcpdump -i any -w /tmp/capture.pcap

3.3 日志分析技术

3.3.1 日志聚合与过滤
#!/bin/bash
# log-analyzer.sh - 日志分析工具

set -euo pipefail

LOG_FILE="${1:-logs.txt}"
OUTPUT_DIR="log-analysis-$(date +%Y%m%d-%H%M%S)"

mkdir -p $OUTPUT_DIR

echo "分析日志文件:$LOG_FILE"
echo "输出目录:$OUTPUT_DIR"

# 1. 错误日志提取
echo "[1/6] 提取错误日志..."
grep -i "error\|exception\|fatal\|critical" $LOG_FILE > $OUTPUT_DIR/errors.txt || true

# 2. 警告日志提取
echo "[2/6] 提取警告日志..."
grep -i "warn\|warning" $LOG_FILE > $OUTPUT_DIR/warnings.txt || true

# 3. 时间线分析
echo "[3/6] 生成时间线..."
awk '{print $1, $2, $3}' $LOG_FILE | sort | uniq -c | sort -rn > $OUTPUT_DIR/timeline.txt

# 4. 错误统计
echo "[4/6] 错误类型统计..."
grep -oP '\[.*?\]' $LOG_FILE | sort | uniq -c | sort -rn > $OUTPUT_DIR/error-types.txt || true

# 5. 关键事件提取
echo "[5/6] 提取关键事件..."
grep -i "start\|stop\|restart\|fail\|success" $LOG_FILE > $OUTPUT_DIR/events.txt || true

# 6. 生成摘要报告
echo "[6/6] 生成摘要报告..."
cat > $OUTPUT_DIR/summary.txt <<EOF
日志分析摘要
============
分析时间:$(date)
日志文件:$LOG_FILE
总行数:$(wc -l < $LOG_FILE)
错误数:$(wc -l < $OUTPUT_DIR/errors.txt)
警告数:$(wc -l < $OUTPUT_DIR/warnings.txt)

Top 5 错误类型:
$(head -5 $OUTPUT_DIR/error-types.txt)

关键事件数量:
$(wc -l < $OUTPUT_DIR/events.txt)
EOF

echo "分析完成!"
3.3.2 实时日志监控
#!/bin/bash
# real-time-log-monitor.sh

SERVICES=("$@")

if [ ${#SERVICES[@]} -eq 0 ]; then
    echo "用法:$0 [service1] [service2] ..."
    exit 1
fi

# 彩色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

echo "实时监控日志 (Ctrl+C 停止)..."
echo "监控服务:${SERVICES[*]}"
echo ""

# 构建 docker compose logs 命令
CMD="docker compose logs -f --tail=50"
for service in "${SERVICES[@]}"; do
    CMD="$CMD $service"
done

# 实时日志流 + 关键字高亮
eval $CMD 2>&1 | while read line; do
    if echo "$line" | grep -qi "error\|exception\|fatal"; then
        echo -e "${RED}$line${NC}"
    elif echo "$line" | grep -qi "warn"; then
        echo -e "${YELLOW}$line${NC}"
    else
        echo -e "${GREEN}$line${NC}"
    fi
done

4. 常见故障案例库

4.1 容器启动失败

4.1.1 案例:依赖服务未就绪

故障现象

$ docker compose up -d
Creating network "app-network" with the default driver
Creating app_db_1 ... done
Creating app_web_1 ... done

$ docker compose ps
NAME           STATUS
app_db_1       Up (health: starting)
app_web_1      Restarting (1) 2 seconds ago

Web 容器日志

2024-01-15 10:30:15 ERROR: Connection refused to db:5432
2024-01-15 10:30:16 FATAL: Cannot connect to database
2024-01-15 10:30:16 Container exiting

诊断步骤

# 1. 检查数据库健康状态
docker inspect --format='{{.State.Health.Status}}' app_db_1
# 输出:starting(说明数据库还未就绪)

# 2. 查看数据库启动日志
docker logs app_db_1
# 输出:database system is starting up

# 3. 手动连接测试
docker exec app_db_1 psql -h localhost -U postgres -c "SELECT 1"
# 错误:connection refused

根本原因

  • Web 服务启动时,数据库还未完全就绪
  • depends_on 只保证启动顺序,不保证就绪状态

解决方案

# 方案 1:使用健康检查条件
version: '3.8'

services:
  db:
    image: postgres:15
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 3s
      retries: 5

  web:
    depends_on:
      db:
        condition: service_healthy  # 等待健康检查通过
# 方案 2:添加重试机制
services:
  web:
    image: myapp:latest
    command: >
      sh -c "until pg_isready -h db -U postgres; do
               echo 'Waiting for database...';
               sleep 2;
             done;
             npm start"
    depends_on:
      - db
# 方案 3:使用 wait-for-it 脚本
services:
  web:
    image: myapp:latest
    command: >
      bash -c "./wait-for-it.sh db:5432 --timeout=60 --strict --
               npm start"
    depends_on:
      - db

wait-for-it.sh 脚本

#!/usr/bin/env bash
# wait-for-it.sh

set -e

host="$1"
shift
cmd="$@"

until nc -z "$host" 2>&1; do
  echo "Waiting for $host..."
  sleep 2
done

echo "$host is available, starting application"
exec $cmd

4.2 网络连接问题

4.2.1 案例:容器间无法通信

故障现象

# Web 容器无法访问数据库容器
$ docker exec app_web_1 ping db
ping: bad address 'db'

$ docker exec app_web_1 nslookup db
nslookup: can't resolve 'db'

诊断步骤

# 1. 检查网络配置
docker network inspect app-network

# 2. 查看容器网络连接
docker inspect --format='{{range $k, $v := .NetworkSettings.Networks}}{{$k}}{{end}}' app_web_1
# 输出:app-network

docker inspect --format='{{range $k, $v := .NetworkSettings.Networks}}{{$k}}{{end}}' app_db_1
# 输出:(空,说明数据库不在同一网络)

# 3. 检查 DNS 配置
docker exec app_web_1 cat /etc/resolv.conf

根本原因

  • Web 和数据库容器不在同一个 Docker 网络
  • DNS 服务发现失效

解决方案

# 确保所有服务在同一网络
version: '3.8'

services:
  web:
    image: myapp:latest
    networks:
      - app-network  # 显式指定网络
  
  db:
    image: postgres:15
    networks:
      - app-network  # 同一网络

networks:
  app-network:
    driver: bridge

修复网络

# 手动连接容器到网络
docker network connect app-network app_db_1

# 验证连通性
docker exec app_web_1 ping -c 3 db
# 输出:PING db (172.20.0.3): 56 data bytes

4.3 数据持久化故障

4.3.1 案例:容器重启后数据丢失

故障现象

# 写入数据
docker exec app_db_1 psql -U postgres -c "CREATE TABLE test (id INT);"

# 重启容器
docker compose restart db

# 数据丢失
docker exec app_db_1 psql -U postgres -c "\dt"
# 输出:Did not find any relations

诊断步骤

# 1. 检查卷挂载
docker inspect app_db_1 | grep -A 10 Mounts

# 2. 查看卷信息
docker volume ls
docker volume inspect app_db-data

# 3. 检查文件系统
docker exec app_db_1 df -h /var/lib/postgresql/data

根本原因

  • 未配置持久化卷,数据存储在容器可写层
  • 容器重建后,数据丢失

解决方案

version: '3.8'

services:
  db:
    image: postgres:15
    volumes:
      - db-data:/var/lib/postgresql/data  # 命名卷持久化
      # 或绑定挂载
      # - /host/path:/var/lib/postgresql/data:z

volumes:
  db-data:
    driver: local
    driver_opts:
      type: none
      device: /mnt/ssd/postgres  # 指定存储位置
      o: bind

数据恢复

# 1. 备份数据
docker run --rm \
  -v app-db-data:/source:ro \
  -v $(pwd):/backup \
  alpine tar czf /backup/db-backup.tar.gz -C /source .

# 2. 恢复数据
docker run --rm \
  -v app-db-data:/target \
  -v $(pwd):/backup \
  alpine tar xzf /backup/db-backup.tar.gz -C /target

# 3. 验证数据
docker exec app_db_1 psql -U postgres -c "\dt"

4.4 资源竞争与泄漏

4.4.1 案例:内存泄漏导致 OOM

故障现象

$ docker compose ps
NAME           STATUS
app_web_1      Restarting (13) 5 seconds ago

$ docker inspect app_web_1 | grep -A 5 State
"OOMKilled": true,
"ExitCode": 137

诊断步骤

# 1. 监控内存使用
docker stats app_web_1 --no-stream

# 2. 查看内存趋势
for i in {1..10}; do
  docker stats app_web_1 --no-stream --format "table {{.MemUsage}}"
  sleep 5
done

# 3. 分析内存泄漏
docker exec app_web_1 cat /proc/1/status | grep -i "VmRSS\|VmSize"

根本原因

  • 应用代码存在内存泄漏
  • 未设置内存限制,导致耗尽宿主机资源

解决方案

version: '3.8'

services:
  web:
    image: myapp:latest
    deploy:
      resources:
        limits:
          memory: 1G  # 硬限制
        reservations:
          memory: 512M  # 软限制
    memswap_limit: 1.5G  # 内存+Swap
    mem_swappiness: 0  # 禁用 Swap

内存泄漏调试

# 1. 生成内存快照
docker exec app_web_1 kill -SIGUSR1 $(pgrep node)

# 2. 分析堆内存
docker exec app_web_1 node --inspect --expose-gc app.js

# 3. 使用工具分析
docker run --rm -it \
  -v /var/run/docker.sock:/var/run/docker.sock \
  docker.io/library/node:latest \
  node --inspect app.js

5. 性能故障深度分析

5.1 CPU 瓶颈诊断

5.1.1 案例:CPU 使用率持续 100%

故障现象

$ docker stats
CONTAINER ID   NAME        CPU %     MEM USAGE / LIMIT
abc123         app_web_1   198.5%    512MiB / 1GiB

诊断流程

# 1. 识别热点进程
docker top app_web_1

# 2. 进入容器诊断
docker exec -it app_web_1 sh
top -H -p 1  # 查看线程级 CPU 使用

# 3. 生成性能分析
docker exec app_web_1 sh -c "
  for i in \$(seq 1 5); do
    ps -eo pid,ppid,pcpu,cmd --sort=-pcpu | head -10
    sleep 2
  done
"

# 4. 使用 perf 分析
docker run --rm -it --pid container:app_web_1 \
  --cap-add SYS_PTRACE alpine \
  sh -c "apk add perf && perf top -p 1"

优化方案

version: '3.8'

services:
  web:
    image: myapp:latest
    # CPU 限制
    deploy:
      resources:
        limits:
          cpus: '2.0'
    # CPU 绑定(减少上下文切换)
    cpuset_cpus: "0,1"
    # 调整优先级
    cpu_shares: 1024

5.2 内存泄漏排查

5.2.1 系统性排查方法
#!/bin/bash
# memory-leak-detector.sh

CONTAINER="$1"
MONITOR_DURATION=300  # 5 分钟
SAMPLE_INTERVAL=10

echo "监控容器:$CONTAINER"
echo "监控时长:${MONITOR_DURATION}s"

MEMORY_SAMPLES=()

for ((i=0; i<MONITOR_DURATION/SAMPLE_INTERVAL; i++)); do
    MEM_USAGE=$(docker stats --no-stream --format "{{.MemUsage}}" $CONTAINER | \
                awk -F'/' '{print $1}' | tr -d ' ')
    
    MEM_PERC=$(docker stats --no-stream --format "{{.MemPerc}}" $CONTAINER)
    
    MEMORY_SAMPLES+=("$MEM_PERC")
    
    echo "[$(date +'%H:%M:%S')] 内存:$MEM_USAGE ($MEM_PERC)"
    
    # 检测泄漏(持续增长)
    if [ ${#MEMORY_SAMPLES[@]} -ge 6 ]; then
        # 计算增长趋势
        first=${MEMORY_SAMPLES[0]%\%}
        last=${MEMORY_SAMPLES[-1]%\%}
        growth=$(echo "$last - $first" | bc)
        
        if (( $(echo "$growth > 10" | bc -l) )); then
            echo "⚠️  警告:检测到内存泄漏(5 分钟增长${growth}%)"
            
            # 生成诊断报告
            docker exec $CONTAINER sh -c "
                echo '=== 进程信息 ==='
                ps aux --sort=-%mem | head -10
                echo ''
                echo '=== 内存映射 ==='
                cat /proc/1/status | grep -i 'VmRSS\|VmSize\|VmPeak'
            "
        fi
    fi
    
    sleep $SAMPLE_INTERVAL
done

5.3 磁盘 I/O 瓶颈

5.3.1 I/O 性能诊断
# 1. 实时 I/O 监控
docker exec app_db_1 sh -c "
  while true; do
    iostat -x 1 1 | grep -E 'Device|sda'
    sleep 5
  done
"

# 2. 识别 I/O 密集型进程
docker exec app_db_1 sh -c "
  pidstat -d 1 5
"

# 3. 测试卷性能
docker run --rm -v app-db-data:/test alpine \
  fio --name=iops --ioengine=psync --direct=1 \
  --rw=randread --bs=4k --size=100M --numjobs=4 \
  --group_reporting

优化方案

version: '3.8'

services:
  db:
    image: postgres:15
    volumes:
      # 使用高速 SSD
      - type: volume
        source: db-data
        target: /var/lib/postgresql/data
        volume:
          nocopy: true
        bind:
          propagation: cached
    
    # I/O 限流(防止影响其他服务)
    device_read_bps:
      - path: /dev/sda
        rate: 100mb
    device_write_bps:
      - path: /dev/sda
        rate: 100mb

volumes:
  db-data:
    driver: local
    driver_opts:
      type: none
      device: /mnt/nvme/postgres  # NVMe SSD
      o: bind

5.4 网络延迟问题

5.4.1 网络性能诊断
# 1. 网络延迟测试
docker network create test-network
docker run -d --name server --network test-network alpine nc -l 8080
docker run --rm --network test-network alpine \
  sh -c "time nc server 8080 < /dev/null"

# 2. 带宽测试
docker run -d --name iperf-server --network test-network networkstatic/iperf3
docker run --rm --network test-network networkstatic/iperf3 \
  -c iperf-server -t 10 -P 4

# 3. DNS 解析延迟
docker run --rm alpine \
  sh -c "time nslookup google.com"

# 4. 网络路径追踪
docker run --rm --cap-add NET_RAW alpine \
  traceroute -n google.com

网络优化配置

version: '3.8'

services:
  web:
    image: nginx:alpine
    networks:
      - optimized-network
    
    # 网络参数调优
    sysctls:
      - net.core.somaxconn=65535
      - net.ipv4.tcp_tw_reuse=1
      - net.ipv4.tcp_rmem=4096,131072,16777216
      - net.ipv4.tcp_wmem=4096,131072,16777216

networks:
  optimized-network:
    driver: bridge
    driver_opts:
      com.docker.network.driver.mtu: 1500
    ipam:
      config:
        - subnet: 172.22.0.0/16
          gateway: 172.22.0.1

6. 实战演练

6.1 综合故障排查演练

场景设定
电商系统在促销活动期间出现响应缓慢,部分服务频繁重启。

演练环境

# 启动故障环境
docker compose -f docker-compose.seckill.yml up -d

# 注入故障(模拟)
docker exec app_order-service_1 sh -c "
  while true; do
    dd if=/dev/zero of=/tmp/test bs=1M count=100
    sleep 5
  done
" &

排查任务

  1. 信息收集(5 分钟)

    • 运行诊断脚本
    • 收集所有服务日志
    • 记录资源使用情况
  2. 问题定位(10 分钟)

    • 识别异常服务
    • 分析依赖关系
    • 确定故障传播链
  3. 制定方案(5 分钟)

    • 临时缓解措施
    • 根本解决方案
    • 预防措施
  4. 实施修复(10 分钟)

    • 执行修复方案
    • 验证效果
    • 监控系统状态

预期成果

  • 故障排查报告
  • 优化建议清单
  • 监控告警规则

7. 总结

7.1 核心技术要点

  1. 多容器编排:掌握复杂应用架构的编排技巧
  2. 高级配置:灵活运用条件化配置、多环境管理
  3. 故障排查:建立系统化的排查方法论
  4. 性能优化:识别并解决 CPU、内存、磁盘、网络瓶颈

7.2 最佳实践清单

故障预防

  • 配置健康检查和自动重启
  • 设置资源限制防止竞争
  • 使用持久化卷保证数据安全
  • 建立完善的监控告警体系

故障排查

  • 遵循系统化排查流程
  • 善用诊断工具集
  • 详细记录故障现象和处理过程
  • 定期复盘和知识沉淀

性能优化

  • 建立性能基准
  • 持续监控关键指标
  • 定期性能测试
  • 根据业务特点调优配置

附录 A:故障排查清单

A.1 启动失败排查清单

  • 检查 Docker Compose 配置语法
  • 验证镜像是否存在/可拉取
  • 检查依赖服务是否就绪
  • 查看容器日志定位错误
  • 验证端口映射是否冲突
  • 检查卷挂载路径权限
  • 确认资源配置(CPU/内存)是否合理
  • 检查网络连接配置

A.2 网络问题排查清单

  • 验证容器是否在同一网络
  • 检查 DNS 解析是否正常
  • 测试容器间连通性(ping/telnet)
  • 查看防火墙规则
  • 检查端口监听状态
  • 验证网络驱动配置
  • 排查 IP 地址冲突

A.3 数据持久化排查清单

  • 确认卷配置正确
  • 检查卷挂载状态
  • 验证数据目录权限
  • 查看磁盘空间使用
  • 测试数据读写
  • 检查备份策略
  • 验证数据完整性

附录 B:诊断脚本集合

B.1 快速诊断脚本

#!/bin/bash
# quick-diagnostic.sh - 60 秒快速诊断

set -euo pipefail

echo "=== Docker Compose 快速诊断 ==="
echo ""

# 1. 服务状态(10 秒)
echo "[1/6] 服务状态:"
docker compose ps

# 2. 资源使用(10 秒)
echo ""
echo "[2/6] 资源使用:"
docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}"

# 3. 最近错误日志(10 秒)
echo ""
echo "[3/6] 最近错误日志:"
docker compose logs --tail=20 2>&1 | grep -i "error\|exception\|fatal" | tail -10 || echo "无错误日志"

# 4. 健康检查(10 秒)
echo ""
echo "[4/6] 健康状态:"
docker compose ps | grep -v "NAME" | awk '{print $1}' | while read container; do
    health=$(docker inspect --format='{{.State.Health.Status}}' $container 2>/dev/null || echo "N/A")
    echo "  $container: $health"
done

# 5. 网络连通性(10 秒)
echo ""
echo "[5/6] 网络检查:"
docker compose config --services | head -3 | while read service; do
    ip=$(docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $service 2>/dev/null || echo "N/A")
    echo "  $service: $ip"
done

# 6. 磁盘空间(10 秒)
echo ""
echo "[6/6] 磁盘使用:"
docker system df

echo ""
echo "=== 诊断完成 ==="

B.2 日志收集脚本

#!/bin/bash
# log-collector.sh - 完整日志收集

set -euo pipefail

OUTPUT_DIR="logs-$(date +%Y%m%d-%H%M%S)"
mkdir -p $OUTPUT_DIR

echo "收集日志到:$OUTPUT_DIR"

# 1. Compose 配置
docker compose config > $OUTPUT_DIR/compose-config.yml

# 2. 服务日志
docker compose logs > $OUTPUT_DIR/all-services.log

# 3. 按服务分离日志
docker compose config --services | while read service; do
    docker compose logs $service > $OUTPUT_DIR/${service}.log
done

# 4. 容器日志
docker ps -a --format "{{.Names}}" | while read container; do
    docker logs $container > $OUTPUT_DIR/container-${container}.log 2>&1 || true
done

# 5. 系统日志
journalctl -u docker > $OUTPUT_DIR/docker-system.log 2>&1 || true

# 6. 诊断信息
docker info > $OUTPUT_DIR/docker-info.txt
docker version > $OUTPUT_DIR/docker-version.txt
docker compose version > $OUTPUT_DIR/compose-version.txt

# 7. 打包
tar czf ${OUTPUT_DIR}.tar.gz $OUTPUT_DIR
rm -rf $OUTPUT_DIR

echo "日志收集完成:${OUTPUT_DIR}.tar.gz"

文档版本: V1.0
最后更新: 2026-03-12
作者: AI 技术助手
许可协议: CC BY-SA 4.0

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐