ES故障处理之-too many files open

【代码】ES故障处理之-too many files open。

戒掉贪嗔痴

25人浏览 · 2026-04-20 11:25:44

戒掉贪嗔痴 · 2026-04-20 11:25:44 发布

curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/health"
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/shards?v&h=index,shard,prirep,state,node,unassigned.reason&s=state"
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/allocation/explain"
{
    "index":"log-platform-20260420",
    "shard":1,
    "primary":false,
    "current_state":"unassigned",
    "unassigned_info":{
        "reason":"ALLOCATION_FAILED",
        "at":"2026-04-19T23:37:55.984Z",
        "failed_allocation_attempts":1,
        "details":"failed shard on node [j4vZ7bM_QzOaqkYNveOF1g]: failed to perform indices:data/write/bulk[s] on replica [log-platform-20260420][1], node[j4vZ7bM_QzOaqkYNveOF1g], [R], s[STARTED], a[id=NEfARmg0Q_Kmi5cN2lyIfg], failure IndexShardClosedException[CurrentState[CLOSED] Primary closed.]",
        "last_allocation_status":"no_attempt"
    },
    "can_allocate":"no",
    "allocate_explanation":"cannot allocate because allocation is not permitted to any of the nodes",
    "node_allocation_decisions":[
        {
            "node_id":"G8CcSTiEQCC6s3XAUqybtA",
            "node_name":"master_data_02",
            "transport_address":"192.168.1.2:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "deciders":[
                {
                    "decider":"replica_after_primary_active",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                },
                {
                    "decider":"throttling",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                }
            ]
        },
        {
            "node_id":"j4vZ7bM_QzOaqkYNveOF1g",
            "node_name":"master_data_03",
            "transport_address":"192.168.1.3:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "deciders":[
                {
                    "decider":"replica_after_primary_active",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                },
                {
                    "decider":"throttling",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                }
            ]
        },
        {
            "node_id":"zOU7QHdXSiG4iVlh5voNDA",
            "node_name":"master_data_01",
            "transport_address":"192.168.1.1:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "xpack.installed":"true",
                "transform.node":"true",
                "ml.max_open_jobs":"20"
            },
            "node_decision":"no",
            "deciders":[
                {
                    "decider":"replica_after_primary_active",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                },
                {
                    "decider":"throttling",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                }
            ]
        }
    ]
}



curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/allocation/explain" -H 'Content-Type: application/json' -d  '
{
  "index": "log-platform-20260420",
  "shard": 1,
  "primary": true
}'
{
    "index":"log-platform-20260420",
    "shard":1,
    "primary":true,
    "current_state":"unassigned",
    "unassigned_info":{
        "reason":"ALLOCATION_FAILED",
        "at":"2026-04-19T23:38:01.168Z",
        "failed_allocation_attempts":3,
        "details":"failed shard on node [G8CcSTiEQCC6s3XAUqybtA]: failed recovery, failure RecoveryFailedException[[log-platform-20260420][1]: Recovery failed on {master_data_02}{G8CcSTiEQCC6s3XAUqybtA}{hui2XAFAS3G_BiqBMQGpmA}{192.168.1.2}{192.168.1.2:9300}{cdhilmrstw}{ml.machine_memory=67449708544, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: IndexShardRecoveryException[failed to recover from gateway]; nested: EngineCreationFailureException[failed to open reader on writer]; nested: FileSystemException[/data/es_9200/data/nodes/0/indices/5vG9du0wS2qZkzo6mAOORA/1/index/_nk.fdx: Too many open files]; ",
        "last_allocation_status":"no_valid_shard_copy"
    },
    "can_allocate":"no_valid_shard_copy",
    "allocate_explanation":"cannot allocate because all found copies of the shard are either stale or corrupt",
    "node_allocation_decisions":[
        {
            "node_id":"G8CcSTiEQCC6s3XAUqybtA",
            "node_name":"master_data_02",
            "transport_address":"192.168.1.2:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "store":{
                "in_sync":true,
                "allocation_id":"j7odwUERTZ-3BYmm926lMA",
                "store_exception":{
                    "type":"corrupt_index_exception",
                    "reason":"failed engine (reason: [corrupt file (source: [start])]) (resource=preexisting_corruption)",
                    "caused_by":{
                        "type":"i_o_exception",
                        "reason":"failed engine (reason: [corrupt file (source: [start])])",
                        "caused_by":{
                            "type":"corrupt_index_exception",
                            "reason":"checksum passed (3583f834). possibly transient resource issue, or a Lucene or JVM bug (resource=BufferedChecksumIndexInput(NIOFSIndexInput(path="/data/es_9200/data/nodes/0/indices/5vG9du0wS2qZkzo6mAOORA/1/index/_nk.fdm")))"
                        }
                    }
                }
            }
        },
        {
            "node_id":"j4vZ7bM_QzOaqkYNveOF1g",
            "node_name":"master_data_03",
            "transport_address":"192.168.1.3:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "store":{
                "in_sync":false,
                "allocation_id":"NEfARmg0Q_Kmi5cN2lyIfg"
            }
        },
        {
            "node_id":"zOU7QHdXSiG4iVlh5voNDA",
            "node_name":"master_data_01",
            "transport_address":"192.168.1.1:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "xpack.installed":"true",
                "transform.node":"true",
                "ml.max_open_jobs":"20"
            },
            "node_decision":"no",
            "store":{
                "found":false
            }
        }
    ]
}

vim /etc/security/limits.conf
* soft nofile 655360
* hard nofile 655360
* soft nproc 655350
* hard nproc 655350
* soft memlock unlimited
* hard memlock unlimited

#需要重启ES,让 nofile配置的值生效。

cat /proc/658/limits | grep "open files"
Max open files            655360               655360               files     
cat /proc/11401/limits | grep "open files"
Max open files            65536                65536                files  
cat /proc/27058/limits | grep "open files"
Max open files            65536                65536                files   


#分片重新分配。
curl -XPOST -u elastic:elastic -k "http://192.168.1.1:9200/_cluster/reroute" -H 'Content-Type: application/json' -d  '
{
  "commands": [
    {
      "allocate_empty_primary": {
        "index": "log-platform-20260420",
        "shard": 1,
        "node": "master_data_01",
        "accept_data_loss": true
      }
    }
  ]
}'

#修复正常。
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/health"
1776649799 01:49:59 es-elk-common-gm01 green 3 3 1672 836 2 0 0 0 - 100.0%

#一次重启别的节点让nofile生效。log-platform-20260420，索引的数据没有丢失。
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/indices" |grep log-platform-20260420
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 82524  100 82524    0     0   18yellow open log-platform-20260420 5vG9du0wS2qZkzo6mAOORA 3 1   8597809      0    3.9gb    3.8gb
2k      0 --:--:-- --:--:-- --:--:--  183k

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

Elasticsearch复杂数据类型终极指南：从入门到精通

Elasticsearch作为功能强大的搜索引擎，支持多种复杂数据类型，让开发者能够灵活处理各种结构化和非结构化数据。本文将带你全面了解Elasticsearch中的复杂数据类型，从基础概念到实际应用，助你轻松掌握数据建模的核心技巧。## 内部对象：构建层级化数据结构在Elasticsearch中，对象类型（Object）是最基础的复杂数据类型之一，用于表示具有嵌套关系的数据。例如，我们可

腾讯云开发者社区

终极指南：Flink SQL连接器版本管理从混乱到有序的升级之路

Apache Flink作为流处理领域的佼佼者，其SQL连接器的版本管理一直是开发者面临的核心挑战。本文将系统讲解Flink SQL连接器版本管理的最佳实践，帮助你轻松应对版本兼容性问题，实现从混乱到有序的升级之旅。## 连接器版本管理的常见痛点 😫在Flink应用开发中，连接器版本管理常常让开发者头疼不已。不同版本的连接器可能导致各种兼容性问题，例如API变更、功能差异甚至运行时错误。

腾讯云开发者社区

如何快速搭建Neon无服务器PostgreSQL：面向初学者的完整指南

Neon是一款革命性的无服务器PostgreSQL解决方案，它通过分离存储和计算层，实现了自动扩缩容、类代码式数据库分支以及零级扩展能力。本指南将帮助你从零开始搭建Neon开发环境，体验这款创新数据库的强大功能。## 准备工作：环境要求与依赖项在开始搭建Neon环境前，请确保你的系统满足以下要求：- Linux操作系统（推荐Ubuntu 20.04+或Debian 11+）- Git