curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/health"
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/shards?v&h=index,shard,prirep,state,node,unassigned.reason&s=state"
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/allocation/explain"
{
    "index":"log-platform-20260420",
    "shard":1,
    "primary":false,
    "current_state":"unassigned",
    "unassigned_info":{
        "reason":"ALLOCATION_FAILED",
        "at":"2026-04-19T23:37:55.984Z",
        "failed_allocation_attempts":1,
        "details":"failed shard on node [j4vZ7bM_QzOaqkYNveOF1g]: failed to perform indices:data/write/bulk[s] on replica [log-platform-20260420][1], node[j4vZ7bM_QzOaqkYNveOF1g], [R], s[STARTED], a[id=NEfARmg0Q_Kmi5cN2lyIfg], failure IndexShardClosedException[CurrentState[CLOSED] Primary closed.]",
        "last_allocation_status":"no_attempt"
    },
    "can_allocate":"no",
    "allocate_explanation":"cannot allocate because allocation is not permitted to any of the nodes",
    "node_allocation_decisions":[
        {
            "node_id":"G8CcSTiEQCC6s3XAUqybtA",
            "node_name":"master_data_02",
            "transport_address":"192.168.1.2:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "deciders":[
                {
                    "decider":"replica_after_primary_active",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                },
                {
                    "decider":"throttling",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                }
            ]
        },
        {
            "node_id":"j4vZ7bM_QzOaqkYNveOF1g",
            "node_name":"master_data_03",
            "transport_address":"192.168.1.3:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "deciders":[
                {
                    "decider":"replica_after_primary_active",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                },
                {
                    "decider":"throttling",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                }
            ]
        },
        {
            "node_id":"zOU7QHdXSiG4iVlh5voNDA",
            "node_name":"master_data_01",
            "transport_address":"192.168.1.1:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "xpack.installed":"true",
                "transform.node":"true",
                "ml.max_open_jobs":"20"
            },
            "node_decision":"no",
            "deciders":[
                {
                    "decider":"replica_after_primary_active",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                },
                {
                    "decider":"throttling",
                    "decision":"NO",
                    "explanation":"primary shard for this replica is not yet active"
                }
            ]
        }
    ]
}



curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/allocation/explain" -H 'Content-Type: application/json' -d  '
{
  "index": "log-platform-20260420",
  "shard": 1,
  "primary": true
}'
{
    "index":"log-platform-20260420",
    "shard":1,
    "primary":true,
    "current_state":"unassigned",
    "unassigned_info":{
        "reason":"ALLOCATION_FAILED",
        "at":"2026-04-19T23:38:01.168Z",
        "failed_allocation_attempts":3,
        "details":"failed shard on node [G8CcSTiEQCC6s3XAUqybtA]: failed recovery, failure RecoveryFailedException[[log-platform-20260420][1]: Recovery failed on {master_data_02}{G8CcSTiEQCC6s3XAUqybtA}{hui2XAFAS3G_BiqBMQGpmA}{192.168.1.2}{192.168.1.2:9300}{cdhilmrstw}{ml.machine_memory=67449708544, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: IndexShardRecoveryException[failed to recover from gateway]; nested: EngineCreationFailureException[failed to open reader on writer]; nested: FileSystemException[/data/es_9200/data/nodes/0/indices/5vG9du0wS2qZkzo6mAOORA/1/index/_nk.fdx: Too many open files]; ",
        "last_allocation_status":"no_valid_shard_copy"
    },
    "can_allocate":"no_valid_shard_copy",
    "allocate_explanation":"cannot allocate because all found copies of the shard are either stale or corrupt",
    "node_allocation_decisions":[
        {
            "node_id":"G8CcSTiEQCC6s3XAUqybtA",
            "node_name":"master_data_02",
            "transport_address":"192.168.1.2:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "store":{
                "in_sync":true,
                "allocation_id":"j7odwUERTZ-3BYmm926lMA",
                "store_exception":{
                    "type":"corrupt_index_exception",
                    "reason":"failed engine (reason: [corrupt file (source: [start])]) (resource=preexisting_corruption)",
                    "caused_by":{
                        "type":"i_o_exception",
                        "reason":"failed engine (reason: [corrupt file (source: [start])])",
                        "caused_by":{
                            "type":"corrupt_index_exception",
                            "reason":"checksum passed (3583f834). possibly transient resource issue, or a Lucene or JVM bug (resource=BufferedChecksumIndexInput(NIOFSIndexInput(path="/data/es_9200/data/nodes/0/indices/5vG9du0wS2qZkzo6mAOORA/1/index/_nk.fdm")))"
                        }
                    }
                }
            }
        },
        {
            "node_id":"j4vZ7bM_QzOaqkYNveOF1g",
            "node_name":"master_data_03",
            "transport_address":"192.168.1.3:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "ml.max_open_jobs":"20",
                "xpack.installed":"true",
                "transform.node":"true"
            },
            "node_decision":"no",
            "store":{
                "in_sync":false,
                "allocation_id":"NEfARmg0Q_Kmi5cN2lyIfg"
            }
        },
        {
            "node_id":"zOU7QHdXSiG4iVlh5voNDA",
            "node_name":"master_data_01",
            "transport_address":"192.168.1.1:9300",
            "node_attributes":{
                "ml.machine_memory":"67449708544",
                "xpack.installed":"true",
                "transform.node":"true",
                "ml.max_open_jobs":"20"
            },
            "node_decision":"no",
            "store":{
                "found":false
            }
        }
    ]
}

vim /etc/security/limits.conf
* soft nofile 655360
* hard nofile 655360
* soft nproc 655350
* hard nproc 655350
* soft memlock unlimited
* hard memlock unlimited

#需要重启ES,让 nofile配置的值生效。

cat /proc/658/limits | grep "open files"
Max open files            655360               655360               files     
cat /proc/11401/limits | grep "open files"
Max open files            65536                65536                files  
cat /proc/27058/limits | grep "open files"
Max open files            65536                65536                files   


#分片重新分配。
curl -XPOST -u elastic:elastic -k "http://192.168.1.1:9200/_cluster/reroute" -H 'Content-Type: application/json' -d  '
{
  "commands": [
    {
      "allocate_empty_primary": {
        "index": "log-platform-20260420",
        "shard": 1,
        "node": "master_data_01",
        "accept_data_loss": true
      }
    }
  ]
}'

#修复正常。
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/health"
1776649799 01:49:59 es-elk-common-gm01 green 3 3 1672 836 2 0 0 0 - 100.0%

#一次重启别的节点让nofile生效。log-platform-20260420,索引的数据没有丢失。
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/indices" |grep log-platform-20260420
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 82524  100 82524    0     0   18yellow open log-platform-20260420 5vG9du0wS2qZkzo6mAOORA 3 1   8597809      0    3.9gb    3.8gb
2k      0 --:--:-- --:--:-- --:--:--  183k

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐