ES故障处理之-too many files open
【代码】ES故障处理之-too many files open。
·
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/health"
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/shards?v&h=index,shard,prirep,state,node,unassigned.reason&s=state"
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/allocation/explain"
{
"index":"log-platform-20260420",
"shard":1,
"primary":false,
"current_state":"unassigned",
"unassigned_info":{
"reason":"ALLOCATION_FAILED",
"at":"2026-04-19T23:37:55.984Z",
"failed_allocation_attempts":1,
"details":"failed shard on node [j4vZ7bM_QzOaqkYNveOF1g]: failed to perform indices:data/write/bulk[s] on replica [log-platform-20260420][1], node[j4vZ7bM_QzOaqkYNveOF1g], [R], s[STARTED], a[id=NEfARmg0Q_Kmi5cN2lyIfg], failure IndexShardClosedException[CurrentState[CLOSED] Primary closed.]",
"last_allocation_status":"no_attempt"
},
"can_allocate":"no",
"allocate_explanation":"cannot allocate because allocation is not permitted to any of the nodes",
"node_allocation_decisions":[
{
"node_id":"G8CcSTiEQCC6s3XAUqybtA",
"node_name":"master_data_02",
"transport_address":"192.168.1.2:9300",
"node_attributes":{
"ml.machine_memory":"67449708544",
"ml.max_open_jobs":"20",
"xpack.installed":"true",
"transform.node":"true"
},
"node_decision":"no",
"deciders":[
{
"decider":"replica_after_primary_active",
"decision":"NO",
"explanation":"primary shard for this replica is not yet active"
},
{
"decider":"throttling",
"decision":"NO",
"explanation":"primary shard for this replica is not yet active"
}
]
},
{
"node_id":"j4vZ7bM_QzOaqkYNveOF1g",
"node_name":"master_data_03",
"transport_address":"192.168.1.3:9300",
"node_attributes":{
"ml.machine_memory":"67449708544",
"ml.max_open_jobs":"20",
"xpack.installed":"true",
"transform.node":"true"
},
"node_decision":"no",
"deciders":[
{
"decider":"replica_after_primary_active",
"decision":"NO",
"explanation":"primary shard for this replica is not yet active"
},
{
"decider":"throttling",
"decision":"NO",
"explanation":"primary shard for this replica is not yet active"
}
]
},
{
"node_id":"zOU7QHdXSiG4iVlh5voNDA",
"node_name":"master_data_01",
"transport_address":"192.168.1.1:9300",
"node_attributes":{
"ml.machine_memory":"67449708544",
"xpack.installed":"true",
"transform.node":"true",
"ml.max_open_jobs":"20"
},
"node_decision":"no",
"deciders":[
{
"decider":"replica_after_primary_active",
"decision":"NO",
"explanation":"primary shard for this replica is not yet active"
},
{
"decider":"throttling",
"decision":"NO",
"explanation":"primary shard for this replica is not yet active"
}
]
}
]
}
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cluster/allocation/explain" -H 'Content-Type: application/json' -d '
{
"index": "log-platform-20260420",
"shard": 1,
"primary": true
}'
{
"index":"log-platform-20260420",
"shard":1,
"primary":true,
"current_state":"unassigned",
"unassigned_info":{
"reason":"ALLOCATION_FAILED",
"at":"2026-04-19T23:38:01.168Z",
"failed_allocation_attempts":3,
"details":"failed shard on node [G8CcSTiEQCC6s3XAUqybtA]: failed recovery, failure RecoveryFailedException[[log-platform-20260420][1]: Recovery failed on {master_data_02}{G8CcSTiEQCC6s3XAUqybtA}{hui2XAFAS3G_BiqBMQGpmA}{192.168.1.2}{192.168.1.2:9300}{cdhilmrstw}{ml.machine_memory=67449708544, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}]; nested: IndexShardRecoveryException[failed to recover from gateway]; nested: EngineCreationFailureException[failed to open reader on writer]; nested: FileSystemException[/data/es_9200/data/nodes/0/indices/5vG9du0wS2qZkzo6mAOORA/1/index/_nk.fdx: Too many open files]; ",
"last_allocation_status":"no_valid_shard_copy"
},
"can_allocate":"no_valid_shard_copy",
"allocate_explanation":"cannot allocate because all found copies of the shard are either stale or corrupt",
"node_allocation_decisions":[
{
"node_id":"G8CcSTiEQCC6s3XAUqybtA",
"node_name":"master_data_02",
"transport_address":"192.168.1.2:9300",
"node_attributes":{
"ml.machine_memory":"67449708544",
"ml.max_open_jobs":"20",
"xpack.installed":"true",
"transform.node":"true"
},
"node_decision":"no",
"store":{
"in_sync":true,
"allocation_id":"j7odwUERTZ-3BYmm926lMA",
"store_exception":{
"type":"corrupt_index_exception",
"reason":"failed engine (reason: [corrupt file (source: [start])]) (resource=preexisting_corruption)",
"caused_by":{
"type":"i_o_exception",
"reason":"failed engine (reason: [corrupt file (source: [start])])",
"caused_by":{
"type":"corrupt_index_exception",
"reason":"checksum passed (3583f834). possibly transient resource issue, or a Lucene or JVM bug (resource=BufferedChecksumIndexInput(NIOFSIndexInput(path="/data/es_9200/data/nodes/0/indices/5vG9du0wS2qZkzo6mAOORA/1/index/_nk.fdm")))"
}
}
}
}
},
{
"node_id":"j4vZ7bM_QzOaqkYNveOF1g",
"node_name":"master_data_03",
"transport_address":"192.168.1.3:9300",
"node_attributes":{
"ml.machine_memory":"67449708544",
"ml.max_open_jobs":"20",
"xpack.installed":"true",
"transform.node":"true"
},
"node_decision":"no",
"store":{
"in_sync":false,
"allocation_id":"NEfARmg0Q_Kmi5cN2lyIfg"
}
},
{
"node_id":"zOU7QHdXSiG4iVlh5voNDA",
"node_name":"master_data_01",
"transport_address":"192.168.1.1:9300",
"node_attributes":{
"ml.machine_memory":"67449708544",
"xpack.installed":"true",
"transform.node":"true",
"ml.max_open_jobs":"20"
},
"node_decision":"no",
"store":{
"found":false
}
}
]
}
vim /etc/security/limits.conf
* soft nofile 655360
* hard nofile 655360
* soft nproc 655350
* hard nproc 655350
* soft memlock unlimited
* hard memlock unlimited
#需要重启ES,让 nofile配置的值生效。
cat /proc/658/limits | grep "open files"
Max open files 655360 655360 files
cat /proc/11401/limits | grep "open files"
Max open files 65536 65536 files
cat /proc/27058/limits | grep "open files"
Max open files 65536 65536 files
#分片重新分配。
curl -XPOST -u elastic:elastic -k "http://192.168.1.1:9200/_cluster/reroute" -H 'Content-Type: application/json' -d '
{
"commands": [
{
"allocate_empty_primary": {
"index": "log-platform-20260420",
"shard": 1,
"node": "master_data_01",
"accept_data_loss": true
}
}
]
}'
#修复正常。
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/health"
1776649799 01:49:59 es-elk-common-gm01 green 3 3 1672 836 2 0 0 0 - 100.0%
#一次重启别的节点让nofile生效。log-platform-20260420,索引的数据没有丢失。
curl -u remote_monitoring_user:remotemonitoringuser123 -k "http://192.168.1.1:9200/_cat/indices" |grep log-platform-20260420
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 82524 100 82524 0 0 18yellow open log-platform-20260420 5vG9du0wS2qZkzo6mAOORA 3 1 8597809 0 3.9gb 3.8gb
2k 0 --:--:-- --:--:-- --:--:-- 183k
更多推荐
所有评论(0)