阿里云安全恶意程序检测-排名295
赛题说明本题目提供的数据来自文件(windows 可执行程序)经过沙箱程序模拟运行后的API指令序列,全为windows二进制可执行程序,经过脱敏处理。本题目提供的样本数据均来自于从互联网。其中恶意文件的类型有感染型病毒、木马程序、挖矿程序、DDOS木马、勒索病毒等,数据总计6亿条。具体请移步:阿里云安全恶意程序检测数据说明简单思路:数据量过大,改变数据类型减少内存使用交叉验证lgb效果还不错具体
·
赛题说明
本题目提供的数据来自文件(windows 可执行程序)经过沙箱程序模拟运行后的API指令序列,全为windows二进制可执行程序,经过脱敏处理。
本题目提供的样本数据均来自于从互联网。其中恶意文件的类型有感染型病毒、木马程序、挖矿程序、DDOS木马、勒索病毒等,数据总计6亿条。
具体请移步:阿里云安全恶意程序检测
数据说明
简单思路:
数据量过大,改变数据类型减少内存使用
交叉验证
lgb效果还不错

具体代码:
from tqdm import tqdm_notebook
class _Data_Preprocess:
def __init__(self):
self.int8_max = np.iinfo(np.int8).max
self.int8_min = np.iinfo(np.int8).min
self.int16_max = np.iinfo(np.int16).max
self.int16_min = np.iinfo(np.int16).min
self.int32_max = np.iinfo(np.int32).max
self.int32_min = np.iinfo(np.int32).min
self.int64_max = np.iinfo(np.int64).max
self.int64_min = np.iinfo(np.int64).min
self.float16_max = np.finfo(np.float16).max
self.float16_min = np.finfo(np.float16).min
self.float32_max = np.finfo(np.float32).max
self.float32_min = np.finfo(np.float32).min
self.float64_max = np.finfo(np.float64).max
self.float64_min = np.finfo(np.float64).min
def _get_type(self, min_val, max_val, types):
if types == 'int':
if max_val <= self.int8_max and min_val >= self.int8_min:
return np.int8
elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:
return np.int16
elif max_val <= self.int32_max and min_val >= self.int32_min:
return np.int32
return None
elif types == 'float':
if max_val <= self.float16_max and min_val >= self.float16_min:
return np.float16
if max_val <= self.float32_max and min_val >= self.float32_min:
return np.float32
if max_val <= self.float64_max and min_val >= self.float64_min:
return np.float64
return None
def _memory_process(self, df):
init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
print('Original data occupies {} GB memory.'.format(init_memory))
df_cols = df.columns
for col in tqdm_notebook(df_cols):
try:
if 'float' in str(df[col].dtypes):
max_val = df[col].max()
min_val = df[col].min()
trans_types = self._get_type(min_val, max_val, 'float')
if trans_types is not None:
df[col] = df[col].astype(trans_types)
elif 'int' in str(df[col].dtypes):
max_val = df[col].max()
min_val = df[col].min()
trans_types = self._get_type(min_val, max_val, 'int')
if trans_types is not None:
df[col] = df[col].astype(trans_types)
except:
print(' Can not do any process for column, {}.'.format(col))
afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))
return df
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import os
os.chdir(r'E:\项目文件\阿里云安全恶意程序检测')
train = pd.read_csv('security_train.csv')
test = pd.read_csv('security_test.csv')
def simple_sts_features(df):
simple_fea = pd.DataFrame()
simple_fea['file_id'] = df['file_id'].unique()
simple_fea = simple_fea.sort_values('file_id')
df_grp = df.groupby('file_id')
simple_fea['file_id_api_count'] = df_grp['api'].count().values
simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values
simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values
simple_fea['file_id_index_count'] = df_grp['index'].count().values
simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values
return simple_fea
simple_train_fea1 = simple_sts_features(train)
simple_test_fea1 = simple_sts_features(test)
def simple_numerical_sts_features(df):
simple_numerical_fea = pd.DataFrame()
simple_numerical_fea['file_id'] = df['file_id'].unique()
simple_numerical_fea = simple_numerical_fea.sort_values('file_id')
df_grp = df.groupby('file_id')
simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values
simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values
simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values
simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values
simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values
return simple_numerical_fea
simple_train_fea2 = simple_numerical_sts_features(train)
simple_test_fea2 = simple_numerical_sts_features(test)
train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')
test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')
### 训练集&测试集构建
train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')
train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')
def lgb_logloss(preds, data):
labels_ = data.get_label()
classes_ = np.unique(labels_)
preds_prob = []
for i in range(len(classes_)):
preds_prob.append(preds[i * len(labels_):(i + 1) * len(labels_)])
preds_prob_ = np.vstack(preds_prob)
loss = []
for i in range(preds_prob_.shape[1]): # 样本个数
sum_ = 0
for j in range(preds_prob_.shape[0]): # 类别个数
pred = preds_prob_[j, i] # 第i个样本预测为第j类的概率
if j == labels_[i]:
sum_ += np.log(pred)
else:
sum_ += np.log(1 - pred)
loss.append(sum_)
return 'loss is: ', -1 * (np.sum(loss) / preds_prob_.shape[1]), False
### 模型验证
train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]
train_label = 'label'
from sklearn.model_selection import KFold
params = {
'task': 'train',
'num_leaves': 255,
'objective': 'multiclass',
'num_class': 8,
'min_data_in_leaf': 50,
'learning_rate': 0.05,
'feature_fraction': 0.85,
'bagging_fraction': 0.85,
'bagging_freq': 5,
'max_bin': 128
}
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predict_res = 0
models = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)
clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=val_data, verbose_eval=50,
early_stopping_rounds=100, feval=lgb_logloss)
models.append(clf)
from sklearn.model_selection import KFold
params = {
'task': 'train',
'num_leaves': 255,
'objective': 'multiclass',
'num_class': 8,
'min_data_in_leaf': 50,
'learning_rate': 0.05,
'feature_fraction': 0.85,
'bagging_fraction': 0.85,
'bagging_freq': 5,
'max_bin': 128
}
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
models = []
predict_res = 0
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)
clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=[trn_data, val_data], verbose_eval=50,
early_stopping_rounds=100, feval=lgb_logloss)
models.append(clf)
feature_importance = pd.DataFrame()
feature_importance['fea_name'] = train_features
feature_importance['fea_imp'] = clf.feature_importance()
feature_importance = feature_importance.sort_values('fea_imp', ascending=False)
plt.figure(figsize=[20, 10, ])
sns.barplot(x=feature_importance['fea_name'], y=feature_importance['fea_imp'])
# sns.barplot(x="fea_name",y="fea_imp",data=feature_importance)
pred_res = 0
fold = 5
for model in models:
pred_res += model.predict(test_submit[train_features]) * 1.0 / fold
test_submit['prob0'] = 0
test_submit['prob1'] = 0
test_submit['prob2'] = 0
test_submit['prob3'] = 0
test_submit['prob4'] = 0
test_submit['prob5'] = 0
test_submit['prob6'] = 0
test_submit['prob7'] = 0
test_submit[['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']] = pred_res
test_submit[['file_id', 'prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']].to_csv('baseline.csv',
index=False)
喜欢记得一键三连
更多推荐
所有评论(0)