1. 背景


回归方程与回归系数的显著性检验

2. statsmodels 库

statsmodels库可以用来做逻辑回归、线性回归。并且会在summary中给出显著性检验的结果。最终我们想要的就是如下图的报告。

 3. 计算过程

如果我们使用的sklearn构建的逻辑回归就没有办法直接输出这个报告,所以需要自己计算这个表中的信息。

3.1 先用statemodels计算作为对比

import pandas as pd
from sklearn.datasets import load_boston
from scipy import stats
import statsmodels.api as sm
import numpy as np
 
''' 数据demo'''
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=['target'])
 
X = sm.add_constant(X)  # 添加常数项
model = sm.OLS(y, X)
results = model.fit()
y_pred = pd.DataFrame(model.predict(results.params, X),
                      columns=['pred'])
print(results.summary())
 
''''''

输出结果:

                           OLS Regression Results                            
==============================================================================
Dep. Variable:                 target   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Thu, 04 Nov 2021   Prob (F-statistic):          6.72e-135
Time:                        11:07:53   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4595      5.103      7.144      0.000      26.432      46.487
CRIM          -0.1080      0.033     -3.287      0.001      -0.173      -0.043
ZN             0.0464      0.014      3.382      0.001       0.019       0.073
INDUS          0.0206      0.061      0.334      0.738      -0.100       0.141
CHAS           2.6867      0.862      3.118      0.002       0.994       4.380
NOX          -17.7666      3.820     -4.651      0.000     -25.272     -10.262
RM             3.8099      0.418      9.116      0.000       2.989       4.631
AGE            0.0007      0.013      0.052      0.958      -0.025       0.027
DIS           -1.4756      0.199     -7.398      0.000      -1.867      -1.084
RAD            0.3060      0.066      4.613      0.000       0.176       0.436
TAX           -0.0123      0.004     -3.280      0.001      -0.020      -0.005
PTRATIO       -0.9527      0.131     -7.283      0.000      -1.210      -0.696
B              0.0093      0.003      3.467      0.001       0.004       0.015
LSTAT         -0.5248      0.051    -10.347      0.000      -0.624      -0.425
==============================================================================
Omnibus:                      178.041   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              783.126
Skew:                           1.521   Prob(JB):                    8.84e-171
Kurtosis:                       8.281   Cond. No.                     1.51e+04
==============================================================================

3.2 自己代码做

计算的函数

def func_sign_testing_LRCoef(X, y_pred, y_true, Coef):
    '''
    逻辑回归系数的显著性检验
    :param X: 原始数据,行为样本,列为特征(注意,如果你的Coef比特征多一列,那么应该在数据上加一列const   X.insert(0, 'const', 1)  # 添加常数项)
    :param y_pred: 模型的预测结果
    :param y_true: 数据的真实结果
    :param Coef: 逻辑回归的回归系数
    :return: 返回报告包含'coef','std error','tval','pval','lower alpha','upper alpha'
    '''
 
    # y_pred = y_pred.values
    # y_true = y.values
    # Coef = results.params.values
 
    n, p = X.shape
    p = p - 1  # p 为特征数
    # X.insert(0, 'const', 1)  # 添加常数项
 
    # SSR = np.dot(y_pred - y_true.mean(), y_pred - y_true.mean())
    y_pred = np.squeeze(y_pred)
    y_true = np.squeeze(y_true)
    Coef = np.squeeze(Coef)
    SSR = np.dot(np.squeeze(y_pred) - np.squeeze(y_true).mean(),
                 np.squeeze(y_pred) - np.squeeze(y_true).mean())
    SSE = np.dot(np.squeeze(y_pred) - np.squeeze(y_true),
                 np.squeeze(y_pred) - np.squeeze(y_true))
    # SSE = np.dot(y_pred - y_true, y_pred - y_true)
 
    # 先计算回归方程的显著性
    f_val = (SSR / p) / (SSE / n - p - 1)
    f_pval = stats.f.sf(f_val, p, n - p - 1)
 
    print("LR F", f_val)
    print("LR p", f_pval)
    # 单个参数的显著性检验
    # print(X.columns)
    ttest_result = pd.DataFrame(None, index=X.columns,
                                columns=['coef', 'std error', 'tval', 'pval', '[lower alpha', 'upper alpha]'])
    ttest_result.loc[:, 'coef'] = np.squeeze(Coef)
    error = np.dot(y_true - y_pred, y_true - y_pred)
    S = np.array(np.linalg.inv(np.dot(X.T, X)))
    for i, col in enumerate(X.columns):
        RMSE = np.sqrt(error / n - p - 1)
        tval = Coef[i] / np.sqrt((error / (n - p - 1)) * S[i][i])
        ttest_result.loc[col, 'tval'] = tval
        # pval_test = stats.t.sf(np.abs(tval), df=492) * 2
        std_error = Coef[i] / tval  # np.sqrt(S[i][i]) * RMSE
        # 其实表里的t检验值就剩余表里的b系数除以std error哦,t=b/std error
        ttest_result.loc[col, 'std error'] = std_error
        pval = stats.t.sf(np.abs(tval), df=(n - p - 1)) * 2
        ttest_result.loc[col, 'pval'] = pval
        [lower_a, upper_a] = stats.norm.ppf(q=[0.025, 0.975], loc=Coef[i], scale=std_error)
        ttest_result.loc[col, '[lower alpha'] = lower_a
        ttest_result.loc[col, 'upper alpha]'] = upper_a
    return ttest_result

调用函数

boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=['target'])
X = sm.add_constant(X)  # 添加常数项
model = sm.OLS(y, X)
results = model.fit()
y_pred = pd.DataFrame(model.predict(results.params, X),
                      columns=['pred'])
 
 
# results.params.values 将训练得到的coef系数给进函数
func_sign_testing_LRCoef(X, y_pred, y, results.params.values)
Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐