盒马购物数据分析(数据处理篇)
重复数据占比:0.19%
·
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['font.family']='Microsoft Yahei'
plt.rcParams['axes.unicode_minus']=False
1 读取数据并查看基本信息
f = pd.read_csv('./order_data.csv')
# 查看缺失值情况
missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
'缺失数量': missing_data,
'缺失百分比(%)': missing_percent.round(2)
}).sort_values('缺失数量', ascending=False)
print(missing_df[missing_df['缺失数量'] > 0])

# 查看是否有重复数据
duplicate_count = df.duplicated().sum()
print(f"重复数据数量:{duplicate_count}")
if duplicate_count > 0:
print("重复数据占比:{:.2f}%".format((duplicate_count / len(df)) * 100))
重复数据数量:10 重复数据占比:0.19%
#处理重复数据
df_clean = df.drop_duplicates(keep='first')
df_clean.duplicated().sum()
#处理缺失值
df_clean = df_clean.dropna(axis=0,how='any')
df_clean.isnull().sum()
#处理日期时间格式
df_clean['下单时间']=pd.to_datetime(df_clean['下单时间'])
df_clean['付款时间'] = pd.to_datetime(df_clean['付款时间'])
df_clean.info()

# 计算支付延迟时间
df_clean['支付延迟分钟'] = (df_clean['付款时间'] - df_clean['下单时间']).dt.total_seconds()/60
df_clean['折扣金额'] = df_clean['订单金额'] - df_clean['付款金额']
df_clean['折扣率'] = (df_clean['折扣金额'] / df_clean['订单金额'] * 100).round(2)
df_clean['下单日期'] = df_clean['下单时间'].dt.date
df_clean['下单小时'] = df_clean['下单时间'].dt.hour
df_clean['下单星期'] = df_clean['下单时间'].dt.day_name()
df_clean['下单月份'] = df_clean['下单时间'].dt.month
df_clean.head(2)
df_clean.to_csv('./order_data_cleaned.csv',index=False,encoding='utf-8')
更多推荐
所有评论(0)