本文共计515个文字,预计阅读时间需要3分钟。
缺少值分析 + 可视化数据分析 + 代码示例pythonimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings('ignore')
train=pd.read_csv(train.csv)test=pd.read_csv(test.csv)
缺失值分析处理
可视化&数据分析
代码 importnumpyasnp importpandasaspd importmatplotlib.pyplotasplt importseabornassns importwarnings warnings.filterwarnings('ignore') train=pd.read_csv("train.csv") test=pd.read_csv("test.csv") gender=pd.read_csv("gender.csv") print(train.head()) print(test.head()) print(gender.head()) data=pd.concat([train,test],ignore_index=True) data print(data.info()) print(data.isnull().sum())#Cabin缺失数据较多,可直接删除 data.describe()#查看年龄统计数据数据,使用年龄的平均值填充空白值 data.Age=data.Age.fillna(data.Age.mean())#用平均值填充年龄 data data[data.Fare.isnull()] data.Fare=data.Fare.fillna(data[(data.Embarked=='S')&(data.Pclass==3)].Fare.median()) data[data.Embarked.isnull()] data.groupby(by=['Pclass','Embarked']).Fare.median() data.groupby(by=['Pclass','Embarked']).Survived.count() data.Embarked=data.Embarked.fillna('C') data.iloc[61] data=data.drop('Cabin',axis=1)#Cabin数据缺失将近70%,删除Cabin列。 data print(data.isnull().sum()) sns.barplot(x='Pclass',y='Survived',data=data) plt.subplots(figsize=(15,8)) sns.kdeplot(data.loc[(data['Survived']==0),'Pclass'],shade=True,color='red',label='NotSurvived') sns.kdeplot(data.loc[(data['Survived']==1),'Pclass'],shade=True,color='blue',label='Survived') labels=['1','2','3'] plt.xticks(sorted(data.Pclass.unique()),labels) plt.show() sns.barplot(x='Sex',y='Survived',data=data)#女性生存数量大于男性 sns.barplot(x='Parch',y='Survived',data=data) sns.barplot(x='SibSp',y='Survived',data=data) sns.barplot(x='Embarked',y='Survived',data=data)