您的位置:首页 > 其它

Kaggle实战——泰坦尼克生存预测大赛

2018-01-10 09:06 375 查看


In [6]:

import csv
import numpy as np
csv_file_object = csv.reader(open('D:/In/kaggle/Titanic/train.csv', 'rt'))

data=[]
for row in csv_file_object:
data.append(row)
#data = np.array(data)
print (data[0])
print (np.array(data)[0])


['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
'Ticket' 'Fare' 'Cabin' 'Embarked']


In [ ]:

'''
#数据处理
import numpy as np
import pandas as pd
#绘图
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#各种模型、数据处理方法
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import warnings
warnings.filterwarnings('ignore')
'''


In [2]:

print (data[:3])    #list是一维的,array是二维的
print (np.array(data)[:3])
print (np.array(data)[:15,5])
#print (data[0:15,5])
data=np.array(data)
type(data)         #data此时为一个二维的数组


[['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], ['1', '0', '3', 'Braund, Mr. Owen Harris', 'male', '22', '1', '0', 'A/5 21171', '7.25', '', 'S'], ['2', '1', '1', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'female', '38', '1', '0', 'PC 17599', '71.2833', 'C85', 'C']]
[['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
'Ticket' 'Fare' 'Cabin' 'Embarked']
['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
'7.25' '' 'S']
['2' '1' '1' 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
'female' '38' '1' '0' 'PC 17599' '71.2833' 'C85' 'C']]
['Age' '22' '38' '26' '35' '35' '' '54' '2' '27' '14' '4' '58' '20' '39']


Out[2]:

numpy.ndarray


In [3]:

print(data[1:6,5])
print(data[1:6,5].astype(int))
print(data[1:6,5].astype(int).mean())


['22' '38' '26' '35' '35']
[22 38 26 35 35]
31.2


In [1]:

import pandas as pd
%matplotlib inline
df=pd.read_csv('D:/In/kaggle/Titanic/train.csv')
df_test=pd.read_csv('D:/In/kaggle/Titanic/test.csv')
print(df.info())
print(df[['Age','Sex','Pclass']][:10])
print(df[df['Age']>60][['Survived','Pclass','Sex','Age']])
df[df['Age'].isnull()][:10]   #只显示年龄为空的数据


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
Age     Sex  Pclass
0  22.0    male       3
1  38.0  female       1
2  26.0  female       3
3  35.0  female       1
4  35.0    male       3
5   NaN    male       3
6  54.0    male       1
7   2.0    male       3
8  27.0  female       3
9  14.0  female       2
Survived  Pclass     Sex   Age
33          0       2    male  66.0
54          0       1    male  65.0
96          0       1    male  71.0
116         0       3    male  70.5
170         0       1    male  61.0
252         0       1    male  62.0
275         1       1  female  63.0
280         0       3    male  65.0
326         0       3    male  61.0
438         0       1    male  64.0
456         0       1    male  65.0
483         1       3  female  63.0
493         0       1    male  71.0
545         0       1    male  64.0
555         0       1    male  62.0
570         1       2    male  62.0
625         0       1    male  61.0
630         1       1    male  80.0
672         0       2    male  70.0
745         0       1    male  70.0
829         1       1  female  62.0
851         0       3    male  74.0


Out[1]:

PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
171812Williams, Mr. Charles EugenemaleNaN0024437313.0000NaNS
192013Masselmani, Mrs. FatimafemaleNaN0026497.2250NaNC
262703Emir, Mr. Farred ChehabmaleNaN0026317.2250NaNC
282913O'Dwyer, Miss. Ellen "Nellie"femaleNaN003309597.8792NaNQ
293003Todoroff, Mr. LaliomaleNaN003492167.8958NaNS
313211Spencer, Mrs. William Augustus (Marie Eugenie)femaleNaN10PC 17569146.5208B78C
323313Glynn, Miss. Mary AgathafemaleNaN003356777.7500NaNQ
363713Mamee, Mr. HannamaleNaN0026777.2292NaNC
424303Kraeff, Mr. TheodormaleNaN003492537.8958NaNC
In [2]:

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(context="paper", font="monospace")
sns.set(style="white")
f, ax = plt.subplots(figsize=(10,6))
train_corr = df.drop('PassengerId',axis=1).corr()
sns.heatmap(train_corr, ax=ax, vmax=.9, square=True)
ax.set_xticklabels(train_corr.index, size=15)
ax.set_yticklabels(train_corr.columns[::1], size=15)
ax.set_title('train feature corr', fontsize=20)


Out[2]:

<matplotlib.text.Text at 0x18f56089128>




In [5]:

for i in range(1,4):
print (i, len (df[ (df['Sex']=='male')&(df['Pclass'] == i) ]) )   #输出不同等级仓中男士的数量


1 122
2 108
3 347


In [6]:

import pylab as p
df['Age'].dropna().hist(range=(0,100),bins=19,alpha=0.8),p.show()   #bins代表直方柱的个数 alpha控制颜色深浅




Out[6]:

(<matplotlib.axes._subplots.AxesSubplot at 0x1d7d360a550>, None)


In [7]:

from scipy import stats
fig,axes=plt.subplots(2,1,figsize=(8,6))
sns.set_style('darkgrid')#设置风格主题
sns.distplot(df.Age.fillna(-20),rug=True,color='b',ax=axes[0])#rug强度(齿)
ax0=axes[0]
ax0.set_xlabel('')

ax1=axes[1]
ax1.set_title('age survived distribution')
k1=sns.distplot(df[df.Survived==0].Age.fillna(-20),hist=False,color='r',ax=ax1,label='dead')#罹难的年龄分布
k2=sns.distplot(df[df.Survived==1].Age.fillna(-20),hist=False,color='g',ax=ax1,label='alive')#存活的年龄分布
ax1.set_xlabel('')#x坐标轴名字

ax1.legend(fontsize=16)#小朋友和中青年比较容易存活


Out[7]:

<matplotlib.legend.Legend at 0x227fa3b1320>




In [8]:

f,ax=plt.subplots(figsize=(8,3))
ax.set_title('Sex Age dist',size=20)
sns.distplot(df[df.Sex=='female'].dropna().Age,hist=False,color='pink',label='female')
sns.distplot(df[df.Sex=='male'].dropna().Age,hist=False,color='blue',label='male')
ax.legend(fontsize=15)#训练集中的男女年龄分布 男性中老年较多 女性较年轻


Out[8]:

<matplotlib.legend.Legend at 0x227fa2ded68>




In [16]:

f,ax=plt.subplots(figsize=(8,3))
plt.ylim(0.0,0.03)
ax.set_title('Pclass Age dist',size=20)
sns.distplot(df[df.Pclass==1].dropna().Age,hist=False,color='pink',label='P1')
sns.distplot(df[df.Pclass==2].dropna().Age,hist=False,color='blue',label='P2')
sns.distplot(df[df.Pclass==3].dropna().Age,hist=False,color='green',label='P3')
ax.legend(fontsize=15)#不同仓级年龄分布


Out[16]:

<matplotlib.legend.Legend at 0x227fba1cac8>




In [35]:

y_dead=df[df.Survived==0].groupby('Pclass')['Survived'].count()
y_alive=df[df.Survived==1].groupby('Pclass')['Survived'].count()
pos=[1,2,3]#横轴id
ax=plt.figure(figsize=(8,4)).add_subplot(111)
ax.bar(pos,y_dead,color='r',alpha=0.6,label='dead')
ax.bar(pos,y_alive,color='g',bottom=y_dead,alpha=0.6,label='alive')
ax.legend(fontsize=16,loc='best')
ax.set_xticks(pos)
ax.set_xticklabels(['Pclass%d'%(i) for i in range(1,4)],size=15)#x坐标轴信息
ax.set_title('Pclass Survived count',size=20)#不同仓级存活情况


Out[35]:

<matplotlib.text.Text at 0x227fd256400>




In [29]:

pos=range(0,6)
age_list=[]
for Pclass_ in range(1,4):
for Survived_ in range(0,2):
age_list.append(df[(df.Pclass==Pclass_)&(df.Survived==Survived_)].Age.values)
#三个仓级的存亡年龄
fig,axes=plt.subplots(3,1,figsize=(10,6))
sns.set_style('darkgrid')#设置风格主题
#plt.ylim(0.0,0.06)
#print(axes)
print(len(age_list))
i_Pclass=1
for ax in axes:
if i_Pclass==1:
ax.set_ylim(0.0, 0.03)#设置y轴范围
sns.distplot(age_list[i_Pclass*2-2],hist=False,ax=ax,label='Pclass:%d,survived:0'%(i_Pclass),color='r')
sns.distplot(age_list[i_Pclass*2-1],hist=False,ax=ax,label='Pclass:%d,survived:1'%(i_Pclass),color='g')
i_Pclass+=1
ax.set_xlabel('age',size=15)
ax.legend(fontsize=15)


6


D:\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:454: RuntimeWarning: invalid value encountered in greater
X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
D:\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:454: RuntimeWarning: invalid value encountered in less
X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.




In [33]:

#性别
print(df.Sex.value_counts())
print('******************************')
print(df.groupby('Sex')['Survived'].mean())#男女存活率


male      577
female    314
Name: Sex, dtype: int64
******************************
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64


In [36]:

ax=plt.figure(figsize=(10,4)).add_subplot(111)
sns.violinplot(x='Sex',y='Age',hue='Survived',data=df.dropna(),split=True)#小提琴图
ax.set_xlabel('Sex',size=20)
ax.set_xticklabels(['Female','male'],size=18)
ax.set_ylabel('Age',size=20)
ax.legend(fontsize=25,loc='best')#男女存亡年龄分布


Out[36]:

<matplotlib.legend.Legend at 0x18f568b1f60>




In [42]:

label=[]
for sex_i in ['female','male']:
for pclass_i in range(1,4):
label.append('sex:%s,Pclass:%d'%(sex_i,pclass_i))

pos=range(6)
fig=plt.figure(figsize=(16,4))
ax=fig.add_subplot(111)
ax.bar(pos,df[df['Survived']==0].groupby(['Sex','Pclass'])['Survived'].count().values,
color='r',
alpha=0.5,
align='center',
tick_label=label,
label='dead')
ax.bar(pos,
df[df['Survived']==1].groupby(['Sex','Pclass'])['Survived'].count().values,
bottom=df[df['Survived']==0].groupby(['Sex','Pclass'])['Survived'].count().values,
color='g',
alpha=0.5,
align='center',
tick_label=label,
label='alive')
ax.tick_params(labelsize=15)
ax.set_title('sex_pclass_survived',size=30)
ax.legend(fontsize=15,loc='best')#相同性别情况下,仓级越高越容易存活


Out[42]:

<matplotlib.legend.Legend at 0x18f56eb40f0>




In [69]:

#Fare费用
fig=plt.figure(figsize=(8,6))
ax=plt.subplot2grid((2,2),(0,0),colspan=2)#角标
#fig,ax=plt.subplots(1,1,figsize=(8,6))
ax.tick_params(labelsize=15)
ax.set_title('Fare dist',size=20)
ax.set_ylabel('dist',size=20)
sns.kdeplot(df.Fare,ax=ax)
sns.distplot(df.Fare,hist=True,ax=ax)
ax.legend(fontsize=15)
pos=range(0,400,50)
ax.set_xticks(pos)
ax.set_xlim([0,200])
ax.set_xlabel('')

#fig,ax1=plt.subplots(1,1,figsize=(8,6))
ax1=plt.subplot2grid((2,2),(1,0),colspan=2)
ax1.set_title('Fare Pclass dist',size=20)
for i in range(1,4):
sns.kdeplot(df[df.Pclass==i].Fare,ax=ax1,label='Pclass %d'%(i))#不同仓级的票价分布
ax1.set_xlim([0,200])
ax1.set_ylim([0,0.15])
ax1.legend(fontsize=15)#船票价分布
plt.tight_layout()#间距松紧




In [70]:

fig=plt.figure(figsize=(8,3))
ax1=fig.add_subplot(111)
sns.kdeplot(df[df.Survived==0].Fare,ax=ax1,label='dead',color='r')
sns.kdeplot(df[df.Survived==1].Fare,ax=ax1,label='alive',color='g')
#sns.distplot(df[df.Survived==0].Fare,ax=ax1,color='r')
#sns.distplot(df[df.Survived==1].Fare,ax=ax1,color='g')
ax1.set_xlim([0,300])
ax1.legend(fontsize=15)
ax1.set_title('Fare survived',size=20)
ax1.set_xlabel('Fare',size=15)#存亡票价分布


Out[70]:

<matplotlib.text.Text at 0x18f5b446400>




In [73]:

fig=plt.figure(figsize=(8,4))
ax1=fig.add_subplot(211)
sns.countplot(df.SibSp)#计数
ax1.set_title('SibSp',size=20)
ax2=fig.add_subplot(212,sharex=ax1)
sns.countplot(df.Parch)
ax2.set_title('Parch',size=20)#表亲和直亲
#plt.tight_layout()


Out[73]:

<matplotlib.text.Text at 0x18f5ba0a2e8>




In [76]:

fig=plt.figure(figsize=(10,6))
ax1=fig.add_subplot(311)
df.groupby('SibSp')['Survived'].mean().plot(kind='bar',ax=ax1)#存活率
ax1.set_title('Sibsp Survived Rate',size=16)
ax1.set_xlabel('')

ax2=fig.add_subplot(312)
df.groupby('Parch')['Survived'].mean().plot(kind='bar',ax=ax2)
ax2.set_title('Parch Survived Rate',size=16)
ax2.set_xlabel('')

ax3=fig.add_subplot(313)
df.groupby(df.SibSp+df.Parch)['Survived'].mean().plot(kind='bar',ax=ax3)
ax3.set_title('Parch+Sibsp Survived Rate',size=16)
#plt.tight_layout()


Out[76]:

<matplotlib.text.Text at 0x18f5cf92d30>




In [85]:

#上船地点
plt.style.use('ggplot')#美化
ax=plt.figure(figsize=(8,3)).add_subplot(111)
pos=[1,2,3]
y1=df[df.Survived==0].groupby('Embarked')['Survived'].count().sort_index().values#确保存亡的一一对应
print(y1)
y2=df[df.Survived==1].groupby('Embarked')['Survived'].count().sort_index().values
ax.bar(pos,y1,color='r',alpha=0.4,align='center',label='dead')
ax.bar(pos,y2,color='g',alpha=0.4,align='center',label='alive',bottom=y1)
ax.set_xticks(pos)
ax.set_xticklabels(['C','Q','S'])
ax.legend(fontsize=15,loc='best')
ax.set_title('Embarked survived count',size=18)


[ 75  47 427]


Out[85]:

<matplotlib.text.Text at 0x18f5d3b8e10>




In [94]:

#C地存活概率较高
#不同的上船地点
ax=plt.figure(figsize=(8,3)).add_subplot(111)
ax.set_xlim([-20,80])
ax.set_ylim([0.0,0.03])
sns.kdeplot(df[df.Embarked=='C'].Age.fillna(-10),ax=ax,label='C',color='r')
sns.kdeplot(df[df.Embarked=='Q'].Age.fillna(-10),ax=ax,label='Q',color='b')
sns.kdeplot(df[df.Embarked=='S'].Age.fillna(-10),ax=ax,label='S',color='g')
ax.legend(fontsize=18)
ax.set_title('Embarked Age Dist',size=18)
#plt.tight_layout()
#Q上岸的年龄缺失比较多
#C和S上岸的年龄分布较相似,但是C的分布更扁平小孩和老人的占比更高


Out[94]:

<matplotlib.text.Text at 0x18f5da920b8>




In [103]:

#不同仓位不同地点
y1=df[df.Survived==0].groupby(['Embarked','Pclass'])['Survived'].count().reset_index()['Survived'].values
print(y1)
y2=df[df.Survived==1].groupby(['Embarked','Pclass'])['Survived'].count().reset_index()['Survived'].values

ax=plt.figure(figsize=(8,3)).add_subplot(111)
pos=range(9)
ax.bar(pos,y1,align='center',alpha=0.5,color='r',label='dead')
ax.bar(pos,y2,align='center',bottom=y1,alpha=0.5,color='g',label='alive')

ax.set_xticks(pos)
xticklabels=[]
for embarked_val in ['C','Q','S']:
for pclass_val in range(1,4):
xticklabels.append('%s/%d'%(embarked_val,pclass_val))

ax.set_xticklabels(xticklabels,size=15)
ax.legend(fontsize=15,loc='best')#C地的存活率似乎更高


[ 26   8  41   1   1  45  53  88 286]


Out[103]:

<matplotlib.legend.Legend at 0x18f5b53ecf8>




In [123]:

#Cabin船舱号
print(df['Cabin'].isnull().value_counts())
df.groupby(df['Cabin'].isnull())['Survived'].mean()
#船舱号为空的存活率低,可以作为一个特征


True     687
False    204
Name: Cabin, dtype: int64


Out[123]:

Cabin
False    0.666667
True     0.299854
Name: Survived, dtype: float64


In [148]:

print(df[df['PassengerId']==28]['Cabin'])
print(len(df.loc[27,'Cabin']))
df[df.Cabin.apply(lambda x:len(x) if (x is not np.nan) else 0)>4].head(10)#返回Cabin大于4个字符的(有多个船舱的)


27    C23 C25 C27
Name: Cabin, dtype: object
11


Out[148]:

PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
272801Fortune, Mr. Charles Alexandermale19.003219950263.0000C23 C25 C27S
757603Moen, Mr. Sigurd Hansenmale25.00003481237.6500F G73S
888911Fortune, Miss. Mabel Helenfemale23.003219950263.0000C23 C25 C27S
979811Greenfield, Mr. William Bertrammale23.0001PC 1775963.3583D10 D12C
11811901Baxter, Mr. Quigg Edmondmale24.0001PC 17558247.5208B58 B60C
12812913Peter, Miss. AnnafemaleNaN11266822.3583F E69C
29729801Allison, Miss. Helen Lorainefemale2.0012113781151.5500C22 C26S
29930011Baxter, Mrs. James (Helene DeLaudeniere Chaput)female50.0001PC 17558247.5208B58 B60C
30530611Allison, Master. Hudson Trevormale0.9212113781151.5500C22 C26S
31131211Ryerson, Miss. Emily Boriefemale18.0022PC 17608262.3750B57 B59 B63 B66C
In [149]:

#不同船舱的存亡统计
df['Cabin_Zone']=df.Cabin.fillna('0').str.split(' ').apply(lambda x: x[0][0])
df.groupby(by='Cabin_Zone')['Survived'].agg(['mean','count'])
#不同船舱的存亡率不一样


Out[149]:

meancount
Cabin_Zone
00.299854687
A0.46666715
B0.74468147
C0.59322059
D0.75757633
E0.75000032
F0.61538513
G0.5000004
T0.0000001
In [155]:

#船票Ticket
print(df.Ticket.head())
print(len(df.Ticket.unique()))#船票有重复的
df[df.Ticket=='110152']


0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object
681


Out[155]:

PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedCabin_Zone
25725811Cherry, Miss. Gladysfemale30.00011015286.5B77SB
50450511Maioni, Miss. Robertafemale16.00011015286.5B79SB
75976011Rothes, the Countess. of (Lucy Noel Martha Dye...female33.00011015286.5B77SB
In [165]:

#船票有重复的
print(df[df.Cabin=='B77'])

#有些船票有英文,有些则没有,使用正则!!!!!!
import re
def find_e_word(x):
pattern=re.compile('[a-z]|[A-Z]')
try:
re.search(pattern,x).group()
return 1
except:
return 0

df['Ticket_e']=df.Ticket.apply(lambda x: find_e_word(x))
df.groupby('Ticket_e')['Survived'].mean()
#存活率没区别


PassengerId  Survived  Pclass  \
257          258         1       1
759          760         1       1

Name     Sex   Age  SibSp  \
257                               Cherry, Miss. Gladys  female  30.0      0
759  Rothes, the Countess. of (Lucy Noel Martha Dye...  female  33.0      0

Parch  Ticket  Fare Cabin Embarked Cabin_Zone  Ticket_e
257      0  110152  86.5   B77        S          B         0
759      0  110152  86.5   B77        S          B         0


Out[165]:

Ticket_e
0 0.3842661 0.382609
Name: Survived, dtype: float64


In [174]:

#名字Name
print(df.Name.apply(lambda x: x.split(',')[1].split('.')[0]).value_counts())
df.Name.apply(lambda x: x.split(',')[1].split('.')[1]).value_counts()[:8]


 Mr              517
Miss 182
Mrs 125
Master 40
Dr 7
Rev 6Major 2
Col 2
Mlle 2
Capt 1
the Countess 1
Don 1
Ms 1
Lady 1
Mme 1
Jonkheer 1
Sir 1
Name: Name, dtype: int64


Out[174]:

 John             9
James 7
Mary 6William 6Ivan 4
Bertha 4
William Henry 4
William John 4
Name: Name, dtype: int64


In [ ]:

#--------------------------------


In [183]:

df.head()


Out[183]:

PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedCabin_ZoneTicket_e
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS01
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85CC1
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS01
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123SC0
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS00
In [177]:

#查看数据缺失情况
print(df.isnull().sum())
df_test.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Cabin_Zone       0
Ticket_e         0
dtype: int64


Out[177]:

PassengerId      0
Pclass 0
Name 0
Sex 0
Age 86SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64


In [179]:

df[df['Embarked'].isnull()]


Out[179]:

PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedCabin_ZoneTicket_e
616211Icard, Miss. Ameliefemale38.00011357280.0B28NaNB0
82983011Stone, Mrs. George Nelson (Martha Evelyn)female62.00011357280.0B28NaNB0
In [181]:

print(df['Embarked'].value_counts())
print(df[df['Pclass']==1].Embarked.value_counts())
df.Embarked.fillna('S',inplace=True)
#上船地点填充


S    644
C    168
Q     77
Name: Embarked, dtype: int64


Out[181]:

S    127
C     85
Q      2
Name: Embarked, dtype: int64


In [234]:

#Cabin缺失值的处理,方法一
df['Cabin_e']=df['Cabin'].isnull().map({True:0,False:1})
df_test['Cabin_e']=df_test['Cabin'].isnull().map({True:0,False:1})
#df=df.drop(['Cabin_e'],axis=1)

#df['Cabin_e']=df['Cabin'].isnull().map(lambda x:0 if x is True else 1)方法二!!!!!!
#df_test['Cabin_e']
#df.head()#df=df.drop(['Cabin_e'],axis=1)

"""方法三
import re
def Cabin_isnull(x):
pattern=re.compile("\d$")
try:
re.search(pattern,x).group()
return 1
except:
return 0
df['Cabin_e']=df['Cabin'].apply(lambda x: Cabin_isnull(x))
df.head()"""
df_test.head()


Out[234]:

PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedCabin_e
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ0
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS0
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ0
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS0
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS0
In [8]:

df['Gender']=5
df['Gender']=df['Sex'].map(lambda x:x[0].upper())
print(df.head())
df['Gender']=df['Sex'].map({'male':1,'female':0}) #并行化执行
df.head()


PassengerId  Survived  Pclass  \
0            1         0       3
1            2         1       1
2            3         1       3
3            4         1       1
4            5         0       3

Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1
2                             Heikkinen, Miss. Laina  female  26.0      0
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1
4                           Allen, Mr. William Henry    male  35.0      0

Parch            Ticket     Fare Cabin Embarked Gender
0      0         A/5 21171   7.2500   NaN        S      M
1      0          PC 17599  71.2833   C85        C      F
2      0  STON/O2. 3101282   7.9250   NaN        S      F
3      0            113803  53.1000  C123        S      F
4      0            373450   8.0500   NaN        S      M


Out[8]:

PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGender
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS1
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C0
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS0
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S0
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS1
In [236]:

#Age年龄离散化,以5岁为一个周期
def age_map(x):
if x<10:
return '10-'
if x<60:
return '%d-%d'%(x//5*5,x//5*5+5)
elif x>=60:
return '60+'
else:
return 'Null'
df['Age_map']=df['Age'].apply(lambda x: age_map(x))
df_test['Age_map']=df_test['Age'].apply(lambda x:age_map(x))
df.groupby('Age_map')['Survived'].agg(['count','mean'])#不同年龄层的存亡情况


Out[236]:

countmean
Age_map
10-620.612903
10-15160.437500
15-20860.395349
20-251140.342105
25-301060.358491
30-35950.421053
35-40720.458333
40-45480.375000
45-50410.390244
50-55320.437500
55-60160.375000
60+260.269231
Null1770.293785
In [247]:

#test中的Fare缺失
print(df_test[df_test['Fare'].isnull()])
df_test.loc[df_test.Fare.isnull(),'Fare']=df_test[(df_test['Pclass']==3)&(df_test['Embarked']=='S')&(df_test['Sex']=='male')].dropna().Fare.mean()
df_test[df_test['PassengerId']==1044]


Empty DataFrame
Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked, Cabin_e, Age_map]
Index: []


Out[247]:

PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedCabin_eAge_map
15210443Storey, Mr. Thomasmale60.50037017.65NaNS060+
In [266]:

#数据归一化以加速模型收敛,Fare分布太宽
import sklearn.preprocessing as preprocessing
scaler=preprocessing.StandardScaler()
fare_scale_param=scaler.fit(df['Fare'].values.reshape(-1,1))

df['Fare_e']=fare_scale_param.transform(df['Fare'].values.reshape(-1,1))
df_test['Fare_e']=fare_scale_param.transform(df_test['Fare'].values.reshape(-1,1))
print(df.head())
df_test.head()


PassengerId  Survived  Pclass  \
0            1         0       3
1            2         1       1
2            3         1       3
3            4         1       1
4            5         0       3

Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1
2                             Heikkinen, Miss. Laina  female  26.0      0
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1
4                           Allen, Mr. William Henry    male  35.0      0

Parch            Ticket     Fare Cabin Embarked Cabin_Zone  Ticket_e  \
0      0         A/5 21171   7.2500   NaN        S          0         1
1      0          PC 17599  71.2833   C85        C          C         1
2      0  STON/O2. 3101282   7.9250   NaN        S          0         1
3      0            113803  53.1000  C123        S          C         0
4      0            373450   8.0500   NaN        S          0         0

Cabin_e Age_map    Fare_e
0        0   20-25 -0.502445
1        1   35-40  0.786845
2        0   25-30 -0.488854
3        1   35-40  0.420730
4        0   35-40 -0.486337


Out[266]:

PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedCabin_eAge_mapFare_e
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ030-35-0.490783
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS045-50-0.507479
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ060+-0.453367
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS025-30-0.474005
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS020-25-0.401017
In [267]:

#部分变量哑编码,onehot独热码
df_x = pd.concat([df[['SibSp','Parch','Fare']], pd.get_dummies(df[['Pclass','Sex','Cabin','Embarked','Age_map']])],axis=1)#按照列黏连
df_y = df.Survived
df_test_x = pd.concat([df_test[['SibSp','Parch','Fare']], pd.get_dummies(df_test[['Pclass', 'Sex','Cabin','Embarked', 'Age_map']])],axis=1)
print(df_x.head())
df_test_x.head()


SibSp  Parch     Fare  Pclass  Sex_female  Sex_male  Cabin_A10  Cabin_A14  \
0      1      0   7.2500       3           0         1          0          0
1      1      0  71.2833       1           1         0          0          0
2      0      0   7.9250       3           1         0          0          0
3      1      0  53.1000       1           1         0          0          0
4      0      0   8.0500       3           0         1          0          0

Cabin_A16  Cabin_A19      ...       Age_map_20-25  Age_map_25-30  \
0          0          0      ...                   1              0
1          0          0      ...                   0              0
2          0          0      ...                   0              1
3          0          0      ...                   0              0
4          0          0      ...                   0              0

Age_map_30-35  Age_map_35-40  Age_map_40-45  Age_map_45-50  Age_map_50-55  \
0              0              0              0              0              0
1              0              1              0              0              0
2              0              0              0              0              0
3              0              1              0              0              0
4              0              1              0              0              0

Age_map_55-60  Age_map_60+  Age_map_Null
0              0            0             0
1              0            0             0
2              0            0             0
3              0            0             0
4              0            0             0

[5 rows x 169 columns]


Out[267]:

SibSpParchFarePclassSex_femaleSex_maleCabin_A11Cabin_A18Cabin_A21Cabin_A29...Age_map_20-25Age_map_25-30Age_map_30-35Age_map_35-40Age_map_40-45Age_map_45-50Age_map_50-55Age_map_55-60Age_map_60+Age_map_Null
0007.82923010000...0010000000
1107.00003100000...0000010000
2009.68752010000...0000000010
3008.66253010000...0100000000
41112.28753100000...1000000000
5 rows × 98 columns

In [ ]:

#缺失年龄填补


In [9]:

M=np.random.randn(2,3)      #i代表性别 j代表仓等级
for i in range(2):
for j in range(3):
M[i][j]=df[(df['Pclass']==j+1) & (df['Gender']==i)]['Age'].median()     #取中位数
M


Out[9]:

array([[ 35. ,  28. ,  21.5],
[ 40. ,  30. ,  25. ]])


In [10]:

df['AgeFill']=df['Age']
print(len(df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']]))
print(df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head())
for i in range(2):
for j in range(3):
df.loc[df[(df['Pclass']==j+1) & (df['Gender']==i) & (df['Age'].isnull())].index,['AgeFill']]=M[i][j]    #.index取索引值
df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head()


177
Gender  Pclass  Age  AgeFill
5        1       3  NaN      NaN
17       1       2  NaN      NaN
19       0       3  NaN      NaN
26       1       3  NaN      NaN
28       0       3  NaN      NaN


Out[10]:

GenderPclassAgeAgeFill
513NaN25.0
1712NaN30.0
1903NaN21.5
2613NaN25.0
2803NaN21.5
In [11]:

#--------------特征工程
df['familysize']=df['SibSp']+df['Parch']
df['Pclass*AgeFill']=df['Pclass']*df['AgeFill']


In [12]:

for i in df.dtypes:
print (i)


int64
int64
int64
object
object
float64
int64
int64
object
float64
object
object
int64
float64
int64
float64


In [13]:

#df=df.drop(['Pclass*Age'],axis=1)  丢弃一列
print(df.dtypes)
df.dtypes[df.dtypes.map(lambda x: x=='object')]


PassengerId         int64
Survived            int64
Pclass              int64
Name               object
Sex                object
Age               float64
SibSp               int64
Parch               int64
Ticket             object
Fare              float64
Cabin              object
Embarked           object
Gender              int64
AgeFill           float64
familysize          int64
Pclass*AgeFill    float64
dtype: object


Out[13]:

Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object


In [14]:

df=df.drop(['Name','Sex','Age','Ticket','Cabin','Embarked'],axis=1)
df.head()


Out[14]:

PassengerIdSurvivedPclassSibSpParchFareGenderAgeFillfamilysizePclass*AgeFill
0103107.2500122.0166.0
12111071.2833038.0138.0
2313007.9250026.0078.0
34111053.1000035.0135.0
4503008.0500135.00105.0
In [15]:

from sklearn.cross_validation import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import preprocessing
import seaborn as sns
feature_names=['Pclass','SibSp','Parch','Fare','Gender','AgeFill','familysize','Pclass*AgeFill']
X=df[feature_names]
Y=df['Survived']
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0) #7:3拆分
lr_model=LogisticRegression()
lr_model.fit(X_train,y_train)
y_pred_score=lr_model.predict_proba(X_test)
y_pred_score[:10]


D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)


Out[15]:

array([[ 0.857523  ,  0.142477  ],
[ 0.86057444,  0.13942556],
[ 0.9224654 ,  0.0775346 ],
[ 0.10549417,  0.89450583],
[ 0.41466724,  0.58533276],
[ 0.59148554,  0.40851446],
[ 0.0714005 ,  0.9285995 ],
[ 0.06661603,  0.93338397],
[ 0.6030931 ,  0.3969069 ],
[ 0.28987151,  0.71012849]])


In [16]:

#from sklearn.metrics import roc_curve
import sklearn
import matplotlib.pyplot as plt
fpr,tpr,thresholds=sklearn.metrics.roc_curve(y_test,y_pred_score[:,1])#注意阈值
roc_auc=sklearn.metrics.auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr,'b',label='AUC = %0.2f'%roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


D:\Anaconda3\lib\site-packages\matplotlib\legend.py:326: UserWarning: Unrecognized location "lower right". Falling back on "best"; valid locations are
best
upper right
upper left
lower left
lower right
right
center left
center right
lower center
upper center
center

six.iterkeys(self.codes))))




In [17]:

df=df.dropna()    #有空值存在的所有行
train_data=df.values
train_data


Out[17]:

array([[   1. ,    0. ,    3. , ...,   22. ,    1. ,   66. ],
[   2. ,    1. ,    1. , ...,   38. ,    1. ,   38. ],
[   3. ,    1. ,    3. , ...,   26. ,    0. ,   78. ],
...,
[ 889. ,    0. ,    3. , ...,   21.5,    3. ,   64.5],
[ 890. ,    1. ,    1. , ...,   26. ,    0. ,   26. ],
[ 891. ,    0. ,    3. , ...,   32. ,    0. ,   96. ]])


In [18]:

#使用网格搜索最佳模型参数!!!!!!  X_train,X_test,y_train,y_test
from sklearn.model_selection import GridSearchCV
base_line_model = LogisticRegression()
param = {'penalty':['l1','l2'],
'C':[0.1, 0.5, 1.0,5.0]}
grd = GridSearchCV(estimator=base_line_model, param_grid=param, cv=5, n_jobs=3)
grd.fit(X_train,y_train)
grd.best_estimator_


Out[18]:

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)


In [27]:

from sklearn.model_selection import learning_curve
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)
def plot_learning_curve(clf, title, X, y, ylim=None, cv=None, n_jobs=3, train_sizes=np.linspace(.05, 1., 5)):
train_sizes, train_scores, test_scores = learning_curve(
clf, X, y, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

ax = plt.figure().add_subplot(111)
ax.set_title(title)
if ylim is not None:
ax.ylim(*ylim)
ax.set_xlabel(u"train_num_of_samples")
ax.set_ylabel(u"score")

ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
ax.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train score")
ax.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"testCV score")

ax.legend(loc="best")

midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff

plot_learning_curve(grd, u"learning_rate", X_train, y_train)


D:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:597: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.
% (min_groups, self.n_splits)), Warning)


Out[27]:

(0.79962671354141879, 0.069011468383558872)


In [28]:

plt.show()




In [44]:

#from sklearn.metrics import roc_curve
import sklearn
import matplotlib.pyplot as plt
fpr,tpr,thresholds=sklearn.metrics.roc_curve(y_test,grd.predict_proba(X_test)[:,1],pos_label=1)#grd可以预测
roc_auc=sklearn.metrics.auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr,'b',label='AUC = %0.2f'%roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


D:\Anaconda3\lib\site-packages\matplotlib\legend.py:326: UserWarning: Unrecognized location "lower right". Falling back on "best"; valid locations are
best
upper right
upper left
lower left
lower right
right
center left
center right
lower center
upper center
center

six.iterkeys(self.codes))))




In [45]:

#使用grd模型生成预测结果 并存入CSV
#df_test=pd.read_csv('D:/In/kaggle/Titanic/test.csv')
gender_submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':grd.predict(X_test)})
gender_submission.to_csv('C://Users//zhangshuai_lc//submission_first.csv', index=None)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息