您的位置:首页 > 其它

Titanic: Machine Learning from Disaster

2017-02-06 13:33 337 查看
我使用了逻辑回归模型做的分类,只使用了7个特征,很显然分类效果很差只有43.54%,先附上自己的渣代码,后续优化

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pydoc import describe
from plot_test import weight

def sigmoid(inX):
inX = inX.astype(float)
return 1.0 / (1.0 + np.exp(-1.0*inX))

def gradAscent(dataMat, classLabel):
m, n = dataMat.shape
alpha = 0.001
maxCycles = 500
weights = np.ones((n, 1))
#     print 'weight:\n',weights
for k in range(maxCycles):
h = sigmoid(dataMat * weights)
error = classLabel - h
weights = weights + alpha * dataMat.T * error
return weights

def getData(train, test):
train = pd.read_csv(train, header=0)
m, n = train.shape
train.Age = train.Age.fillna(train.Age.median())
train.loc[train.Sex == 'male', 'Sex'] = 1
train.loc[train.Sex == 'female', 'Sex'] = 0
train.Embarked = train.Embarked.fillna('S')
train.loc[train.Embarked == 'S', 'Embarked'] = 0
train.loc[train.Embarked == 'C', 'Embarked'] = 1
train.loc[train.Embarked == 'Q', 'Embarked'] = 2
label_mat = np.mat([train.Survived]).T
data_mat = np.mat([train.Pclass, train.Sex, train.Age, train.SibSp, train.Parch, train.Fare, train.Embarked]).T
data_mat = np.hstack((np.ones((m,1)), data_mat))
test = pd.read_csv(test, header=0)
m2, n2 = test.shape
test.Age = test.Age.fillna(test.Age.median())
test.loc[test.Sex == 'male', 'Sex'] = 1
test.loc[test.Sex == 'female', 'Sex'] = 0
test.Embarked = test.Embarked.fillna('S')
test.loc[test.Embarked == 'S', 'Embarked'] = 0
test.loc[test.Embarked == 'C', 'Embarked'] = 1
test.loc[test.Embarked == 'Q', 'Embarked'] = 2
print test.Fare[152]
print test.Fare.median()
print test.Pclass
test.Fare = test.Fare.fillna(test.Fare.median())
test_mat = np.mat([test.Pclass, test.Sex, test.Age, test.SibSp, test.Parch, test.Fare, test.Embarked]).T
test_mat = np.hstack((np.ones((m2,1)), test_mat))

#     print data_mat.shape
#     print test_mat.shape
#     print label_mat.shape
#     print data_mat[0]
#     print test_mat[0]

return data_mat, test_mat, label_mat

data, test, label = getData('train.csv', 'test.csv')
weights = gradAscent(data, label)
count = 0
f = open('re.csv', 'w')
k = 0
for i in range(892, 1310):
print k
t = int(sigmoid(test[k]*weights))
k += 1
temp = str(i)+','+str(t)+'\n'
f.write(temp)
f.close()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  机器学习