您的位置:首页 > 编程语言 > Python开发

一、机器学习系统设计笔记之python机器学习入门

2015-10-10 11:24 711 查看
一,学习Numpy

因为numpy.array会遮挡python自带的数组模块,应该使用以下方式:

>>> numpy.version.full_version
'1.9.3'
>>> import numpy as np
>>> a = np.array([0,1,2,3,4,5])
>>> a
array([0, 1, 2, 3, 4, 5])
>>> a.ndim
1
>>> a.shape
(6,)


将这个数组转换到二维矩阵:

>>> b = a.reshape((3,2))
>>> b
array([[0, 1],
[2, 3],
[4, 5]])
>>> b.ndim
2
>>> b.shape
(3, 2)
>>>


numpy在所有可能之处都避免复制操作,如:

>>> b[1][0]=100
>>> b
array([[  0,   1],
[100,   3],
[  4,   5]])
>>> a
array([  0,   1, 100,   3,   4,   5])


如果需要真正的副本,应该这样:

>>> c =a.reshape((3,2)).copy()
>>> c
array([[  0,   1],
[100,   3],
[  4,   5]])
>>> c[0][0]=100
>>> c
array([[100,   1],
[100,   3],
[  4,   5]])
>>> a
array([  0,   1, 100,   3,   4,   5])
>>> b
array([[  0,   1],
[100,   3],
[  4,   5]])
>>>


numpy对数组的操作传递到每个元素上:

>>> a*2
array([  0,   2, 200,   6,   8,  10])
>>> a**2
array([    0,     1, 10000,     9,    16,    25])
>>>


1、索引:
除了正常的列表索引,numpy还允许将数组当做索引使用

>>> a[np.array([2,3,4])]
array([100,   3,   4])
>>> a>4
array([False, False,  True, False, False,  True], dtype=bool)
>>> a[a>4]=4
>>> a
array([0, 1, 4, 3, 4, 4])
>>>


鉴于经常修剪异常值,可以用专门的函数处理:

>>> a
array([0, 1, 4, 3, 4, 4])
>>> a.clip(0,3)
array([0, 1, 3, 3, 3, 3])
>>>


2、处理不存在的值:

numpy.NAN标记的表示它不是真实的数值

>>> c=np.array([1,2,np.NAN,3,4])
>>> c
array([  1.,   2.,  nan,   3.,   4.])
>>> np.isnan(c)
array([False, False,  True, False, False], dtype=bool)
>>> c[~np.isnan(c)]
array([ 1.,  2.,  3.,  4.])
>>> np.mean(c[~np.isnan(c)])
2.5
>>>


3、运行时比较

import timeit

normal_py_sec=timeit.timeit('sum(x*x for x in range(1000))',number=1000)

naive_np_sec=timeit.timeit('sum(na*na)',setup="import numpy as np;na=np.arange(1000)",number=1000)

good_np_sec=timeit.timeit('na.dot(na)',setup="import numpy as np;na=np.arange(1000)",number=1000)
print("Normal Python :%f sec"%normal_py_sec)
print("Naive Numpy:%f sec"%naive_np_sec)
print("Good Numpy:%f sec"%good_np_sec)


结果如下:

Normal Python :0.101323 sec
Naive Numpy:0.083080 sec
Good Numpy:0.001787 sec

二、实例应用

1、读取数据

使用scipy的genfromtxt()很容易读取数据

import scipy as sp
data=sp.genfromtxt("web_traffic_tsv",delimiter="\t")
2、预处理和数据清洗

1)、获取x、y方向坐标

x = data[:,0]
y = data[:,1]


2)、清除无效数据

查看无效值多少

sp.sum(sp.isnan(y))


3)、画出数据

def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="\t")
x = data[:,0] y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]

plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)



3、选择正确的模型和算法

1)、近似误差

def error(f,x,y):
return sp.sum((f(x)-y)**2)


2)、一阶模型

import scipy as  sp
import numpy as np
import matplotlib.pyplot as plt

def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="\t")
x = data[:,0] y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]

fp1,residuals,rank,sv,rcond=sp.polyfit(x,y,1,full=True)
print("Model parameters %s" % fp1)
f1=sp.poly1d(fp1)
fx = sp.linspace(0,x[-1],1000)
plt.plot(fx,f1(fx),linewidth=4)
plt.legend(["d=%i" %f1.order],loc="upper left")

plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)



3)、二阶模型

import scipy as  sp
import numpy as np
import matplotlib.pyplot as plt

def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="\t")
x = data[:,0] y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]

fp1,residuals,rank,sv,rcond=sp.polyfit(x,y,1,full=True)
print("Model parameters %s" % fp1)
f1=sp.poly1d(fp1)
fx = sp.linspace(0,x[-1],1000)
plt.plot(fx,f1(fx),linewidth=4)
plt.legend(["d=%i" % f1.order],loc="upper left")

f2p=sp.polyfit(x,y,2)
f2=sp.poly1d(f2p)
plt.plot(fx,f2(fx),linewidth=4)
print(f2.order)
plt.legend("d=%i" % f2.order,loc="upper left")

plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)




4)、组合模型

# -*- coding: utf-8 -*-

import os
import scipy as sp
import matplotlib.pyplot as plt

data_dir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), ".", "data")
data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t")
print(data[:10])

# all examples will have three classes in this file
colors = ['g', 'k', 'b', 'm', 'r']
linestyles = ['-', '-.', '--', ':', '-']

x = data[:, 0]
y = data[:, 1]
print("Number of invalid entries:", sp.sum(sp.isnan(y)))
x = x[~sp.isnan(y)]
y = y[~sp.isnan(y)]

# plot input data

def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
plt.clf()
plt.scatter(x, y, s=10)
plt.title("Web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hits/hour")
plt.xticks(
[w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])

if models:
if mx is None:
mx = sp.linspace(0, x[-1], 1000)
for model, style, color in zip(models, linestyles, colors):
# print "Model:",model
# print "Coeffs:",model.coeffs
plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)

plt.legend(["d=%i" % m.order for m in models], loc="upper left")

plt.autoscale(tight=True)
plt.ylim(ymin=0)
if ymax:
plt.ylim(ymax=ymax)
if xmin:
plt.xlim(xmin=xmin)
plt.grid(True, linestyle='-', color='0.75')
plt.savefig(fname)

# first look at the data
plot_models(x, y, None, os.path.join("..", "1400_01_01.png"))

# create and plot models
fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
print("Model parameters: %s" % fp1)
print("Error of the model:", res)
f1 = sp.poly1d(fp1)
f2 = sp.poly1d(sp.polyfit(x, y, 2))
f3 = sp.poly1d(sp.polyfit(x, y, 3))
f10 = sp.poly1d(sp.polyfit(x, y, 10))
f100 = sp.poly1d(sp.polyfit(x, y, 100))

plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))
plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))
plot_models(
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))

# fit and plot a model using the knowledge about inflection point
inflection = 3.5 * 7 * 24
xa = x[:inflection]
ya = y[:inflection]
xb = x[inflection:]
yb = y[inflection:]

fa = sp.poly1d(sp.polyfit(xa, ya, 1))
fb = sp.poly1d(sp.polyfit(xb, yb, 1))

plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))

def error(f, x, y):
return sp.sum((f(x) - y) ** 2)

print("Errors for the complete data set:")
for f in [f1, f2, f3, f10, f100]:
print("Error d=%i: %f" % (f.order, error(f, x, y)))

print("Errors for only the time after inflection point")
for f in [f1, f2, f3, f10, f100]:
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))

print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))

# extrapolating into the future
plot_models(
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)

print("Trained only on data after inflection point")
fb1 = fb
fb2 = sp.poly1d(sp.polyfit(xb, yb, 2))
fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))
fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))
fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))

print("Errors for only the time after inflection point")
for f in [fb1, fb2, fb3, fb10, fb100]:
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))

plot_models(
x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)

# separating training from testing data
frac = 0.3
split_idx = int(frac * len(xb))
shuffled = sp.random.permutation(list(range(len(xb))))
test = sorted(shuffled[:split_idx])
train = sorted(shuffled[split_idx:])
fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))

print("Test errors for only the time after inflection point")
for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:
print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))

plot_models(
x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..",
"1400_01_08.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)

from scipy.optimize import fsolve
print(fbt2)
print(fbt2 - 100000)
reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)
print("100,000 hits/hour expected at week %f" % reached_max[0])




5)、训练与测试

6)、验证
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: