一、机器学习系统设计笔记之python机器学习入门
2015-10-10 11:24
711 查看
一,学习Numpy
因为numpy.array会遮挡python自带的数组模块,应该使用以下方式:
将这个数组转换到二维矩阵:
numpy在所有可能之处都避免复制操作,如:
如果需要真正的副本,应该这样:
numpy对数组的操作传递到每个元素上:
1、索引:
除了正常的列表索引,numpy还允许将数组当做索引使用
鉴于经常修剪异常值,可以用专门的函数处理:
2、处理不存在的值:
numpy.NAN标记的表示它不是真实的数值
3、运行时比较
结果如下:
Normal Python :0.101323 sec
Naive Numpy:0.083080 sec
Good Numpy:0.001787 sec
二、实例应用
1、读取数据
使用scipy的genfromtxt()很容易读取数据
1)、获取x、y方向坐标
2)、清除无效数据
查看无效值多少
3)、画出数据
3、选择正确的模型和算法
1)、近似误差
2)、一阶模型
3)、二阶模型
4)、组合模型
5)、训练与测试
6)、验证
因为numpy.array会遮挡python自带的数组模块,应该使用以下方式:
>>> numpy.version.full_version '1.9.3' >>> import numpy as np >>> a = np.array([0,1,2,3,4,5]) >>> a array([0, 1, 2, 3, 4, 5]) >>> a.ndim 1 >>> a.shape (6,)
将这个数组转换到二维矩阵:
>>> b = a.reshape((3,2)) >>> b array([[0, 1], [2, 3], [4, 5]]) >>> b.ndim 2 >>> b.shape (3, 2) >>>
numpy在所有可能之处都避免复制操作,如:
>>> b[1][0]=100 >>> b array([[ 0, 1], [100, 3], [ 4, 5]]) >>> a array([ 0, 1, 100, 3, 4, 5])
如果需要真正的副本,应该这样:
>>> c =a.reshape((3,2)).copy() >>> c array([[ 0, 1], [100, 3], [ 4, 5]]) >>> c[0][0]=100 >>> c array([[100, 1], [100, 3], [ 4, 5]]) >>> a array([ 0, 1, 100, 3, 4, 5]) >>> b array([[ 0, 1], [100, 3], [ 4, 5]]) >>>
numpy对数组的操作传递到每个元素上:
>>> a*2 array([ 0, 2, 200, 6, 8, 10]) >>> a**2 array([ 0, 1, 10000, 9, 16, 25]) >>>
1、索引:
除了正常的列表索引,numpy还允许将数组当做索引使用
>>> a[np.array([2,3,4])] array([100, 3, 4]) >>> a>4 array([False, False, True, False, False, True], dtype=bool) >>> a[a>4]=4 >>> a array([0, 1, 4, 3, 4, 4]) >>>
鉴于经常修剪异常值,可以用专门的函数处理:
>>> a array([0, 1, 4, 3, 4, 4]) >>> a.clip(0,3) array([0, 1, 3, 3, 3, 3]) >>>
2、处理不存在的值:
numpy.NAN标记的表示它不是真实的数值
>>> c=np.array([1,2,np.NAN,3,4]) >>> c array([ 1., 2., nan, 3., 4.]) >>> np.isnan(c) array([False, False, True, False, False], dtype=bool) >>> c[~np.isnan(c)] array([ 1., 2., 3., 4.]) >>> np.mean(c[~np.isnan(c)]) 2.5 >>>
3、运行时比较
import timeit normal_py_sec=timeit.timeit('sum(x*x for x in range(1000))',number=1000) naive_np_sec=timeit.timeit('sum(na*na)',setup="import numpy as np;na=np.arange(1000)",number=1000) good_np_sec=timeit.timeit('na.dot(na)',setup="import numpy as np;na=np.arange(1000)",number=1000) print("Normal Python :%f sec"%normal_py_sec) print("Naive Numpy:%f sec"%naive_np_sec) print("Good Numpy:%f sec"%good_np_sec)
结果如下:
Normal Python :0.101323 sec
Naive Numpy:0.083080 sec
Good Numpy:0.001787 sec
二、实例应用
1、读取数据
使用scipy的genfromtxt()很容易读取数据
import scipy as sp data=sp.genfromtxt("web_traffic_tsv",delimiter="\t")2、预处理和数据清洗
1)、获取x、y方向坐标
x = data[:,0] y = data[:,1]
2)、清除无效数据
查看无效值多少
sp.sum(sp.isnan(y))
3)、画出数据
def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="\t")
x = data[:,0] y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)
3、选择正确的模型和算法
1)、近似误差
def error(f,x,y): return sp.sum((f(x)-y)**2)
2)、一阶模型
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="\t")
x = data[:,0] y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
fp1,residuals,rank,sv,rcond=sp.polyfit(x,y,1,full=True)
print("Model parameters %s" % fp1)
f1=sp.poly1d(fp1)
fx = sp.linspace(0,x[-1],1000)
plt.plot(fx,f1(fx),linewidth=4)
plt.legend(["d=%i" %f1.order],loc="upper left")
plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)
3)、二阶模型
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="\t")
x = data[:,0] y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
fp1,residuals,rank,sv,rcond=sp.polyfit(x,y,1,full=True)
print("Model parameters %s" % fp1)
f1=sp.poly1d(fp1)
fx = sp.linspace(0,x[-1],1000)
plt.plot(fx,f1(fx),linewidth=4)
plt.legend(["d=%i" % f1.order],loc="upper left")
f2p=sp.polyfit(x,y,2)
f2=sp.poly1d(f2p)
plt.plot(fx,f2(fx),linewidth=4)
print(f2.order)
plt.legend("d=%i" % f2.order,loc="upper left")
plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)
4)、组合模型
# -*- coding: utf-8 -*- import os import scipy as sp import matplotlib.pyplot as plt data_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), ".", "data") data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t") print(data[:10]) # all examples will have three classes in this file colors = ['g', 'k', 'b', 'm', 'r'] linestyles = ['-', '-.', '--', ':', '-'] x = data[:, 0] y = data[:, 1] print("Number of invalid entries:", sp.sum(sp.isnan(y))) x = x[~sp.isnan(y)] y = y[~sp.isnan(y)] # plot input data def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): plt.clf() plt.scatter(x, y, s=10) plt.title("Web traffic over the last month") plt.xlabel("Time") plt.ylabel("Hits/hour") plt.xticks( [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)]) if models: if mx is None: mx = sp.linspace(0, x[-1], 1000) for model, style, color in zip(models, linestyles, colors): # print "Model:",model # print "Coeffs:",model.coeffs plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color) plt.legend(["d=%i" % m.order for m in models], loc="upper left") plt.autoscale(tight=True) plt.ylim(ymin=0) if ymax: plt.ylim(ymax=ymax) if xmin: plt.xlim(xmin=xmin) plt.grid(True, linestyle='-', color='0.75') plt.savefig(fname) # first look at the data plot_models(x, y, None, os.path.join("..", "1400_01_01.png")) # create and plot models fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True) print("Model parameters: %s" % fp1) print("Error of the model:", res) f1 = sp.poly1d(fp1) f2 = sp.poly1d(sp.polyfit(x, y, 2)) f3 = sp.poly1d(sp.polyfit(x, y, 3)) f10 = sp.poly1d(sp.polyfit(x, y, 10)) f100 = sp.poly1d(sp.polyfit(x, y, 100)) plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png")) plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png")) plot_models( x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png")) # fit and plot a model using the knowledge about inflection point inflection = 3.5 * 7 * 24 xa = x[:inflection] ya = y[:inflection] xb = x[inflection:] yb = y[inflection:] fa = sp.poly1d(sp.polyfit(xa, ya, 1)) fb = sp.poly1d(sp.polyfit(xb, yb, 1)) plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png")) def error(f, x, y): return sp.sum((f(x) - y) ** 2) print("Errors for the complete data set:") for f in [f1, f2, f3, f10, f100]: print("Error d=%i: %f" % (f.order, error(f, x, y))) print("Errors for only the time after inflection point") for f in [f1, f2, f3, f10, f100]: print("Error d=%i: %f" % (f.order, error(f, xb, yb))) print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb))) # extrapolating into the future plot_models( x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24) print("Trained only on data after inflection point") fb1 = fb fb2 = sp.poly1d(sp.polyfit(xb, yb, 2)) fb3 = sp.poly1d(sp.polyfit(xb, yb, 3)) fb10 = sp.poly1d(sp.polyfit(xb, yb, 10)) fb100 = sp.poly1d(sp.polyfit(xb, yb, 100)) print("Errors for only the time after inflection point") for f in [fb1, fb2, fb3, fb10, fb100]: print("Error d=%i: %f" % (f.order, error(f, xb, yb))) plot_models( x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24) # separating training from testing data frac = 0.3 split_idx = int(frac * len(xb)) shuffled = sp.random.permutation(list(range(len(xb)))) test = sorted(shuffled[:split_idx]) train = sorted(shuffled[split_idx:]) fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1)) fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2)) fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3)) fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10)) fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100)) print("Test errors for only the time after inflection point") for f in [fbt1, fbt2, fbt3, fbt10, fbt100]: print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test]))) plot_models( x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..", "1400_01_08.png"), mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), ymax=10000, xmin=0 * 7 * 24) from scipy.optimize import fsolve print(fbt2) print(fbt2 - 100000) reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24) print("100,000 hits/hour expected at week %f" % reached_max[0])
5)、训练与测试
6)、验证
相关文章推荐
- Python基本语法
- 【转】 Python subprocess模块学习总结
- Python待完善
- Python资料收集
- python得爬虫关键词
- spark机器学习中安装ipython步骤
- python常用的自省函数
- 机器学习算法的本质(Python和R准则)
- python继承中重载问题:私有函数不能被子类重写
- 【转】关于python文件操作
- Python的13大图形库
- Python调用jieba分词中的中文编码问题
- LeetCode----Combination Sum III
- Python thread
- 关于python的字符串逆序输出
- python----设置默认编码
- Python正则表达式操作指南
- python 面向对象编程案例01
- KNN算法 手写识别 python
- 写一个段落python代码推理list深浅