Python数据分析实战 | 数据挖掘人门
2017-12-15 09:24
746 查看
一般概念
平台:Windows、Linux科学计算工具:Anaconda
Numpy-数据结构基础
关键词:开源、数据计算拓展功能:nadarray 多维操作 线性代数
#encoding=UTF-8 import numpy as np from numpy.linalg import * def startMain(): lst =[[1,3,5],[2,4,6]] print(type(lst)) np_lst=np.array(lst) print(type(np_lst)) np_lst = np.array(lst,dtype=np.float) print(np_lst.shape) print(np_lst.ndim) print(np_lst.dtype) print(np_lst.itemsize) print(np_lst.size) #2 same arrays print(np.zeros([2,4])) print(np.ones([3, 5])) print("rand:") print(np.random.rand(2, 4)) print(np.random.rand()) print("randint:") #整形 print(np.random.randint(1, 10,3)) print("randn:") #正态分布 print(np.random.randn(2,4)) print("choice:") print(np.random.choice([10,20,30])) #array opes print(np.arange(1,11).reshape(2,5)) print("Exp:") print(np.exp(lst)) print("Exp2:") print(np.exp2(lst)) print("sqrt:") print(np.sqrt(lst)) print("sin:") print(np.sin(lst)) print("log:") print(np.log(lst)) #4 liner print(np.eye(3)) print(np.log(lst)) lst = np.array([[1,2],[3,4]]) print("inv") print(inv(lst)) #transpose det eig y=np.array([[5.],[7.]]) print("solve:") print(solve(lst,y)) if __name__ == '__main__': startMain()
Matplotlib-丰富的可视化套件
关键词:绘图库Scipy-强大的科学计算方法(矩阵分析,数理分析,信号分析)
关键词:数值计算库#encoding=utf-8 import numpy as np from pylab import * from scipy.integrate import quad,dblquad,nquad def main3(): #1-integral print(quad(lambda x:np.exp(-x),0,np.inf)) print(dblquad(lambda t,x: np.exp(-x*t)/t**3, 0, np.inf,lambda x:1,lambda x:np.inf)) def f(x,y): return x*y def bound_y(): return [0,0.5] def bound_x(y): return [0,1-2*y] print(nquad(f,[bound_x,bound_y])) #2--optimizer from scipy.optimize import minimize def rosen(x): return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0+(1-x[:-1])**2.0) x0=np.array([1.3,0.7,0.8,1.9,1.2]) res=minimize(rosen,x0,method="nelder-mead",options={"xtol":1e-8,"disp":True}) print("ROSE MINI:",res.x) def func(x): return (2*x[0]*x[1]+2*x[0]-x[0]**2-2*x[1]**2) def func_deriv(x): dfdx0=(-2*x[0]+2*x[1]+2) dfdx1 = (2 * x[0] -4 * x[1] ) return np.array([dfdx0,dfdx1]) cons=({"type":"eq","fun":lambda x:np.array([x[0]**3-x[1]]),"jac":lambda x:np.array([3.0*(x[0]**2.0),-1.0])}, { 'type':'ineq','fun':lambda x:np.array([x[1]-1]),'jac':lambda x:np.array([0.0 , 1.0])}) res=minimize(func,[-1.0,1.0],jac=func_deriv,constraints=cons,method='SLSQP',options={'disp':True}) print("RESTRICT:",res) from scipy.optimize import root def fun(x): return x+2*np.cos(x) sol=root(fun,0.1) print("ROOT:",sol.x,sol.fun) #3--interpolation x=np.linspace(0,1,10) y=np.sin(2*np.pi*x) from scipy.interpolate import interp1d li=interp1d(x, y, kind="cubic") x_new=np.linspace(0,1,50) y_new=li(x_new) figure() plot(x,y,"r") plot(x_new,y_new,"k") show() print(y_new) #4--Linear from scipy import linalg as lg arr=np.array([[1,2],[3,4]]) print("Det:",lg.det(arr)) print("Inv:", lg.inv(arr)) b=np.array([6,14]) print("Sol:",lg.solve(arr,b)) print("Eig:",lg.eig(arr)) #分解 print("LU:", lg.lu(arr)) print("QR:", lg.qr(arr)) print("SVD:", lg.svd(arr)) if __name__=="__main__": main3()
Pandas-基础数据分析套件
关键词:数据分析库#encoding=utf-8 import numpy as np import pandas as pd from pylab import * def main3(): #date struture s=pd.Series([i*2 for i in range(1,11)]) print(type(s)) date=pd.date_range("20170525",periods=8) df=pd.DataFrame(np.random.randn(8,5),index=date,columns=list("ABCDE")) print(df) #Basic print(df.head(3)) print(df.tail(3)) print(df.index) print(df.values) print(df.T) print(df.sort(columns="C")) print(df.sort_index(axis=1,ascending=False)) print(df.describe()) #Select print(type(df["A"])) print(df[:3]) print(df["20170526":"20170528"]) print(df.loc[date[0]]) print(df.loc["20170525":"20170529",["B","D"]]) print(df.at[date[0],"C"]) print(df.iloc[1:3,2:4]) print(df.iloc[1,4]) print(df.iat[1,4]) print(df[df.B>0],[df.A<0]) print(df[df>0]) print(df[df["E"].isin([0,1])]) #Set s1=pd.Series(list(range(10,18)),index=pd.date_range("20170525",periods=8)) df["F"]=s1 print(df) df.at[date[0],"A"]=0 print(df) df.iat[1,1]=1 df.loc[:,"D"]=np.array([4]*len(df)) print(df) df2=df.copy() df2[df2>0]=-df2 print(df2) #Missing Values df1=df.reindex(index=date[:4],columns=list("ABCD")+["G"]) df1.loc[date[0]:date[1],"G"]=1 print(df1) print(df1.dropna()) print(df1.fillna(value=2)) #Statistic print(df.mean()) print(df.var()) s=pd.Series([1,2,3,np.nan,5,6,9,10],index=date) print(s) print(s.shift(2)) #移两格 print(s.diff()) print(s.value_counts()) print(df.apply(np.cumsum)) #依次累加 print(df.apply(lambda x:x.max()-x.min())) #Concat pieces=[df[:3],df[-3:]] print(pd.concat(pieces)) left=pd.DataFrame({"key":["x","y"],"value":[1,2]}) right = pd.DataFrame({"key": ["x", "y"], "value": [3, 4]}) print("LEFT:",left) print("RIGHT:",right) print(pd.merge(left,right,on="key",how="outer")) df3=pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))}) print(df3.groupby("A").sum()) #reshape import datetime df4=pd.DataFrame({'A':['one','one','two','three']*6, 'B':['a','b','c']*8, 'C':['foo','foo','foo','bar','bar','bar']*4, 'D':np.random.randn(24), 'E':np.random.randn(24), 'F':[datetime.datetime(2017,i,1) for i in range(1,13)]+ [datetime.datetime(2017, i, 15) for i in range(1, 13)]}) print(pd.pivot_table(df4,values="D",index=["A","B"],columns=["C"])) #Time Series t_exam=pd.date_range("20170301",periods=10,freq="S") print(t_exam) #Graph ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170301",periods=1000)) ts=ts.cumsum() ts.plot() show() #File df6=pd.read_csv("./date/test.csv") print(df6) df7=pd.read_excel("./date/test.xlsx","Sheet1") print("Excel",df7) df6.to_csv("./date/test2.csv") df7.to_excel("./date/test2.xlsx") if __name__=="__main__": main3()
Scikit-learn-强大的数据分析建模库
关键词:数据挖掘建模 机器学习机器学习:因子–>结果
结果:不打标记–>无监督学习(例如聚类)
打标记–>监督学习
有限离散–>分类
连续–>回归
决策树:监督学习 树形结构
原理:信息熵
#encoding=utf-8 import numpy as np import pandas as pd def main5(): #Pre-processing from sklearn.datasets import load_iris iris=load_iris() print(iris) print(len(iris["data"])) from sklearn.cross_validation import train_test_split,train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1) #Model from sklearn import tree clf=tree.DecisionTreeClassifier(criterion="entropy") clf.fit(train_data,train_target) y_pred=clf.predict(test_data) #Verify from sklearn import metrics print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred)) print(metrics.confusion_matrix(y_true=test_target, y_pred=y_pred)) with open("./date/tree.dot","w") as fw: tree.export_graphviz(clf,out_file=fw) if __name__=="__main__": main5()
相关文章推荐
- python数据分析与挖掘实战 第七章 拓展思考
- python数据分析和挖掘实战
- python数据分析与挖掘实战 第六章 拓展思考
- python数据分析与挖掘学习笔记(6)-电商网站数据分析及商品自动推荐实战与关联规则算法
- python数据挖掘与分析实战 第5章 一处错误
- Python数据分析与实战挖掘
- Python数据分析与挖掘实战pdf
- python数据分析与挖掘实战-4
- Python数据分析与挖掘实战(数据预处理)
- python数据分析与挖掘学习笔记(7)-交通路标自动识别实战与神经网络算法
- 『Python数据分析与挖掘实战』第五章:挖掘建模
- 利用Python爬虫爬取淘宝商品做数据挖掘分析实战篇,超详细教程
- Python数据分析与挖掘实战代码纠错 代码3-3
- Python数据分析与挖掘实战—挖掘建模
- python数据分析与挖掘实战-第六章拓展偷漏税用户识别
- Python数据分析与挖掘实战(开发流程及常用库安装)
- python数据分析与挖掘项目实战记录
- python数据分析与挖掘学习笔记(6)-电商网站数据分析及商品自动推荐实战与关联规则算法
- python 数据分析与挖掘实战
- Python数据分析与挖掘实战学习笔记(二)