您的位置:首页 > 编程语言 > Python开发

Python数据分析实战 | 数据挖掘人门

2017-12-15 09:24 746 查看

一般概念

平台:Windows、Linux

科学计算工具:Anaconda

Numpy-数据结构基础

关键词:开源、数据计算拓展

功能:nadarray 多维操作 线性代数

#encoding=UTF-8
import  numpy as np
from numpy.linalg import *
def startMain():
lst =[[1,3,5],[2,4,6]]
print(type(lst))
np_lst=np.array(lst)
print(type(np_lst))
np_lst = np.array(lst,dtype=np.float)
print(np_lst.shape)
print(np_lst.ndim)
print(np_lst.dtype)
print(np_lst.itemsize)
print(np_lst.size)
#2 same arrays
print(np.zeros([2,4]))
print(np.ones([3, 5]))
print("rand:")
print(np.random.rand(2, 4))
print(np.random.rand())
print("randint:") #整形
print(np.random.randint(1, 10,3))
print("randn:") #正态分布
print(np.random.randn(2,4))
print("choice:")
print(np.random.choice([10,20,30]))
#array opes
print(np.arange(1,11).reshape(2,5))
print("Exp:")
print(np.exp(lst))
print("Exp2:")
print(np.exp2(lst))
print("sqrt:")
print(np.sqrt(lst))
print("sin:")
print(np.sin(lst))
print("log:")
print(np.log(lst))
#4 liner
print(np.eye(3))
print(np.log(lst))
lst = np.array([[1,2],[3,4]])
print("inv")
print(inv(lst))
#transpose  det  eig
y=np.array([[5.],[7.]])
print("solve:")
print(solve(lst,y))
if __name__ == '__main__':
startMain()


Matplotlib-丰富的可视化套件

关键词:绘图库

Scipy-强大的科学计算方法(矩阵分析,数理分析,信号分析)

关键词:数值计算库

#encoding=utf-8
import numpy as np
from pylab import *
from scipy.integrate import quad,dblquad,nquad
def main3():
#1-integral
print(quad(lambda x:np.exp(-x),0,np.inf))
print(dblquad(lambda t,x: np.exp(-x*t)/t**3, 0, np.inf,lambda x:1,lambda x:np.inf))
def f(x,y):
return x*y
def bound_y():
return [0,0.5]
def bound_x(y):
return [0,1-2*y]
print(nquad(f,[bound_x,bound_y]))

#2--optimizer
from scipy.optimize import minimize
def rosen(x):
return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0+(1-x[:-1])**2.0)
x0=np.array([1.3,0.7,0.8,1.9,1.2])
res=minimize(rosen,x0,method="nelder-mead",options={"xtol":1e-8,"disp":True})
print("ROSE MINI:",res.x)
def func(x):
return (2*x[0]*x[1]+2*x[0]-x[0]**2-2*x[1]**2)
def func_deriv(x):
dfdx0=(-2*x[0]+2*x[1]+2)
dfdx1 = (2 * x[0] -4 * x[1] )
return np.array([dfdx0,dfdx1])
cons=({"type":"eq","fun":lambda x:np.array([x[0]**3-x[1]]),"jac":lambda x:np.array([3.0*(x[0]**2.0),-1.0])},
{ 'type':'ineq','fun':lambda x:np.array([x[1]-1]),'jac':lambda x:np.array([0.0 , 1.0])})
res=minimize(func,[-1.0,1.0],jac=func_deriv,constraints=cons,method='SLSQP',options={'disp':True})
print("RESTRICT:",res)
from scipy.optimize import root
def fun(x):
return x+2*np.cos(x)
sol=root(fun,0.1)
print("ROOT:",sol.x,sol.fun)

#3--interpolation
x=np.linspace(0,1,10)
y=np.sin(2*np.pi*x)
from scipy.interpolate import interp1d
li=interp1d(x, y, kind="cubic")
x_new=np.linspace(0,1,50)
y_new=li(x_new)
figure()
plot(x,y,"r")
plot(x_new,y_new,"k")
show()
print(y_new)

#4--Linear
from scipy import linalg as lg
arr=np.array([[1,2],[3,4]])
print("Det:",lg.det(arr))
print("Inv:", lg.inv(arr))
b=np.array([6,14])
print("Sol:",lg.solve(arr,b))
print("Eig:",lg.eig(arr))

#分解
print("LU:", lg.lu(arr))
print("QR:", lg.qr(arr))
print("SVD:", lg.svd(arr))

if __name__=="__main__":
main3()


Pandas-基础数据分析套件

关键词:数据分析库

#encoding=utf-8
import numpy as np
import pandas as pd
from pylab import *

def main3():
#date struture
s=pd.Series([i*2 for i in range(1,11)])
print(type(s))
date=pd.date_range("20170525",periods=8)
df=pd.DataFrame(np.random.randn(8,5),index=date,columns=list("ABCDE"))
print(df)

#Basic
print(df.head(3))
print(df.tail(3))
print(df.index)
print(df.values)
print(df.T)
print(df.sort(columns="C"))
print(df.sort_index(axis=1,ascending=False))
print(df.describe())

#Select
print(type(df["A"]))
print(df[:3])
print(df["20170526":"20170528"])
print(df.loc[date[0]])
print(df.loc["20170525":"20170529",["B","D"]])
print(df.at[date[0],"C"])
print(df.iloc[1:3,2:4])
print(df.iloc[1,4])
print(df.iat[1,4])
print(df[df.B>0],[df.A<0])
print(df[df>0])
print(df[df["E"].isin([0,1])])

#Set  s1=pd.Series(list(range(10,18)),index=pd.date_range("20170525",periods=8))
df["F"]=s1
print(df)
df.at[date[0],"A"]=0
print(df)
df.iat[1,1]=1
df.loc[:,"D"]=np.array([4]*len(df))
print(df)
df2=df.copy()
df2[df2>0]=-df2
print(df2)

#Missing Values
df1=df.reindex(index=date[:4],columns=list("ABCD")+["G"])
df1.loc[date[0]:date[1],"G"]=1
print(df1)
print(df1.dropna())
print(df1.fillna(value=2))

#Statistic
print(df.mean())
print(df.var())
s=pd.Series([1,2,3,np.nan,5,6,9,10],index=date)
print(s)
print(s.shift(2)) #移两格
print(s.diff())
print(s.value_counts())
print(df.apply(np.cumsum))  #依次累加
print(df.apply(lambda x:x.max()-x.min()))

#Concat
pieces=[df[:3],df[-3:]]
print(pd.concat(pieces))
left=pd.DataFrame({"key":["x","y"],"value":[1,2]})
right = pd.DataFrame({"key": ["x", "y"], "value": [3, 4]})
print("LEFT:",left)
print("RIGHT:",right)
print(pd.merge(left,right,on="key",how="outer"))
df3=pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))})
print(df3.groupby("A").sum())

#reshape
import datetime
df4=pd.DataFrame({'A':['one','one','two','three']*6,

'B':['a','b','c']*8,

'C':['foo','foo','foo','bar','bar','bar']*4,

'D':np.random.randn(24),

'E':np.random.randn(24),

'F':[datetime.datetime(2017,i,1) for i in range(1,13)]+

[datetime.datetime(2017, i, 15) for i in range(1, 13)]})

print(pd.pivot_table(df4,values="D",index=["A","B"],columns=["C"]))
#Time Series
t_exam=pd.date_range("20170301",periods=10,freq="S")
print(t_exam)

#Graph  ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170301",periods=1000))
ts=ts.cumsum()
ts.plot()
show()

#File
df6=pd.read_csv("./date/test.csv")
print(df6)
df7=pd.read_excel("./date/test.xlsx","Sheet1")
print("Excel",df7)
df6.to_csv("./date/test2.csv")
df7.to_excel("./date/test2.xlsx")

if __name__=="__main__":
main3()


Scikit-learn-强大的数据分析建模库

关键词:数据挖掘建模 机器学习

机器学习:因子–>结果

结果:不打标记–>无监督学习(例如聚类)

打标记–>监督学习

有限离散–>分类

连续–>回归

决策树:监督学习 树形结构

原理:信息熵

#encoding=utf-8
import numpy as np
import pandas as pd

def main5():

#Pre-processing
from sklearn.datasets import load_iris
iris=load_iris()
print(iris)
print(len(iris["data"]))

from sklearn.cross_validation import train_test_split,train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)

#Model
from sklearn import tree
clf=tree.DecisionTreeClassifier(criterion="entropy")
clf.fit(train_data,train_target)
y_pred=clf.predict(test_data)
#Verify
from sklearn import metrics
print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred))
print(metrics.confusion_matrix(y_true=test_target, y_pred=y_pred))

with open("./date/tree.dot","w") as fw:
tree.export_graphviz(clf,out_file=fw)

if __name__=="__main__":
main5()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息