python对站点类型聚类
2018-03-19 11:59
99 查看
有10个excel文件,记录了每个站点每个时间的进出站人数,统计上下班时间段进站日均人数、上下班时间段出站日均人数、非上下班时间段进站日均人数、非上下班时间段出站日均人数 四个变量。使用这4个变量做kmean聚类:
最后的聚类结果:
[0 2 0 2 2 1 2 2 1 2 1 0 0 2 2 1 1 2 2 0 0 0 2 1 1 2 0 2 2 2 2 2 2 2 0 2 0
0 0 1 2 2 1 2 1 2 2 0 0 0 2 2 2 1 2 2 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
2 2 0 0 0 0 0 0 0 0 2 0 2 0 0 0 2 0 2 0 2 0 0 2 2 1 0 0 0 2 1 2 0 2 2 2 0
2 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2
0 0 0 0 0 0 0 0 2]
#coding=utf-8 import pandas as pd from pandas import Series,DataFrame import random import numpy as np from datetime import date import datetime as dt from numpy import nan as NA from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingRegressor import warnings warnings.filterwarnings("ignore") def GetExcelData(Path,IDFlag): #读取excel数据 data=pd.read_excel(Path) #去掉最后3行,最后3行是总计、统计 data = data[:data.shape[0]-3] #print(data.columns.values) #print(data.shape) data['V1'] = 0 data['V2'] = 0 #统计工作时间、非工作时间人数 data['V1']=data['4:00-5:00']+\ data['5:00-6:00']+\ data['10:00-11:00']+\ data['11:00-12:00']+\ data['12:00-13:00']+\ data['13:00-14:00']+\ data['14:00-15:00']+\ data['15:00-16:00']+\ data['19:00-20:00']+\ data['20:00-21:00']+\ data['21:00-22:00']+\ data['22:00-23:00']+\ data['23:00-24:00']+\ data['24:00-1:00']+\ data['1:00-2:00']+\ data['2:00-3:00']+\ data['3:00-4:00'] data['V2']=data['7:00-8:00']+\ data['8:00-9:00']+\ data['9:00-10:00']+\ data['16:00-17:00']+\ data['17:00-18:00']+\ data['18:00-19:00'] IDNum = data.shape[0]/2 IDNum = int(IDNum) #print(IDNum) df1=pd.DataFrame({'ID':np.random.randn(IDNum)}) for i in range(IDNum): df1.ID[i] = IDFlag+str(i+1) df1['V1'] = 0#非工作时间进 df1['V2'] = 0#非工作时间出 df1['V3'] = 0#工作时间进 df1['V4'] = 0#工作时间出 for i in range(IDNum): df1.V1[i] = data.V1[2*i] df1.V2[i] = data.V1[2*i+1] df1.V3[i] = data.V2[2*i] df1.V4[i] = data.V2[2*i+1] #print(df1) return df1 data1 = GetExcelData(u'2015xxxx/1.xls','a') print(data1.shape) print(data1) data2 = GetExcelData(u'2015xxxx/2.xls','b') data3 = GetExcelData(u'2015xxxx/3.xls','c') data4 = GetExcelData(u'2015xxxx/4.xls','d') data5 = GetExcelData(u'2015xxxx/5.xls','e') data6 = GetExcelData(u'2015xxxx/6.xls','f') data7 = GetExcelData(u'2015xxxx/7.xls','g') data8 = GetExcelData(u'2015xxxx/8.xls','j') data9 = GetExcelData(u'2015xxxx/9.xls','i') data10 = GetExcelData(u'2015xxxx/10.xls','j') #把10个结果加一起 data = data1 print(data.shape) #print(data) data = data.append(data2) data = data.append(data3) data = data.append(data4) data = data.append(data5) data = data.append(data6) data = data.append(data7) data = data.append(data8) data = data.append(data9) data = data.append(data10) data = data.reset_index(drop=True)#重新计算下索引 #print(data.shape) #print(data) import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans X = data[['V1','V2','V3','V4']].values estimator = KMeans(n_clusters=3)#构造聚类器 estimator.fit(X)#聚类 label_pred = estimator.labels_ #获取聚类标签 print(label_pred)
最后的聚类结果:
[0 2 0 2 2 1 2 2 1 2 1 0 0 2 2 1 1 2 2 0 0 0 2 1 1 2 0 2 2 2 2 2 2 2 0 2 0
0 0 1 2 2 1 2 1 2 2 0 0 0 2 2 2 1 2 2 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
2 2 0 0 0 0 0 0 0 0 2 0 2 0 0 0 2 0 2 0 2 0 0 2 2 1 0 0 0 2 1 2 0 2 2 2 0
2 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2
0 0 0 0 0 0 0 0 2]
相关文章推荐
- Python集合(set)类型的操作
- python 元类型编程, 单例模式SingleTon的一种实现方式
- Python数据类型 之 int
- Python之路(三)Python数据类型
- Python数据类型转换及一些操作
- python继承,判断类型,多态
- python判断变量类型
- 005 Python前端之元素类型
- Python一天入门2:数据类型
- 廖雪峰老师——Python入门(Python变量和数据类型 )
- python基本数据类型
- python如何查看编码类型
- Python核心数据类型——列表与字典
- python各种类型转换-int,str,char,float,ord,hex,oct等
- Python的bool类型
- python 数据类型
- Python的类型和变量
- python post content-type:multipart/form-data 类型的表单数据
- 第二课——python中的数据类型
- Python 动态类型 变量基本上是动态变量