python爬取链家数据
2016-12-07 00:31
253 查看
V1版
注:通过sublime新建.py文件无法command+B 执行,可能与本身环境变量未设定有关,spyder新建的可执行import sys reload(sys) sys.setdefaultencoding("utf-8") #设定编码要放在最上面。之前放在引用库的最下面,然后加到pandas的DataFrame老是出现中文字符为问号的乱码 import pandas as pd import urllib2 import urllib import time import re from bs4 import BeautifulSoup #读取指定网页的内容 myurl="http://hz.lianjia.com/ershoufang/pg"+str(1) req = urllib2.Request(myurl) myResponse = urllib2.urlopen(req) myPage = myResponse.read() unmyPage = myPage.decode('utf-8') #转换为Unicode类型 #匹配房源的总价 c1=re.findall('<div.*?class="totalPrice".*?><span>(.*?)</span>(.*?)</div>',unmyPage,re.S) totalPrice=[] for item in c1: newitem=item[0]+item[1] newitem=str(newitem) totalPrice.append(newitem) #匹配房源信息 c2=re.findall('data-el="region">(.*?)</div>',unmyPage,re.S) houseinfo=[] for item in c2: #item=item.encode('utf-8') #print isinstance(item,str) houseinfo.append(item) #匹配房源关注度 c3=re.findall('<span.*?class="starIcon"></span>(.*?)</div>',unmyPage,re.S) followinfo=[] for item in c3: followinfo.append(item) house=pd.DataFrame({'totalprice':totalPrice,'houseinfo':houseinfo, 'followinfo':followinfo}) print type(house['totalprice'][0]) #str print type(house['houseinfo'][0]) #unicode print type(house['followinfo'][0]) #unicode print house.head()
参考字符串编码
http://ajucs.com/2015/11/10/Python-character-encoding-explained.html
http://wklken.me/posts/2013/08/31/python-extra-coding-intro.html
V2版
import sys reload(sys) sys.setdefaultencoding("utf-8") import pandas as pd import urllib2 import urllib import time import re from bs4 import BeautifulSoup totalPrice=[] houseinfo=[] followinfo=[] for i in range(1,3): #读取指定网页的内容 myurl="http://hz.lianjia.com/ershoufang/pg"+str(i) req = urllib2.Request(myurl) myResponse = urllib2.urlopen(req) myPage = myResponse.read() unmyPage = myPage.decode('utf-8') #匹配房源的总价 c1=re.findall('<div.*?class="totalPrice".*?><span>(.*?)</span>(.*?)</div>',unmyPage,re.S) for item in c1: newitem=item[0]+item[1] totalPrice.append(newitem) #print newitem #匹配房源信息 c2=re.findall('data-el="region">(.*?)</div>',unmyPage,re.S) for item in c2: item=re.sub('</a>','',item) #将</a>替换掉 houseinfo.append(item) #匹配房源关注度 c3=re.findall('<span.*?class="starIcon"></span>(.*?)</div>',unmyPage,re.S) for item in c3: followinfo.append(item) house=pd.DataFrame({'totalprice':totalPrice,'houseinfo':houseinfo, 'followinfo':followinfo}) #print house,house.index #使用pandas对数据进行清洗 houseinfo_split = pd.DataFrame((x.split('|') for x in house.houseinfo),index=house.index, columns=['小区','户型','面积','朝向','装修','电梯']) print houseinfo_split
V2版
注:3张图会相互覆盖,关掉一幅图才会显示下一幅,全部关掉才会显示finishedimport sys reload(sys) sys.setdefaultencoding("utf-8") import pandas as pd import urllib2 import urllib import time import re from bs4 import BeautifulSoup import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans totalPrice=[] houseinfo=[] followinfo=[] for i in range(1,6): #12.8记录 第7页有个独栋别墅,houseinfo与其他不一致,故先取到第6页 #读取指定网页的内容 myurl="http://hz.lianjia.com/ershoufang/pg"+str(i) req = urllib2.Request(myurl) myResponse = urllib2.urlopen(req) myPage = myResponse.read() unmyPage = myPage.decode('utf-8') #匹配房源的总价 c1=re.findall('<div.*?class="totalPrice".*?><span>(.*?)</span>(.*?)</div>',unmyPage,re.S) for item in c1: newitem=item[0]+item[1] newitem=newitem.encode('utf-8') totalPrice.append(newitem) #print newitem #匹配房源信息 c2=re.findall('data-el="region">(.*?)</div>',unmyPage,re.S) for item in c2: item=re.sub('</a>','',item) #将</a>替换掉 item=item.encode('utf-8') houseinfo.append(item) #print item #匹配房源关注度 c3=re.findall('<span.*?class="starIcon"></span>(.*?)</div>',unmyPage,re.S) for item in c3: item=item.encode('utf-8') followinfo.append(item) house=pd.DataFrame({'totalprice':totalPrice,'houseinfo':houseinfo, 'followinfo':followinfo}) #print house,house.index #使用pandas对数据进行清洗 #对房源信息进行分列 houseinfo_split = pd.DataFrame((x.split('|') for x in house.houseinfo),index=house.index, columns=['xiaoqu','huxing','mianji','chaoxiang','zhuangxiu','dianti']) house=pd.merge(house,houseinfo_split,right_index=True, left_index=True) #对关注信息进行分类 followinfo_split = pd.DataFrame((x.split('/') for x in house.followinfo),index=house.index, columns=['guanzhu','daikan','fabu']) house=pd.merge(house,followinfo_split,right_index=True, left_index=True) #print house.head() #房源户型分布 huxing=house.groupby('huxing')['huxing'].agg(len) #print huxing #绘制房源户型分布 ***条形图*** plt.rc('font', family='STXihei', size=11) a=np.array([1,2,3,4,5,6,7,8,9,10,11,12]) plt.barh([1,2,3,4,5,6,7,8,9,10,11,12],huxing,color='#052B6C',alpha=0.8,align='center',edgecolor='white') plt.ylabel('户型') plt.xlabel('数量') plt.xlim(0,150) plt.ylim(0,15) plt.title('房源户型分布情况') plt.legend(['数量'], loc='upper right') plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4) plt.yticks(a,('1室0厅','1室1厅','2室0厅','2室1厅','2室2厅','3室1厅','3室2厅','4室1厅','4室2厅','4室3厅','5室2厅','6室3厅')) plt.show() #房屋面积分布情况 #对房源面积进行二次分列 mianji_num_split = pd.DataFrame((x.split('平') for x in house.mianji),index=house.index,columns=['mianji_num','mi']) #print mianji_num_split.head() #将分列后的房源面积拼接回原数据表 house=pd.merge(house,mianji_num_split,right_index=True, left_index=True) #去除mianji_num字段两端的空格 house['mianji_num']=house['mianji_num'].map(str.strip) #更改mianji_num字段格式为float house['mianji_num']=house['mianji_num'].astype(float) #查看所有房源面积的范围值 house['mianji_num'].max(),house['mianji_num'].min() #对房源面积进行分组 bins = [0, 50, 100, 150, 200, 250, 300] group_mianji = ['小于50', '50-100', '100-150', '150-200','200-250','250-300'] house['group_mianji'] = pd.cut(house['mianji_num'], bins, labels=group_mianji) #添加一个新的列 #按房源面积分组对房源数量进行汇总 group_mianji=house.groupby('group_mianji')['group_mianji'].agg(len) #print group_mianji #绘制房源面积分布图***条形图*** plt.rc('font', family='STXihei', size=15) a=np.array([1,2,3,4,5,6]) plt.barh([1,2,3,4,5,6],group_mianji,color='#052B6C',alpha=0.8,align='center',edgecolor='white') plt.ylabel('面积分组') plt.xlabel('数量') plt.title('房源面积分布') plt.legend(['数量'], loc='upper right') plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4) plt.yticks(a,('小于50', '50-100', '100-150', '150-200','200-250','250-300')) plt.show() #房源关注度分布情况 #对房源关注度进行二次分列 guanzhu_num_split = pd.DataFrame((x.split('人') for x in house.guanzhu),index=house.index,columns=['guanzhu_num','ren']) totalprice_num_split = pd.DataFrame((x.split('万') for x in house.totalprice),index=house.index,columns=['totalprice_num','wan']) #万列为空 #将分列后的关注度数据拼接回原数据表 house=pd.merge(house,guanzhu_num_split,right_index=True, left_index=True) house=pd.merge(house,totalprice_num_split,right_index=True, left_index=True) #去除房源关注度字段两端的空格 house['guanzhu_num']=house['guanzhu_num'].map(str.strip) house['totalprice_num']=house['totalprice_num'].map(str.strip) #更改房源关注度及总价字段的格式 house[['guanzhu_num','totalprice']]=house[['guanzhu_num','totalprice_num']].astype(float) #查看房源关注度的区间 house['guanzhu_num'].min(),house['guanzhu_num'].max() #对房源关注度进行分组 bins = [0, 100, 200, 300, 400] group_guanzhu = ['小于100', '100-200', '200-300', '300-400'] house['group_guanzhu'] = pd.cut(house['guanzhu_num'], bins, labels=group_guanzhu) group_guanzhu=house.groupby('group_guanzhu')['group_guanzhu'].agg(len) #绘制房源关注度分布图 plt.rc('font', family='STXihei', size=15) a=np.array([1,2,3,4]) plt.barh([1,2,3,4],group_guanzhu,color='#052B6C',alpha=0.8,align='center',edgecolor='white') plt.ylabel('关注度分组') plt.xlabel('数量') plt.xlim(0,400) plt.title('房源关注度分布') plt.legend(['数量'], loc='upper right') plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4) plt.yticks(a,('小于100', '100-200', '200-300', '300-400')) plt.show()
相关文章推荐
- Python爬虫:获取链家,搜房,大众点评的数据
- Python爬取链家的数据并绘制热力图
- Python数据爬虫,爬链家的二手房信息
- [置顶] 【python 爬虫】链家天津租房在售房源数据爬虫
- Python爬取链家地铁房数据
- python爬虫抓取链家租房数据
- 【Python爬虫系列】Python 爬取上海链家二手房数据
- python爬取链家新房数据
- [Python]Python/PHP如何查询sql server中NTEXT类型数据
- Python 起步:数据类型一数字
- [Python]Python/PHP如何查询sql server中NTEXT类型数据
- python 数据加密代码
- python天天进步(6)--网络编程之数据传输TCP
- [Python]Python/PHP如何查询sql server中NTEXT类型数据
- Python例程:使用adodbapi存取二进制数据
- 用 Python 的输入输出功能读取和写入数据
- python 宝典 笔记 第十二章 存储数据和对象 (各种对象转换成字符串)
- python 数据加密代码
- python操作数据时编码问题
- Delphi中使用python脚本读取Excel数据