您的位置:首页 > 编程语言 > Python开发

Python爬取链家地铁房数据

2017-07-21 10:21 661 查看
#coding=gbk
#因为涉及到中文,utf-8会报错
### 环境:Python 3.6###
import requests
import re
import pandas as pd
import csv
from bs4 import BeautifulSoup
def generate_allurl(user_in_nub):
url = 'https://bj.lianjia.com/ditiefang/li647/pg{}/'
for url_next in range(1, int(user_in_nub)):
yield url.format(url_next)
def main():
#user_in_nub = input('输入生成页数:')
df = []
for i in generate_allurl(35): #总共34页
print("页码"+i)
#get_allurl(i)
res = requests.get(i)
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'lxml') #获取html的文本
re_set = re.compile('<li.*?class="clear">.*?<a.*?class="img.*?".*?data-housecode="(.*?)"')#正则匹配data-house
re_get = re.findall(re_set, res.text)#获取一页的二手房信息个数
print(len(re_get))
#for i in re_get:
#print(i)
#open_url(i)
for i in range(len(re_get)):
info = {}
info['编号'] = re_get[i] + '号'
#print(info)
info['单价'] = soup.select('.unitPrice')[i].text
info['地铁'] = soup.select('.subway')[i].text
df.append(info)
print(df)
#df1 = pd.DataFrame.from_dict(info, orient='index').T
#pandas_to_xlsx(df1)
# 表头
# header = ['编号', '单价', '地铁']
print(len(df))
df = pd.DataFrame(df)
#将结果写入csv
df.to_csv('D:/dst8.csv', index=False)
if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: