您的位置:首页 > 编程语言 > Python开发

简单python脚本 爬取杭州链家二手房 房价信息

2020-02-02 00:56 316 查看

爬取链家房价信息

主要使用以下库:

requests
BeautifulSoup

相关问题

当爬取次数太多的时候,可能会遇到被封的情况或者验证码, 大佬们可以自行解决,这个脚本里面我就不掺和了,毕竟我也没有找到好的解决方案,因为这是自己随便用用的嘛

  1. 更换UA
  2. 使用代理
  3. 使用自动打码的玩意或者自动识别验证码

整个脚本也比较简单, 就是通过拼接相应的url地址,拿到页面相应的内容, 然后通过BeautifulSoup 把相关信息 一个一个找出来,填入到excel中

当然,各位不要频繁爬取数据,否则涉及到爬虫问题就尴尬了

具体脚本如下:

# -*- coding: UTF-8 -*-
import json
import urllib2, urllib
import cookielib
import csv
import codecs
import random, time
import cStringIO
import sys
import re
from bs4 import BeautifulSoup
from PIL import Image
import socket, os
import requests

# 杭州各县市区
# 房源地区列表
# areas_rent = {'江干':'jianggan','余杭':'yuhang','西湖':'xihu','萧山':'xiaoshan','滨江':'binjiangb',
#        '拱墅':'gongshu','下城':'xiacheng','上城':'shangcheng','富阳':'fuyang','临安':'linan',
#         '桐庐':'tonglu','淳安':'chunan','建德':'jiande'}
# areas_rent = {'丁桥': 'dingqiao'}
# areas_rent = {'景芳': 'jingfang1'}
# areas_rent = {'未来科技城':'weilaikejicheng'}
areas_rent = {'闲林':'xianlin1'}
global proxy_ran

tmp_total = {'总价': '无', '单价': '无', '小区名称': '无', '所在区域': '无', '建楼时间': '无'}
# 字段
tmp1 = {'房屋户型': '无', '所在楼层': '无', '建筑面积': '无', '户型结构': '无', '套内面积': '无', '建筑类型': '无',
'房屋朝向': '无', '建筑结构': '无', '装修情况': '无', '梯户比例': '无', '配备电梯': '无', '产权年限': '无',
'用水类型': '无', '用电类型': '无'}
tmp2 = {'挂牌时间': '无', '交易权属': '无', '上次交易': '无', '房屋用途': '无', '房屋年限': '无', '产权所属': '无',
'抵押信息': '无', '房本备件': '无', '房协编码': '无'}

def gethtml(url, proxy_ran={}):
random_ua = random.randint(1, 4)
if random_ua == 1:
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36',
}
elif random_ua == 2:
my_headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
}
elif random_ua == 3:
my_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
}
else:
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko',
}
reponse = requests.get(url, headers=my_headers, proxies=proxy_ran)
reponse.encoding = reponse.apparent_encoding  # 获取文本原来编码,使两者编码一致才能正确显示
soup = BeautifulSoup(reponse.text, "lxml")
return soup

def gethouseinfo(soup, areas):
houseinfos = soup.find_all("li", class_="clear LOGCLICKDATA")
# print(houseinfos)
for houseinfo in houseinfos:
# 得到房源详细页面url
# house_tmp = houseinfo.find("a", class_='noresultRecommend img ')
tmp = re.compile("noresultRecommend img.?")
house_url = houseinfo.find("a", class_=tmp).get('href')  # class 包含这个值
print ">>>house_url>>>" + house_url
soup2 = gethtml(house_url)
total_info = soup2.find("div", class_="price")
total_price = total_info.find("span", class_="total").string.strip().replace('\n', '')
unit_price = total_info.find("span", class_="unitPriceValue").get_text().strip().replace('\n', '')
community_info = soup2.find("div", class_="communityName")
community_name = community_info.find("a", class_="info").string.strip().replace('\n', '')
area_info = soup2.find("div", class_="areaName")
area_name = area_info.find("span", class_="info").get_text().strip().replace('\n', '')
build_info = soup2.find("div", class_="houseInfo")
build_time = build_info.find("div", class_="area").find("div", class_="subInfo").string.strip().replace('\n',
'')
tmp_total['总价'] = total_price.encode('utf-8')
tmp_total['单价'] = unit_price.encode('utf-8')
tmp_total['小区名称'] = community_name.encode('utf-8')
tmp_total['所在区域'] = area_name.encode('utf-8')
tmp_total['建楼时间'] = build_time.encode('utf-8')
base_info = soup2.find("div", class_="base")
tmp_infos = base_info.find_all("li")
for tmp_info in tmp_infos:
tmp_key = tmp_info.find("span", class_="label").string
key = tmp_key.strip().replace('\n', '').encode('utf-8')
tmp_val = tmp_info.find("span", class_="label").next_sibling.string
val = tmp_val.strip().replace('\n', '').encode('utf-8')
tmp1[key] = val
trans_info = soup2.find("div", class_="transaction")
tmp_infos = trans_info.find_all("li")
for tmp_info in tmp_infos:
tmp_key = tmp_info.find("span", class_="label").string
key = tmp_key.strip().replace('\n', '').encode('utf-8')
tmp_val = tmp_info.find("span", class_="label").next_sibling.next_sibling.string
val = tmp_val.strip().replace('\n', '').encode('utf-8')
# print(key, val)
tmp2[key] = val
# 找出来的房屋信息需要编码后,传入csv显示中文
datas = [
tmp_total['小区名称'], tmp_total['所在区域'], tmp_total['总价'], tmp_total['单价'], tmp1['房屋户型'],
tmp1['建筑面积'], tmp1['套内面积'], tmp1['所在楼层'], tmp1['装修情况'], tmp1['产权年限'], tmp_total['建楼时间'],
tmp2['交易权属'], tmp2['房屋用途'], tmp2['房屋年限'], tmp2['挂牌时间'], tmp1['房屋朝向'], tmp1['配备电梯'],
tmp1['梯户比例']
]
writer.writerow(datas)
time.sleep(1)

def insertinfo(areas):
# 当程序出错,也保存部分内容
# try:
for area_k, area_v in areas.iteritems():
print ">>>>>>>>>>>>>>查找%s地区信息<<<<<<<<<<<<<<" % area_k
# 网址信息
# https://hz.lianjia.com/ershoufang/yuhang/pg1l2l3bp150ep210/
url = 'https://hz.lianjia.com/ershoufang/' + area_v + '/l1l2l3bp150ep210/'
print "++++_____++++" + url
soup = gethtml(url)
get_pages = soup.find('div', class_='page-box house-lst-page-box').get('page-data')
total_page = json.loads(get_pages)["totalPage"]
print("总页数: %d" % total_page)
# for i in range(1, total_page + 1):
for i in range(1, total_page + 1):
url = 'https://hz.lianjia.com/ershoufang/' + area_v + '/pg%d' % i + 'l1l2l3bp150ep210/'
print "++++_____++++>" + url
soup2 = gethtml(url)
gethouseinfo(soup2, areas)
print "完成第 %d 页" % i
time.sleep(3)
# except Exception as e:
# print "there are some errors, err:%s" %e

csvfile = open('house_info_%s.csv' % areas_rent.values()[0], 'wb')
csvfile.write(codecs.BOM_UTF8)
writer = csv.writer(csvfile)
writer.writerow(['     小区名称     ', '    所在区域    ', '  总价  ', '  单价  ', '房屋户型', '建筑面积', '套内面积', '所在楼层', '装修情况', '产权年限','   建楼时间   ', '交易权属', '房屋用途', '房屋年限', '  挂牌时间  ', '房屋朝向', '配备电梯', '梯户比例'])
print u"查找房源信息"
insertinfo(areas_rent)
csvfile.close()
print u"数据读取完成。"
  • 点赞
  • 收藏
  • 分享
  • 文章举报
消逝的异次元 发布了3 篇原创文章 · 获赞 0 · 访问量 841 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: