您的位置:首页 > 编程语言 > Python开发

【Python】爬虫-获取五级行政区划(2018)

2019-07-07 09:38 148 查看
版权声明:本文为博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明。 本文链接:https://blog.csdn.net/weixin_43826590/article/details/94963563

更新于2019-07-07,不足之处还望批评指正

目前尚存在的问题:

(1)连接重置后尝试重连的问题

(2)字符集为GBK

最终效果(注意字符集)

以下是源码:

[code]# -*- coding: UTF-8 -*-
"""
获取统计用区划代码和城乡划分代码
2018年度
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/
"""
import re
import requests
import time
from bs4 import BeautifulSoup

headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
f = open('getArea_sql.txt', 'r+')

def getItem(itemData, dataArray, parentRequestUrl, type):
item = {}
if(type == 5):
item['name'] = str(dataArray[2].get_text())
else:
item['name'] = str(dataArray[1].get_text())
# 下一级请求url
href = re.findall('(.*)/', parentRequestUrl)
if type != 5:
item['href'] = href[0] + '/' + dataArray[0].get('href')
# 父级code
item['parentCode'] = itemData.get('code')
# 类型
item['type'] = type
# code
if type <= 3:
item['code'] = str(dataArray[0].get_text())[0 : 6]
else:
item['code'] = str(dataArray[0].get_text())[0: 12]
# sql语句
f.write('insert into tb_sys_area (code, name, level, parent_code) values (\'%s\', \'%s\', %s, \'%s\')' % (
item['code'], item['name'], item['type'], item['parentCode']
) + ";\n")
return item

# 获取BeautifulSoup
def getSoup(requestUrl):
# 数据量太大,有abort的可能
try:
htmls = requests.get(requestUrl, headers=headers)
except requests.exceptions.ConnectionError:
time.sleep(5)
htmls = requests.get(requestUrl, headers=headers)
htmls.encoding = 'GBK'
soup = BeautifulSoup(htmls.text, 'html.parser')
return soup

# 循环处理
def loopItem(label, labelClass, labelChild, item, requestUrl, type, lists):
for link in soup.find_all(label, labelClass):
array = link.find_all(labelChild, class_='')
if not len(array):
continue
itemData = getItem(item, array, requestUrl, type)
lists.append(itemData)

requestProvinceUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html'
soup = getSoup(requestProvinceUrl)

# 省列表
provinceList = []
for link in soup.find_all('a', class_=''):
requestCityUrl = re.findall('(.*)/', requestProvinceUrl)
item = {}
# 名称
item['name'] = str(link.get_text())
# 下一级url
href = str(link.get('href'))
item['href'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/' + href
# 父级code
item['parentCode'] = '0'
# 类型
item['type'] = '1'
# code
item['code'] = (href.split('.'))[0] + '0000'
provinceList.append(item)
f.write('insert into tb_sys_area (code, name, level, parent_code) values (\'%s\', \'%s\', %s, \'%s\')' % (
item['code'], item['name'], item['type'], item['parentCode']
) + ";\n")

# 市列表
cityList = []
for item in provinceList:
cityRequestUrl = str(item.get('href'))
soup = getSoup(cityRequestUrl)
loopItem('tr', 'citytr', 'a', item, cityRequestUrl, 2, cityList)

# 县列表
countyList = []
for item in cityList:
countyRequestUrl = str(item.get('href'))
soup = getSoup(countyRequestUrl)
loopItem('tr', 'countytr', 'a', item, countyRequestUrl, 3, countyList)

# 城镇列表
townList = []
for item in countyList:
townRequestUrl = str(item.get('href'))
soup = getSoup(townRequestUrl)
loopItem('tr', 'towntr', 'a', item, townRequestUrl, 4, townList)

# 村列表
villageList = []
for item in townList:
villageRequestUrl = str(item.get('href'))
soup = getSoup(villageRequestUrl)
loopItem('tr', 'villagetr', 'td', item, villageRequestUrl, 5, villageList)

 

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: