您的位置:首页 > 编程语言 > Python开发

Python解析HTML实例

2016-11-14 21:27 176 查看
# coding:utf-8
import urllib.request
import re
import xlwt
import time
from xlwt import Workbook
from html.parser import HTMLParser
from tempfile import TemporaryFile
info = ''
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
#结果目录
inforst = 'C:/Users/玲玲/PycharmProjects/untitled/***/'
#写入Excel(定义Excel表头)
book = Workbook()
#自动换行
style = xlwt.easyxf('align: wrap on')
#设置时间格式
ISOTIMEFORMAT='%Y-%m-%d %X'
'''''
HTMLParser的成员函数:

handle_startendtag  处理开始标签和结束标签
handle_starttag     处理开始标签,比如<xx>
handle_endtag       处理结束标签,比如</xx>
handle_charref      处理特殊字符串,就是以&#开头的,一般是内码表示的字符
handle_entityref    处理一些特殊字符,以&开头的,比如
handle_data         处理数据,就是<xx>data</xx>中间的那些数据
handle_comment      处理注释
handle_decl         处理<!开头的,比如<!DOCTYPE html PUBLIC “-//W3C//DTD HTML 4.01 Transitional//EN”
handle_pi           处理形如<?instruction>的东西

'''
class myHtmlParser(HTMLParser):
# 处理初始化数据结构
def  __init__(self):
HTMLParser.__init__(self)
self.pflag = 0
self.showflag = 1
self.scores = []
self.names = []
self.addresses = []
self.areacodes = []
#处理标签
def handle_starttag(self, tag, attrs):
if tag == 'p' or tag == 'br':
self.pflag = 1
self.showflag = 1
elif tag == 'img':#店铺评分
for attr in attrs:
for t in  attr:
if  'title' in  t:
##print(attr[1])
if '店面' in attr[1]:
self.scores.append(attr[1])
self.showflag = 0
elif tag == 'a':#店铺全称
for attr in attrs:
for t in  attr:
if  'onclick' in  t:
##print(attr[1][6:-3])
if attr[1][6:-3] not in self.names and zhPattern.search(attr[1][6:-3]):#去重、剔除不含中文部分
self.names.append(attr[1][6:-3])
self.showflag = 0
elif tag == 'option':  # 地区
for attr in attrs:
for t in  attr:
if  'value' in  t:
if '-1' not in attr[1]:
self.areacodes.append(attr[1])
else:
self.showflag = 0
#处理标签之间的数据
def handle_data(self, data):#店铺联系方式及地址
if self.pflag  == 1 and str(data).strip() != '' and self.showflag == 1:
##print(str(data).strip())
self.addresses.append(str(data).strip())
#处理返回函数
def get_scores(self):
return self.scores
def get_names(self):
return self.names
def get_addresses(self):
return self.addresses
def get_areacodes(self):
return self.areacodes
if __name__ == '__main__':
#定义输出格式
sheet1 = book.add_sheet('数据')
row1 = sheet1.row(0)
row1.write(0, '所属省份')
row1.write(1, '所属城市')
row1.write(2, '店铺名称')
row1.write(3, '联系方式')
row1.write(4, '店面综合评价')
sheet1.col(0).width = 3000
sheet1.col(1).width = 3000
sheet1.col(2).width = 15000
sheet1.col(3).width = 20000
sheet1.col(4).width = 10000
m = myHtmlParser()
url = 'http://www.***.com.cn/Find_***_store/index.html?province=360000&city=-1'
req = urllib.request.Request(url)
fd = urllib.request.urlopen(req)
m.feed(fd.read().decode('utf-8'))
areacodes = m.get_areacodes()
s_rownum = 1#单个循环计数器
t_rownum = 1#所有计数器

# 获取地区编码
for i in range(1,len(areacodes)):#len(areacodes)
#print(areacodes[i])
areacode = areacodes[i]
url = 'http://www.***.com.cn/Find_***_store/index.html?province='+areacode+'&city=-1'
req = urllib.request.Request(url)
fd = urllib.request.urlopen(req)
m = myHtmlParser()
m.feed(fd.read().decode('utf-8'))
scores = []
names = []
addresses = []
scores = m.get_scores()
names = m.get_names()
addresses = m.get_addresses()
addressesT = []#处理过的地址
vAdd = ''
#写入评价分
#print(s_rownum)
for i in range(0, len(scores)):
#print(i, scores[i])
row1 = sheet1.row(s_rownum)
s_rownum = s_rownum + 1
row1.write(4, scores[i][6:], style)
s_rownum = t_rownum
#print(s_rownum)
#写入名称和省市
for i in range(0, len(names)):
#print(i, names[i])
##print(i, names[i])
v_pos = str(names[i]).index('-')
##print(v_pos)
v_lpos = str(names[i]).rindex('-')
##print(v_lpos)
row1 = sheet1.row(s_rownum)
s_rownum = s_rownum + 1
row1.write(0, names[i][0:int(v_pos)], style)
row1.write(1, names[i][int(v_pos)+1:int(v_lpos)], style)
row1.write(2,  names[i], style)
#写入地址
s_rownum = t_rownum
#print(s_rownum)
for i in range(0, len(addresses)):
#print(i, addresses[i])
if '>' in  addresses[i]:
if '>' in  vAdd:
addressesT.append(vAdd)
vAdd = ''
vAdd = vAdd + addresses[i]
else:
vAdd = vAdd + ' ' + addresses[i]
addressesT.append(vAdd)
vAdd = ''
addressesT.append(vAdd)#最后一个可能没有联系方式
s_rownum = t_rownum
#print(s_rownum)
for i in range(0, len(addressesT)):
#print(i, addressesT[i])
row1 = sheet1.row(s_rownum)
s_rownum = s_rownum + 1
row1.write(3, addressesT[i][1:], style)
#print(s_rownum)
#print(t_rownum)
t_rownum = s_rownum#保存上一次行数
systime = str(time.strftime(ISOTIMEFORMAT, time.localtime())).replace(':','')
book.save(inforst + '数据'+systime+'.xls')
book.save(TemporaryFile())
m.close()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python