Python解析HTML实例
2016-11-14 21:27
176 查看
# coding:utf-8 import urllib.request import re import xlwt import time from xlwt import Workbook from html.parser import HTMLParser from tempfile import TemporaryFile info = '' zhPattern = re.compile(u'[\u4e00-\u9fa5]+') #结果目录 inforst = 'C:/Users/玲玲/PycharmProjects/untitled/***/' #写入Excel(定义Excel表头) book = Workbook() #自动换行 style = xlwt.easyxf('align: wrap on') #设置时间格式 ISOTIMEFORMAT='%Y-%m-%d %X' ''''' HTMLParser的成员函数: handle_startendtag 处理开始标签和结束标签 handle_starttag 处理开始标签,比如<xx> handle_endtag 处理结束标签,比如</xx> handle_charref 处理特殊字符串,就是以开头的,一般是内码表示的字符 handle_entityref 处理一些特殊字符,以&开头的,比如 handle_data 处理数据,就是<xx>data</xx>中间的那些数据 handle_comment 处理注释 handle_decl 处理<!开头的,比如<!DOCTYPE html PUBLIC “-//W3C//DTD HTML 4.01 Transitional//EN” handle_pi 处理形如<?instruction>的东西 ''' class myHtmlParser(HTMLParser): # 处理初始化数据结构 def __init__(self): HTMLParser.__init__(self) self.pflag = 0 self.showflag = 1 self.scores = [] self.names = [] self.addresses = [] self.areacodes = [] #处理标签 def handle_starttag(self, tag, attrs): if tag == 'p' or tag == 'br': self.pflag = 1 self.showflag = 1 elif tag == 'img':#店铺评分 for attr in attrs: for t in attr: if 'title' in t: ##print(attr[1]) if '店面' in attr[1]: self.scores.append(attr[1]) self.showflag = 0 elif tag == 'a':#店铺全称 for attr in attrs: for t in attr: if 'onclick' in t: ##print(attr[1][6:-3]) if attr[1][6:-3] not in self.names and zhPattern.search(attr[1][6:-3]):#去重、剔除不含中文部分 self.names.append(attr[1][6:-3]) self.showflag = 0 elif tag == 'option': # 地区 for attr in attrs: for t in attr: if 'value' in t: if '-1' not in attr[1]: self.areacodes.append(attr[1]) else: self.showflag = 0 #处理标签之间的数据 def handle_data(self, data):#店铺联系方式及地址 if self.pflag == 1 and str(data).strip() != '' and self.showflag == 1: ##print(str(data).strip()) self.addresses.append(str(data).strip()) #处理返回函数 def get_scores(self): return self.scores def get_names(self): return self.names def get_addresses(self): return self.addresses def get_areacodes(self): return self.areacodes if __name__ == '__main__': #定义输出格式 sheet1 = book.add_sheet('数据') row1 = sheet1.row(0) row1.write(0, '所属省份') row1.write(1, '所属城市') row1.write(2, '店铺名称') row1.write(3, '联系方式') row1.write(4, '店面综合评价') sheet1.col(0).width = 3000 sheet1.col(1).width = 3000 sheet1.col(2).width = 15000 sheet1.col(3).width = 20000 sheet1.col(4).width = 10000 m = myHtmlParser() url = 'http://www.***.com.cn/Find_***_store/index.html?province=360000&city=-1' req = urllib.request.Request(url) fd = urllib.request.urlopen(req) m.feed(fd.read().decode('utf-8')) areacodes = m.get_areacodes() s_rownum = 1#单个循环计数器 t_rownum = 1#所有计数器 # 获取地区编码 for i in range(1,len(areacodes)):#len(areacodes) #print(areacodes[i]) areacode = areacodes[i] url = 'http://www.***.com.cn/Find_***_store/index.html?province='+areacode+'&city=-1' req = urllib.request.Request(url) fd = urllib.request.urlopen(req) m = myHtmlParser() m.feed(fd.read().decode('utf-8')) scores = [] names = [] addresses = [] scores = m.get_scores() names = m.get_names() addresses = m.get_addresses() addressesT = []#处理过的地址 vAdd = '' #写入评价分 #print(s_rownum) for i in range(0, len(scores)): #print(i, scores[i]) row1 = sheet1.row(s_rownum) s_rownum = s_rownum + 1 row1.write(4, scores[i][6:], style) s_rownum = t_rownum #print(s_rownum) #写入名称和省市 for i in range(0, len(names)): #print(i, names[i]) ##print(i, names[i]) v_pos = str(names[i]).index('-') ##print(v_pos) v_lpos = str(names[i]).rindex('-') ##print(v_lpos) row1 = sheet1.row(s_rownum) s_rownum = s_rownum + 1 row1.write(0, names[i][0:int(v_pos)], style) row1.write(1, names[i][int(v_pos)+1:int(v_lpos)], style) row1.write(2, names[i], style) #写入地址 s_rownum = t_rownum #print(s_rownum) for i in range(0, len(addresses)): #print(i, addresses[i]) if '>' in addresses[i]: if '>' in vAdd: addressesT.append(vAdd) vAdd = '' vAdd = vAdd + addresses[i] else: vAdd = vAdd + ' ' + addresses[i] addressesT.append(vAdd) vAdd = '' addressesT.append(vAdd)#最后一个可能没有联系方式 s_rownum = t_rownum #print(s_rownum) for i in range(0, len(addressesT)): #print(i, addressesT[i]) row1 = sheet1.row(s_rownum) s_rownum = s_rownum + 1 row1.write(3, addressesT[i][1:], style) #print(s_rownum) #print(t_rownum) t_rownum = s_rownum#保存上一次行数 systime = str(time.strftime(ISOTIMEFORMAT, time.localtime())).replace(':','') book.save(inforst + '数据'+systime+'.xls') book.save(TemporaryFile()) m.close()
相关文章推荐
- Python HTMLParser模块解析html获取url实例
- python 解析html基础 HTMLParser库,方法,及代码实例
- Python中使用HTMLParser解析html实例
- python解析html提取数据,并生成word文档实例解析
- Python中使用HTMLParser解析html实例
- Python 网页解析HTMLParse的实例详解
- Python写爬虫——抓取网页并解析HTML
- XML WebService完全实例详细解析【转:http://www.cnblogs.com/xqzhao/archive/2008/01/18/1044574.html】
- Python 解析 html 文件
- Python网络编程 HTML解析
- Python网页解析:BeautifulSoup vs lxml.html
- python模块之HTMLParser: 解析html,获取url
- 用python解析html
- python解析xml文档实例
- Python实例讲解 -- 解析xml
- 黄聪:使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies(二)
- Python写爬虫——抓取网页并解析HTML
- python解析html之BeautifulSoup
- 用python解析html[SGMLParser]
- python模块之HTMLParser: 解析html,获取url