Python正则表达式
2015-09-09 21:16
736 查看
代码实例:
# -*- coding: utf-8 -*-
#Author:qiang
import re
import os
import csv
class HTML_Process(object):
def __init__(self, page):
self.page = page
self.record_dict, self.record_arr = self.deal_page()
self.save_data()
print "\nfinish"
#save data
def save_data(self):
output_file_object=open("output_file.txt","wb")
writer=csv.writer(output_file_object)
header=["用户编号","用户类型","计量点名称","资产编号","出厂编号","示数类型","上次示数","本次示数","综合倍率",
"上次抄见电量","本次电量","抄表状态","抄表异常分类","抄表数据来源","用电地址"]
writer.writerow(header)
for line in self.record_arr:
writer.writerow(line)
output_file_object.close()
#deal page and return segment data
def deal_page(self):
#find all records
page_patt = re.compile("green..href=.javascript:queryConsInfo(.*?)<a")
records = page_patt.findall(self.page)
#compile the pattern for record
record_patt = re.compile("<td noWrap>([^<].*?)</td>")
user_number_patt = re.compile("\).>(.*?)</a>")
count = 0
record_dict = []
record_arr = []
for record in records:
print record
#get the user number
user_number = user_number_patt.findall(record)
arr = record_patt.findall(record)
result_arr = user_number + arr
#catch 15 segments data
if len(result_arr) == 15:
one_record_dict = {}
one_record_dict["user_number"] = result_arr[0]#用户编号
one_record_dict["user_type"] = result_arr[1]#用户类型
one_record_dict["meter_point_name"] = result_arr[2]#计量点名称
one_record_dict["asset_number"] = result_arr[3]#资产编号
one_record_dict["factory_number"] = result_arr[4]#出厂编号
one_record_dict["display_type"] = result_arr[5]#示数类型
one_record_dict["last_display"] = result_arr[6]#上次示数
one_record_dict["current_display"] = result_arr[7]#本次示数
one_record_dict["comprehensive_rate"] = result_arr[8]#综合倍率
one_record_dict["last_power"] = result_arr[9]#上次抄见电量
one_record_dict["current_power"] = result_arr[10]#本次电量
one_record_dict["status"] = result_arr[11]#抄表状态
one_record_dict["abnormal_assort"] = result_arr[12]#抄表异常分类
one_record_dict["data_from"] = result_arr[13]#抄表数据来源
one_record_dict["user_address"] = result_arr[14]#用电地址
record_dict.append(one_record_dict)
record_arr.append(result_arr)
#print the data to the screen
count = count + 1
strs = ""
for i in result_arr:
strs = strs + str(i) + "-"
print strs
print "total records :" + str(count)
if count==0:
print "maybe will modify regular expression or check the page is ok"
return record_dict, record_arr
def get_Page():
try:
file_object = open("monInfo.txt")
page = file_object.read()
file_object.close()
return page
except:
print "no [monInfo.txt] file in current folder"
if __name__ == "__main__":
try:
page=get_Page()
HTML_Process(page)
except:
pass
os.system("pause")
# -*- coding: utf-8 -*-
#Author:qiang
import re
import os
import csv
class HTML_Process(object):
def __init__(self, page):
self.page = page
self.record_dict, self.record_arr = self.deal_page()
self.save_data()
print "\nfinish"
#save data
def save_data(self):
output_file_object=open("output_file.txt","wb")
writer=csv.writer(output_file_object)
header=["用户编号","用户类型","计量点名称","资产编号","出厂编号","示数类型","上次示数","本次示数","综合倍率",
"上次抄见电量","本次电量","抄表状态","抄表异常分类","抄表数据来源","用电地址"]
writer.writerow(header)
for line in self.record_arr:
writer.writerow(line)
output_file_object.close()
#deal page and return segment data
def deal_page(self):
#find all records
page_patt = re.compile("green..href=.javascript:queryConsInfo(.*?)<a")
records = page_patt.findall(self.page)
#compile the pattern for record
record_patt = re.compile("<td noWrap>([^<].*?)</td>")
user_number_patt = re.compile("\).>(.*?)</a>")
count = 0
record_dict = []
record_arr = []
for record in records:
print record
#get the user number
user_number = user_number_patt.findall(record)
arr = record_patt.findall(record)
result_arr = user_number + arr
#catch 15 segments data
if len(result_arr) == 15:
one_record_dict = {}
one_record_dict["user_number"] = result_arr[0]#用户编号
one_record_dict["user_type"] = result_arr[1]#用户类型
one_record_dict["meter_point_name"] = result_arr[2]#计量点名称
one_record_dict["asset_number"] = result_arr[3]#资产编号
one_record_dict["factory_number"] = result_arr[4]#出厂编号
one_record_dict["display_type"] = result_arr[5]#示数类型
one_record_dict["last_display"] = result_arr[6]#上次示数
one_record_dict["current_display"] = result_arr[7]#本次示数
one_record_dict["comprehensive_rate"] = result_arr[8]#综合倍率
one_record_dict["last_power"] = result_arr[9]#上次抄见电量
one_record_dict["current_power"] = result_arr[10]#本次电量
one_record_dict["status"] = result_arr[11]#抄表状态
one_record_dict["abnormal_assort"] = result_arr[12]#抄表异常分类
one_record_dict["data_from"] = result_arr[13]#抄表数据来源
one_record_dict["user_address"] = result_arr[14]#用电地址
record_dict.append(one_record_dict)
record_arr.append(result_arr)
#print the data to the screen
count = count + 1
strs = ""
for i in result_arr:
strs = strs + str(i) + "-"
print strs
print "total records :" + str(count)
if count==0:
print "maybe will modify regular expression or check the page is ok"
return record_dict, record_arr
def get_Page():
try:
file_object = open("monInfo.txt")
page = file_object.read()
file_object.close()
return page
except:
print "no [monInfo.txt] file in current folder"
if __name__ == "__main__":
try:
page=get_Page()
HTML_Process(page)
except:
pass
os.system("pause")
相关文章推荐
- python文件操作
- python正则表达式示例
- Python 产生两个方法将不被所述多个随机数的特定范围内反复
- JAVA Python学习博客
- python 写的str 解决dos2unix问题
- python问题:IndentationError:expected an indent
- python安装学习
- Coursera-An Introduction to Interactive Programming in Python (Part 1)-Mini-project #3 —"Stopwatch: The Game"
- bae python-worker 爬虫 接下来工作
- BAE python-worker 服务
- BAE 上部署python-worker 爬虫遇到的问题
- python实现在 Mac 10.9 远程桌面截屏抓取
- Json概述以及python对json的相关操作
- python 发送邮件实例
- python 发送邮件实例
- 使用GDB出现ImportError: No module named 'libstdcxx' 错误提示的解决方案
- python下取得父文件夹绝对路径的方法
- 配置Python开发环境
- 零基础学python-13.4 文件上使用列表解析与列表解析扩展
- 零基础学python-13.4 文件上使用列表解析与列表解析扩展