您的位置:首页 > 编程语言 > Python开发

Python 使用xpath匹配html内容并生成CSV文件

2020-01-11 17:39 169 查看
#-- coding: utf-8 --
import os
import re
import csv
from lxml import html
#生成CSV文件
def get_list_dir():
headers = ('标题', '内容', '来源', '时间', '作者')
with open('D:/Python/PythonProjects/TestDemo/article/31530942.csv', 'w', encoding='utf-8-sig') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(headers)
dir_aim = "D:/Python/PythonProjects/TestDemo/article"
for filename in os.listdir(dir_aim):
#print(filename)
article = get_article_content(filename)
if article:
with open('D:/Python/PythonProjects/TestDemo/article/31530942.csv', 'a+', encoding='utf-8-sig') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(article)
#读取html文件用xpath解析文件返回结果
def get_article_content(fileName):
try:
with open('D:/Python/PythonProjects/TestDemo/article/'+fileName, 'r', encoding='utf-8') as f:
content = f.read()
etree = html.etree
ht = etree.HTML(content)
title = "".join(ht.xpath("/html/body/div[5]/h1/text()"))
content = "".join(ht.xpath("/html/body/div[5]/div[2]/p/text()")).replace("\n","")
source = "".join(ht.xpath("/html/body/div[5]/div[1]/text()"))
if source:
source = re.search(r"来源:(.*?)\s",source)
if source:
source = source.group().replace("来源:", "")
else:
source = ""
else:
source = ""
time = "".join(ht.xpath("/html/body/div[5]/div[1]/text()"))
if time:
time = re.search(r"时间:(.*?)$", time)
if time:
time = time.group().replace("时间:", "")
else:
time = ""
else:
time = ""
edit = "".join(ht.xpath("/html/body/div[5]/div[3]/text()"))
if edit:
edit = re.search(r"责编:(.*?)\s",edit)
if edit:
edit = edit.group().replace("责编:", "")
else:
edit = ""
else:
edit =""
return (title, content, source, time, edit)
except Exception as e:
print(e)
get_list_dir()

a+ 是追加方式写入

  • 点赞
  • 收藏
  • 分享
  • 文章举报
ALEX2205 发布了4 篇原创文章 · 获赞 1 · 访问量 32 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐