您的位置:首页 > 其它

爬虫小案例 爬取笑话 xpath

2018-07-21 08:50 39 查看

 

[code]import requests
from lxml import etree

# 写入文件
def write_file(art):
with open("笑话.txt", "a+", encoding="utf-8") as f:
f.write(art)

# 解析html得到自己想要的内容
def parse_html(html):
content = etree.HTML(html)
a_lists = content.xpath('//div[@class="list_title"]/ul/li/b/a/@href')
for a in a_lists:
# # "http://www.jokeji.cn/jokehtml/%E5%86%B7%E7%AC%91%E8%AF%9D/201806212319307.htm"
url = "http://www.jokeji.cn" + a
result = requests.get(url)
# 转化成gb2312编码
result.encoding = "gb2312"
result = result.text
info = etree.HTML(result)
art_lists = info.xpath('//span[@id="text110"]/p/text()')
for art in art_lists:
print(art)
write_file(art)

def main():
num = 1
for i in range(10):
url = "http://www.jokeji.cn/list_" + str(num) + ".htm"
num += 1
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
html = requests.get(url, headers=headers)
# 如果不知道是什么编码 此时可以print(html.encoding)查看一下是什么编码
html.encoding = "gb2312"
html = html.text
parse_html(html)

if __name__ == '__main__':
main()

 

阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: