您的位置:首页 > 编程语言 > Python开发

Python 小说内容抓取

2017-10-21 12:30 459 查看
环境 Python27 pycharm2017.2

代码如下:

# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib
import urllib2
from bs4 import BeautifulSoup

if __name__ == '__main__':
url = 'http://www.136book.com/huaqiangu/'
head = {}
head[
'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 ' \
'(KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
req = urllib2.Request(url, headers=head)
response = urllib2.urlopen(req)
html = response.read()
soup = BeautifulSoup(html, 'lxml')
soup_texts = soup.find('div', id='book_detail', class_='box1').find_next('div')
# 打开文件
f = open('D:/huaqianguo.txt', 'w')
# 循环解析链接地址
for link in soup_texts.ol.children:
if link != '\n':
download_url = link.a.get('href')
download_req = urllib2.Request(download_url, headers=head)
download_response = urllib2.urlopen(download_req)
download_html = download_response.read()
download_soup = BeautifulSoup(download_html, 'lxml')
download_soup_texts = download_soup.find('div', id='content')
# 抓取其中文本
download_soup_texts = download_soup_texts.text
# 写入章节标题
f.write(link.text + '\n\n')
# 写入章节内容
f.write(download_soup_texts)
f.write('\n\n')
f.close()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: