您的位置:首页 > 编程语言 > Python开发

【Python】网络爬虫(静态网站)实例

2018-10-12 21:01 363 查看
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/m0_37811192/article/details/83032620

本爬虫的特点:

1.目标:静态网站

2.级数:二级

3.线程:单线程(未采用同步,为了避免顺序错乱,因此采用单线程)

4.结果:爬取一部网络小说,将分散的各章节合并成一个txt文本文件

 

获取网页模板:

[code]def get_url(url):
try:
response = requests.get(url)
print(response.encoding)
print(response.apparent_encoding)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
else:
print("url Error:", url)
except RequestException:
print("URL RequestException Error:", url)
return None

解析保存函数:

[code]def parse_url(html):
count = 0
essay = ""
pattern = re.compile('<td class="L"><a href="(.*?)">(.*?)</a></td>', re.S)
items = re.findall(pattern, html)
pattern_page = re.compile('<meta property="og:url" content="(.*?)"/>', re.S)
item_page = re.findall(pattern_page, html)
print(items)
print(items.__len__())
for item in items:
count += 1
if count <= 2416:
continue
this_url = item_page[0] + item[0]
this_title = item[1]
essay = get_book(this_url, this_title).replace("\ufffd", "*")
try:
if count % 100 == 1:
file = open(sys.path[0]+"凡人修仙传.txt", "a")
file.write(essay)
if count % 100 == 0 or count == items.__len__():
file.close()
print("前"+str(count)+"章保存完毕!")
print("下载到第 " + str(count) + "章", item, count / items.__len__() * 100, "%")
except RequestException:
# print("Error", item)
print(essay)

完整代码:

[code]import requests
from requests.exceptions import RequestException
import re
import sys
from multiprocessing import Pool
import sqlite3
import os

def get_url(url):
try:
response = requests.get(url)
print(response.encoding)
print(response.apparent_encoding)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
else:
print("url Error:", url)
except RequestException:
print("URL RequestException Error:", url)
return None

def parse_url(html):
count = 0
essay = ""
pattern = re.compile('<td class="L"><a href="(.*?)">(.*?)</a></td>', re.S)
items = re.findall(pattern, html)
pattern_page = re.compile('<meta property="og:url" content="(.*?)"/>', re.S)
item_page = re.findall(pattern_page, html)
print(items)
print(items.__len__())
for item in items:
count += 1
if count <= 2416:
continue
this_url = item_page[0] + item[0]
this_title = item[1]
essay = get_book(this_url, this_title).replace("\ufffd", "*")
try:
if count % 100 == 1:
file = open(sys.path[0]+"凡人修仙传.txt", "a")
file.write(essay)
if count % 100 == 0 or count == items.__len__():
file.close()
print("前"+str(count)+"章保存完毕!")
print("下载到第 " + str(count) + "章", item, count / items.__len__() * 100, "%")
except RequestException:
# print("Error", item)
print(essay)

def get_book(url, title):
data = "\n" + str(title) + "\n"
pattern = re.compile('<dd id="contents">(.*?)</dd>', re.S)
essay = re.findall(pattern, get_url(url))
essay_str = str(essay[0])
data = data + essay_str.replace("&nbsp;", " ").replace("<br />", "\n")
return data

if __name__ == '__main__':
parse_url(get_url("https://www.x23us.com/html/0/328/"))

阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: