Python爬虫多线程爬搜索引擎
2017-11-27 17:39
302 查看
爬搜索引擎的信息要注意page和key的变化,还有正则表达式一定要正确
爬下面的URL: http://weixin.sogou.com/weixin?type=2&query=
后面再跟page信息
一共三个线程,第一个负责把URL存到队列中去,第二个URL负责读取需要的信息并储存,第三个如果队列为空,则结束
import queue
import threading
import urllib.request
import urllib.error
import re
import time
urlqueue = queue.Queue()
#获得html文档
def GetData(url):
try:
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
# thread1
class GetUrl(threading.Thread):
def __init__(self, key, pagestart, pageend, urlqueue):
threading.Thread.__init__(self)
self.key = key
self.pagestart = pagestart
self.pageend = pageend
self.urlqueue = urlqueue
def run(self):
keycode = urllib.request.quote(self.key)
pagecode = urllib.request.quote("&page=")
for page in range(self.pagestart, self.pageend+1):
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
data = GetData(url)
listurlpattern = '<div class="txt-box">.*?(http://.*?)"'
page_urls = re.compile(listurlpattern, re.S).findall(data)
for page_url in page_urls:
page_url = page_url.replace("amp;", "")
self.urlqueue.put(page_url)
self.urlqueue.task_done()
class GetConnect(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
def run(self):
html1 = '''
<html>
<head>
<title>微信文章</title>
</head>
<body>
'''
fh = open("1.html", 'wb')
fh.write(html1.encode('utf-8'))
fh.close()
fh = open("1.html", 'ab')
i = 1
while(True):
try:
url = self.urlqueue.get()
print(url)
data = GetData(url)
titlepat = '<title>(.*?)</title>'
contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
title = re.compile(titlepat, re.S).findall(data)
content = re.compile(contentpat, re.S).findall(data)
thistitle = "no"
thiscontent = "no"
if (title != []):
thistitle = title[0]
if(content != []):
thiscontent = content[0]
dataall = "<p>标题是:"+thistitle+"</p><p>内容是:"+thiscontent+"</p><br>"
fh.write(dataall.encode('utf-8'))
print("第"+str(i)+"个网页处理")
i += 1
except urllib.request.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
fh.close()
html2 = '''
</body>
</html>
'''
fh = open("1.html", 'ab')
fh.write(html2.encode('utf-8'))
fh.close()
class Conrl(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
def run(self):
while(True):
print("程序执行ing")
time.sleep(60)
if self.urlqueue.empty():
print("执行完毕")
exit()
key = "IT"
pagestart = 1
pageend = 2
thread1 = GetUrl(key, pagestart, pageend, urlqueue)
thread1.start()
thread2 = GetConnect(urlqueue)
thread2.start()
thread3 = Conrl(urlqueue)
thread3.start()
爬下面的URL: http://weixin.sogou.com/weixin?type=2&query=
后面再跟page信息
一共三个线程,第一个负责把URL存到队列中去,第二个URL负责读取需要的信息并储存,第三个如果队列为空,则结束
import queue
import threading
import urllib.request
import urllib.error
import re
import time
urlqueue = queue.Queue()
#获得html文档
def GetData(url):
try:
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
# thread1
class GetUrl(threading.Thread):
def __init__(self, key, pagestart, pageend, urlqueue):
threading.Thread.__init__(self)
self.key = key
self.pagestart = pagestart
self.pageend = pageend
self.urlqueue = urlqueue
def run(self):
keycode = urllib.request.quote(self.key)
pagecode = urllib.request.quote("&page=")
for page in range(self.pagestart, self.pageend+1):
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
data = GetData(url)
listurlpattern = '<div class="txt-box">.*?(http://.*?)"'
page_urls = re.compile(listurlpattern, re.S).findall(data)
for page_url in page_urls:
page_url = page_url.replace("amp;", "")
self.urlqueue.put(page_url)
self.urlqueue.task_done()
class GetConnect(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
def run(self):
html1 = '''
<html>
<head>
<title>微信文章</title>
</head>
<body>
'''
fh = open("1.html", 'wb')
fh.write(html1.encode('utf-8'))
fh.close()
fh = open("1.html", 'ab')
i = 1
while(True):
try:
url = self.urlqueue.get()
print(url)
data = GetData(url)
titlepat = '<title>(.*?)</title>'
contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
title = re.compile(titlepat, re.S).findall(data)
content = re.compile(contentpat, re.S).findall(data)
thistitle = "no"
thiscontent = "no"
if (title != []):
thistitle = title[0]
if(content != []):
thiscontent = content[0]
dataall = "<p>标题是:"+thistitle+"</p><p>内容是:"+thiscontent+"</p><br>"
fh.write(dataall.encode('utf-8'))
print("第"+str(i)+"个网页处理")
i += 1
except urllib.request.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
fh.close()
html2 = '''
</body>
</html>
'''
fh = open("1.html", 'ab')
fh.write(html2.encode('utf-8'))
fh.close()
class Conrl(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
def run(self):
while(True):
print("程序执行ing")
time.sleep(60)
if self.urlqueue.empty():
print("执行完毕")
exit()
key = "IT"
pagestart = 1
pageend = 2
thread1 = GetUrl(key, pagestart, pageend, urlqueue)
thread1.start()
thread2 = GetConnect(urlqueue)
thread2.start()
thread3 = Conrl(urlqueue)
thread3.start()
相关文章推荐
- python多线程、异步、多进程+异步爬虫
- python实现爬虫统计学校BBS男女比例(二)多线程爬虫
- 第三百五十节,Python分布式爬虫打造搜索引擎Scrapy精讲—selenium模块是一个python操作浏览器软件的一个模块,可以实现js动态网页请求
- python爬虫实战--selenium验证码保存+多线程多标签+自动点击+完整代码
- python queue和多线程的爬虫 与 JoinableQueue和多进程的爬虫
- 第三百五十七节,Python分布式爬虫打造搜索引擎Scrapy精讲—利用开源的scrapy-redis编写分布式爬虫代码
- 毕业设计中怎样用python写一个搜索引擎的分布式爬虫---异样的美感
- 第三百三十九节,Python分布式爬虫打造搜索引擎Scrapy精讲—Scrapy启动文件的配置—xpath表达式
- 第三百六十二节,Python分布式爬虫打造搜索引擎Scrapy精讲—elasticsearch(搜索引擎)基本的索引和文档CRUD操作、增、删、改、查
- Python多线程爬虫获取电影下载链接
- 通过爬虫、lucene和python.web实现网页搜索引擎
- python 爬虫练习 多线程的运用
- 第三百四十一节,Python分布式爬虫打造搜索引擎Scrapy精讲—编写spiders爬虫文件循环抓取内容—meta属性返回指定值给回调函数—Scrapy内置图片下载器
- 第三百四十三节,Python分布式爬虫打造搜索引擎Scrapy精讲—scrapy模拟登陆和知乎倒立文字验证码识别
- 多线程获取豆瓣网页的网络爬虫(Python实现)
- python多线程爬虫抓取网页
- 一个简单的多线程Python爬虫(一)
- 第三百六十九节,Python分布式爬虫打造搜索引擎Scrapy精讲—elasticsearch(搜索引擎)用Django实现搜索功能
- 第三百七十节,Python分布式爬虫打造搜索引擎Scrapy精讲—elasticsearch(搜索引擎)用Django实现搜索结果分页