您的位置:首页 > 编程语言 > Python开发

Python爬虫多线程爬搜索引擎

2017-11-27 17:39 302 查看
爬搜索引擎的信息要注意page和key的变化,还有正则表达式一定要正确

爬下面的URL:    http://weixin.sogou.com/weixin?type=2&query=
后面再跟page信息

一共三个线程,第一个负责把URL存到队列中去,第二个URL负责读取需要的信息并储存,第三个如果队列为空,则结束

import queue
import threading
import urllib.request
import urllib.error
import re
import time

urlqueue = queue.Queue()

#获得html文档
def GetData(url):
try:
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)

# thread1
class GetUrl(threading.Thread):
def __init__(self, key, pagestart, pageend, urlqueue):
threading.Thread.__init__(self)
self.key = key
self.pagestart = pagestart
self.pageend = pageend
self.urlqueue = urlqueue

def run(self):
keycode = urllib.request.quote(self.key)
pagecode = urllib.request.quote("&page=")
for page in range(self.pagestart, self.pageend+1):
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
data = GetData(url)
listurlpattern = '<div class="txt-box">.*?(http://.*?)"'
page_urls = re.compile(listurlpattern, re.S).findall(data)
for page_url in page_urls:
page_url = page_url.replace("amp;", "")
self.urlqueue.put(page_url)
self.urlqueue.task_done()

class GetConnect(threading.Thread):

def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue

def run(self):
html1 = '''
<html>
<head>
<title>微信文章</title>
</head>
<body>
'''
fh = open("1.html", 'wb')
fh.write(html1.encode('utf-8'))
fh.close()
fh = open("1.html", 'ab')
i = 1
while(True):
try:
url = self.urlqueue.get()
print(url)
data = GetData(url)
titlepat = '<title>(.*?)</title>'
contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
title = re.compile(titlepat, re.S).findall(data)
content = re.compile(contentpat, re.S).findall(data)
thistitle = "no"
thiscontent = "no"
if (title != []):
thistitle = title[0]
if(content != []):
thiscontent = content[0]
dataall = "<p>标题是:"+thistitle+"</p><p>内容是:"+thiscontent+"</p><br>"
fh.write(dataall.encode('utf-8'))
print("第"+str(i)+"个网页处理")
i += 1
except urllib.request.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
fh.close()
html2 = '''
</body>
</html>
'''
fh = open("1.html", 'ab')
fh.write(html2.encode('utf-8'))
fh.close()

class Conrl(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue

def run(self):
while(True):
print("程序执行ing")
time.sleep(60)
if self.urlqueue.empty():
print("执行完毕")
exit()

key = "IT"
pagestart = 1
pageend = 2
thread1 = GetUrl(key, pagestart, pageend, urlqueue)
thread1.start()
thread2 = GetConnect(urlqueue)
thread2.start()
thread3 = Conrl(urlqueue)
thread3.start()

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐