您的位置：首页 > 编程语言 > Python开发

python 爬虫——针对query爬取百度百科页面

2016-03-15 22:45 981 查看

Preface:最近有些事情，需要爬取百度百科的东西，以前的给忘光了，不用就会忘记，还好有部分记录，可是以前大部分都因为没有记录又重新查找了很多东西，所以说经常记录很有必要。

coding:

import codecs#用于打开文件夹保证编码格式
import urllib2#用于爬取
import re#用于匹配找到url
import os#用于创建文件夹
from bs4 import BeautifulSoup as bs#用于解析html
from multiprocessing.dummy import Pool as ThreadPool #用于多线程
import time

def scrapy(url):#找到百度百科的那个链接，通过百度搜索引擎，爬取
m = urllib2.urlopen(url).read()
# substring = "http://baike.baidu.com/"
p = re.compile(r"(http://baike.baidu.com/.+?)\"")#正则匹配包含百科的那个url，
baike_url = p.findall(m)[0]
if not baike_url:#若是百度返回的第一个页面中没有包含百度百科的URL，那么返回汪涵的URL，程序继续跑
print "error"
write_error = open("baike_error.txt","a")
write_error.write(url+"\n")
write_error.close()
return "http://baike.baidu.com/view/64448.htm"#若是出错，返回汪涵的url，使得程序继续进行
else:
print baike_url
baike_html = urllib2.urlopen(baike_url).read()#解析该实体的百科url，然后提取html即可
return baike_html

def get_text(html):#找到那张百度百科的页面后，提取正文，一个函数get_text()即可
soup = bs(html)#使用beautifulSoup
text_div = soup.find_all("div",class_="para",attrs = {"label-module":"para"})#找到div标签
texts = []
for i in text_div:
text = i.get_text()#这里，用get_text()就能够提取正文，弄得卤煮很久，本来觉得可以用xpath里面的string(.)或者/text()，结果用不上。
text = text.replace("\n","")#包含太多换行，去掉
if text:
texts.append(text)
return texts

def write_file(query,texts):#将提取的文本写入到文件中，以query.txt为文件名，没啥好说的
dir = "baike_dir"
if not os.path.exists("baike_dir"):#创建一个文件夹，判断是否先存在，这里本可以节省时间，但为了通用还是放这里。
os.mkdir(dir)
write = codecs.open(dir+os.sep+query.split("\s")[0]+".txt","w")
for i in texts:
write.write(i+"\n")
write.close()

def handle_query(query):#将所有部分结合到一起。
query = unicode(query)
query+=u"\s百度百科"
# query=u"汪涵\s百度百科"

url = "http://www.baidu.com/s?wd="+query+"&pn=00&oq="+query+"&tn=baiduhome_pg&ie=utf-8&usm=3&rsv_idx=1&f=8&rsv_bp=1"

html = scrapy(url)
texts = get_text(html)
write_file(query,texts)

queries = ["汪涵","赵薇","谢娜","何炅"]
pool = ThreadPool(8)#看你使用的电脑是几核CPU就用几，卤煮是4核的，但用8也是可以的,更快些。
time3 = time.time()
results = pool.map(handle_query, queries[0:4])
pool.close()
pool.join()
time4 = time.time()
print time4-time3

注：不多写其他的，写注释是个好习惯。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 爬虫 python multiprocessing 多线程

相关文章推荐

新的分享

章节导航