您的位置:首页 > 其它

构建自己的博客专用搜索引擎--抓数据

2011-12-12 22:34 323 查看
博客园有自己的lucene.net搜索引擎,还有google的站内搜索,不过即使是google搜索,也不内完全索引我的内容,它也挑三捡四的,所以我一直希望做一个自己的博客的全文索引。

本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展,做了一半,感觉整个工程周期太长,还是放了一旁,拿起以前的代码,改吧改吧先能用起来再说

我使用的是以前15-1688小额批发搜索引擎的部分脚本,之前使用web ui的方式来定制抓取的脚本模板,这里就直接拿来用了。

整个抓取数据的过程分为4步,共4个脚本,

A生成列表页链接

B抓取列表页

C抽取詳細页链接

D抽取詳細页

我就直接上代码了

A

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.15,v0.2
2010.10.07,v0.1
批量生成列表页链接
"""
import sys,os,time
list_url_template = "http://www.cnblogs.com/lexus/default.html?page=%s"
list_url_start    = 1
list_url_end      = 154
list_links_file   = os.path.join("./","list_links.txt")
g_step=1

def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc

@cost_time
def show(list_url_start=list_url_start,\
list_url_end=list_url_end,\
list_url_template=list_url_template):
lines=[]
for i in xrange(list_url_start,list_url_end+1):
line="http://www.cnblogs.com/lexus/default.html?page=%s\n"%(i*g_step)
print line.rstrip()
lines.append(line)
open(list_links_file,"w").writelines(lines)
print "total count:%s"%len(lines)
print "done!"

#import os.path
#print os.path.abspath(".")
if __name__=="__main__":
l=len(sys.argv)
if l==1:
show()
elif l==2:
show(list_url_end=int(sys.argv[1]))
elif l==3:
show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]))
elif l==4:
show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])
B

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.3
2010.10.09,v0.2
2010.10.07,v0.1
批量抓取列表页
"""
from __future__ import with_statement
from __future__ import division

import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2

import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')

import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)

import time

import os

import os.path

import stat

import select

import shutil

import re

import gzip
import StringIO

list_list_folder    = os.path.join("./","lists")
list_info_folder    = os.path.join("./","lists_infos")
status_file         = os.path.join("./","lists_infos/status.txt")
error_file          = os.path.join("./","lists_infos/error.txt")
error_file_bak      = os.path.join("./","lists_infos/error.txt.bak")
success_file        = os.path.join("./","lists_infos/success.txt")
list_links_file     = os.path.join("./","list_links.txt")
g_headers={}
g_pool_num          = 5

def init():
if not os.path.exists(list_list_folder):
os.mkdir(list_list_folder)
if not os.path.exists(list_info_folder):
os.mkdir(list_info_folder)
print "完成初始化"

def delete(src):
'''delete files and folders'''
permission(src)
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass

def permission(src):
os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)

def clear():
delete(list_list_folder)
delete(list_info_folder)
print "还原为初始"

def size(src):
"检查文件或文件夹大小"
r = 0L
if os.path.isfile(src):
r=os.path.getsize(src)
else:
for root, dirs, files in os.walk(src):
r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
l=len(str(r))

if l>9:
r=r/1024/1024/1024
r="%.2f GiB"%r
elif l>6:
r=r/1024/1024
r="%.2f MiB"%r
elif l>3:
r=r/1024
r="%.2f KiB"%r
print "%s 大小为:%s"%(src,r)

def status(str):
"running/stop"
f=open(status_file,"w")
f.write(str)
f.close()

def error(url,ex):
f=open(error_file,"a")
f.write("%s\n"%(url,))
f.close()

def success(url):
f=open(success_file,"a")
f.write("%s\n"%url)
f.close()

def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)

def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r

def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc

def statistics(func):
def tongji():
total,successed=0,0
if os.path.exists(list_links_file):
total=len(set(open(list_links_file,"r").readlines()))
print "total lines:%s"%total
if os.path.exists(success_file):
successed=len(set(open(success_file,"r").readlines()))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
def newFunc(*args,**args2):
tongji()
back = func(*args, **args2)
tongji()
return back
return newFunc

def get_html(url):
def do(url):
html=""
try:
req = urllib2.Request(url = url,headers = g_headers)
html = urllib2.urlopen(req).read()
return html
except Exception,e:
print url,"error",e
error(url,e)
return None
rr = with_timeout(10, do, url, timeout_value=None)
return rr

def get_html22(url):
import types
def do(url):
html=""
try:
req = urllib2.Request(url = url,headers = g_headers)
html = urllib2.urlopen(req).read()
t=type(html)
if t==types.StringTypes or t==types.UnicodeType:
return html
else:
print url,"error======"
return ""
except Exception,e1:
pdata = StringIO.StringIO(rr)#下面6行是实现解压缩
gzipper = gzip.GzipFile(fileobj = pdata)
try:
html = gzipper.read()
return html
except Exception,e2:
print url,e1,e2
error(url,e1)
return ""
rr = with_timeout(10, do, url, timeout_value="")
return rr

def get_html2(url):
"when use gzipped page will get fetch error"
#print url
with httppool.item() as http:
#eventlet.sleep(0)
resp, content = http.request(url)
print content
return content

def save_html2file(filename,html):
f=open(filename,"w")
f.write(html)
f.close()

def save_url2file(url):
#html=""
#try:
#    html=get_html(url)
#except Exception,e:
#    print url,"fetch error",e
#    error(url,e)
#    return
html=get_html(url)
if html is not None and html<>"":
filename=os.path.join(list_list_folder,url2filename(url))
save_html2file(filename,html)
if os.path.getsize(filename)<1024*20:
error(url,"size小于%s"%(1024*20))
print url,"error"
return
success(url)#以成功的为基准,剩下的都是不成功的或未执行的
print url,"success"
else:
print url,"error"
error(url,"html为None或为空")

@cost_time
@statistics
def batch_get_html(urls):
print "执行批量下载网页工作"
pool=eventlet.GreenPool(g_pool_num)
for url in urls:
pool.spawn_n(save_url2file,url)
pool.waitall()
print "done!"

def process_continue():
"接着success抓取剩下的部分"
#读取完整的部分和已完成的部分进行取非交集合
done=set(open(success_file,"r").read().split("\n"))
all=set(open(list_links_file,"r").read().split("\n"))
left=all-done
batch_get_html(left)

if __name__=="__main__":
init()
l=len(sys.argv)
if l==1:
content=""
if not select.select([sys.stdin,],[],[],0.0)[0]:
print "load from %s"%list_links_file
content=open(list_links_file,"r").read()
else:
print "load from stdin"
content=sys.stdin.read()
urls=content.strip().split("\n")
#print urls
batch_get_html(urls)
size(list_list_folder)
elif l==2:
argv=sys.argv[1]
if argv=="clear":
clear()
if argv=="continue":
process_continue()
elif l==3:
argv=sys.argv[1]
if argv=="load":
url=sys.argv[2]
print url
save_url2file(url)
print "done!"
C

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.22
2010.10.11,v0.21
2010.10.09,v0.2
2010.10.07,v0.1
从列表页抽取详细页的链接和缩略图链接的脚本
"""
import sys
import re
import os.path

list_list_folder      = os.path.join("./","lists")
success_file        = os.path.join("./","lists_infos/success.txt")
detail_links_file   = os.path.join("./","extract_detail_links.txt")

#g_pattern=r"""
[^"]*?)\1[\s\S]*?[^"]*?)\3 """
g_pattern=r"""http://www.cnblogs.com/lexus/archive/\d{4}/\d{1,2}/\d{1,2}/\d{1,}\.html)\1[\s\S]*?>(?P[\s\S]*?)<[\s\S]*?/[\s\S]*?a[\s\S]*?>"""
if g_pattern[-2]=='"':
g_pattern=g_pattern[:-2]+'\\"'
else:
g_pattern=g_pattern[:-1]

def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)

def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r

def delete(src):
'''delete files and folders'''
#permission(src)
if os.path.isfile(src):
try:
os.remove(src)
print "删除文件%s"%src
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
print "删除文件夹%s"%src
except:
pass

def clear():
delete(detail_links_file)

def extract_detail_link(url):
lines=[]
regex=re.compile(g_pattern)
file=os.path.join(list_list_folder,url2filename(url))
subject=open(file,"r").read()
for match in regex.finditer(subject):
#line="%s,%s\n"%(match.group("link").replace("&","&"),match.group("img").replace("http:/www","http://www").replace(","," "))
line="%s,\n"%(match.group("link").replace("&","&"),)
lines.append(line)
return lines

def batch_extract_detail_links():
f=open(detail_links_file,"w")
urls=open(success_file,"r").read().strip().split("\n")
total=[]
for url in urls:
lines=extract_detail_link(url)
total.extend(lines)
print "%s,%s"%(url,len(lines))

s=set(total)
f.writelines(s)
f.close()
print "done!"
print "repeat count:%s"%(len(total)-len(s))
print "total lines:%s"%len(s)

if __name__=="__main__":
l=len(sys.argv)
if l==1:
batch_extract_detail_links()
elif l==2:
if sys.argv[1]=="clear":
clear()
D

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.13
2010.10.15,v0.12
2010.10.13,v0.11
2010.10.07,v0.1
批量抓取详细页
"""
from __future__ import with_statement
from __future__ import division

import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2

from urlparse import urljoin
import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')

import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)

import time

import os

import os.path

import stat

import select

g_host                  = "http://www.cnblogs.com/lexus"
g_data_folder           = os.path.join("./","details")
g_info_folder           = os.path.join("./","details_infos")
g_status_file           = os.path.join("./","details_infos/status.txt")
g_error_file            = os.path.join("./","details_infos/error.txt")
g_success_file          = os.path.join("./","details_infos/success.txt")
g_result_links_file     = os.path.join("./","extract_detail_links.txt")
g_pool_num              = 1
g_headers={}
headers                 = """"""
headers                 = headers.strip().replace("\r\n","\n")
if headers<>"":
for elem in headers.split("\n"):
if elem.strip()=="":
continue
a,b=elem.split(":",1)
a=a.strip()
b=b.strip()
g_headers[a]=b

def init():
if not os.path.exists(g_data_folder):
os.mkdir(g_data_folder)
if not os.path.exists(g_info_folder):
os.mkdir(g_info_folder)
print "完成初始化"

def delete(src):
'''delete files and folders'''
permission(src)
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass

def permission(src):
os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)

def clear():
delete(g_data_folder)
delete(g_info_folder)
print "还原为初始"

def size(src):
"检查文件或文件夹大小"
r = 0L
if os.path.isfile(src):
r=os.path.getsize(src)
else:
for root, dirs, files in os.walk(src):
r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
l=len(str(r))

if l>9:
r=r/1024/1024/1024
r="%.2f GiB"%r
elif l>6:
r=r/1024/1024
r="%.2f MiB"%r
elif l>3:
r=r/1024
r="%.2f KiB"%r
print "%s 大小为:%s"%(src,r)

def status(str):
"running/stop"
f=open(g_status_file,"w")
f.write(str)
f.close()

def error(url,ex):
f=open(g_error_file,"a")
f.write("%s\n"%(url,))
f.close()

def success(url):
f=open(g_success_file,"a")
f.write("%s\n"%url)
f.close()

def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)

def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r

def statistics(func):
def tongji():
total,successed=0,0
if os.path.exists(g_result_links_file):
total=len(set(open(g_result_links_file,"r").readlines()))
print "total lines:%s"%total
if os.path.exists(g_success_file):
successed=len(set(open(g_success_file,"r").readlines()))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
def newFunc(*args,**args2):
tongji()
back = func(*args, **args2)
tongji()
return back
return newFunc

def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc

def get_html(url):
def do(url):
html=""
try:
req = urllib2.Request(url = url,headers = g_headers)
html = urllib2.urlopen(req).read()
return html
except Exception,e:
print url,"error",e
error(url,e)
return None
rr = with_timeout(10, do, url, timeout_value=None)
return rr

def get_html2(url):
#print url
with httppool.item() as http:
#eventlet.sleep(0)
resp, content = http.request(url,'GET',headers=g_headers)
#resp, content = http.request(url)
return content

def save_html2file(filename,html):
f=open(filename,"w")
f.write(html)
f.close()

def save_url2file(url):
a,b=url.strip().split(",")
if not a.startswith("http://"):
a=urljoin(g_host,a)
#a=a.replace("&","&")
html=get_html(a)
if html is not None and html<>"":
filename=os.path.join(g_data_folder,url2filename(a))
save_html2file(filename,html)
if os.path.getsize(filename)<1024*10:
error(url,"size小于%s"%(1024*10))
print url,"error"
return
success(url)#以成功的为基准,剩下的都是不成功的或未执行的
print url,"success"
else:
print url,"error"
error(url,"html为None或为空")

def save_url2file2(url):
a,b=url.strip().split(",")
if not a.startswith("http://"):
a=urljoin(g_host,a)
html=""
try:
html=get_html(a)
except Exception,e:
print url,e,"fetch error"
error(url,e)
return

if html<>"":
filename=os.path.join(g_data_folder,url2filename(a))
save_html2file(filename,html)
if os.path.getsize(filename)<1024*10:             error(url,"size小于%s"%(1024*10))             print url,"error"             return         success(url)#以成功的为基准,剩下的都是不成功的或未执行的         print url,"success" @cost_time @statistics def batch_get_html(urls):     print "执行批量下载网页工作"     pool=eventlet.GreenPool(g_pool_num)     for url in urls:         pool.spawn_n(save_url2file,url)     pool.waitall()     size(g_data_folder)     print "done!" def count():     total,successed=set(),set()     if os.path.exists(g_success_file):         successed=set(open(g_success_file,"r").read().strip().split("\n"))     if os.path.exists(g_result_links_file):         total=set(open(g_result_links_file,"r").read().strip().split("\n"))     left=total-successed     return total,successed,left def process_continue():     "接着success抓取剩下的部分"     #读取完整的部分和已完成的部分进行取非交集合     total,successed,left=count()     batch_get_html(left) def process_forever():     "循环处理,直到全部完成"     total,successed,left=count()     print "left"     while len(left)>0:
print "由于还没未完成页面,再次循环执行"
process_continue()
total,successed,left=count()

if __name__=="__main__":
init()
l=len(sys.argv)
if l==1:
content=""
if not select.select([sys.stdin,],[],[],0.0)[0]:
print "load from %s"%g_result_links_file
content=open(g_result_links_file,"r").read()
else:
print "load from stdin"
content=sys.stdin.read()
urls=content.strip().split("\n")
#print urls
batch_get_html(urls)
elif l==2:
argv=sys.argv[1]
if argv=="clear":
clear()
if argv=="continue":
process_continue()
if argv=="loop":
process_forever()
elif l==3:
if sys.argv[1]=="load":
url=sys.argv[2]
save_url2file(url)
print "done!"
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: