您的位置:首页 > 编程语言 > Python开发

python爬取邮箱

2013-05-02 21:27 169 查看
上次纠结了那么久。结果不用编码也是可以匹配邮箱的。

下面是一个用队列实现,广度优先的简单爬虫代码。先就这样吧,目测暂时不会再理它了,以后有时间再修改。(又是一个烂尾。。。。。)

View Code

# -*- coding: cp936 -*-
import urllib2
import re
from pyquery import PyQuery as pq
from lxml import etree

#mailpattern = re.compile('[^\._:>\\-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
mailpattern = re.compile('[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')

htmlcount = 0  #to count the urls
maxcount = 3000 # the max count
allUrls = set()
allMails = set()
UrlsQlist = []
UrlsQdict = {}
url = "http://www.163.com"
fmails = open("E:/py/crawler/mailresult.txt","a")
furls = open("E:/py/crawler/urlresult.txt","a")

def geturls(data):#the function to get the urls in the html
urls = set()
if data:
d = pq(data)
label_a = d.find('a')#用pyquery库去找到 a 标签.
if label_a:
label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
for u in label_a_href:
if u[0:10]!="javascript" :
if u[0:4] == "http":
urls.add(u)
else:
urls.add(url + u)
#for u in urls:
#print u
return urls
else:
return None

def gethtml(url):
try:
fp = urllib2.urlopen(url)
except:
print "urllib2.urlopen error"
return None
else:
mybytes =fp.read()
fp.close()
return mybytes

def savemails(data): # the function to save the emails
if data:
mailResult = mailpattern.findall(data)
mailResultset = set(mailResult)
if mailResultset:
allMails.update(mailResultset)

def savehtml(pagecontent,count):
if pagecontent != None:
f = open("E:/py/crawler/html/"+str(count)+".html","w")
f.write(pagecontent)
f.close()
else:
f = open("E:/py/crawler/html/"+str(count)+"error"+".html","w")
f.write("this page empty")
f.close()

def BFS(firstUrl):
global htmlcount
global maxcount
allUrls.add(firstUrl)
UrlsQlist = list(allUrls)
while htmlcount < maxcount : #数量小于最大值
tempUrl = UrlsQlist.pop(0)# the queue
myWebStr = gethtml(tempUrl)
savehtml(myWebStr,htmlcount)
savemails(myWebStr)
firstUrls_set = geturls(myWebStr)#初始页面的处理
if firstUrls_set != None:
allUrls.update(firstUrls_set) #记录全部 url
for u in firstUrls_set:
if u not in UrlsQlist:
UrlsQlist.append(u)
htmlcount = htmlcount + 1

BFS(url)
for u in allMails:
try:
fmails.write(u)
fmails.write('\n')
except:
continue
for u in allUrls:
try:
furls.write(u)
furls.write('\n')
except:
continue
fmails.close()
furls.close()


2013.5.13 update

本来想在加个多线程。。。。结果看了 好多资料 无处下手,再研究研究 ,日后再改

加了点 url规范化。代码整理如下:

import urllib2
import re
from pyquery import PyQuery as pq
from lxml import etree
import urlparse
import time

allUrls = set()
allMails = set()
urlsDownlist = []

class mailCrawler:
def __init__(self,mailExpression,start_url,maxcount):
''' mailExpressoin 邮箱的正则表达式;
start_url开始邮箱;
maxcount最大数量'''
self.mailpattern = re.compile(mailExpression)
self.maxcount = maxcount
self.htmlcount = 0
self.UrlsQlist = []#url queue 实现广度优先
self.url = start_url

def url_normal(self,url):
'''url 规范化 '''
scheme,netloc,path,query = urlparse.urlsplit(url)[:4]
netloc = netloc.lower()

url.encode("utf-8")

if path:
path = re.sub('/{2,}','/',path)#去除url中的重复/
path = re.sub(r'\.$','',path)#去除url中结尾多余的点
path = re.sub('/$','',path)#去除url中结尾多余的/
path = re.sub('\s','',path)#取出url中的空格
if query:
return '%s://%s%s?%s' % (scheme,netloc,path or '/',query)
else:
return '%s://%s%s' % (scheme,netloc,path)

def geturls(self,data):
'''解析html中的url'''
urls = set()
if data:
d = pq(data)
label_a = d.find('a')#用pyquery库去找到 a 标签.
if label_a:
label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
for u in label_a_href:
if u[0:10]!="javascript" and u[0:6]!="mailto" :
if u[0:4] == "http":
normal_url = self.url_normal(u)
urls.add(normal_url)
else:
normal_url = self.url_normal(self.url + u)
urls.add(normal_url)
return urls
else:
return None

def gethtml(self,url):
'''下载html  5s超时'''
try:
fp = urllib2.urlopen(url,None,5)
except:
print "urllib2.urlopen error  or timeout"
return None
else:
mybytes =fp.read()
fp.close()
return mybytes

def savemails(self,data):
'''将抓取到的url存放到 allmails中 ,set去重复'''
global allMails
if data:
mailResult = self.mailpattern.findall(data)
mailResultset = set(mailResult)
if mailResultset:
allMails.update(mailResultset)

def savehtml(self,pagecontent,htmlcount,url):
'''保存html文件 '''
if pagecontent != None:
f = open("E:/py/crawler/html/"+str(htmlcount)+".html","w")
f.write(pagecontent)
f.close()
else:
f = open("E:/py/crawler/html/"+str(htmlcount)+"error"+".html","w")
try:
f.write(url)
except:
f.write("encode error")
f.close()

def BFS(self):
'''用队列实现广度优先,爬取url '''
global allUrls
global urlsDownlist
allUrls.add(self.url)
self.UrlsQlist = list(allUrls)
while self.htmlcount < self.maxcount : #数量小于最大值
tempUrl = self.UrlsQlist.pop(0)# the queue
print tempUrl
urlsDownlist.append(tempUrl)
myWebStr = self.gethtml(tempUrl)
self.savehtml(myWebStr,self.htmlcount,tempUrl)
self.savemails(myWebStr)
firstUrls_set = self.geturls(myWebStr)#初始页面的处理
if firstUrls_set != None:
for u in firstUrls_set:
if u not in allUrls:
allUrls.add(u)
self.UrlsQlist.append(u)
self.htmlcount = self.htmlcount + 1

def main():
reg = r'[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+'
url = "http://www.baidu.com"
count = 100
fmails = open("E:/py/crawler/mailresult.txt","a")
furls = open("E:/py/crawler/urlresult.txt","a")
fdownUrls = open("E:/py/crawler/urlDownresult.txt","a")
newcrawler = mailCrawler(reg,url,count)
newcrawler.BFS()
for u in allMails:
try:
fmails.write(u)
fmails.write('\n')
except:
continue
for u in allUrls:
try:
furls.write(u)
furls.write('\n')
except:
continue
for u in urlsDownlist:
try:
fdownUrls.write(u)
fdownUrls.write('\n')
except:
continue
fmails.close()
furls.close()
fdownUrls.close()

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: