您的位置:首页 > 编程语言 > Python开发

Python爬虫QQnews

2014-02-26 13:29 239 查看
好久没碰python了。。写一个简单的热热手

'''
Created on 2014.2.25
for QQnews
@author: accyao
'''
import sys
import urllib2
import urllib
import re
import os
import time
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
tm = time.strftime('%Y%m%d',time.localtime(time.time()))
print(tm)
def getTitle(page):
key = "<title>.*</title>"
title = re.findall(key, page)
return title
def getText(page):
key = "<P style=\"TEXT-INDENT: 2em\">.*</P>"
text = re.findall(key,page)
return text
def dlNews(idx):
tmp = urllib2.Request(
url = 'http://news.qq.com/a/'+str(tm)+'/'+str("%06d"%idx)+'.htm',
headers = headers
)
try:
html = urllib2.urlopen(tmp).read()
title = getTitle(html)[0]
title = title.replace('<title>','')
title = title.replace('</title>','')
if(os.path.isfile(title+".txt")==0):
filename = title+".txt"
fl = file(filename,'w')
text = getText(html)[0]
text = text.replace('<P style=\"TEXT-INDENT: 2em\">','')
text = text.replace('</P>','\n')
text = re.sub('<[^>]*>',' ',text)
fl.write(text)

except urllib2.HTTPError,e:
print(e.reason)
def main():
for i in range(1803,1804):
dlNews(i)
main()





                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: