您的位置:首页 > 其它

记录一次爬虫经历

2017-02-07 16:25 302 查看
初学python,先记录一次爬虫经历,就作为python的入门训练吧。目标网站采用了动态加载技术。

#-*- coding:utf-8 -*-
import requests
import re
import threading
global headers_for_pc,headers_for_realurl,offset_for_pc,forbidden
offset_for_pc=0
forbidden=["xxxxxxx",
"xxxxxxx",
"xxxxxx",
"xxxxxxx"]
headers_for_pc={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cookie':'xxxxx'
'Host':'aps.115.com',
'Referer':'http://aps.115.com/bridge_2.0.html?xxxxx',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
headers_for_realurl={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cookie':'xxxxx'
'Host':'web.api.115.com',
'Referer':'http://web.api.115.com/bridge_2.0.html?xxxxxx',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
url_for_pc="http://aps.115.com/natsort/files.php?xxxxxx"
url_for_realurl="http://web.api.115.com/xxxxxx"
def getpc(url,offset):
response=requests.get(url,params="offset=%s"%(offset),headers=headers_for_pc)
if response.status_code==200:
#print response.url
html=response.text
pickcodes=re.findall(r'"pc":"(.*?)"',html)
return pickcodes
else:
print "Sory,Get Pickcodes Fail,ErrorCode:",reponse.status_code
return -1
def geturl(url,pickcode):
#print pickcode
response=requests.get(url,params="pickcode="+pickcode,headers=headers_for_realurl)
#print response.url
if response.status_code==200:
html=response.text
#print html
realurl=re.findall(r'"file_url":"(.*?)"',html)
#name=str(re.findall(r'"file_name":"(.*?)"',html)[0])
return realurl
else:
print "Sory,Get Realurl Fail,Errorcode",response.status_code
return -1
def getpic(url,name):
#print "name=",name
f=open("%s"%(name),"wb")
f.write(requests.get(url).content)
f.close()
#print name,"-->done"
def work(offset):
offset="%s"%(offset)
print offset
pcs=getpc(url_for_pc,offset)
if pcs!=-1:
for pc in pcs:
if pc not in forbidden:
#print pc
url=geturl(url_for_realurl,pc)
getpic(str(url[0]).replace("\\",""),pc)

for i in range(0,197,24):
td=threading.Thread(target=work,args=(i))
td.run()
print "done"
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: