您的位置:首页 > 其它

jd.py

2015-08-28 02:59 411 查看
#!/usr/bin/env python
#coding:utf-8
import urllib2,re,sys,os,types
#from bs4 import BeautifulSoup

reload(sys);
sys.setdefaultencoding('gbk');

province="上海"
city="上海"
fileHeader='\xEF\xBB\xBF'
colums='省直辖市^城市^行政区^商圈^名称^地址^联系人^联系电话^URL^公司介绍^'

def getCompany():
for page in range(1,5+1):
url1="http://book.jd.com/booktop-4-6929-%s.html"%(page)
print "\n##################:",url1
httpCrawler(url1,page)

def httpCrawler(url,page):
content = httpRequest(url)
#<tr logr='j_2_27359935228167_20019655228034_3'>
List=re.findall(r'<dt class=\'p-name\'>(.*?)<a href=\'(.*?)\' title="(.*?)" target=\'_blank\'',content,re.S)
no=len(List)
print no
for i in range(0,no):#0 ~ no-1
url=List[i][1]
name=List[i][2]
print "\ndownload one page:",List[i][1],"\n",List[i][2]
if not os.path.exists('./jd'):
os.mkdir(r'./jd')
content = httpRequest(url)
#    if (page-1)*20+i+1 != 82:
open(u'jd/%s.%s'%((page-1)*20+i+1,List[i][2].replace("/",""))+'.html','w+').write(content)
print "ok"

def httpRequest(url):
#try:
html = None
req_header = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Accept':'text/html;q=0.9,*/*;q=0.8',
#'Accept-Language':'en-US,en;q=0.5',
#'Accept-Encoding':'gzip',
#'Host':'j3.s2.dpfile.com',
#'Connection':'keep-alive'
#'Referer':'http://www.baidu.com'
}
req_timeout = 15
req = urllib2.Request(url,None,req_header)
resp = urllib2.urlopen(req,None,req_timeout)
html = resp.read()#.decode('gbk').encode('gbk')
print "resp:",resp
#print html
#finally:
#    if resp:
#        resp.close()
return html

def writeHeader(fileheader,colums):
if not os.path.exists('./58'):
os.mkdir(r'./58')
f = open('./58/daikuan.csv', 'w')
f.write(fileheader)
f.write(colums)
#f.write('\r\n')
f.close()

if __name__ == '__main__':
#writeHeader(fileHeader,colums)
getCompany()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: