jd.py
2015-08-28 02:59
411 查看
#!/usr/bin/env python #coding:utf-8 import urllib2,re,sys,os,types #from bs4 import BeautifulSoup reload(sys); sys.setdefaultencoding('gbk'); province="上海" city="上海" fileHeader='\xEF\xBB\xBF' colums='省直辖市^城市^行政区^商圈^名称^地址^联系人^联系电话^URL^公司介绍^' def getCompany(): for page in range(1,5+1): url1="http://book.jd.com/booktop-4-6929-%s.html"%(page) print "\n##################:",url1 httpCrawler(url1,page) def httpCrawler(url,page): content = httpRequest(url) #<tr logr='j_2_27359935228167_20019655228034_3'> List=re.findall(r'<dt class=\'p-name\'>(.*?)<a href=\'(.*?)\' title="(.*?)" target=\'_blank\'',content,re.S) no=len(List) print no for i in range(0,no):#0 ~ no-1 url=List[i][1] name=List[i][2] print "\ndownload one page:",List[i][1],"\n",List[i][2] if not os.path.exists('./jd'): os.mkdir(r'./jd') content = httpRequest(url) # if (page-1)*20+i+1 != 82: open(u'jd/%s.%s'%((page-1)*20+i+1,List[i][2].replace("/",""))+'.html','w+').write(content) print "ok" def httpRequest(url): #try: html = None req_header = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Accept':'text/html;q=0.9,*/*;q=0.8', #'Accept-Language':'en-US,en;q=0.5', #'Accept-Encoding':'gzip', #'Host':'j3.s2.dpfile.com', #'Connection':'keep-alive' #'Referer':'http://www.baidu.com' } req_timeout = 15 req = urllib2.Request(url,None,req_header) resp = urllib2.urlopen(req,None,req_timeout) html = resp.read()#.decode('gbk').encode('gbk') print "resp:",resp #print html #finally: # if resp: # resp.close() return html def writeHeader(fileheader,colums): if not os.path.exists('./58'): os.mkdir(r'./58') f = open('./58/daikuan.csv', 'w') f.write(fileheader) f.write(colums) #f.write('\r\n') f.close() if __name__ == '__main__': #writeHeader(fileHeader,colums) getCompany()
相关文章推荐
- IIS安全性配置
- poj 2566 Bound Found(尺取法 好题)
- WingIDE破解 python2.x和python3.x
- IIS7.5安全配置研究
- Oracle 权限 管理大全
- ubuntu git 服务器搭建
- 升级WIN10后80端口被占用
- oracle登陆账户信息
- oracle登陆命令
- JS 的 call apply bind 方法
- Pow(x, n)
- 可以替代Ghost的系统备份/恢复利器Clonezilla
- LightOJ 1072 - Calm Down 【二分】
- python数据结构与算法——哈希表
- Product of Array Except Self
- union和union all的区别
- maven 如何修改仓库repository 位置
- 8行代码教你搞定导航控制器全屏滑动返回效果
- HTML标签深入学习系列(1)——注释标签 <!-- -->
- hdu 4630 No Pain No Game (区间gcd相关x线段树or树状数组)