scrapy google爬虫实例
2016-07-15 11:18
316 查看
#!/usr/bin/python # -*- coding:utf-8 -*- import MySQLdb import re import sys import json from scrapy.spider import Spider from scrapy.selector import Selector from scrapy import log from scrapy import Request from goog.items import GoogItem class googSpider(Spider): name = "goog" allowed_domains = ["google.co.jp"] start_urls = [] def start_requests(self): try: conn = MySQLdb.connect('127.0.0.1','root','123456','test') cursor = conn.cursor() res = cursor.execute('select * from birds_google_suggest') brand = [] crawlURL_advice = [] crawlURL_bottom = [] brand.append(1) res = cursor.fetchall() # for i in res: # print i[1]+'\n' # exit() for i in res: brand.append(i[1]) url = r"https://www.google.com/complete/search?sclient=psy-ab&biw=1845&bih=407&q="+i[1]+r"%20coupon" crawlURL_advice.append(url) request = Request(url,callback=self.parse,meta = {'keyword':i[1],'level':'0'}) yield request # url_for_bottom = "https://www.google.com/search?q="+i[1]+"+coupon&fp=1&biw=1855&bih=428&dpr=1&tch=1&ech=1&psi=pP2GV4z1GMfAjwOl3avgBQ.1468464549215.3" # url_for_bottom = url_for_bottom.replace(' ','+') # print url_for_bottom+'\n' # # request = Request(url_for_bottom,callback=self.parse_bottom,dont_filter=True,meta={'keyword':i[1]}) # yield request cursor.close() conn.close() except MySQLdb.Error,e: print "Mysql Error %d %s" % (e.args[0],e.args[1]) def parse(self,response): brandToken = '' lst = json.loads(response.body) for v in lst[1]: brandToken += re.sub(r'[<b>|</b>]','',v[0])+',' conn = MySQLdb.connect('127.0.0.1','root','123456','test') cursor = conn.cursor() #res = cursor.execute("insert into `tb_brandToken` values('','"+lst[0]+"','"+brandToken+"')") conn.commit() def parse_bottom(self,response): conn = MySQLdb.connect('127.0.0.1','root','123456','test') cursor = conn.cursor() keyword = response.meta['keyword'] content = re.search('\:\[\\\\\"(.*?)\\\\\"\]\,',response.body).group(1) content = re.sub('[\'|\\\\\"\,\\\\\"]+',',',content) if(content): res = cursor.execute("insert into `tb_brandBotToken` values('','"+keyword+"','"+content+"')") conn.commit()
相关文章推荐
- Django中实现文件下载功能
- mongo group by
- google Gson 的用法
- 【Proto文件】Google开源技术 Protobuf 简介与使用
- Django中实现文件上传功能
- ubuntu +pyCharm配置Django
- 英文版Ubuntu16.04安装fcitx-googlepinyin
- 双系统ubuntu+ros indigo安装
- zygote笔记
- 修改u-boot的开机logo及显示过程
- 《Pokémon GO》为什么这么火?我们应该从这些方面来思考
- go语言学习-数组, slice和map
- go语言学习-未分类的一些记录
- Django的搜索路径与Import机制
- Django的模板渲染(render)机制
- Django模板的include机制
- 理解Django里的MTV开发模式
- GO语言range的用法
- pgoneproxy的VIP机制
- go语言beego框架 orm实现insertOrUpdate功能