您的位置:首页 > 编程语言 > Go语言

scrapy google爬虫实例

2016-07-15 11:18 316 查看
#!/usr/bin/python
# -*- coding:utf-8 -*-
import MySQLdb
import re
import sys
import json
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import log
from scrapy import Request
from goog.items import GoogItem

class googSpider(Spider):
name = "goog"
allowed_domains = ["google.co.jp"]
start_urls = []

def start_requests(self):
try:
conn = MySQLdb.connect('127.0.0.1','root','123456','test')
cursor = conn.cursor()
res = cursor.execute('select * from birds_google_suggest')
brand = []
crawlURL_advice = []
crawlURL_bottom = []
brand.append(1)
res = cursor.fetchall()
# for i in res:
#     print i[1]+'\n'
# exit()
for i in res:
brand.append(i[1])
url = r"https://www.google.com/complete/search?sclient=psy-ab&biw=1845&bih=407&q="+i[1]+r"%20coupon"
crawlURL_advice.append(url)
request = Request(url,callback=self.parse,meta = {'keyword':i[1],'level':'0'})
yield request

# url_for_bottom = "https://www.google.com/search?q="+i[1]+"+coupon&fp=1&biw=1855&bih=428&dpr=1&tch=1&ech=1&psi=pP2GV4z1GMfAjwOl3avgBQ.1468464549215.3"
# url_for_bottom = url_for_bottom.replace(' ','+')
# print url_for_bottom+'\n'
#
# request = Request(url_for_bottom,callback=self.parse_bottom,dont_filter=True,meta={'keyword':i[1]})
# yield request
cursor.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d %s" % (e.args[0],e.args[1])

def parse(self,response):
brandToken = ''
lst = json.loads(response.body)
for v in lst[1]:
brandToken += re.sub(r'[<b>|</b>]','',v[0])+','
conn = MySQLdb.connect('127.0.0.1','root','123456','test')
cursor = conn.cursor()
#res = cursor.execute("insert into `tb_brandToken` values('','"+lst[0]+"','"+brandToken+"')")
conn.commit()

def parse_bottom(self,response):
conn = MySQLdb.connect('127.0.0.1','root','123456','test')
cursor = conn.cursor()
keyword = response.meta['keyword']
content = re.search('\:\[\\\\\"(.*?)\\\\\"\]\,',response.body).group(1)
content = re.sub('[\'|\\\\\"\,\\\\\"]+',',',content)
if(content):
res = cursor.execute("insert into `tb_brandBotToken` values('','"+keyword+"','"+content+"')")
conn.commit()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: