您的位置:首页 > 编程语言 > Python开发

python爬去搜狐论坛笔记

2016-08-26 16:30 274 查看
搜狐论坛搜索页面:http://s.club.sohu.com/?action=search

——————————

改于:2016.09.08

可是!论坛搜索自己跪了!都一个星期了还没修好==!是不是搜狐自己还没发现搜索自己跪了?!==

——————————

今天搞了半天才搞定搜狐的爬虫,坑还蛮多的。特意记下来,方便自己下次捡起来。

首先是搜狐论坛每个板块的url不一样,不能直接在得到的  【a/@href  】 加前缀。



后来找到了介个,是一个list,写着不同的板块对应的url前缀



我用的是python的scrapy框架,搜狐的论坛上,每一层楼的数据是用get的方式得到的,所以response.xpath('//table[@class="viewpost"]').extract()为空!



没办法了,直接从源码上用正则截取数据

除了正则不知道还有啥方法

table = re.findall('<table class.*table>',response.body)[0].decode('unicode-escape')


Selector(text=table).xpath('//div[@class="wrap"]/p/text()')

# -*- encoding:utf-8 -*-
import scrapy
import re
import time
from spider.items import spiderItem
#import urllib2,urllib
import json
from scrapy.selector import Selector
from w3lib.html import remove_tags
from w3lib.html import replace_tags

url_search = ...#这里是用url编码的搜索内容
f=open('sohu_.json','r')
sohu_url=json.load(f)
f.close()

class MySpider(scrapy.Spider):
name = 'sohu'
allowed_domains = ['sohu.com']
start_urls = [ 'http://club.sohu.com/']
time_list = {u"昨天": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24)),
u"前天": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 2)),
u"三天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 3)),
u"四天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 4)),
u"五天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 5)),
u"六天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 6)),
u"七天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 7)),
u"3天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 3)),
u"4天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 4)),
u"5天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 5)),
u"6天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 6)),
u"7天前": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 60 * 24 * 7))
}

def parse(self, response):
urls_1 = ['http://s.club.sohu.com/?action=search&type=0&keyword=%s' % i for i in url_search]#搜索
urls_2 =['&timeauto=1&thread=1&page=%s' % i for i in range(1, 30)]
urls = [i+j for i in urls_1 for j in urls_2]#拼接
for url in urls:
yield scrapy.Request(url, callback=self.parse_url)

def parse_url(self, response):
urls = response.xpath('//h1/a/@href').extract()
bankuai=[i.replace('\n','').strip() for i in response.xpath('//div[@class="resultItem"]/a[2]/text()').extract()]
url_dict = zip(urls,bankuai)
url_dict = dict((urls,bankuai)for urls,bankuai in url_dict)
for url,bankuai in url_dict.items():
new_url = 'http://'+sohu_url[bankuai]+url
yield scrapy.Request(new_url, callback=self.parse_page)

def parse_page(self, response):
item = spiderItem()
table = re.findall('<table class.*table>',response.body)[0].decode('unicode-escape')

item['content'] = ''.join(Selector(text=table).xpath('//div[@class="wrap"]/p/text()').extract()).strip().replace('\r','').replace('\t','').replace(' ','').replace(u'\u3000','')
try:
time_str = ''.join(Selector(text=table).xpath('//div[@class="grey"]//div//text()').extract())
try:
item['time'] = re.findall('\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}',time_str)[0]
item['time'] = time.strptime(item['time'], "%Y-%m-%d %H:%M:%S")
item['time'] = time.strftime("%Y-%m-%d %H:%M:%S", item['time'])
except:
for t in self.time_list.keys():
if t in time_str:
item['time'] = self.time_list[t]
except:
item['time'] = '1999-09-09 09:09:09'
item['time'] = time.strptime(item['time'], "%Y-%m-%d %H:%M:%S")
item['time'] = time.strftime("%Y-%m-%d %H:%M:%S", item['time'])
yield item
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息