您的位置:首页 > 编程语言 > Python开发

Python爬取站酷全网用户数据

2020-02-17 12:16 211 查看

爬取站酷所有用户资料作品,图片,视频等内容。。。

1.首先我用的是Python3加scrapy,每次爬取一个网站,个人习惯都会对这个网站查看很久,思考怎么写最方便,最通用写这个网站的时候我就发现了,这个网站的每个用户的网址是比较有规律的,为自增长,当然可能中间有一些无效用户吗,但是不影响我们把数据抓全。

2.其次,这个网站理论上来说相对比较简单的,因为没有IP限制,没有js破解,抓取的时候主要考略怎么快就行了!但是有一个要记得,这个网站的视频链接是有有效期的,大家要注意!

3.第一步构造请求,理论上需要从1,构造到17000000左右,为啥事这个,因为当时我新注册额两个,是连的增长的,再加上我之前跑过部分数据观察过其规律,我是开了多个窗口执行代替了分布式。

代码的话我只复制spiders的代码如下,因为比较简单,网站没什么反爬,只需要告诉你怎么抓全,以及视频链接过期问题!这是我爬下来的数据,我们是所以关于这个人的信息全部拿下来了!

表一基本用户信息表:

这是获取用户本身信息,用户信息和作品我是分开跑的,因为作品太多,如果一起层次太深!
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import re
import redis
from zhanku.items import ZhankuItem

class ZhankuSpider(scrapy.Spider):
name = 'ZhanKu'
allowed_domains = ['zcool.com.cn']

def start_requests(self):
r = redis.StrictRedis(host="127.0.0.1", port=6379, decode_responses=True)
while True:
url = r.spop("zhanku_website")
dict_data = {}
yield scrapy.Request(url, meta={"dict_data": dict_data}, callback=self.parse,
dont_filter=True)

def parse(self, response):
dict_data = response.meta["dict_data"]
div_list = response.xpath("//div[@class='work-list-box']/div[@class='card-box']")
for div in div_list:
views = div.xpath(".//span[@class='statistics-view']/@title").extract_first()
try:
view_num = int(re.search("\d+", views).group())
except:
view_num = 0
link = div.xpath(".//a[@class='title-content']/@href").extract_first()
dict_data[link] = view_num
next_page = response.xpath("//div[@class='pageturning ']//a[@class='laypage_next']/@href").extract_first()
if next_page:
yield scrapy.Request("https://www.zcool.com.cn" + next_page, meta={"dict_data": dict_data},
callback=self.parse)
else:
yield scrapy.Request(response.url,
meta={"dict_data": dict_data}, callback=self.get_top10,dont_filter=True)

def get_top10(self, response):
data_dict = response.meta["dict_data"]
if len(data_dict) > 10:
data_dict = sorted(data_dict.items(), key=lambda x: x[1], reverse=True)[0:10]
data_dict = {item[0]: item[1] for item in data_dict}
else:
data_dict = data_dict
for key, value in data_dict.items():
img = []
video =[]
yield scrapy.Request(key, meta={"img": img, "video": video,"res":response.url},callback=self.get_zuopin)

def get_zuopin(self, response):
item = ZhankuItem()
img_list =response.meta["img"]
video_list = response.meta["video"]
website = response.meta["res"]
try:
title = response.xpath("//h2/text()").extract_first()
if "\n" in title:
title = title.replace("\n","")
title = title.strip()
except:
title = None
create_time = re.findall("创建时间:(\d+-\d+-\d+) ", response.text)
if len(create_time) == 0:
create_time = None
else:
create_time = create_time[0]
content_tag = ";".join(response.xpath("//span[@class='head-index']/span/a/text()").extract())
try:
view_num = response.xpath("//span[@class='head-data-show']//a[@class='see vertical-line']/@title").extract_first()
view_num = re.search("\d+", view_num).group()
except:
view_num = None
try:
news = response.xpath(
"//span[@class='head-data-show']//a[@class='news vertical-line']/@title").extract_first()
news= re.search("\d+", news).group()
except:
news = None
try:
recommend = response.xpath(
"//span[@class='head-data-show']//a[@class='recommend-show']/@title").extract_first()
recommend = re.search("\d+", recommend).group()
except:
recommend = None
try:
jianjie = response.xpath("//div[@class='work-decoration-title']/p/text()").extract_first()
except:
jianjie = None
img_list.extend(response.xpath("//div[@class='work-details-content']//img/@src").extract())
video_list.extend(re.findall("\'(https://video.zcool.cn.*?)\'",str(response.text)))
#video_list.extend(response.xpath("//div[@class='work-details-content']//video/@src").extract())
item["title"] = title
item["view_num"] = view_num
item["news"] = news
item["create_time"] = create_time
item["recommend"] = recommend
item["content_tags"] = content_tag
item["jianjie"] = jianjie
item["img_list"] = img_list
item["video"] = video_list
item["website"] = website
next_page = response.xpath("//a[@class='laypage_next']/@href").extract_first()
if next_page:
yield scrapy.Request("https://www.zcool.com.cn" + next_page,meta={"img": img_list, "video": video_list,"res":website},callback=self.get_zuopin)
else:
yield item

表二:用户作品表:

这是爬取图片主题,链接,视频的代码!

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import re
import redis
from zhanku.items import ZhankuItem

class ZhankuSpider(scrapy.Spider):
name = 'ZhanKu'
allowed_domains = ['zcool.com.cn']

def start_requests(self):
r = redis.StrictRedis(host="127.0.0.1", port=6379, decode_responses=True)
while True:
url = r.spop("zhanku_website")
dict_data = {}
yield scrapy.Request(url, meta={"dict_data": dict_data}, callback=self.parse,
dont_filter=True)

def parse(self, response):
dict_data = response.meta["dict_data"]
div_list = response.xpath("//div[@class='work-list-box']/div[@class='card-box']")
for div in div_list:
views = div.xpath(".//span[@class='statistics-view']/@title").extract_first()
try:
view_num = int(re.search("\d+", views).group())
except:
view_num = 0
link = div.xpath(".//a[@class='title-content']/@href").extract_first()
dict_data[link] = view_num
next_page = response.xpath("//div[@class='pageturning ']//a[@class='laypage_next']/@href").extract_first()
if next_page:
yield scrapy.Request("https://www.zcool.com.cn" + next_page, meta={"dict_data": dict_data},
callback=self.parse)
else:
yield scrapy.Request(response.url,
meta={"dict_data": dict_data}, callback=self.get_top10,dont_filter=True)

def get_top10(self, response):
data_dict = response.meta["dict_data"]
if len(data_dict) > 10:
data_dict = sorted(data_dict.items(), key=lambda x: x[1], reverse=True)[0:10]
data_dict = {item[0]: item[1] for item in data_dict}
else:
data_dict = data_dict
for key, value in data_dict.items():
img = []
video =[]
yield scrapy.Request(key, meta={"img": img, "video": video,"res":response.url},callback=self.get_zuopin)

def get_zuopin(self, response):
item = ZhankuItem()
img_list =response.meta["img"]
video_list = response.meta["video"]
website = response.meta["res"]
try:
title = response.xpath("//h2/text()").extract_first()
if "\n" in title:
title = title.replace("\n","")
title = title.strip()
except:
title = None
create_time = re.findall("创建时间:(\d+-\d+-\d+) ", response.text)
if len(create_time) == 0:
create_time = None
else:
create_time = create_time[0]
content_tag = ";".join(response.xpath("//span[@class='head-index']/span/a/text()").extract())
try:
view_num = response.xpath("//span[@class='head-data-show']//a[@class='see vertical-line']/@title").extract_first()
view_num = re.search("\d+", view_num).group()
except:
view_num = None
try:
news = response.xpath(
"//span[@class='head-data-show']//a[@class='news vertical-line']/@title").extract_first()
news= re.search("\d+", news).group()
except:
news = None
try:
recommend = response.xpath(
"//span[@class='head-data-show']//a[@class='recommend-show']/@title").extract_first()
recommend = re.search("\d+", recommend).group()
except:
recommend = None
try:
jianjie = response.xpath("//div[@class='work-decoration-title']/p/text()").extract_first()
except:
jianjie = None
img_list.extend(response.xpath("//div[@class='work-details-content']//img/@src").extract())
video_list.extend(re.findall("\'(https://video.zcool.cn.*?)\'",str(response.text)))
#video_list.extend(response.xpath("//div[@class='work-details-content']//video/@src").extract())
item["title"] = title
item["view_num"] = view_num
item["news"] = news
item["create_time"] = create_time
item["recommend"] = recommend
item["content_tags"] = content_tag
item["jianjie"] = jianjie
item["img_list"] = img_list
item["video"] = video_list
item["website"] = website
next_page = response.xpath("//a[@class='laypage_next']/@href").extract_first()
if next_page:
yield scrapy.Request("https://www.zcool.com.cn" + next_page,meta={"img": img_list, "video": video_list,"res":website},callback=self.get_zuopin)
else:
yield item
  • 点赞
  • 收藏
  • 分享
  • 文章举报
qq_40997629 发布了2 篇原创文章 · 获赞 1 · 访问量 171 私信 关注
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: