您的位置:首页 > 编程语言 > Python开发

15Python爬虫---爬虫定向爬取腾讯视频网---利刃出击评论

2018-03-09 00:29 501 查看
先贴上代码后面补充上解析,代码缺陷没有对对评论的回复进行处理爬取

import urllib.request
import http.cookiejar
import re

# ----------对象赋值--------------------------------------------
class point():
pass
# ----------对象赋值--------------------------------------------

# ----------emoji表情去除--------------------------------------------
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|"  # emoticons
u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
u"(\ud83c[\udde0-\uddff])|"  # flags (iOS)
u"(\\n)"
"+", flags=re.UNICODE)

def remove_emoji(text):
return emoji_pattern.sub(r'', text)
# ------------------------------------------------------------------

# 设置视频编号
# 利刃出击
vid = "2457683703"
# 设置评论起始标编号
comid = "0"
# 请求的评论数
num = "10"
# 构造真实评论请求网址
# 设置头信息伪装成浏览器
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gb2312, utf-8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"Host": "video.coral.qq.com",
}
# 设置cookie
cjar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
# 添加头部
opener.addheaders = headall
urllib.request.install_opener(opener)
# 爬取该网页
def craw(vid, comid):
url = "https://video.coral.qq.com/varticle/" + vid + "/comment/v2?orinum=" + num + "&cursor=" + comid + "&pageflag=1&oriorder=o"
data = urllib.request.urlopen(url).read().decode('unicode_escape')
return data

# 正则
useridpat = '"userid":"(.*?)","content":'
idpat = '"id":"(.*?)"'
userpat = '{"userid":"(.*?)","head":'
conpat = '"content":"(.*?)"'
print("-------------------------利剑出击---------------------------")
for i in range(1, 10):  # 循环9页评论
print("------------------------------------------------------------")
print("第 " + str(i) + " 页评论内容")
# 正则找出所有符合的内容
data = craw(vid, comid)
useridlist = re.compile(useridpat, re.S).findall(data)  # 用户id列表
idlist = re.compile(idpat, re.S).findall(data)  # 评论id列表
userlist = re.compile(userpat, re.S).findall(data)  # 用户列表
conlist = re.compile(conpat, re.S).findall(data)  # 评论内容列表
"""
处理用户JSON,使得数组变为[{'userid':'111','username':'如意'},{'userid':'122','username':'二哈'}...]
"""
uselist = []
name = []
for k in range(0, len(userlist)):
user = userlist[k].split('","nick":"')
obj = point()
for j in range(len(user)):
obj.userid = user[0]
obj.username = remove_emoji(user[1])  # 去除用户名中的表情符号和换行符
uselist.append(obj)

# 循环打印评论内容
for k in range(0, 10):
# 输出对应的信息,并对字符串进行unicode编码,从而正常显示
for j in range(0, len(uselist)):  # ID和用户姓名对应 显示
if uselist[j].userid == str(useridlist[k]):
print("用户名是:" + eval('u"' + uselist[j].username + '"'))
content = remove_emoji(conlist[k])  # 去除内容中的表情符号和换行符
print("评论内容是:" + eval('u"' + content + '"'))
print("\n")
comid = idlist[9]


结果展示



内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息