您的位置:首页 > 编程语言 > Python开发

python获取自己发的说说内容

2017-08-06 19:59 204 查看
  一、模拟登陆

import re
from selenium import webdriver
from time import sleep
from PIL import Image

#定义QQ空间登录函数
def QR_login():
def getGTK(cookie):
""" 根据cookie得到GTK """
hashes = 5381
for letter in cookie['p_skey']:
hashes += (hashes << 5) + ord(letter)

return hashes & 0x7fffffff
browser=webdriver.PhantomJS(executable_path="D:\python\phantomjs.exe")#这里要输入你的phantomjs所在的路径
url="https://qzone.qq.com/"#QQ登录网址
browser.get(url)
browser.maximize_window()#全屏
sleep(3)#等三秒
browser.get_screenshot_as_file('QZone.png')#截屏并保存图片
im = Image.open('QZone.png')#打开图片
im.show()#用手机扫二维码登录qq空间
sleep(10)#等二十秒,可根据自己的网速和性能修改
print(browser.title)#打印网页标题
cookie = {}#初始化cookie字典
for elem in browser.get_cookies():#取cookies
cookie[elem['name']] = elem['value']
print('Get the cookie of QQlogin successfully!(共%d个键值对)' % (len(cookie)))
html = browser.page_source#保存网页源码
print(html)
g_qzonetoken=re.search(r'window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)#从网页源码中提取g_qzonetoken
gtk=getGTK(cookie)#通过getGTK函数计算gtk
browser.quit()
return (cookie,gtk,g_qzonetoken.group(1))
if __name__=="__main__":
QR_login()


 二、评论获取

import requests
import re
import datetime
from time import sleep
from urllib import parse

def comment(my_qq, target_qq, topicid, content, gtk, qzonetoken, cookie):
data = {
'qzreferrer': 'https://qzs.qq.com/qzone/app/mood_v6/html/index.html#mood&uin=790178228&pfid=2&qz_ver=8&appcanvas=0&qz_style=35¶ms=&entertime=1498019616488&canvastype=&cdn_use_https=1',
'uin': my_qq,
'hostUin': target_qq,
'topicId': topicid,
'commentUin': my_qq,
'content': content,
'richval': '',
'richtype': '',
'inCharset': '',
'outCharset': '',
'ref': '',
'private': '0',
'with_fwd': '0',
'to_tweet': '0',
'hostuin': my_qq,
'code_version': '1',
'format': 'fs'
}
comment_data = parse.urlencode(data)
content_length = str(data)
comment_params = {
'g_tk': gtk,
'qzonetoken': qzonetoken
}
comment_headers = {
'Host': 'h5.qzone.qq.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': content_length,
'Upgrade-Insecure-Requests': '1'
}

res = s.request('POST',
'https://h5.qzone.qq.com/proxy/domain/taotao.qzone.qq.com/cgi-bin/emotion_cgi_addcomment_ugc',
params=comment_params, data=comment_data, headers=comment_headers, cookies=cookie)
print(res.status_code)
res = res.text
print(res)
commentid = re.findall('"id":(.*?),"postTime"', res)
if commentid:
f = open('target_qq.txt', 'a')
f.write(str(topicid))
f.write('  ')
f.write(str(commentid[0]))
f.write('\n')
f.close()
print('评论成功')
return True
else:
print('评论失败')
return False

headers = {
'Host': 'h5.qzone.qq.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://user.qzone.qq.com/790178228?_t_=0.22746974226377736',
'Connection': 'keep-alive'
}

cookie = {'QZ_FE_WEBP_SUPPORT': '0', 'ptcz': '7cac1c7521b1ad8be9b1489f9b0aaba8efe9500f0f5dcb7693a9f693d37a8dff',
'fnc': '2', 'skey': '@F6CRfhQVd', 'pgv_si': 's493469696', 'ptui_loginuin': '790178228', 'RK': 'gYFn6+IOYo',
'pt2gguin': 'o0790178228', 'p_uin': 'o0790178228',
'rv2': '808A93A64B1A6FC5AE6D906AB5E744B38AF1EAA4163EC57A76', 'ptisp': 'ctc',
'p_skey': '5Iv6LkqOjJH*JPtrq0xqZmVlBNkbKLCRcDasiGGq71w_', '_qpsvr_localtk': '0.6656868932768703',
'pgv_pvi': '7208859648', '790178228_todaycount': '4', '__Q_w_s_hat_seed': '1',
'790178228_totalcount': '24703', 'pgv_pvid': '1698820840', 'qz_screen': '1366x768',
'pt4_token': 'WeiGzJbrn*TO4HO4FFXRdiD3SpXE2UqW2Litsm-TZPw_', 'pgv_info': 'ssid=s6237051136',
'uin': 'o0790178228', 'Loading': 'Yes',
'property20': '9D827FD9F839B247CF95AA1787B450E4D22D6C9F2A76DC8C4D27798667EBB92CA7122514560889AF'}
gtk =""
qzonetoken =""
s = requests.session()
my_qq =""
target_qq =""
content = '加油!'
cnt = 0
for page in range(0, 170):
pos = page * 20
params = {
'uin': target_qq,
'ftype': '0',
'sort': '0',
'pos': pos,
'num': '20',
'replynum': '100',
'g_tk': gtk,
'callback': '_preloadCallback',
'code_version': '1',
'format': 'jsonp',
'need_private_comment': '1',
'qzonetoken': qzonetoken
}

response = s.request('GET', 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6',
params=params, headers=headers, cookies=cookie)
print(response.status_code)

text = response.text
if not re.search('lbs', text):
print('全部说说评论完成,共添加评论%s条' % cnt)
exit()
textlist = re.split('\{"certified"', text)[0:]
for i in range(1, len(textlist)):
text = re.sub('"commentlist":.*?"conlist":', '', textlist[i])
tid = re.findall('"t1_termtype":.*?"tid":"(.*?)"', text)[0]
topicid = target_qq + '_' + str(tid)
print(topicid)
counts = comment(my_qq=my_qq, target_qq=target_qq, content=content, topicid=topicid, gtk=gtk,
qzonetoken=qzonetoken, cookie=cookie)
sleep(180)
if counts == True:
cnt = cnt + 1


 二、数据抓取存入数据库

import requests
import re
import datetime
import pymysql
import csv
from qq_mood.qq import QRlogin

def parse_mood(i):
'''从返回的json中,提取我们想要的字段'''
text = re.sub('"commentlist":.*?"conlist":', '', i)
if text:
myMood = {}
myMood["isTransfered"] = False
tid = re.findall('"t1_termtype":.*?"tid":"(.*?)"', text)[0]  # 获取说说ID
tid = qq + '_' + tid
myMood['id'] = tid
myMood['pos_y'] = 0
myMood['pos_x'] = 0
mood_cont = re.findall('\],"content":"(.*?)"', text)
if re.findall('},"name":"(.*?)",', text):
name = re.findall('},"name":"(.*?)",', text)[0]
myMood['name'] = name
if len(mood_cont) == 2:  # 如果长度为2则判断为属于转载
myMood["Mood_cont"] = "评语:" + mood_cont[0] + "--------->转载内容:" + mood_cont[1]  # 说说内容
myMood["isTransfered"] = True
elif len(mood_cont) == 1:
myMood["Mood_cont"] = mood_cont[0]
else:
myMood["Mood_cont"] = ""
if re.findall('"created_time":(\d+)', text):
created_time = re.findall('"created_time":(\d+)', text)[0]
temp_pubTime = datetime.datetime.fromtimestamp(int(created_time))
temp_pubTime = temp_pubTime.strftime("%Y-%m-%d %H:%M:%S")
dt = temp_pubTime.split(' ')
time = dt[1]
myMood['time'] = time
date = dt[0]
myMood['date'] = date
if re.findall('"source_name":"(.*?)"', text):
source_name = re.findall('"source_name":"(.*?)"', text)[0]  # 获取发表的工具(如某手机)
myMood['tool'] = source_name
if re.findall('"pos_x":"(.*?)"', text):
pos_x = re.findall('"pos_x":"(.*?)"', text)[0]
pos_y = re.findall('"pos_y":"(.*?)"', text)[0]
if pos_x:
myMood['pos_x'] = pos_x
if pos_y:
myMood['pos_y'] = pos_y
idname = re.findall('"idname":"(.*?)"', text)[0]
myMood['idneme'] = idname
cmtnum = re.findall('"cmtnum":(.*?),', text)[0]
myMood['cmtnum'] = cmtnum
return myMood
#从csv文件中取qq号,并保存在一个列表中
csv_reader = csv.reader(open('qq.csv'))
friend=[]
for row in csv_reader:
friend.append(row[3])
friend.pop(0)
friends=[]
for f in friend:
f=f[:-7]
friends.append(f)
headers={
'Host': 'h5.qzone.qq.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept': '*/*',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://user.qzone.qq.com/790178228?_t_=0.22746974226377736',
'Connection':'keep-alive'
}#伪造浏览器头
conn = pymysql.connect('localhost', 'root', 'root', 'test', charset="utf8", use_unicode=True)#连接mysql数据库
cursor = conn.cursor()#定义游标

cookie,gtk,qzonetoken=QRlogin.QR_login() #通过登录函数取得cookies,gtk,qzonetoken
s=requests.session()#用requests初始化会话
for qq in friends:#遍历qq号列表
for p in range(0,10):
pos=p*20
params={
'uin':qq,
'ftype':'0',
'sort':'0',
'pos':pos,
'num':'20',
'replynum':'100',
'g_tk':gtk,
'callback':'_preloadCallback',
'code_version':'1',
'format':'jsonp',
'need_private_comment':'1',
'qzonetoken':qzonetoken
}
response=s.request('GET','https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6',params=params,headers=headers,cookies=cookie)
print(response.status_code)  #通过打印状态码判断是否请求成功
text=response.text   #读取响应内容
print(text)
if not re.search('lbs', text):#通过lbs判断此qq的说说是否爬取完毕
print('%s说说下载完成'% qq)
break
textlist = re.split('\{"certified"', text)[1:]
for i in textlist:
myMood=parse_mood(i)
'''将提取的字段值插入mysql数据库,通过用异常处理防止个别的小bug中断爬虫,开始的时候可以先不用异常处理判断是否能正常插入数据库'''
try:
insert_sql = '''
insert into mood(id,content,time,sitename,pox_x,pox_y,tool,comments_num,date,isTransfered,name)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
cursor.execute(insert_sql, (myMood['id'],myMood["Mood_cont"],myMood['time'],myMood['idneme'],myMood['pos_x'],myMood['pos_y'],myMood['tool'],myMood['cmtnum'],myMood['date'],myMood["isTransfered"],myMood['name']))
conn.commit()
except:
pass

print('说说全部下载完成!')


三、结果展示



引用于: https://zhuanlan.zhihu.com/p/27604277
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python QQ 说说 爬虫