您的位置:首页 > 编程语言 > Python开发

【爬虫】获取Github仓库提交纪录历史的脚本 python

2020-12-12 16:22 85 查看

本脚本为第一版开发;后续会进行扩展

#! python3

import requests
import time, datetime
import json
from colorama import Fore,Back,Style,init
from bs4 import BeautifulSoup

process = 0
output  = 0

def req(type,addr,data='',**args):
if type == 'get':
try:
responses = requests.get(addr,timeout=50)
except requests.exceptions.RequestException as e:
pass
elif type == 'post':
try:
responses = requests.post(addr,timeout=50)
except requests.exceptions.RequestException as e:
pass
return responses

def access(url_addr):
# print("access")
for i in url_addr:
print(i['git_addr'])
responses = req('get',i['git_addr'])
if responses.status_code == 200:
print("[SUCCESS]" + " %s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
i['git_addr'] = i['git_addr'] + '/commits/'
commits(i)
else :
print(Fore.BLACK + Back.RED + "[ERROR]   "+"%s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
i['code'] = responses.status_code
# return

def commits(addr):
url = addr['git_addr']
responses = req('get', url)
if responses.status_code != 200:
print("[SUCCESS] %s [status] %s"%(str(url), str(responses.status_code)))
addr['code'] = responses.status_code
return
text = BeautifulSoup(responses.text, "html.parser")
# 判断空仓库

if "This repository is empty." in text:
print(print(Fore.RED + Back.WHITE +"%s 的仓库内容爬取过程中发现告警[This repository is empty.]"%(addr['username'])))
return
# commits_all_dict = []

all_commits = text.find_all(class_='TimelineItem-body')

# 展露细节内容的
try:
for texts in all_commits:
dateBar = texts.find(class_='text-normal').get_text()[11:] # 日期
# 我们获取的日期格式是标准的英文格式日期"Nov 26, 2020",所以我们需要进行日期的转换
date = datetime.datetime.strptime(dateBar, '%b %d, %Y').strftime('%Y年%m月%d日')
commits_second = 0
if process:
print("\n=================[%s]================="%(str(date)))
all_commits_find = texts.ol.find_all('li')
for commits_find in all_commits_find:
commits_dict = {
'commits_auth' : commits_find.div.find('div',class_='d-flex').find('div',class_='f6').find(class_='commit-author').get_text(),
'commits_time' : commits_find.find('relative-time')['datetime'],  # 当前日期所提交的内容
'commits_href' : "https://github.com" + commits_find.div.p.a['href'],
# 我们的text中式把summary和description内容融合在一起的于是我们需要把他们分开
'commits_summary' : commits_find.div.p.a['aria-label'][:len(commits_find.div.p.a.get_text())] ,
'commits_description' :  commits_find.div.p.a['aria-label'][len(commits_find.div.p.a.get_text()):].strip()
}
# commits_all_dict.append(commits_dict)
commits_second += 1
# 处理爬取数据的输出
if process :
print("\n-----------------[%s]-----------------"%(commits_dict['commits_auth']))
print ("[提交时间] %s \n[提交代码] %s\n[提交主题] %s\n[提交描述] %s"
%(commits_dict['commits_time'], commits_dict['commits_href'],
commits_dict['commits_summary'], commits_dict['commits_description']))
print(Fore.BLACK + Back.WHITE +"%s 于 %s 共计提交了 %s 次代码"%(addr['username'], date, commits_second))

# 处理分页爬取
next_a = text.find(class_='paginate-container').find_all('a')
if  len(next_a) and next_a[-1].get_text() == 'Older':
print("------next page------")
addr['git_addr'] = next_a[-1]['href']
commits(addr)
except Exception as e:
print(print(Fore.RED + Back.WHITE +"%s 的仓库爬取过程中发生错误."%(addr['username'])))
return

def main():
global process
url_addr = [
{
'username' : 'X1',
'git_addr' : 'https://github.com/litbird0/elevator', # 项目地址
'start'    : '',
'commins'  : [],
},
{
'username' : 'X2',
'git_addr' : 'https://github.com/1564820398/cjwc_dianti',
'start'    : '',
'commins'  : [],
},

]

Webcrawler_key = "mirror"
if input("请输入爬虫Key:") != Webcrawler_key:
print(Fore.RED + Back.WHITE + "Key错误!")
time.sleep(10)
exit()

if input("是否爬取commits细节(Y/N):").upper() == "Y":
process = 1
else :
process = 0

access(url_addr)
print("[OK] 爬行结束 ...")
if input("是否关闭当前窗口(Y/N):").upper() == "Y":
exit()
else :
pass
exit()

if __name__ == "__main__":
init(autoreset=True)
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: