【爬虫】获取Github仓库提交纪录历史的脚本 python
2020-12-12 16:22
85 查看
本脚本为第一版开发;后续会进行扩展
#! python3 import requests import time, datetime import json from colorama import Fore,Back,Style,init from bs4 import BeautifulSoup process = 0 output = 0 def req(type,addr,data='',**args): if type == 'get': try: responses = requests.get(addr,timeout=50) except requests.exceptions.RequestException as e: pass elif type == 'post': try: responses = requests.post(addr,timeout=50) except requests.exceptions.RequestException as e: pass return responses def access(url_addr): # print("access") for i in url_addr: print(i['git_addr']) responses = req('get',i['git_addr']) if responses.status_code == 200: print("[SUCCESS]" + " %s [status] %s"%(str(i['git_addr']), str(responses.status_code))) i['git_addr'] = i['git_addr'] + '/commits/' commits(i) else : print(Fore.BLACK + Back.RED + "[ERROR] "+"%s [status] %s"%(str(i['git_addr']), str(responses.status_code))) i['code'] = responses.status_code # return def commits(addr): url = addr['git_addr'] responses = req('get', url) if responses.status_code != 200: print("[SUCCESS] %s [status] %s"%(str(url), str(responses.status_code))) addr['code'] = responses.status_code return text = BeautifulSoup(responses.text, "html.parser") # 判断空仓库 if "This repository is empty." in text: print(print(Fore.RED + Back.WHITE +"%s 的仓库内容爬取过程中发现告警[This repository is empty.]"%(addr['username']))) return # commits_all_dict = [] all_commits = text.find_all(class_='TimelineItem-body') # 展露细节内容的 try: for texts in all_commits: dateBar = texts.find(class_='text-normal').get_text()[11:] # 日期 # 我们获取的日期格式是标准的英文格式日期"Nov 26, 2020",所以我们需要进行日期的转换 date = datetime.datetime.strptime(dateBar, '%b %d, %Y').strftime('%Y年%m月%d日') commits_second = 0 if process: print("\n=================[%s]================="%(str(date))) all_commits_find = texts.ol.find_all('li') for commits_find in all_commits_find: commits_dict = { 'commits_auth' : commits_find.div.find('div',class_='d-flex').find('div',class_='f6').find(class_='commit-author').get_text(), 'commits_time' : commits_find.find('relative-time')['datetime'], # 当前日期所提交的内容 'commits_href' : "https://github.com" + commits_find.div.p.a['href'], # 我们的text中式把summary和description内容融合在一起的于是我们需要把他们分开 'commits_summary' : commits_find.div.p.a['aria-label'][:len(commits_find.div.p.a.get_text())] , 'commits_description' : commits_find.div.p.a['aria-label'][len(commits_find.div.p.a.get_text()):].strip() } # commits_all_dict.append(commits_dict) commits_second += 1 # 处理爬取数据的输出 if process : print("\n-----------------[%s]-----------------"%(commits_dict['commits_auth'])) print ("[提交时间] %s \n[提交代码] %s\n[提交主题] %s\n[提交描述] %s" %(commits_dict['commits_time'], commits_dict['commits_href'], commits_dict['commits_summary'], commits_dict['commits_description'])) print(Fore.BLACK + Back.WHITE +"%s 于 %s 共计提交了 %s 次代码"%(addr['username'], date, commits_second)) # 处理分页爬取 next_a = text.find(class_='paginate-container').find_all('a') if len(next_a) and next_a[-1].get_text() == 'Older': print("------next page------") addr['git_addr'] = next_a[-1]['href'] commits(addr) except Exception as e: print(print(Fore.RED + Back.WHITE +"%s 的仓库爬取过程中发生错误."%(addr['username']))) return def main(): global process url_addr = [ { 'username' : 'X1', 'git_addr' : 'https://github.com/litbird0/elevator', # 项目地址 'start' : '', 'commins' : [], }, { 'username' : 'X2', 'git_addr' : 'https://github.com/1564820398/cjwc_dianti', 'start' : '', 'commins' : [], }, ] Webcrawler_key = "mirror" if input("请输入爬虫Key:") != Webcrawler_key: print(Fore.RED + Back.WHITE + "Key错误!") time.sleep(10) exit() if input("是否爬取commits细节(Y/N):").upper() == "Y": process = 1 else : process = 0 access(url_addr) print("[OK] 爬行结束 ...") if input("是否关闭当前窗口(Y/N):").upper() == "Y": exit() else : pass exit() if __name__ == "__main__": init(autoreset=True) main()
相关文章推荐
- github将一个远程仓库的某个分支放到一个新的仓库中(提交历史纪录也导过去)
- Git基础-获取仓库、提交、查看历史、撤销
- Python爬虫获取POJ某个用户的所有提交状态
- 使用UNIX环境下的bash脚本自动化提交仓库到github
- python爬虫获取11选5彩票历史开奖号码
- 用shell脚本获取一个github项目所有文件的历史信息
- Python爬虫实践:获取空气质量历史数据
- python爬虫教程:编写Python脚本来获取mp3文件tag信息的教程
- python爬虫自动提交HDU并获取AC状态(p3+request+Beatifulsoup)
- Python爬虫使用脚本登录Github并查看信息
- 使用python脚本获取docker私有镜像仓库镜像信息
- <Python 2.7>爬虫获取天气历史按月份存为txt文件
- python爬虫获取编码时中文乱码问题
- python3 获取脚本所在路径的方法
- Python 获取脚本路径以及脚本所在文件夹路径
- Intellij IDEA 提交代码到远程GitHub仓库
- python爬虫获取拉钩网在线搜索招聘信息(超实用!)
- python获取当前脚本文件路径
- 在Linux或mac上配置github提交代码到github仓库
- 「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)