python 网络数据获取第三章书中代码更新
2016-11-24 22:33
309 查看
#第三章代码更新
#代码取自作者GitHub from urllib.request import urlopen from urllib.parse import urlparse from bs4 import BeautifulSoup import re import datetime import random pages = set() random.seed(datetime.datetime.now()) #Retrieves a list of all Internal links found on a page def getInternalLinks(bsObj, includeUrl): includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc internalLinks = [] #Finds all links that begin with a "/" for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")): if link.attrs['href'] is not None: if link.attrs['href'] not in internalLinks: if(link.attrs['href'].startswith("/")): internalLinks.append(includeUrl+link.attrs['href']) else: internalLinks.append(link.attrs['href']) return internalLinks #Retrieves a list of all external links found on a page def getExternalLinks(bsObj, excludeUrl): externalLinks = [] #Finds all links that start with "http" or "www" that do #not contain the current URL for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")): if link.attrs['href'] is not None: if link.attrs['href'] not in externalLinks: externalLinks.append(link.attrs['href']) return externalLinks def getRandomExternalLink(startingPage): html = urlopen(startingPage) bsObj = BeautifulSoup(html, "html.parser") externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc) if len(externalLinks) == 0: print("No external links, looking around the site for one") domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc internalLinks = getInternalLinks(bsObj, domain) return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)]) else: return externalLinks[random.randint(0, len(externalLinks)-1)] def followExternalOnly(startingSite): externalLink = getRandomExternalLink(startingSite) print("Random external link is: "+externalLink) followExternalOnly(externalLink) #Collects a list of all external URLs found on the site allExtLinks = set() allIntLinks = set() def getAllExternalLinks(siteUrl): html = urlopen(siteUrl) domain = urlparse(siteUrl).scheme+"://"+urlparse(siteUrl).netloc bsObj = BeautifulSoup(html, "html.parser") internalLinks = getInternalLinks(bsObj,domain) externalLinks = getExternalLinks(bsObj,domain) for link in externalLinks: if link not in allExtLinks: allExtLinks.add(link) print(link) for link in internalLinks: if link not in allIntLinks: allIntLinks.add(link) getAllExternalLinks(link) followExternalOnly("http://oreilly.com") allIntLinks.add("http://oreilly.com") getAllExternalLinks("http://oreilly.com")
相关文章推荐
- 首页实时获取数据更新ListView和chart代码
- 两个Fragment切换,BFragment中的listview 数据获取回来 为什第一次进入不显示第二次进入数据就显示了呢?ListView网络数据没有及时更新
- Python神经网络代码识别手写字的实现流程(一):加载mnist数据
- Python获取CPU、内存使用率以及网络使用状态代码
- java代码实现访问网络外部接口并获取数据的工具类
- Python获取股票历史数据和收盘数据的代码实现
- Python实现简单的爬虫获取某刀网的更新数据
- python本地与网络数据的获取
- python网络数据采集的代码
- Python3 urllib(网络数据获取 模块)
- [置顶] python语言处理get类型请求,调试模式获取数据代码
- Python3 urllib(网络数据获取 模块)
- python多线程获取网络数据,与单线程进行比较
- Python_网络编程_获取百度首页代码保存到本地
- Android代码优化----PullToRefresh+universal-image-loader实现从网络获取数据并刷新
- 树莓派python,tornado,返回json数据代码,初级网络编程。
- python结合selenium获取XX省交通违章数据的实现思路及代码
- Python 获取股票数据 弹出框 自动关闭 测试代码
- 实时更新XML文件获取网络数据
- Python网络数据采集(1)——获取网页源码