您的位置:首页 > 编程语言 > Python开发

Python小程序:用广度优先搜索算法查询两个url之间的最短路径

2016-11-28 01:53 597 查看
源自人工智能一次作业中的一道题



本人Python小白啊,真的要好好努力了。话不多说,直接上代码。

#encoding:UTF-8
import time
from lxml import etree
import urllib2
import Queue
import requests
requests.packages.urllib3.disable_warnings()

def findUrl(url):  # 查找Url对应页面包含的子url
try:
links = []
res = requests.get(url, timeout=0.6)  # 获取requests发出get请求后的response对象
html = etree.HTML(res.text)           # etree解析响应体的字符串形式
newurls = html.findall('.//a')        # 找到所有的a标签
for newURL in newurls:
href = newURL.get('href')         # 获取href链接
links.append(href)
return links
except Exception as e:
pass

def printPath(parents, startUrl, targetUrl):  # 寻找从初始url到目标url的路径
try:
print "\nThe path from %s to %s: " % (startUrl, targetUrl)
path = [targetUrl]
parent = parents[targetUrl]           # 寻找父亲
while bool(parent) == True:
path.append(parent)
parent = parents[parent]
path = path[::-1]                     # 列表反转
print "\n-> ".join(path)
except Exception as e:
pass

def search(startUrl, targetUrl):
queue = Queue.Queue()                # 队列 存储 未访问的url
visited = set()                      # 集合 存储 访问过的url
parents = dict()                     # 字典 存储 父url
parents[startUrl] = None             # 起始url为祖先
queue.put(startUrl)                  # BFS开始前先将源url推进队列
visited.add(startUrl)                # 初始节点标记访问

while (queue.empty() == False):      # 队列非空
try:
curentUrl = queue.get()      # 取出队列首部并pop掉
print('search in %s ...' % curentUrl)

# 找出当前url的所有子url
urlLink = findUrl(curentUrl)
if urlLink:
for url in urlLink:
parents[url] = curentUrl     # 记录当前url与子url的对应关系
if (url == targetUrl):
print('find %s successfully\n' % targetUrl)
printPath(parents, startUrl, targetUrl)   # 打印路径
return
if (url not in visited):     # 如果子url还未访问过,推进队列并标记访问
queue.put(url)
visited.add(url)
except Exception as e:
pass

if __name__ == '__main__':

startTime = time.time()

search('http://helpdesk.sysu.edu.cn/', 'http://tv.sysu.edu.cn/')

print ("\nCost time: %f s" % (time.time() - startTime))


查找过程如下:



查询结果如下:



提示成功找到目标url, 打印最短路径,并显示查询时间。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: