您的位置:首页 > 编程语言 > Python开发

python爬虫学习第十五天

2017-08-08 21:15 405 查看
今天首先把昨天的练习3重复做了一遍,还是有些不熟练,不过好多了。

今天的主题是采集整个网站,主要涉及是根据链接一步步的把整个网站全部采集,形成网站地图

昨天的练习3

# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re
# import datetime
# import random

# def randomUrl(articleUrl):
#   url = "http://en.wikipedia.org"+articleUrl
#   html = urlopen(url)
#   bsObj = BeautifulSoup(html)
#   links = bsObj.find('div',{'id':'bodyContent'}).findAll('a',href = re.compile('^(/wiki/)((?!:).)*$'))
#   return links
#   pass

# random.seed(datetime.datetime.now())
# newlinks = randomUrl('/wiki/Kevin_Bacon')
# while len(newlinks)>0:
#   link = newlinks[random.randint(0,len(newlinks)-1)].attrs['href']
#   print(link)
#   newlinks = randomUrl(link)
#   pass


练习1 网络数据采集示例(逐个采集维基百科的每一个链接)

# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re

# datas = set()
# def getlinks(linkUrl):
#   global datas
#   url = 'http://en.wikipedia.org'+linkUrl
#   html = urlopen(url)
#   bsObj = BeautifulSoup(html)
#   for link in bsObj.findAll('a',href=re.compile("^(/wiki/)")):
#       if 'href' in link.attrs:
#           if link.attrs['href'] not in datas:
#               newlink = link.attrs['href']
#               print(newlink)
#               datas.add(newlink)
#               getlinks(newlink)
#           else:
#               print('这个页面重复啦!')

#   pass
# getlinks('')


练习2 收集整个网站数据

# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import re

# datas = set()
# def getLinks(pageUrl):
#   global datas
#   url = 'http://en.wikipedia.org'+pageUrl
#   html = urlopen(url)
#   bsobj = BeautifulSoup(html)
#   try:
#       print(bsobj.h1.get_text())
#       print(bsobj.find(id='mw-content-text').findAll('p')[0])
#       print(bsobj.find(id="ca-edit").find('span').find('a').attrs['href'])
#       pass
#   except AttributeError as e:
#       print('这页面里没有我们要的某些信息')
#   for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):
#       if 'href'in link.attrs:
#           if link.attrs['href'] not in datas:
#               newlink = link.attrs['href']
#               print(newlink)
#               print('---------------------')
#               datas.add(newlink)
#               getLinks(newlink)
#               pass
#           else:
#               print('我们来过这儿喽')
# getLinks('')


今天的练习就这两个,虽然数量少但是不看书自己敲出来比前面的练习挑战性高的多,完整做完感觉自己还有很多不足,加油!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python