您的位置:首页 > 编程语言 > Python开发

Python使用BeautifulSoup进行爬虫

2017-07-21 09:29 399 查看
import urllib.request
import re
from bs4 import BeautifulSoup
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('utf-8')
return html
def getyema(url):
reg = r'<a href="(.*?)">\d</a>'
imgre = re.compile(reg)
imglist = imgre.findall(url)
for imgurl in imglist:
if(imgurl.endswith('html')):
print(imgurl)
def getgaishu(html):
soup = BeautifulSoup(html, "html.parser")
divPager = soup.find_all('div', class_='detailc')
for imgurl in divPager:
print(imgurl.get_text())

def getXiangguanjibing(html):
reg = r'<dt><a href=".*?" target="_blank" title=".*?">(.*?)</a></dt>'
imgre = re.compile(reg)
imglist = imgre.findall(html)
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
for imgurl in imglist:
match = zhPattern.search(imgurl)
if match:
print(imgurl)
def getbansuizhengzhuang(html):
#     reg = r'<div class=".*?"><ul class=".*?"><p><span>.*?</span>.*?</p><a href=".*?" target="_blank" class=".*?">.*?</a></ul><ul class=".*?">(.*?)</ul></div>'
#     reg = r'<ul class=".*?"><p><span>.*?</span>(.*?)</p>'
reg = r'<li><a href="/symptom/detail/.*?" target="_blank" title=".*?">(.*?)</a></li>'
imgre = re.compile(reg)
imglist = imgre.findall(html)
for imgurl in imglist:
print(imgurl)
def getallgaishu(html):
reg = r'<li ><a href="(.*?)">(.*?)</a></li>'
imgre = re.compile(reg)
imglist = imgre.findall(html)
for imgurl in imglist:
print(imgurl)
def getalltxt(html):
reg = r'<meta name="description" content="(.*?)"/>'
imgre = re.compile(reg)
imglist = imgre.findall(html)
for imgurl in imglist:
print(imgurl)
def getzhengzhuangqiyin(html):
reg = r'<div id="art_content" .*?>(.*?)</div>'
imgre = re.compile(reg)
imglist = imgre.findall(html)
for imgurl in imglist:
print(imgurl)
def GetBlogPage(html):
#     str(html, 'utf-8')
soup = BeautifulSoup(html, "html.parser")
divPager = soup.find_all('div', id='art_content')
for trtag in divPager:
tdlist = trtag.find_all('p')
for trt in tdlist:
td = trt.find_all('p')
for t in td:
print(t.get_text())
def onlygetbansuizhengzhuang(html,zonghtml):
#     reg = r'<div class=".*?"><ul class=".*?"><p><span>.*?</span>.*?</p><a href=".*?" target="_blank" class=".*?">.*?</a></ul><ul class=".*?">(.*?)</ul></div>'
#     reg = r'<ul class=".*?"><p><span>.*?</span>(.*?)</p>'
soup = BeautifulSoup(html, "html.parser")
divPager = soup.find_all('ul', class_='catalog02 z_catalog02e')
i = 0;
#   获取更多ul class="z_border01_head"  div class="z_block08_con"
gengduoPager=soup.find_all('div',class_='z_border01')
for trtag in divPager:
i = i + 1
if(i == 2):
for url in trtag.parent.find_all('ul',class_='z_border01_head'):
for a in url.find_all('a'):
print(a)
print(a.get('href'))
html=zonghtml+a.get('href')
imglist = trtag.find_all('a')
for imgurl in imglist:
imgurl.get_text()
html = 'http://zzk.fh21.com.cn/symptom/detail/1.html'
html = getHtml(html)
onlygetbansuizhengzhuang(html)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐