您的位置:首页 > 编程语言 > Python开发

python3系列-爬虫解析

2017-11-30 15:08 103 查看
import urllib.request
from bs4 import BeautifulSoup

url="http://www.biquge5200.com/52_52542/"
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
data = response.read()
data=data.decode("gbk")
soup=BeautifulSoup(data)
soup=BeautifulSoup(str(soup.find(id='list')))
list=soup.find_all('a');
for l in range(9,len(list)):
print(list[l].get('href'),list[l].text)

url="http://www.biquge5200.com/52_52542/150290199.html"
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
data = response.read()
data=data.decode("gbk")
soup=BeautifulSoup(data)
soup=BeautifulSoup(str(soup.find(id='content')))
print(soup)
import urllib.request
from bs4 import BeautifulSoup

# for u in range(1,1830):
for u in range(0,1830):
print(u)
url="http://xiaohua.zol.com.cn/new/"+str(u)+".html"
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
data = response.read()
data=data.decode("gbk")
soup=BeautifulSoup(data)
soup=BeautifulSoup(str(soup.find(attrs={'class':'article-list'}))).find_all(attrs={'class':'article-summary'})
for i in soup:
u="http://xiaohua.zol.com.cn"+i.find(attrs={'class':'article-title'}).find_all('a')[0].get('href')
req = urllib.request.Request(u)
response = urllib.request.urlopen(req)
data = response.read()
data = data.decode("gbk")
cls = BeautifulSoup(data).find(attrs={'class':'wrapper location clearfix'}).find_all("a")[3].text
title=BeautifulSoup(data).find(attrs={'class':'article-title'}).text
content=BeautifulSoup(data).find(attrs={'class':'article-text'}).text
fcontent=cls+"|||"+title+"|||"+content+"\n"
# print(fcontent)
with open("c:/dz.txt", 'a') as file:
file.writelines(fcontent.replace(u'\xa0', u' '))
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: