您的位置:首页 > 编程语言 > Python开发

【Python爬虫】爬取猫眼电影TOP100榜

2018-10-12 21:27 405 查看
[code]import requests
import bs4
from bs4 import BeautifulSoup
url = 'http://maoyan.com/board/4'
path = 'I://Users//xieyingchao//Desktop//爬虫//movies.txt'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'maoyan.com',
'Referer': 'http://maoyan.com/board',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.349'
}
def GetText(url,header,offset):
#get网页源代码,猫眼电影榜每十个一夜通过params来切换页码,每一页的URL的不同在于“offset=0”,构造头模拟浏览器,猫眼disallow一般的爬虫。
try:
r = requests.get(url,params= 'offset='+offset,headers = header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""

def Text2html(text,ulist):
#做soup,将网页源码转换成HTML。
soup = BeautifulSoup(text,'html.parser')
for dds in soup.find_all('dd'): #观察源代码可以发现电影信息隐藏在dd标签下,find_all筛选出所有的dd
if isinstance(dds,bs4.element.Tag): #判断筛选出的dd是不是标签类型
a = dds.getText().replace('\n\n\n\n\n\n\n\n',' ').replace('                ','').replace('\n\n\n\n','\n')
#getText获取标签内的文本,不会使用正则表达式,连续的replace同样可以去除多余的换行符和空格。
ulist.append([a]) #list的嵌套每一个电影信息是一个内部list
#for ps in dds.find_all('p'):
# ulist[k].append(ps.getText())

def Save2txt(ulist,path):
f = open(path,'w',encoding = 'UTF-8') #文件写入用“UTF-8”码,否则报UnicodeEncodeError错误
k = len(ulist)
for i in range(k):
f.writelines(ulist[i])
f.close()

def main():
ulist = []
for i in range(10):
offset = str(i*10)
#r = requests.get(url, params='offset=' + offset, headers=header)
#print(r.url)
r = GetText(url,header,offset)
Text2html(r,ulist)
Save2txt(ulist,path)
print(ulist)
main()

爬取结果示例:

阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: