您的位置:首页 > 编程语言 > Python开发

Python 爬虫:豆瓣电影Top250,包括电影导演、类型、年份、主演

2018-03-05 14:01 639 查看

结果输出到文本文件中。

import codecs
import requests
from bs4 import BeautifulSoup

headers={'User-Agent': 'Mozilla/5.0'}
index_url = 'https://movie.douban.com/top250'

def get_html(url):
html = requests.get(url, headers=headers).text
return html

def create_list(html):
soup = BeautifulSoup(html, 'lxml')
movie_names = []
movie_info = []
for t in soup.find_all('div', 'hd'):
name = t.find('span', 'title').get_text()
movie_names.append(name)
for t in soup.find_all('div', 'info'):
info = t.find('p').get_text().replace(' ','')
movie_info.append(info)
next_page = soup.find('span', 'next').find('a')
if next_page:
return movie_names, movie_info, index_url + next_page['href']
else:
return movie_names, movie_info, None

def main():
order = 1
url = index_url
with codecs.open('top250.txt', 'wb', encoding='utf-8') as f:
while url:
html = get_html(url)
names, info, url = create_list(html)
for n in range(25):
f.write('Top ' + str(order) + ' ' + names
 + '\r\n')
f.write(info
 + '\r\n')
order = order + 1

if __name__ == '__main__':
main()

 

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: