参考教程,练习BeautifulSoup实例
2016-03-21 14:08
239 查看
<pre name="code" class="python">#!/usr/bin/env python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests import re import threading import queue root_url = 'http://pyvideo.org' index_url = root_url + '/category/50/pycon-us-2014' q = queue.Queue() result = [] def get_video_page_urls(): response = requests.get(index_url) soup = BeautifulSoup(response.text, "lxml") return [a.get('href') for a in soup('a', class_='thumbnail')] def get_video_msg(video_url): # print('url=', root_url + video_url) video_data = {} response = requests.get(root_url + video_url) soup = BeautifulSoup(response.text, 'lxml') tag = soup.find(id='sidebar') video_data['Category'] = tag.find('a', href=re.compile('category')).string try: video_data['Speakers'] = tag.find('meta', property='author').get('content') except: video_data['Speakers'] = 'Unknown' video_data['Language'] = tag.find('meta', property='inLanguage').previous_element.strip() video_data['Recorded'] = tag.find('meta', property='dateCreated').previous_element.strip() try: video_data['Video origin'] = tag.find('a', property='embedUrl').get('href') except: video_data['Video origin'] = 'Unknown' # print(video_data['Category']) # print(video_data['Speakers']) # print(video_data['Language']) # print(video_data['Recorded']) # print(video_data['Video origin']) return video_data def show_video_stats(): video_list = get_video_page_urls() for video_url in video_list: print(get_video_msg(video_url)) class Mythread(threading.Thread): global result global q def __init__(self, name): threading.Thread.__init__(self) self.name = name def run(self): while True: if q.empty(): # q.task_done() break print('thread:', self.name) url = q.get() result.append(get_video_msg(url)) q.task_done() # get_video_msg('/video/2668/writing-restful-web-services-with-flask') # show_video_stats() def main(): video_list = get_video_page_urls() for video_url in video_list: q.put(video_url) for i in range(8): Mythread(i).start() q.join() for msg in result: print(msg) main()
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例
- Python 七步捉虫法