您的位置:首页 > 其它

从豆瓣中抓取图片以及音乐

2012-12-06 13:52 211 查看
该脚本可以从豆瓣中抓取图片以及音乐,可以输入自己的正则,正则的合法性我没有去进行验证,这只是一个简单的文件抓取,只是用了一些基本的模块,看到网上好多使用Beautiful Soup
模块的例子,一会研究研究。代码很简单,就不多解释了


#! /usr/bin/python2.6.6

# -- coding:utf-8 --

import urllib, urllib2, gevent, re

import time, os

from gevent import monkey

monkey.patch_all()

def worker(url):

try:

m_arr = []

parent_dir = os.path.join("%s" % time.strftime('%Y-%m-%d',time.localtime(time.time()))) #按日期创建目录

if not os.path.exists(parent_dir):

os.mkdir(parent_dir)

os.chdir(parent_dir)

if not os.path.exists('images'):#创建存储图片的目录

os.mkdir('images')

if not os.path.exists('radios'):#创建存储音乐的目录

os.mkdir('radios')

response=urllib2.urlopen(url)

text=response.read()

groups=re.finditer(reg, text)

for g in groups:

name=g.group(1).strip() + ".mp3"

cover_name=g.group(1).strip() + ".jpg"

cover_path=g.group(2).replace('\\', '')

path=g.group(3).replace('\\', '')

m_arr.append((name, path, cover_name, cover_path))

except Exception, e:

print "<<==get regulare text raised exceptin %s==>>" % e

finally:

return m_arr

def grun(path, name):

try:

urllib.urlretrieve(path, name)

except Exception, e:

print "<<==Fetch material %s in %s raised Exception %s==>>" % (name, path, e)

pass

if __name__ == '__main__':

#匹配音乐url

url = raw_input("input the url to fetch materials ==>>")

re_pat = raw_input("input the regular expression to fetch materials ==>>")

if not url:

url = "http://site.douban.com/dannv/"

if not re_pat:

re_pat = '{"name":"(.+?)",.+?"cover":"(.+?)",.+?"rawUrl":"(.+?)",.+?}'

reg=re.compile(re_pat, re.I)

musicArray = worker(url)

jobs = []

os.chdir('images')

for (name, path, cover_name, cover_path) in musicArray:

jobs.append(gevent.spawn(grun, cover_path, cover_name))

gevent.joinall(jobs, timeout=600)

os.chdir('../radios')

for (name, path, cover_name, cover_path) in musicArray:

jobs.append(gevent.spawn(grun, path, name))

gevent.joinall(jobs, timeout=600)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: