您的位置:首页 > 编程语言 > Python开发

利用python从www.yousheng8.com批量下载小说

2013-10-27 00:00 316 查看
downLoadMp3.py从有声吧下载小说,不能批量下载,所有搞了个脚本,纯属自娱自乐
downLoadMp3.py
1
2 #coding=gbk
3 '''Created on 2012-9-16
4
5 @author: Administrator
6 '''
7
8 import urllib
9 from HTMLParser import HTMLParser
10 import os
11 import re
12 import string
13 import getopt
14 import sys
15 def cbk(a, b, c):
16     per = 100.0 * a * b / c
17     if per > 100:
18         per = 100
19     print '%.2f%%' % per
20
21
22
23 class MyHTMLParser(HTMLParser):
24     data = ""
25     def __init__(self):
26         HTMLParser.__init__(self)
27         self.links = []
28
29     def handle_starttag(self, tag, attrs):
30         #print "Encountered the beginning of a %s tag" % tag
31         if tag == "a":
32             if len(attrs) == 0: pass
33             else:
34                 for (variable, value)  in attrs:
35                     if variable == "href":
36                         self.links.append(value)
37     def handle_data(self, data):
38         if data.find("共")>=0 and data.find("第") >=0:
39             self.data = data
40
41
42 def make_num_str(num, num_width):
43     str = "%i"%num
44     if len(str) < num_width:
45         for i in range(1, num_width-len(str)+1) :
46             str = "0" + str
47     return str
48
49 #设置保存文件的名称
50 def make_save_name(basisname, num, extendname, num_width):
51     str = make_num_str(num, num_width)
52     str = basisname + str + extendname
53     return str
54
55 def parse_num_width(url):
56     e = url.find(".htm")
57     s = url.rfind("_")+1
58     return e-s
59
60 def make_url(base_url, num):
61     num_width = parse_num_width(base_url)
62     end = base_url.rfind("_")+1;
63     prefix = base_url[0:end]
64     return prefix + make_num_str(num, num_width) + ".htm"
65
66 def do_download(base_url_play, target_dir):
67     #从play页面提取下载页面地址
68     base_url = re.sub('play', 'down', base_url_play)
69     print "base down url %s"%base_url
70     extendname=".mp3"
71     html_code = urllib.urlopen(base_url_play).read()
72     hp = MyHTMLParser()
73     hp.feed(html_code)
74     hp.close()
75     cnt = string.atoi(re.findall(r'[\d]+', hp.data)[1])
76
77     if os.path.exists(target_dir) == False :
78         os.mkdir(target_dir)
79     os.chdir(target_dir)
80     print cnt+1
81     for i in range(1,cnt+1):
82         url = make_url(base_url, i)
83         print url
84         num_width=parse_num_width(base_url)
85         html_code = urllib.urlopen(url).read()
86         hp = MyHTMLParser()
87         hp.feed(html_code)
88         hp.close()
89         is_find = False
90         for link in hp.links:
91             is_find = False
92             if link.find(".mp3")>=0 :
93                     print link
94                     is_find = True
95                     break
96         if is_find == False :
97             print "download ends"
98             break;
99         urllib.urlretrieve(link, make_save_name(target_dir, i, extendname, num_width), cbk)
100
101
102 def print_usage():
103     print "Usage: %s [-h | -u | -d] [--help] [--url | --dir] args......" % sys.argv[0]
104
105
106
107 if __name__ == "__main__":
108     try:
109         #播放页面地址
110         base_url_play = ""
111         target_dir = ""
112         opts,args = getopt.getopt(sys.argv[1:], "hu:d:", ["help", "url", "dir"])
113         for o,a in opts:
114             if  o in ("-h", "--help"):
115                 print_usage()
116                 sys.exit(0);
117             elif o in ("-u", "--url"):
118                 base_url_play = a;
119             elif o in ("-d", "--dir"):
120                 target_dir = a;
121             else:
122                 print o , "==>", a
123         print base_url_play, "======>", target_dir
124         do_download(base_url_play, target_dir)
125
126     except getopt.GetoptError:
127         print "opt error"
128         print_usage();
129         sys.exit(-1);
130
131
132
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  有声吧