您的位置:首页 > 编程语言 > Python开发

python抓网页资源小脚本

2012-11-19 23:24 155 查看
#!/usr/bin/env python
# coding: utf-8
import urllib

def filter_src(file_name):
resource_list = []
f_obj = open(file_name)
for f_line in f_obj:
if '404' in f_line:
str_goal = f_line.strip().split(' ')[7]
if not str_goal in resource_list:
print str_goal
if '/static' in str_goal:
str_goal = str_goal.replace('/static', '')
resource_list.append(str_goal[:-1])
print resource_list
return resource_list

def down_src(source_list):
base_url = "http://www.ttcrm.com"
down_path = r"src"
for source in source_list:
source_url = base_url + source
source_path = down_path + source
print source_url
source_stram = urllib.urlopen(source_url)
f_obj = open(source_path,'wb')
f_obj.write(source_stram.read())

if __name__=='__main__':
file_name = 'src.txt'
source_list = filter_src(file_name)
down_src(source_list)


关键点在于保存是以二进制方式保存!

f_obj = open(source_path,'wb')
f_obj.write(source_stram.read())
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: