python爬虫抽取武侠小说
2015-10-13 16:14
609 查看
这几天在手机上看《蜀山剑侠传》 看到一半 突然让付费,还是自己抓取文字嘛
就模仿着做了个简单的抽取, 最终结果存在文件里
__author__ = 'allen'
import urllib
import urllib2
import re
import chardet
import os
from bs4 import BeautifulSoup
import sys
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf-8')
def get_page_str(page_num):
if page_num < 10:
return '0' + str(page_num)
return str(page_num)
def get_huanzhu_url(page_num):
return 'http://www.my285.com/wuxia/hzlz/ssj3/' + get_page_str(page_num)+'.htm'
data_save = open('data.txt', 'wb+')
def spider_page(page_num):
url = get_huanzhu_url(page_num)
print(url)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read()
content = unicode(content, 'gbk')
content = content.replace('<br>', '')
soup = BeautifulSoup(content, 'html.parser')
tmp = soup.find_all(lambda tag: tag.name=='td' and len(tag.attrs) == 1
and tag.has_key('colspan'))
length = len(tmp)
index = 0;
for data in tmp:
if index >= 1:
break
index = index + 1
data_str = data.string
data_str = data_str.replace('<td colspan="2">', '')
data_str = data_str.replace('</td>', '')
data_save.write(data_str)
return True
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
return False
if hasattr(e,"reason"):
print e.reason
return False
page_num = 0
while True:
page_num = page_num + 1
res = spider_page(page_num)
if res == False:
break
print(page_num)
data_save.close()
就模仿着做了个简单的抽取, 最终结果存在文件里
__author__ = 'allen'
import urllib
import urllib2
import re
import chardet
import os
from bs4 import BeautifulSoup
import sys
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf-8')
def get_page_str(page_num):
if page_num < 10:
return '0' + str(page_num)
return str(page_num)
def get_huanzhu_url(page_num):
return 'http://www.my285.com/wuxia/hzlz/ssj3/' + get_page_str(page_num)+'.htm'
data_save = open('data.txt', 'wb+')
def spider_page(page_num):
url = get_huanzhu_url(page_num)
print(url)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read()
content = unicode(content, 'gbk')
content = content.replace('<br>', '')
soup = BeautifulSoup(content, 'html.parser')
tmp = soup.find_all(lambda tag: tag.name=='td' and len(tag.attrs) == 1
and tag.has_key('colspan'))
length = len(tmp)
index = 0;
for data in tmp:
if index >= 1:
break
index = index + 1
data_str = data.string
data_str = data_str.replace('<td colspan="2">', '')
data_str = data_str.replace('</td>', '')
data_save.write(data_str)
return True
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
return False
if hasattr(e,"reason"):
print e.reason
return False
page_num = 0
while True:
page_num = page_num + 1
res = spider_page(page_num)
if res == False:
break
print(page_num)
data_save.close()
相关文章推荐
- 老李分享:使用 Python 的 Socket 模块开发 UDP 扫描工具
- python学习004--Python运算符优先级
- Python输出中文到文件时的字符编码问题
- Python时间戳和日期
- python学习03--格式化输出
- Python中的并发编程
- Python 使用thrift连接hbase || 远程连接hbase
- python自动化执行脚本
- 搭建Python开发环境
- python学习002
- 举例讲解Python中的身份运算符的使用方法
- poker 游戏实现 (python)
- python module getopt usage
- python学习001
- python装饰器学习笔记
- 如何用Python输出漂亮的xml文件
- 使用python + tornado 做项目demo演示模板
- Python中的字典与成员运算符初步探究
- [python] 多线程编程
- 详解Python中的元组与逻辑运算符