您的位置:首页 > 编程语言 > Python开发

百度贴吧的内容的爬取,以一个NBA吧的实例,在python3.6上,IDE是pycharm,最新的正则。

2017-08-31 20:58 471 查看
import urllib
import re
from urllib import request
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n    ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
class BDTB:
def __init__(self,baseUrl,seeLZ,floorTag):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
self.tool = Tool()
self.file = None
# 楼层标号,初始为1
self.floor = 1
# 默认的标题,如果没有成功获取到标题的话则会用这个标题
self.defaultTitle = u"百度贴吧"
# 是否写入楼分隔符的标记
self.floorTag = floorTag
def getPage(self,pageNum):
try:
url = self.baseURL+self.seeLZ+'&pn='+str(pageNum)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
#print(response.read().decode('utf-8'))
content = response.read().decode('utf-8')
return content

except urllib.error.URLError as e:
if hasattr(e, "reason"):
print(u"连接百度贴吧失败,错误原因", e.reason)
return None
    def getTitle(self, page):
#print(page)
pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
result = re.search(pattern, str(page))
if result:
#print(result.group(1))
return result.group(1).strip()
else:
return None
    def getPageNum(self, indexPage):
pattern = re.compile('<span class="red">(.*?)</span>', re.S)
result = re.search(pattern, indexPage)
if result:
#print(result.group(1))
return result.group(1).strip()
else:
return None
    def getContent(self, page):
pattern = re.compile('<div id="post_content_.*?>(.*?)</div>', re.S)
items = re.findall(pattern, page)
floor = 1
contents = []
#items_res = list()
for item in items:
content ='\n'+self.tool.replace(item)+'\n'
contents.append(content)
#print(floor,u'楼-----------------------------------------------------\n')
#items_res.append(self.tool.replace(item))
#print(self.tool.replace(item))

abdd
floor += 1
return contents
def write(self, contents):
for item in contents:
filename = 'tieba.txt'
with open(filename, 'a') as file_object:
if self.floorTag == '1':
# 楼之间的分隔符
floorLine = "\n" + str(
self.floor) + u"-----------------------------------------------------------------------------------------\n"
file_object.write(floorLine)
file_object.write(str(item))
self.floor += 1
def start(self):
indexPage = self.getPage(1)
#print(indexPage)
pageNum = self.getPageNum(indexPage)
title = self.getTitle(indexPage)
if pageNum ==None:
print('URL已失效,请重试!!')
return
        try:
print("该帖子共有"+str(pageNum)+'页')
for i in range(1,int(pageNum)+1):
print('正在写入第'+str(i)+'页数据')
page = self.getPage(i)
contents = self.getContent(page)
self.write(contents)
except IOError as e:
print("写入异常,原因" + e.message)
finally:
print("写入任务完成")

# batb = BDTB(baseURL, 1,1)
# print(batb.getPage(1))
# print(u'请输入帖子网址')
# baseurl  = input(u'请输入帖子网址:')
# seeLZ = input("是否只获取楼主发言,是输入1,否输入0\n")
# floorTag = input("是否写入楼层信息,是输入1,否输入0\n")
# bdtb = BDTB(baseurl, seeLZ)
# bdtb.start()
#baseURL = 'https://tieba.baidu.com/p/' + str(input(u'https://tieba.baidu.com/p/'))
baseURL = 'https://tieba.baidu.com/p/3138733512'
seeLZ = input("是否只获取楼主发言,是输入1,否输入0\n")
floorTag = input("是否写入楼层信息,是输入1,否输入0\n")
bdtb = BDTB(baseURL,seeLZ,floorTag)
bdtb.start()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: