您的位置:首页 > 编程语言 > Python开发

python3 [入门基础实战] 爬虫入门之爬取糗事百科

2017-05-24 23:43 701 查看
#encoding=utf8
import requests
from lxml import etree

class QiuShi(object):
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}

url = 'http://www.qiushibaike.com/text/'

def __init__(self):
filed = ['作者','性别','年龄','段子内容','好笑','评论']
# self.write = CSV('qiushi.csv',filed)
print(filed)

# 总页码
def totalUrl(self):
urls = [self.url+'page/{}?s=4985075'.format(i) for i in range(1,36)]
for url in urls:
print(u'正在获取:'+url.split('/')[-2]+u'页')
self.getInfo(url)

# 抓取详细信息
def getInfo(self,url):
item= {}
html = requests.get(url,headers = self.headers).text
data = etree.HTML(html)

infos = data.xpath('//*[@class="article block untagged mb15"]')
print(infos)

for info in infos:
try:
item[1] = info.xpath('div[1]/a[2]/h2/text()')[0]
try:
age = info.xpath('div[1]/div[@class="articleGender womenIcon"]/text()')[0]
item[2] = u'女'
item[3] = age
except:
age = info.xpath('div[1]/div[@class="articleGender manIcon"]/text()')[0]
item[2] = u'男'
item[3] = age
except:
item[1] = u'匿名用户'
item[2] = u'不详'
item[3] = u'不详'
item[4] = info.xpath('a/div/span/text()')[0].strip()
item[5] = info.xpath('div[2]/span[1]/i/text()')[0]
item[6] = data.xpath('//*[@class="qiushi_comments"]/i/text()')[0]
row = [item[i] for i in range(1, 7)]
# self.write.writeRow(row)
print(row)
# with open('C:\\QiuShiBaiKe.cvs', 'w+') as f:
#     # f.write('{},{},{},{},{}'.format(row, work_year, money, palace, '\n'))
#     f.write(row+"")

if __name__ == '__main__':
qiushi = QiuShi()
qiushi.totalUrl()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 糗事百科