您的位置：首页 > 编程语言 > Python开发

Scrapy爬虫实例——南方都市报

2017-09-11 21:39 405 查看

1.目录结构

工程目录下： 忽略__pycache__目录（缓存目录），应该是类似java编译后的class文件。目录结构如下：
└─  SouthCity #工程（project）名
│
│  scrapy.cfg              #scrapy爬虫部署的配置文件（新建项目的时候自动生成）
│
└─SouthCity #spider名
│  items.py            #Items代码模块
│  middlewares.py      #Middlewares模块
│  pipelines.py        #Pipelines模块
│  settings.py         #scrapy爬虫的配置文件（配置定制的scrapy组件）
│  __init__.py
│
└─spiders              #spider模块目录
│  mpage.py           #spider代码
└─  __init__.py

2.爬虫简介

该爬虫主要分为3个部分：

自定义item（items.py ） scrapy内置的数据结构

爬取部分（mpage.py）解析链接并存到item中

存储部分（pipelines.py）从item中取出，存到数据库

3.自定义item

# -*- coding: utf-8 -*-
import scrapy

# 继承crapy.Item，
# 定义自己的数据结构
class ArticalItem(scrapy.Item):
leading_title = scrapy.Field() #大标题
title = scrapy.Field() #标题
subtitle = scrapy.Field() #副标题
link = scrapy.Field() #链接
source = scrapy.Field() #新闻来源
writeTime = scrapy.Field() #编写时间
section = scrapy.Field() #板块
author = scrapy.Field() #作者
news =  scrapy.Field()  #新闻内容

修改Setting.py
ITEM_PIPELINES = {
'SouthCity.pipelines.MySQLStoreCnblogsPipeline': 301,

}

4.爬取部分

#   mpage.py
#总共三个部分，也就是爬取的三个步骤。

#parse              拿到报纸所有的板块的url
#parse_section      拿到当前板块的所有新闻的url
#parse_page         取得对应的新闻信息，放到定义的item种

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from datetime import datetime

from SouthCity.items import ArticalItem
nav={}

class MpageSpider(scrapy.Spider):
name = 'mpage'
# allowed_domains = ['http://epaper.oeeee.com/epaper/A/html/']
start_urls = ['http://epaper.oeeee.com/epaper']

def parse(self, response):
html = response.body
soup=BeautifulSoup(html,'html.parser')
paper_div=soup.find('div','shortcutbox')
a=paper_div.find_all('a')

for i in a:
href=i.get('href')
link=response.urljoin(href) #自动拼接
# link='http://epaper.oeeee.com/epaper/'+href[href.find('A'):] (手动拼接相对连接)
nav[i.text]=link
try:
#yield 的作用是一次提交一次请求后，继续执行。避免一次返回一个迭代对象，占用过多的内存。
yield scrapy.Request(link,callback=self.parse_section)
except:
continue
# print(nav)

def parse_section(self, response):
html = response.body
soup=BeautifulSoup(html,'html.parser')
paper_div=soup.find('div','main-list')
a=paper_div.find_all('a')
nav={}
for i in a:
href=i.get('href')
link=response.urljoin(href)

nav[i.text]=link
try:
yield scrapy.Request(link,callback=self.parse_page)
except:
continue

# print(nav)
def parse_page(self,response):
detailbox=[]
artical='  '

html = response.body
soup = BeautifulSoup(html, "html.parser")

# try:
info = soup.find('div', "main-600 fl")
#print(1)
detail=info.find_all('span')
# detailbox.append(detail[1].text)
#print(2)
for dt in detail:
try:
dts=dt.text
dts=dts[dts.find('：')+1:].strip()
detailbox.append(dts)
except:
detailbox.append(dt.text)

#print(3)
news=info.find('div','text')
pp=news.find_all('p')
#print(4)
for p in pp:
pt = p.text
pt = pt.strip().replace("\xa0","")
artical += pt
#print(5)
try:
head1=info.find('h1').text
head2=info.find_all('h2')
except:
pass
item = ArticalItem()
#print(6)
item['leading_title'] = head2[0].text
item['title'] = head1
item['subtitle'] = head2[1].text
item['link']=response.url
item['writeTime']=detailbox[1]
item['source']=detailbox[0]
item['section']=detailbox[3]
item['author']=detailbox[4]
item['news']=artical
#print(7)
yield item
return item

# print(item)

4.存储部分

# pipeline.py
#两部分：
# __init__          定义数据库的连接
# process_item      利用定义好的连接进行存储

import pymysql
import logging
from hashlib import md5
import datetime
import sys

class MySQLStoreCnblogsPipeline(object):
#定义一个变量，使得类中可以访问。就这点而言，相当于java的成员变量。

def __init__(self):
self.connect = pymysql.connect(
host='localhost',
db='TESTDB',
user='pymysql',
passwd='123123',
charset='utf8',
use_unicode=True)
self.cursor = self.connect.cursor()

def process_item(self, item, spider):
global NewTable_Tag
now = datetime.datetime.now()
date=str(now.date())
date_s=date[:4]+date[5:7]+date[8:]
print(date_s) #日期序列 如 20170912
table_name='sc_'+date_s #表名
#建表的sql语句
sql='CREATE TABLE SC_%s (leading_title varchar(255), title varchar(255), subtitle varchar(255), link varchar(250) NOT NULL primary key, writeTime varchar(20), source varchar(100),section varchar(50),author varchar(100),news text,updated datetime,img varchar(100))'%date_s

#查询的sql语句
sql_query = "SELECT 1 from SC_%s where link = '%s'"%(date_s,item['link'])

#更新的sql语句
sql_update = """UPDATE sc_%s set leading_title = '%s' ,
title = '%s',
subtitle = '%s' ,
link = '%s' ,
writetime = '%s' ,
source = '%s' ,
section = '%s' ,
author = '%s' ,
news = '%s' ,
updated = '%s'
where link = '%s'
"""% (date_s,
item['leading_title'],
item['title'],
item['subtitle'],
item['link'],
item['writeTime'],
item['source'],
item['section'],
item['author'],
item['news'],
now,
item['link'])

#插入新值的sql语句
sql_insert =  """
insert into sc_%s(leading_title, title, subtitle, link, writeTime,source,section,author,news,updated)
values('%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s')
"""% (date_s,
item['leading_title'],
item['title'],
item['subtitle'],
item['link'],
item['writeTime'],
item['source'],
item['section'],
item['author'],
item['news'],
now)
print('我在loc1')

#暂时忽略：tag 如何不重复检查？
#检查是否存在该表
self.cursor.execute('show tables')
tables=self.cursor.fetchall()

if  (table_name,) not in tables:
try:
#不存在，新建表
self.cursor.execute(sql)
except Exception as e:
raise e
print('我在loc2')

try:
#查询当前链接item['link']是否存在
self.cursor.execute(sql_query)
ret = self.cursor.fetchone()
if ret:
self.cursor.execute(sql_update)
print("成功更新一条数据!")
#print """
#    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
#""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)

#item['link']不存在
else:
self.cursor.execute(sql_insert)
print("成功插入一条数据!")
#print """
#    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
#    values(%s, %s, %s, %s, %s, %s)
#""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
print('我在loc3')
self.connect.commit()
# self.cursor.close() # 关闭游标，似乎没有关闭连接（不知会有什么影响，实际使用没发觉有问题）

except Exception as error:
logging.warning(error)
return item

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 爬虫 python scrapy

相关文章推荐

新的分享

章节导航