Scrapy爬虫实例——南方都市报
2017-09-11 21:39
405 查看
1.目录结构
工程目录下: 忽略__pycache__目录(缓存目录),应该是类似java编译后的class文件。目录结构如下: └─ SouthCity #工程(project)名 │ │ scrapy.cfg #scrapy爬虫部署的配置文件(新建项目的时候自动生成) │ └─SouthCity #spider名 │ items.py #Items代码模块 │ middlewares.py #Middlewares模块 │ pipelines.py #Pipelines模块 │ settings.py #scrapy爬虫的配置文件(配置定制的scrapy组件) │ __init__.py │ └─spiders #spider模块目录 │ mpage.py #spider代码 └─ __init__.py
2.爬虫简介
该爬虫主要分为3个部分:自定义item(items.py ) scrapy内置的数据结构
爬取部分(mpage.py) 解析链接并存到item中
存储部分(pipelines.py) 从item中取出,存到数据库
3.自定义item
# -*- coding: utf-8 -*- import scrapy # 继承crapy.Item, # 定义自己的数据结构 class ArticalItem(scrapy.Item): leading_title = scrapy.Field() #大标题 title = scrapy.Field() #标题 subtitle = scrapy.Field() #副标题 link = scrapy.Field() #链接 source = scrapy.Field() #新闻来源 writeTime = scrapy.Field() #编写时间 section = scrapy.Field() #板块 author = scrapy.Field() #作者 news = scrapy.Field() #新闻内容 修改Setting.py ITEM_PIPELINES = { 'SouthCity.pipelines.MySQLStoreCnblogsPipeline': 301, }
4.爬取部分
# mpage.py #总共三个部分,也就是爬取的三个步骤。 #parse 拿到报纸所有的板块的url #parse_section 拿到当前板块的所有新闻的url #parse_page 取得对应的新闻信息,放到定义的item种 # -*- coding: utf-8 -*- import scrapy from bs4 import BeautifulSoup from datetime import datetime from SouthCity.items import ArticalItem nav={} class MpageSpider(scrapy.Spider): name = 'mpage' # allowed_domains = ['http://epaper.oeeee.com/epaper/A/html/'] start_urls = ['http://epaper.oeeee.com/epaper'] def parse(self, response): html = response.body soup=BeautifulSoup(html,'html.parser') paper_div=soup.find('div','shortcutbox') a=paper_div.find_all('a') for i in a: href=i.get('href') link=response.urljoin(href) #自动拼接 # link='http://epaper.oeeee.com/epaper/'+href[href.find('A'):] (手动拼接相对连接) nav[i.text]=link try: #yield 的作用是一次提交一次请求后,继续执行。避免一次返回一个迭代对象,占用过多的内存。 yield scrapy.Request(link,callback=self.parse_section) except: continue # print(nav) def parse_section(self, response): html = response.body soup=BeautifulSoup(html,'html.parser') paper_div=soup.find('div','main-list') a=paper_div.find_all('a') nav={} for i in a: href=i.get('href') link=response.urljoin(href) nav[i.text]=link try: yield scrapy.Request(link,callback=self.parse_page) except: continue # print(nav) def parse_page(self,response): detailbox=[] artical=' ' html = response.body soup = BeautifulSoup(html, "html.parser") # try: info = soup.find('div', "main-600 fl") #print(1) detail=info.find_all('span') # detailbox.append(detail[1].text) #print(2) for dt in detail: try: dts=dt.text dts=dts[dts.find(':')+1:].strip() detailbox.append(dts) except: detailbox.append(dt.text) #print(3) news=info.find('div','text') pp=news.find_all('p') #print(4) for p in pp: pt = p.text pt = pt.strip().replace("\xa0","") artical += pt #print(5) try: head1=info.find('h1').text head2=info.find_all('h2') except: pass item = ArticalItem() #print(6) item['leading_title'] = head2[0].text item['title'] = head1 item['subtitle'] = head2[1].text item['link']=response.url item['writeTime']=detailbox[1] item['source']=detailbox[0] item['section']=detailbox[3] item['author']=detailbox[4] item['news']=artical #print(7) yield item return item # print(item)
4.存储部分
# pipeline.py #两部分: # __init__ 定义数据库的连接 # process_item 利用定义好的连接进行存储 import pymysql import logging from hashlib import md5 import datetime import sys class MySQLStoreCnblogsPipeline(object): #定义一个变量,使得类中可以访问。就这点而言,相当于java的成员变量。 def __init__(self): self.connect = pymysql.connect( host='localhost', db='TESTDB', user='pymysql', passwd='123123', charset='utf8', use_unicode=True) self.cursor = self.connect.cursor() def process_item(self, item, spider): global NewTable_Tag now = datetime.datetime.now() date=str(now.date()) date_s=date[:4]+date[5:7]+date[8:] print(date_s) #日期序列 如 20170912 table_name='sc_'+date_s #表名 #建表的sql语句 sql='CREATE TABLE SC_%s (leading_title varchar(255), title varchar(255), subtitle varchar(255), link varchar(250) NOT NULL primary key, writeTime varchar(20), source varchar(100),section varchar(50),author varchar(100),news text,updated datetime,img varchar(100))'%date_s #查询的sql语句 sql_query = "SELECT 1 from SC_%s where link = '%s'"%(date_s,item['link']) #更新的sql语句 sql_update = """UPDATE sc_%s set leading_title = '%s' , title = '%s', subtitle = '%s' , link = '%s' , writetime = '%s' , source = '%s' , section = '%s' , author = '%s' , news = '%s' , updated = '%s' where link = '%s' """% (date_s, item['leading_title'], item['title'], item['subtitle'], item['link'], item['writeTime'], item['source'], item['section'], item['author'], item['news'], now, item['link']) #插入新值的sql语句 sql_insert = """ insert into sc_%s(leading_title, title, subtitle, link, writeTime,source,section,author,news,updated) values('%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s') """% (date_s, item['leading_title'], item['title'], item['subtitle'], item['link'], item['writeTime'], item['source'], item['section'], item['author'], item['news'], now) print('我在loc1') #暂时忽略:tag 如何不重复检查? #检查是否存在该表 self.cursor.execute('show tables') tables=self.cursor.fetchall() if (table_name,) not in tables: try: #不存在,新建表 self.cursor.execute(sql) except Exception as e: raise e print('我在loc2') try: #查询当前链接item['link']是否存在 self.cursor.execute(sql_query) ret = self.cursor.fetchone() if ret: self.cursor.execute(sql_update) print("成功更新一条数据!") #print """ # update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id) #item['link']不存在 else: self.cursor.execute(sql_insert) print("成功插入一条数据!") #print """ # insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated) # values(%s, %s, %s, %s, %s, %s) #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now) print('我在loc3') self.connect.commit() # self.cursor.close() # 关闭游标,似乎没有关闭连接(不知会有什么影响,实际使用没发觉有问题) except Exception as error: logging.warning(error) return item
相关文章推荐
- Python爬虫框架Scrapy实例
- python爬虫框架scrapy实例详解
- Scrapy安装、爬虫入门教程、爬虫实例(豆瓣电影爬虫)
- scrapy爬虫框架实例二
- Scrapy爬虫入门实例
- scrapy爬虫实例分享
- Scrapy爬虫(六):多个爬虫组合实例
- Scrapy爬虫从入门到实例精讲(中)
- Scrapy爬虫实例讲解_校花网
- scrapy爬虫实例w3school报错ImportError: No module named w3school.items
- scrapy爬虫框架入门实例
- scrapy爬虫框架学习入门教程及实例
- scrapy爬虫框架入门实例
- 数据分析——以斗鱼为实例解析requests库与scrapy框架爬虫技术
- scrapy google爬虫实例
- 爬虫 scrapy 抓取小说实例
- python爬虫框架scrapy实例详解
- scrapy爬虫实例
- Scrapy 爬虫实例 抓取豆瓣小组信息并保存到mongodb中 推荐
- Scrapy爬虫(七):爬虫数据存储实例