Scrapy学习笔记(二)
2015-08-17 23:30
537 查看
抓去Mp4ba电影信息
0x00 创建项目和Spider
scrapy startproject movieproject scrapy genspider -t crawl mp4ba mp4ba.com
大致结构
├── movieproject │ ├── agents.py │ ├── __init__.py │ ├── items.py │ ├── log.py │ ├── middlewares.py │ ├── mysql.py │ ├── pipelines.py │ ├── proxy.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── mp4ba.py └── scrapy.cfg
mp4ba.py中的代码
# -*- coding: utf-8 -*- class Mp4baSpider(CrawlSpider): name = "mp4ba" allowed_domains = ["mp4ba.com"] start_urls = ( u'http://www.mp4ba.com/', ) rules = ( Rule(LinkExtractor(allow=r'\index.php\?(.)*page=(\d)*'), callback='parse_page', follow=True), Rule(LinkExtractor(allow=r'show.php\?hash=(.)*'), callback='parse_detail', follow=False), ) def parse_page(self, response): items = [] for data in response.xpath("//tbody[@id='data_list']/tr[@class='alt1']"): item = MovieItem() item['publish_time'] = u''.join(data.xpath("td[1]/text()").extract()) item['category'] = u''.join(data.xpath("td[2]/a[@href]/text()").extract()) item['name'] = u''.join(data.xpath("td[3]/a[@href]/text()").extract()).strip() item['size'] = u''.join(data.xpath("td[4]/text()").extract()) item['download_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='btl_1']/text()").extract()) item['detail_link'] = u''.join(data.xpath("td[3]/a[@href]/@href").extract()) item['torrert_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='bts_1']/text()").extract()) items.append(item) return items pass def parse_detail(self, response): item = MovieDetailItem() item['detail_link'] = response.url item['download_link'] = response.xpath("//p[@class='original download']/a[@id='download']/@href").extract()[0] item['magnet_link'] = response.xpath("//p[@class='original magnet']/a[@id='magnet']/@href").extract()[0] return item pass
pipelines.py
class MovieSavePipeline(object): def __init__(self): self.linecount = 0 #创建一个数据库处理对象 self.db = Mp4BaSql() def process_item(self, item, spider): self.linecount = self.linecount +1 if type(item) is MovieItem: print "%d.\t%s" %(self.linecount,item['name']) self.db.insert('tbMovieItem',item) elif type(item) is MovieDetailItem: print item['download_link'] self.db.insert('tbMovieDetailItem',item) else: pass return item
mysql.py
#!/usr/bin/python #coding=utf-8 # import pymysql from movieproject.items import MovieItem,MovieDetailItem class SqlBase(object): def __init__(self): self.dbName= self.database_Name() self.tbCurTable= None self.conn = None self.cur = None self._init() self._close() pass # overwrite function def database_Name(self): return "dbTest" pass def config(self): return ('localhost','root','Xq123456') pass def table_Info(self): table = { } return table pass # private f 4000 unction sql string # 根据定义的Item,创建Table def _createdbsql(self,databasename): sql = "CREATE database if not exists %s character set utf8;" % databasename #print sql return sql pass def _createtablesql(self,tablename,item): easysql = 'CREATE TABLE if not exists ' + tablename + ' ( id int(10) unsigned NOT NULL AUTO_INCREMENT,%s PRIMARY KEY (id));' table = "%s varchar(80) NOT NULL DEFAULT '',"* len(item.fields.keys()) table = table % tuple(item.fields.keys()) easysql = easysql % table #print easysql return easysql pass def _dropdatabasesql(self,db): sql = 'DROP DATABASE if exists ' + db return sql pass def _createinsertSql(self,tablename,data): sql = "INSERT INTO " + tablename + " (" + ",".join(data.keys()) + ") VALUES ('" + "','".join(data.values()).encode('UTF-8') + "')" #print sql return sql pass # private function def _init(self): try: # connect self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8") self.cur=self.conn.cursor() # delete database self.cur.execute(self._dropdatabasesql(self.dbName)) # create database self.cur.execute(self._createdbsql(self.dbName)) self.conn.select_db(self.dbName) # create tables for item in self.table_Info().items(): self.cur.execute(self._createtablesql(item[0],item[1])) pass pass except pymysql.Error,e: print u"Mysql Error %d: %s" % (e.args[0], e.args[1]) finally: pass pass def _open(self): try: self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8") self.cur=self.conn.cursor() self.conn.select_db(self.dbName) except pymysql.Error,e: print u"Mysql Error %d: %s" % (e.args[0], e.args[1]) self._close() finally: pass pass def _close(self): self.cur.close() self.conn.close() self.conn = None self.cur = None pass # insert def insert(self,tbName,item): try: self.tbCurTable = tbName self._open() sql = self._createinsertSql(tbName,item) self.cur.execute(sql) self.conn.commit() self._close() except pymysql.Error,e: print u"Mysql Error %d: %s" % (e.args[0], e.args[1]) finally: pass pass class Mp4BaSql(SqlBase): def table_Info(self): table = { 'tbMovieItem':MovieItem(), 'tbMovieDetailItem':MovieDetailItem() } return table pass
0x01 问题思考
由于现在的代码中的proxy.py中的代理很多不能用了,所以考虑,再写个scrapy去爬代理的信息。代码:https://git.oschina.net/xuqi1987/scrapy.git
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- install scrapy with pip and easy_install
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定