您的位置：首页 > 编程语言 > Python开发

Scrapy学习笔记（二）

2015-08-17 23:30 537 查看

抓去Mp4ba电影信息

0x00 创建项目和Spider

scrapy startproject movieproject
scrapy genspider -t crawl mp4ba mp4ba.com

大致结构

├── movieproject
│   ├── agents.py
│   ├── __init__.py
│   ├── items.py
│   ├── log.py
│   ├── middlewares.py
│   ├── mysql.py
│   ├── pipelines.py
│   ├── proxy.py
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       └── mp4ba.py
└── scrapy.cfg

mp4ba.py中的代码

# -*- coding: utf-8 -*-

class Mp4baSpider(CrawlSpider):
name = "mp4ba"
allowed_domains = ["mp4ba.com"]
start_urls = (
u'http://www.mp4ba.com/',
)

rules = (
Rule(LinkExtractor(allow=r'\index.php\?(.)*page=(\d)*'), callback='parse_page', follow=True),

Rule(LinkExtractor(allow=r'show.php\?hash=(.)*'), callback='parse_detail', follow=False),
)

def parse_page(self, response):
items = []
for  data in response.xpath("//tbody[@id='data_list']/tr[@class='alt1']"):
item = MovieItem()
item['publish_time'] =  u''.join(data.xpath("td[1]/text()").extract())
item['category'] = u''.join(data.xpath("td[2]/a[@href]/text()").extract())
item['name'] = u''.join(data.xpath("td[3]/a[@href]/text()").extract()).strip()
item['size'] = u''.join(data.xpath("td[4]/text()").extract())
item['download_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='btl_1']/text()").extract())
item['detail_link'] = u''.join(data.xpath("td[3]/a[@href]/@href").extract())
item['torrert_count'] = u''.join(data.xpath("td[@nowrap]/span[@class='bts_1']/text()").extract())
items.append(item)
return items
pass

def parse_detail(self, response):
item = MovieDetailItem()
item['detail_link'] = response.url
item['download_link'] = response.xpath("//p[@class='original download']/a[@id='download']/@href").extract()[0]
item['magnet_link'] = response.xpath("//p[@class='original magnet']/a[@id='magnet']/@href").extract()[0]
return item
pass

pipelines.py

class MovieSavePipeline(object):
def __init__(self):
self.linecount = 0
＃创建一个数据库处理对象
self.db = Mp4BaSql()

def process_item(self, item, spider):
self.linecount  = self.linecount +1

if type(item) is MovieItem:
print "%d.\t%s" %(self.linecount,item['name'])
self.db.insert('tbMovieItem',item)
elif type(item) is MovieDetailItem:
print item['download_link']
self.db.insert('tbMovieDetailItem',item)
else:
pass
return item

mysql.py

#!/usr/bin/python
#coding=utf-8
#

import pymysql
from movieproject.items import MovieItem,MovieDetailItem

class SqlBase(object):
def __init__(self):

self.dbName= self.database_Name()
self.tbCurTable= None
self.conn = None
self.cur = None

self._init()
self._close()

pass

# overwrite function
def database_Name(self):
return "dbTest"
pass

def config(self):
return ('localhost','root','Xq123456')
pass

def table_Info(self):
table = {
}
return table
pass

# private f
4000
unction sql string
＃ 根据定义的Item，创建Table
def _createdbsql(self,databasename):
sql = "CREATE database if not exists %s character set utf8;" % databasename
#print sql
return sql
pass

def _createtablesql(self,tablename,item):
easysql = 'CREATE TABLE if not exists ' + tablename + ' ( id int(10) unsigned NOT NULL AUTO_INCREMENT,%s PRIMARY KEY (id));'

table = "%s varchar(80) NOT NULL DEFAULT '',"* len(item.fields.keys())
table = table % tuple(item.fields.keys())
easysql = easysql % table
#print easysql
return easysql
pass

def _dropdatabasesql(self,db):
sql = 'DROP DATABASE if exists ' + db

return sql
pass

def _createinsertSql(self,tablename,data):

sql = "INSERT INTO " + tablename + " (" + ",".join(data.keys()) + ") VALUES ('" + "','".join(data.values()).encode('UTF-8') + "')"
#print sql
return sql
pass

# private function
def _init(self):
try:
# connect
self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8")
self.cur=self.conn.cursor()

# delete database
self.cur.execute(self._dropdatabasesql(self.dbName))
# create database
self.cur.execute(self._createdbsql(self.dbName))
self.conn.select_db(self.dbName)

# create tables
for item in self.table_Info().items():
self.cur.execute(self._createtablesql(item[0],item[1]))
pass
pass

except pymysql.Error,e:
print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
pass
pass

def _open(self):
try:
self.conn=pymysql.connect(host=self.config()[0],user=self.config()[1],passwd=self.config()[2],port=3306,charset="utf8")
self.cur=self.conn.cursor()
self.conn.select_db(self.dbName)

except pymysql.Error,e:
print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
self._close()
finally:
pass
pass

def _close(self):
self.cur.close()
self.conn.close()
self.conn = None
self.cur = None
pass

# insert
def insert(self,tbName,item):
try:
self.tbCurTable = tbName
self._open()

sql = self._createinsertSql(tbName,item)
self.cur.execute(sql)
self.conn.commit()

self._close()

except pymysql.Error,e:
print u"Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
pass

pass

class Mp4BaSql(SqlBase):

def table_Info(self):
table = {
'tbMovieItem':MovieItem(),
'tbMovieDetailItem':MovieDetailItem()
}
return table
pass

0x01 问题思考

由于现在的代码中的proxy.py中的代理很多不能用了，所以考虑，再写个scrapy去爬代理的信息。

代码：https://git.oschina.net/xuqi1987/scrapy.git

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python scrapy

相关文章推荐

新的分享

章节导航