您的位置:首页 > 编程语言 > Python开发

python爬虫学习(下)——爬虫代码实现

2016-11-12 20:02 731 查看
上篇分析出了数据获取的完整路径,下面对应介绍具体的代码实现

注:代码说明、我的一些总结心得都放到了代码注释里

整个程序主要由以下几个类组成:

Class Car:汽车模型,存储每个车的信息

Class CarFactory:传入获取的网络数据,生产出汽车模型

Class CarSpider:爬虫的主体类,串联整个业务

Class DataSaver:负责数据库操作,数据存储

Class RequestThread:后期我把请求改成了多线程操作,引入了这个类

具体代码及注释:

#CarSpider对象负责抓取数据
spider = CarSpider()
cars = spider.getDatas()

#DataSaver对象负责入库
dataSaver = DataSaver()
dataSaver.updateCarsData(cars)
print "finish"

CarSpider:

class CarSpider:

def __init__(self):
#类属性要放到init中
self.allCars = []

#对外暴露函数
def getDatas(self):
# 根据首字母得到所有车系列表
carList = self.__getCarSeriesListByInitialChar()
# 从车型列表中提取出每个车系首页的url
carUrlList = self.__getAllCarSeriesIndexUrl(carList)
# 解析出车系配置详情页面url
urlQueue = self.__getCarSeriesInfoUrls(carUrlList)
# 获取车辆详情
cars = self.__getCarsBycarSeriesInfoUrls(urlQueue)
return cars

#gzip解压(有的页面是经过gzip压缩的,gzip解压)
#私有方法,方法名以__开头
def __gzipDecode(self,response):
if response.info().get('Content-Encoding') == 'gzip':
compressedstream = StringIO.StringIO(response.read())
gziper = gzip.GzipFile(fileobj=compressedstream)
result = gziper.read()
return result
else:
return response.read()

#根据首字母得到所有车系列表
def __getCarSeriesListByInitialChar(self):
dataList = []
for i in range(65,91):
url = 'http://www.autohome.com.cn/grade/carhtml/%c.html'%(chr(i))
try:
response = urllib2.urlopen(url)
except urllib2.HTTPError,e:
print e.reason
zipDecodeData = self.__gzipDecode(response)
data = zipDecodeData.decode("gbk")#返回的数据有中文,需要经过编码转换
dataList.append(data)
return dataList

#从车型列表中提取出每个车系首页的url
def __getAllCarSeriesIndexUrl(self,dataList):
carInfoUrlList = []
#()分组的概念!!!
pattern = re.compile(r'<h4.*><a href="(.*)">.*</a></h4>', re.M)
for data in dataList:
carList = re.findall(pattern, data)
for carUrl in carList:
if carUrl.find('greylink') == -1:#greylink意味着车系信息没有价值(不全或陈旧),丢弃
carInfoUrlList.append(carUrl)
return carInfoUrlList

#解析出车系配置详情页面
def __getCarSeriesInfoUrls(self,carUrlList):
urlQueue = Queue()
for carUrl in carUrlList:
#从车系首页url中提取车系id
pattern = re.compile(r'[0-9]+',re.M)
carId = re.search(pattern,carUrl).group()
#根绝总结的规律,拼出配置详情页url
carSeriesInfoUrl = 'http://car.autohome.com.cn/config/series/%d.html'%(int(carId))
#print carSeriesInfoUrl
urlQueue.put(carSeriesInfoUrl)
return urlQueue

# 获取车辆详情
def __getCarsBycarSeriesInfoUrls(self,urlQueue):
#Queue,线程安全,有阻塞机制
ouputDataQueue = Queue()
threads = []
for i in range(0,4):
#查了一下,python的多线程是伪多线程,把计算放到线程里意义不大,只把请求放到线程中去做
requestThread = RequestThread(i,urlQueue,ouputDataQueue)
threads.append(requestThread)
#不能在此处join,此处join后,循环是在当前线程中的,下次循环也会被阻塞住
#requestThread.join()
requestThread.start()

for requestThread in threads:
requestThread.join()

while ouputDataQueue.qsize() != 0:
outputData = ouputDataQueue.get()[0]
seriesInfoUrl = ouputDataQueue.get()[1]
data = outputData.decode("gbk")
pattern = re.compile(r'var config = ({.*};)', re.M)
result = re.findall(pattern, data)
if len(result) > 0:
infoJsonStr = result[0][0:-1]
#json转换
infoDict = json.loads(infoJsonStr)
carFactory = CarFactory(infoDict, seriesInfoUrl)
# carfactory依据infoDict生成car模型
cars = carFactory.analysisData()
for car in cars:
self.allCars.append(car)
else:
continue
#从队列里get()数据后,数据使用完后要调用task_done(),让队列取消阻塞
ouputDataQueue.task_done()
return self.allCars


CarFactory:

class CarFactory:
def __init__(self,carsData,carSeriesUrl):
self.carsData = carsData
self.carSeriesUrl = carSeriesUrl
self.cars = []
self.carsNum = 0

def __setCarsParam(self,param,values):
for i in range(self.carsNum):
car = self.cars[i]
value = values[i]["value"]
# 反射机制
setattr(car,param,value)

def analysisData(self):
#一个车系中会有多种车型的信息
specsList = self.carsData["result"]["specsList"]
for spec in specsList:
car = Car();
car.specid = spec["specid"]
car.infoUrl = self.carSeriesUrl
self.cars.append(car)
self.carsNum = len(self.cars)

paramTypeItems = self.carsData["result"]["paramtypeitems"]
for paramTypeItem in paramTypeItems:
paramTypeName = paramTypeItem["name"]
paramItems = paramTypeItem["paramitems"]
if paramTypeName == u"基本参数":
for param in paramItems:
paramName = param["name"]
values = param["valueitems"]
if paramName == u"厂商":
self.__setCarsParam("family",values)
.......

elif paramTypeName == u"发动机":
for param in paramItems:
paramName = param["name"]
values = param["valueitems"]
if paramName == u"排量(L)":
self.__setCarsParam("sv", values)
elif paramName == u"最大马力(Ps)":
self.__setCarsParam("hpower", values)
..........

..........

..........

return self.cars

Car:

#python中的枚举实现方式之一
class FuelType:
Gasoline = 0
Diesel = 1

class GearType:
MT = 0
AUTO = 1
DCT = 2
CVT = 3
........

class Car:

def __init__(self):
#实例变量要定义在init中
#self.xxx的调用也会触发__setattr__方法
self.specid = 0L  # id,long型
self.family = None  # 厂牌
self.name = None  # 车型
self.price = u"0万"  # 价格
self.level = None  # 级别
self.maxSpeed = 0  # 最高时速
..........

#因为CarFactory中用了反射进行属性赋值,而python中每个属性没有独立对应的默认set方法,所以只能把逻辑都写在__setattr__里了
def __setattr__(self, key, value):
if key == 'specid':
self.__dict__[key] = int(value)
elif key == 'isTurbo':
if value.find(u'自然') != -1:
# __dict__ 返回的是一个字典,它的键(key)是属性名,键值(value)是相应的属性对象的数据值
self.__dict__[key] = False
else:
self.__dict__[key] = True
elif key == 'price':
#python里数量词默认是贪婪的(在少数语言里也可能是默认非贪婪),总是尝试匹配尽可能多的字符;非贪婪则相反,总是尝试匹配尽可能少的字符。在"*","?","+","{m,n}"后面加上?,使贪婪变成非贪婪。
pattern = re.compile(ur'(.*?)万.*', re.M)
result = re.findall(pattern,value)
if len(result) > 0:
self.__dict__[key] = float(result[0])
else:
self.__dict__[key] = float(0)
elif key == 'fuleType':
if value.find(u'汽油') != -1:
self.__dict__[key] = FuelType.Gasoline
else:
self.__dict__[key] = FuelType.Diesel
elif key == 'gearType':
pattern = re.compile(ur'手动|MT', re.M)
result = re.search(pattern,value)
if result != None:
self.__dict__[key] = GearType.MT
else:
pattern = re.compile(ur'双离合|DSG|DCT|PDK|tronic|MDKG|power shift', re.M)
result = re.search(pattern, value)
if result != None:
self.__dict__[key] = GearType.DCT
else:
pattern = re.compile(ur'无极|CVT', re.M)
result = re.search(pattern, value)
if result != None:
self.__dict__[key] = GearType.CVT
else:
self.__dict__[key] = GearType.AUTO
..........
else:
try:
number = float(value)
self.__dict__[key] = number
except StandardError:
self.__dict__[key] = value


RequestThread:

b0d7

class RequestThread(threading.Thread):

def __init__(self,threadId,queue,outputQueue):
threading.Thread.__init__(self)
self.threadId = threadId
self.queue = queue
self.outputQueue = outputQueue

def run(self):
#从url队列中取出一个尚未处理的url
while self.queue.qsize() != 0:
url = self.queue.get()
self.queue.task_done()#get配合task_done,告知queue,内容已取出,可以不阻塞了
print "%d: url:%s %d\n" % (self.threadId, url, self.queue.qsize())
try:
response = urllib2.urlopen(url)
#print id(response) #response会重复???
#此处返回data外,还将车系url返回(url会存回数据库)
data = self.gzipDecode(response)
self.outputQueue.put((data,url))

except urllib2.URLError,e:
print e.reason


DataSaver:

class DataSaver:

def __init__(self):
self.db = sqlite3.connect("CarDB.sqlite")
print self.db

def updateCarsData(self,cars):
#先删除之前所有数据,简单粗暴
self.db.execute("delete from Cars")
for car in cars:
self.db.execute("insert into Cars (specid,name,family,price,level,maxSpeed,accelerate,sv,hpower,mpower) VALUES (?,?,?,?,?,?,?,?,?,?)",[car.specid,car.name,car.family,car.price,car.level,car.maxSpeed,car.accelerate,
car.sv,car.hpower,car.mpower])
self.db.commit()

self.db.close()


以上便是爬虫的主要代码,本人刚刚接触python,文中若有错误或不妥之处,望大家多多指教,谢谢。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: