您的位置:首页 > 编程语言 > Python开发

(python爬虫)-selenium的常规操作并爬取拉勾网信息

2019-03-09 22:35 267 查看

爬虫小练习,selenium有段时间没用了,以拉钩为例重温下,本文较为详细的总结了操作selenium的常规用法,总共四部分,前三部分常规操作,最后一部分贴出完整爬取拉钩信息并简单存储Redis数据库的代码。
话不多说,本次以操练selenium为主要目的,对selenium常用操作进行简单的梳理,不再赘述selenium的安装操作。
本次选用selenium+chromedriver进行操作,在定位上使用简单易用的Xpath。

  1. 首先来看导入,这是使用webdriver时的常规导入,后面有添加代理的导包。
#常规导包
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
  1. 生成driver。这里有个操作,关于selenium添加代理,之前没有尝试过,经查询源码和搜索之后,贴上个人推荐的动态添加ip方式。
# 常规生成driver
option = webdriver.ChromeOptions()
self.driver = webdriver.Chrome(
# executable_path为驱动路径,可去官网下载对应驱动
executable_path='./chromedriver',
chrome_options=option )
self.timeout = 10

2.1 selenium加代理,需另外导包,推荐看下源码,易懂。

#selenium添加代理
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
option = webdriver.ChromeOptions()
proxy = Proxy(
{
'proxyType':ProxyType.MANUAL,
'httpProxy':'ip:port'
}
)
# 用火狐或者其他的注意自己对应
desired_capabilities = DesiredCapabilities.CHROME.copy()
proxy.add_to_capabilities(desired_capabilities)
self.driver = webdriver.Chrome(
executable_path='./chromedriver',
chrome_options=option,
desired_capabilities=desired_capabilities
)

2.3 后续再次切换代理,将其封装成方法,贴下代码:

def change_proxy(self):
proxy = Proxy(
{
'proxyType':ProxyType.MANUAL,
'httpProxy':'ip:port'
}
)
desired_capabilities = DesiredCapabilities.CHROME.copy()
proxy.add_to_capabilities(desired_capabilities)
self.driver.start_session(desired_capabilities)
  1. driver拿url,浏览器打开,既然是练习selenium,从百度页面开始,以下分别进行用Xpath定位在搜索框内输入,搜索按钮的点击,搜索结果的显示,切换窗口,最后采集数据的操作。这里注意一下WebDriverWait(self.driver,self.timeout).until的返回,类型是WebElement,有几个常用接口的使用。
    3.1 在搜索框输入内容。WebElement.send_keys(*value):向输入框输入字符串 ,可以输入文件绝对路径上传文件。
url = 'https://www.baidu.com/'
self.driver.get(url)
# print(self.driver.title)
# 输入搜索内容
search_content = WebDriverWait(self.driver,self.timeout).until(
lambda a: a.find_element_by_xpath('//input[@id="kw"]')
)
# kw = input('请输入搜索内容:')
search_content.send_keys('拉钩网')

3.2 点击搜索。WebElement.click():模拟点击。

# 点击搜索
search_button = WebDriverWait(self.driver,self.timeout).until(
lambda b: b.find_element_by_xpath('//input[@id="su"]')
)
search_button.click()

3.3搜索结果显示,搜索出来的一条条标题,打印出来看看。WebElement.text:获取文本内容。

#查看搜索结果
search_result = WebDriverWait(self.driver,self.timeout).until(
lambda c:c.find_elements_by_xpath('//h3[contains(@class,"t")]/a[1]')
)
for item in search_result:
print(item.text)

3.4切换窗口,这里封装成方法展示。

def change_window(self):
windowhandles = self.driver.window_handles
self.driver.switch_to.window(windowhandles[-1])
#print(self.driver.title)

关于WebElement,还有几个比较常用的。这里贴出一个链接,比较全。WebElement接口
4. 最后贴出本次完整爬取拉钩的代码。

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import redis
class LaSpider(object):
def __init__(self):
# 常规生成driver
option = webdriver.ChromeOptions()
self.driver = webdriver.Chrome(
# executable_path为驱动路径,可去官网下载对应驱动
executable_path='./chromedriver',
chrome_options=option

)
'''
# selenium加代理
option = webdriver.ChromeOptions()
proxy = Proxy(
{
'proxyType':ProxyType.MANUAL,
'httpProxy':'ip:port'
}
)
# 用火狐或者其他的注意自己切
desired_capabilities = DesiredCapabilities.CHROME.copy()
proxy.add_to_capabilities(desired_capabilities)
driver = webdriver.Chrome(
executable_path='./chromedriver',
chrome_options=option,
desired_capabilities=desired_capabilities
)
'''
self.timeout = 10
self.positions = []
self.page=1
def change_proxy(self):
proxy = Proxy(
{
'proxyType':ProxyType.MANUAL,
'httpProxy':'ip:port'
}
)
desired_capabilities = DesiredCapabilities.CHROME.copy()
proxy.add_to_capabilities(desired_capabilities)
self.driver.start_session(desired_capabilities)
def baidu_search(self):
url = 'https://www.baidu.com/'
self.driver.get(url)
# print(self.driver.title)
# 输入搜索内容
search_content = WebDriverWait(self.driver,self.timeout).until(
lambda a: a.find_element_by_xpath('//input[@id="kw"]')
)
# kw = input('请输入搜索内容:')
search_content.send_keys('拉钩网')time.sleep(4)
# 搜索点击
search_button = WebDriverWait(self.driver,self.timeout).until(
lambda b: b.find_element_by_xpath('//input[@id="su"]')
)
search_button.click()
time.sleep(5)
#         搜索结果显示
search_result = WebDriverWait(self.driver,self.timeout).until(
lambda c:c.find_elements_by_xpath('//h3[contains(@class,"t")]/a[1]')
)
# print(type(search_result))
# for item in search_result:
#     print(item.text)
#     print(type(item))

# if '官网' in item.text:
#     time.sleep(5)
#     item.click()
#     return
time.sleep(3)
# 选择条目点击进入
result = WebDriverWait(self.driver,self.timeout).until(
lambda d:d.find_element_by_xpath('//div[@id="1"]/h3/a')
)
result.click()
# time.sleep(5)

time.sleep(10)
def change_window(self):
driver_windows = self.driver.window_handles
self.driver.switch_to.window(driver_windows[-1])
# print(self.driver.title)
time.sleep(3)
def la_chooose(self):
# 选择地区
address_choose = WebDriverWait(self.driver, self.timeout).until(
lambda a: a.find_element_by_xpath('//div[@id="changeCityBox"]/ul[@class="clearfix"]/li[1]/a')
)
address_choose.click()
time.sleep(4)
# 在搜索框中输入内容
search_content = WebDriverWait(self.driver,self.timeout).until(
lambda b:b.find_element_by_xpath('//input[@id="search_input"]')
)
# kw = input('请输入搜索词:')
search_content.send_keys('python')
time.sleep(4)
# 点击搜索
search_button = WebDriverWait(self.driver,self.timeout).until(
lambda c:c.find_element_by_xpath('//input[@id="search_button"]')
)
search_button.click()
time.sleep(10)
def fetch_content(self):
position_list = WebDriverWait(self.driver,self.timeout).until(
lambda a:a.find_elements_by_xpath('//div[@id="s_position_list"]/ul/li')
)
# print(len(position_list))
# print(position_list[0])
for position in position_list:
position_dic = {}
try:
# result = position.get_attribute('data-company')
position_dic['name'] = position.get_attribute('data-positionname')
position_dic['campany'] = position.get_attribute('data-company')
position_dic['data-salary'] = position.get_attribute('data-salary')
# https://www.lagou.com/jobs/5670619.html
positionid = position.get_attribute('data-positionid')
position_dic['dt_url'] = 'https://www.lagou.com/jobs/{}.html'.format(positionid)
# print(result)
except:
pass
self.positions.append(position_dic)
# print(campany.text)
time.sleep(10)
def change_page(self):
try:
next_page = WebDriverWait(self.driver,self.timeout).until(
lambda a:a.find_element_by_xpath('//span[@class="pager_next "]')
)
next_page.click()
self.page +=1
except:
self.page=0
print('抓取完毕。')
def save_data_redis(self):
r_client = redis.StrictRedis(host='127.0.0.1',port=6379,db=2)
key = 'LGpositions'
try:
for job in self.positions:
r_client.sadd(key,str(job))
except:

pass

def run(self):
self.baidu_search()
self.change_window()
self.la_chooose()

while self.page:
self.fetch_content()
if len(self.positions)==15*self.page:
self.change_page()

time.sleep(5)
self.save_data_redis()
self.driver.close()
if __name__ == '__main__':

LaSpider().run()

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: