您的位置:首页 > 编程语言 > Python开发

【Python】模拟登陆并抓取拉勾网信息(selenium+phantomjs)

2017-10-15 18:27 781 查看

环境

python3.5

pip install selenium

phantomjs-2.1.1

pip install pyquery

代码

# -*- coding:utf-8 -*-

# 防止print中文出错
import time
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# 给phantomjs设置请求头
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=r"C:\Users\DELL\Desktop\Scrapy\phantomjs-2.1.1-windows\bin\phantomjs.exe")
driver.set_window_size(400, 100)

# 模拟登陆
def login(login_url, username, password):
print("begin login...")
try:
driver.get(login_url)
driver.find_element_by_css_selector(".input_item.clearfix[data-propertyname='username'] input").send_keys(username)
driver.find_element_by_css_selector(".input_item.clearfix[data-propertyname='password'] input").send_keys(password)
driver.find_element_by_css_selector(".input_item.btn_group.clearfix[data-propertyname='submit'] input").click()
except:
print("login wrong...")

# 模拟搜索
def search_position(position_name):
print("search position {}".format(position_name))
try:
search_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "search_input"))
)
search_input.send_keys(position_name)
search_btn = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "search_button"))
)
search_btn.click()
except:
print("search wrong...")

# 递归,逐页解析页面
def parse_html():
print("begin parse html...")
try:
next_page_label = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".item_con_pager .pager_container span:last-child"))
)
html = pq(driver.page_source)
items = html("#s_position_list .item_con_list li.con_list_item.default_list").items()
for item in items:
print(item.attr("data-company"))
print(item.attr("data-positionname"))
print(item.attr("data-salary"))
print(item("a.position_link").attr("href"))
print("\n")
next_page_label.click()
time.sleep(3)
parse_html()
except Exception as e:
print(str(e))

if __name__ == "__main__":
login_url = "https://passport.lagou.com/login/login.html?ts=1508055021059&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=101A9F09764AD83E3E2A035A1506AF7A"
username = "用户名"
password = "用户密码"
login(login_url, username, password)
search_position("python")
parse_html()


效果

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python
相关文章推荐