python爬虫——用selenium爬取京东商品信息
2019-05-10 09:32
363 查看
python爬虫——用selenium爬取京东商品信息
1.先附上效果图(我偷懒只爬了4页)
2.京东的网址https://www.jd.com/
3.我这里是不加载图片,加快爬取速度,也可以用Headless无弹窗模式
options = webdriver.ChromeOptions() options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2}) #不加载图片 browser = webdriver.Chrome(options=options) wait =WebDriverWait(browser,50)#设置等待时间 url = 'https://www.jd.com/' data_list = []#设置全局变量用来存储数据 keyword="python爬虫"#关键词
4.先找到搜索框并用selenium模拟点击(这里发现京东不需要登录就能看到商品信息)
def search(): browser.get('https://www.jd.com/') try: input = wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key")) ) #等到搜索框加载出来 submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")) )#等到搜索按钮可以被点击 input[0].send_keys(keyword)#向搜索框内输入关键词 submit.click()#点击 total = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b') ) )#记录一下总页码,等到总页码加载出来 html = browser.page_source#获取网页信息 prase_html(html)#调用提取数据的函数(后面才写到) return total[0].text except TimeoutError: search()
5.进入了第一页,先写好翻页的函数,需要滑动到底部才能加载后30个商品,总共有60个商品
def next_page(page_number): try: # 滑动到底部 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(random.randint(1, 3))#设置随机延迟 button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')) )#翻页按钮 button.click()# 翻页动作 wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(30)")) )#等到30个商品都加载出来 # 滑动到底部,加载出后三十个货物信息 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")) )#等到60个商品都加载出来 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)) )# 判断翻页成功,高亮的按钮数字与设置的页码一样 html = browser.page_source#获取网页信息 prase_html(html)#调用提取数据的函数 except TimeoutError: return next_page(page_number)
6.能正常翻页就简单很多了,开始抽取需要的商品信息,搜索不同的关键词,页面的布局会有变化,需要重新写定位商品信息
def prase_html(html): html = etree.HTML(html) # 开始提取信息,找到ul标签下的全部li标签 try: lis = browser.find_elements_by_class_name('gl-item') # 遍历 for li in lis: # 名字 title = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text # 价格 price = li.find_element_by_xpath('.//div[@class="p-price"]//i').text # 评论数 comment = li.find_elements_by_xpath('.//div[@class="p-commit"]//a') # 商铺名字 shop_name = li.find_elements_by_xpath('.//div[@class="p-shop"]//a') if comment: comment = comment[0].text else: comment = None if shop_name: shop_name = shop_name[0].text else: shop_name = None data_dict ={}#写入字典 data_dict["title"] = title data_dict["price"] = price data_dict["shop_name"] = shop_name data_dict["comment"] = comment print(data_dict) data_list.append(data_dict)#写入全局变量 except TimeoutError: prase_html(html)
7.存储方法
def save_html(): content = json.dumps(data_list, ensure_ascii=False, indent=2) #把全局变量转化为json数据 with open("jingdong.json", "a+", encoding="utf-8") as f: f.write(content) print("json文件写入成功") with open('jingdong.csv', 'w', encoding='utf-8', newline='') as f: # 表头 title = data_list[0].keys() # 声明writer writer = csv.DictWriter(f, title) # 写入表头 writer.writeheader() # 批量写入数据 writer.writerows(data_list) print('csv文件写入完成')
8.开始调用
def main(): print("第", 1, "页:") total = int(search()) for i in range(2, 5): # for i in range(2, total + 1):#想全爬的就用这个循环 time.sleep(random.randint(1, 3))#设置随机延迟 print("第", i, "页:") next_page(i) save_html() if __name__ == "__main__": main()
遇到的坑
这个是我弄了好久也没有弄好的,我一加这个滑动到底部就报错,我也不知道逻辑错哪里了,所以第一页只能爬30个商品,后面的都能爬60个,希望知道的小伙伴运行过能告诉我一下
def search(): browser.get('https://www.jd.com/') try: input = wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key")) ) # submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")) ) input[0].send_keys(keyword) submit.click() total = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b') ) ) # # 滑动到底部,加载出后三十个货物信息 # browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # wait.until( # EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")) # ) html = browser.page_source prase_html(html) return total[0].text except TimeoutError: search()
9.附上完整代码
import time from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait import random import json import csv from lxml import etree options = webdriver.ChromeOptions() options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2}) #不加载图片 browser = webdriver.Chrome(options=options) wait =WebDriverWait(browser,50)#设置等待时间 url = 'https://www.jd.com/' data_list= []#设置全局变量用来存储数据 keyword ="python爬虫"#关键词 def search(): browser.get('https://www.jd.com/') try: input = wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key")) ) #等到搜索框加载出来 submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")) )#等到搜索按钮可以被点击 input[0].send_keys(keyword)#向搜索框内输入关键词 submit.click()#点击 total = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b') ) )#记录一下总页码,等到总页码加载出来 # # 滑动到底部,加载出后三十个货物信息 # browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # wait.until( # EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")) # ) html = browser.page_source#获取网页信息 prase_html(html)#调用提取数据的函数 return total[0].text#返回总页数 except TimeoutError: search() def next_page(page_number): try: # 滑动到底部 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(random.randint(1, 3))#设置随机延迟 button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')) )#翻页按钮 button.click()# 翻页动作 wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(30)")) )#等到30个商品都加载出来 # 滑动到底部,加载出后三十个货物信息 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") wait.until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")) )#等到60个商品都加载出来 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)) )# 判断翻页成功,高亮的按钮数字与设置的页码一样 html = browser.page_source#获取网页信息 prase_html(html)#调用提取数据的函数 except TimeoutError: return next_page(page_number) def prase_html(html): html = etree.HTML(html) # 开始提取信息,找到ul标签下的全部li标签 try: lis = browser.find_elements_by_class_name('gl-item') # 遍历 for li in lis: # 名字 title = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text # 价格 price = li.find_element_by_xpath('.//div[@class="p-price"]//i').text # 评论数 comment = li.find_elements_by_xpath('.//div[@class="p-commit"]//a') # 商铺名字 shop_name = li.find_elements_by_xpath('.//div[@class="p-shop"]//a') if comment: comment = comment[0].text else: comment = None if shop_name: shop_name = shop_name[0].text else: shop_name = None data_dict ={}#写入字典 data_dict["title"] = title data_dict["price"] = price data_dict["shop_name"] = shop_name data_dict["comment"] = comment print(data_dict) data_list.append(data_dict)#写入全局变量 except TimeoutError: prase_html(html) def save_html(): content = json.dumps(data_list, ensure_ascii=False, indent=2) #把全局变量转化为json数据 with open("jingdong1.json", "a+", encoding="utf-8") as f: f.write(content) print("json文件写入成功") with open('jingdong1.csv', 'w', encoding='utf-8', newline='') as f: # 表头 title = data_list[0].keys() # 声明writer writer = csv.DictWriter(f, title) # 写入表头 writer.writeheader() # 批量写入数据 writer.writerows(data_list) print('csv文件写入完成') def main(): print("第", 1, "页:") total = int(search()) for i in range(2, 5): # for i in range(2, total + 1): time.sleep(random.randint(1, 3)) # 设置随机延迟 print("第", i, "页:") next_page(i) save_html() if __name__ == "__main__": main()
相关文章推荐
- python爬虫(三)selenium爬取京东商品信息
- 利用Python爬虫爬取京东商品的简要信息
- python爬虫实战(一)----------爬取京东商品信息
- Python爬虫二:抓取京东商品列表页面信息
- python爬虫之selenium+chrome 爬去淘宝商品信息
- Python爬虫 --爬取京东商品信息
- python爬虫(7)——获取京东商品评论信息
- 利用Python爬虫爬取京东商品的简要信息
- 【Python】抓取京东列表页商品信息(selenium)
- python爬虫——用selenium爬取淘宝商品信息
- 基于selenium和requests的京东商品信息和评论爬虫
- Python+Selenium+Chrome抓取京东商品信息
- python爬虫selenium+firefox抓取京东商品评论
- 京东商品信息及其价格爬虫
- python利用urllib实现爬取京东网站商品图片的爬虫实例
- 爬虫实践---Selenium-抓取淘宝搜索商品信息
- 使用python+selenium爬取京东商品列表
- python爬虫(6)——获取天猫商品评论信息
- [Python爬虫] 之二十六:Selenium +phantomjs 利用 pyquery抓取智能电视网站图片信息
- Python+Scrapy+Selenium简单爬取淘宝天猫商品信息及评论