python3.x爬虫实战:阿里巴巴网站定向信息抓取
2017-07-13 11:12
846 查看
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests import re from bs4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import pymongo import time from pymongo.collection import ReturnDocument b=webdriver.Chrome() wait=WebDriverWait(b,10) KEY_WORD="建筑" URL="https://www.XXXXXXXXX.com" MONGO_URL='localhost' MONGO_DB='albbs' MONGO_TABLE='supplier' client=pymongo.MongoClient(MONGO_URL) db=client[MONGO_DB] def search(): try: b.get(URL) ul=b.find_element_by_css_selector("#masthead > div.ali-search.fd-right > div.searchtypeContainer > ul") ul.click() b.find_element(By.XPATH,"//*[@id='masthead']/div[2]/div[1]/ul/li[2]").click() input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#alisearch-keywords"))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#alisearch-submit"))) input.send_keys(KEY_WORD) submit.click() total= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#sw_mod_pagination_form > div > span"))) get_url() time.sleep(2) return total.text except TimeoutException: return search() def next_page(page_number): try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jumpto"))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#jump-sub"))) input.clear() input.send_keys(page_number) submit.click() time.sleep(5) wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#sw_mod_pagination_content > div > span.page-cur"),str(page_number))) get_url() time.sleep(8) except TimeoutException: next_page(page_number) def get_url(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#sw_mod_searchlist"))) html=b.page_source soup = BeautifulSoup(html,'html.parser') a=soup.find_all("a",class_="list-item-title-text") for i in a: try: get_information_url(i.attrs['href']) except: continue def save_to_mongo(results): try: if db[MONGO_TABLE].insert(results): print('存储成功') except: print('存储异常') def get_information_url(url): try: b.get(url) contactinfo=b.find_element_by_link_text("联系方式") contactinfo.click() time.sleep(1) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#site_content > div.grid-main > div > div > div > div.m-content"))) html=b.page_source soup = BeautifulSoup(html,'html.parser') div=soup.find("h4") company_name=div.text contactinfo_name=soup.find("a",class_="membername").text MobilePhone=soup.find("dl",class_="m-mobilephone").text MobilePhone=int(re.compile('(\d+)').search(MobilePhone).group()) addr=soup.find("dd",class_="address").text print(company_name) print(contactinfo_name) print(MobilePhone) print(addr) contactinfos={"公司名":company_name,"联系人":contactinfo_name,"手机号码":MobilePhone,"地址":addr} save_to_mongo(contactinfos) time.sleep(8) except TimeoutException: get_information_url(url) def main(): total=search() total=int(re.compile('(\d+)').search(total).group(1)) print(total) for i in range(2,total +1): next_page(i) if __name__ == '__main__': main()自己的学习笔记。
相关文章推荐
- Python爬虫实战---抓取图书馆借阅信息
- <四>、python爬虫抓取购物网站商品信息--图片价格名称
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- Python爬虫框架Scrapy实战之批量抓取招聘信息
- [Python爬虫] 之二十六:Selenium +phantomjs 利用 pyquery抓取智能电视网站图片信息
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- java爬虫实战(1):抓取信息门户网站中的图片及其他文件并保存至本地
- python网络爬虫实战3——抓取新闻内文相关信息
- Python爬虫实战---抓取图书馆借阅信息
- Python爬虫框架Scrapy实战 - 抓取BOSS直聘招聘信息
- Python爬虫框架Scrapy实战 - 抓取BOSS直聘招聘信息
- Python爬虫实战一之使用Beautiful Soup抓取百度招聘信息并存储excel文件
- Python爬虫框架Scrapy实战之批量抓取招聘信息
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- Python爬虫实战三 | 蓝奏网盘抓取网盘链接信息
- Python爬虫框架Scrapy实战 - 抓取BOSS直聘招聘信息
- Python爬虫框架Scrapy实战教程---定向批量获取职位招聘信息
- Python爬虫框架Scrapy实战之定向批量获取职位招聘信息
- 【极客学院】-python学习笔记-4-单线程爬虫 (提交表单抓取信息,实战练习)
- python 爬虫实战 抓取学校bbs相关板块的发帖信息