您的位置:首页 > 数据库

爬取天眼查企业工商数据并写入数据库(仅供学习参考)

2018-11-28 15:53 363 查看

最近需要一些企业数据来做分析,故写了爬取天眼查企业工商数据的python脚本,用的是selenium,代码如下:

[code]import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
from bs4 import BeautifulSoup
import re
import random
#判断元素是否存在
def isElementExist(element):
flag = True
try:
browser.find_element_by_xpath(element)
return flag

except:
flag = False
return flag

browser=webdriver.Chrome()

# 登录
browser.get("https://www.tianyancha.com/login?from=https%3A%2F%2Fwww.tianyancha.com%2Fusercenter%2Fwatch")
time.sleep(5)
browser.maximize_window()
count=1
browser.find_element_by_xpath('//*[@id="web-content"]/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/input').send_keys("账号")
browser.find_element_by_xpath(
'//*[@id="web-content"]/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[3]/input').send_keys("密码")
time.sleep(1)
browser.find_element_by_xpath('//*[@id="web-content"]/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[5]').click()
# 打开数据库连接
db = pymysql.connect("localhost", "数据库账号", "数据库密码", "my_database")
cursor = db.cursor()
# 使用execute方法执行SQL语句
cursor.execute("select name from company_base_sh ")
# 使用 fetchone() 方法获取一条数据
data = cursor.fetchall()
for row in data:
try:
print(row[0])
time.sleep(1)
qqq = browser.find_element_by_xpath('//*[@id="header-company-search"]').clear()
time.sleep(0.5)
browser.find_element_by_xpath('//*[@id="header-company-search"]').send_keys(row[0])
time.sleep(0.5)
browser.find_element_by_xpath('/html/body/div[1]/div/div[2]/div/div[2]/div[1]/div').click()
time.sleep(random.random()+1)
browser.find_element_by_xpath('//*[@id="web-content"]/div/div[1]/div/div[3]/div/div/div[2]/div[1]/a').click()

time.sleep(random.random()+1.5)
num = browser.window_handles  # 获取当前页句柄
browser.close()
browser.switch_to.window(num[1])  # 在句柄2 上执行下述步骤
count += 1
print(count)
time.sleep(1)
if (browser.find_element_by_xpath('/html/body/div[1]/div/div[3]/div[1]/a').text == "登录/注册"):
browser.find_element_by_xpath('/html/body/div[1]/div/div[3]/div[1]/a').click()
time.sleep(1)
browser.find_element_by_xpath(
'//*[@id="_modal_container"]/div/div/div[2]/div/div/div[3]/div[2]/div[1]').click()
time.sleep(1)
browser.find_element_by_xpath(
'//*[@id="_modal_container"]/div/div/div[2]/div/div/div[3]/div[1]/div[2]/input').send_keys(
"13512118067")
time.sleep(0.5)
browser.find_element_by_xpath(
'//*[@id="_modal_container"]/div/div/div[2]/div/div/div[3]/div[1]/div[3]/input').send_keys(
"1234qwer")
time.sleep(1)
browser.find_element_by_xpath(
'//*[@id="_modal_container"]/div/div/div[2]/div/div/div[3]/div[1]/div[5]').click()
time.sleep(1)

hangye = browser.find_element_by_xpath(
'// *[ @ id = "_container_baseInfo"] / table[2] / tbody / tr[3] / td[4]').text
name = browser.find_element_by_xpath('//*[@id="company_web_top"]/div[2]/div[2]/div[1]/h1').text
tel = browser.find_element_by_xpath(
'//*[@id="company_web_top"]/div[2]/div[2]/div[5]/div[1]/div[1]/span[2]').text
mail = browser.find_element_by_xpath(
'//*[@id="company_web_top"]/div[2]/div[2]/div[5]/div[1]/div[2]/span[2]').text
flag = isElementExist('//*[@id="company_web_top"]/div[2]/div[2]/div[5]/div[2]/div[1]/a')
if flag:
net = browser.find_element_by_xpath('//*[@id="company_web_top"]/div[2]/div[2]/div[5]/div[2]/div[1]/a').text
else:
net = ""

flag = isElementExist('//*[@id="company_web_top"]/div[2]/div[2]/div[6]/div/a')
if flag:
tagurl = browser.find_element_by_xpath(
'//*[@id="company_web_top"]/div[2]/div[2]/div[6]/div/a').get_attribute('href')
else:
tagurl = "https://www.tianyancha.com/brand/b19ff9090"

address = browser.find_element_by_xpath(
'//*[@id="_container_baseInfo"]/table[2]/tbody/tr[8]/td[2]').text
address=address[:-4]
zhucehao = browser.find_element_by_xpath('//*[@id="_container_baseInfo"]/table[2]/tbody/tr[1]/td[2]').text
zuzhijigou = browser.find_element_by_xpath('//*[@id="_container_baseInfo"]/table[2]/tbody/tr[1]/td[4]').text
tongyixinyong = browser.find_element_by_xpath(
'//*[@id="_container_baseInfo"]/table[2]/tbody/tr[2]/td[2]').text
companylx = browser.find_element_by_xpath('//*[@id="_container_baseInfo"]/table[2]/tbody/tr[2]/td[4]').text
jydate = browser.find_element_by_xpath(
'//*[@id="_container_baseInfo"]/table[2]/tbody/tr[4]/td[2]/span').text
membercount = browser.find_element_by_xpath(
'//*[@id="_container_baseInfo"]/table[2]/tbody/tr[5]/td[4]').text
canbaocount = browser.find_element_by_xpath(
'//*[@id="_container_baseInfo"]/table[2]/tbody/tr[7]/td[2]').text

start_html = requests.get(tagurl).content
soup = BeautifulSoup(start_html, "lxml")
companytag = ""
for tags in soup.find_all("div", class_=re.compile("tags")):
for tag in tags.find_all("a"):
companytag = companytag + tag.text + ","
sql = "UPDATE company_base_sh SET tel = '"+tel+"',hangye='"+hangye+"',mail='"+mail+"',net='"+net+"',address='"+address\
+"',zhucehao='"+zhucehao+"',zuzhijigou='"+zuzhijigou+"',tongyixinyong='"+tongyixinyong+"',companylx='"+companylx\
+"',jydate='"+jydate+"',membercount='"+membercount+"',canbaocount='"+canbaocount+"',tag='"+companytag+"',isok=1 WHERE name ='"+row[0]+"'"
try:
# 执行SQL语句
print(sql)
cursor.execute(sql)
# 提交到数据库执行
db.commit()
except:
# 发生错误时回滚
db.rollback()
except:
continue
# 关闭数据库连接
db.close()

 

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐