您的位置:首页 > 其它

爬虫实战项目--优信二手车--天眼

2018-07-03 10:50 134 查看
import requests, time, random
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParser

from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
from multiprocessing import Pool

# 关闭Https请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
ua = UserAgent()

number = 1
new_session_xin = 'k8935l0tr72p6dfngdfnuiukoo4n6jfn'
anti_uid = '8F932282-2E08-FA10-DDDC-841EEF3E0BF3'

def get_proxy():
response = requests.get('http://localhost:5010/get/').text
proxy = {'http': 'http://' + response}
return proxy

def get_session_xin():
global anti_uid
headers = {
'User-Agent': ua.random,
'Host': 'www.xin.com',
'Referer': 'https://www.xin.com/zhengzhou/baoma/',
'Cookie': 'XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9'.format(anti_uid)
}

response = requests.get('https://www.xin.com/search/get_wishlist_token', headers=headers, proxies=get_proxy(), verify=False)
# 从响应头的Set-Cookie中,取出session_xin
session_xin = response.cookies.get('session_xin', '没有')
print(session_xin)
return session_xin

def get_list_page(page_num):
global number, new_session_xin, anti_uid
tm = str(time.time()).split('.')[0]
url = 'https://www.xin.com/zhengzhou/baoma/i{}'.format(page_num)
headers = {
'User-Agent': ua.random,
'Host': 'www.xin.com',
'Referer': 'https://www.xin.com/zhengzhou/baoma/',
'Cookie': 'RELEASE_KEY=; XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/'.format(anti_uid, tm, new_session_xin)
}

response = requests.get(url, headers=headers, verify=False, proxies=get_proxy())

uid = response.cookies.get('XIN_anti_uid', '')
if uid:
print('uid = ',uid)
anti_uid = uid
else:
print('uid 不存在')

return response.text

def parse_list_page(list_page):

list_pool = Pool(4)

list_obj = etree.HTML(list_page, parser=HTMLParser(encoding='utf-8'))
detail_urls = list_obj.cssselect('h2 .tit')
for detail_url in detail_urls:
detail_url = 'https:' + detail_url.attrib['href']
list_pool.apply_async(get_detail_page, args=(detail_url,), callback=parse_detail_page)

list_pool.close()
list_pool.join()

def get_detail_page(detail_url):
global number, new_session_xin, anti_uid
number_list = [1525 + number, 1319 + number, 1262 + number, 1436 + number, 1561 + number, 1452 + number,
1618 + number, 1624 + number, 1632 + number, 1631 + number, 1646 + number, 1742 + number,
1814 + number, 1891 + number, 1847 + number, 2286 + number]
tm = str(time.time()).split('.')[0]

# 每次请求详情页数据之前,需要判断number的值,目的就是爬取详情页几条数据之后,更换session_xin的值
if number % 9 == 0:
number += 1
new_session_xin = get_session_xin()
get_detail_page(detail_url)
# 默认情况下,get_detail_page()执行完毕,会继续向下执行代码
return

headers = {
'User-Agent': ua.random,
'Host': 'www.xin.com',
'Referer': 'https://www.xin.com/zhengzhou/baoma/',
'Cookie': 'RELEASE_KEY=; XIN_bhv_oc={}; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/; XIN_CARBROWSE_IDS=%5B67720293%5D; XIN_bhv_pc={}; XIN_bhv_expires=1530597119591'.format(anti_uid, random.choice(number_list), tm, new_session_xin, number)
}
response = requests.get(detail_url, headers=headers, verify=False, proxies=get_proxy())

return response.text, detail_url

def parse_detail_page(detail_tuple):
global number
detail_page = detail_tuple[0]
detail_url = detail_tuple[1]
detail_obj = etree.HTML(detail_page, parser=HTMLParser(encoding='utf-8'))
try:
title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[3]
except Exception as e:
title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[0].strip()

price = detail_obj.xpath('//span[@class="cd_m_info_jg"]/b/text()')[0].strip()

print(detail_url, title, price)

number += 1

if __name__ == '__main__':

pool = Pool(4)

for x in range(1, 51):
print('开始获取第{}页...'.format(x))
pool.apply_async(get_list_page, args=(x,), callback=parse_list_page)

pool.close()
pool.join()

天眼

import requests,time from lxml.html import etree from fake_useragent import UserAgent from urllib.parse import quote from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning from multiprocessing import Pool # from fontTools.ttLib import TTFont # 关闭Https请求警告 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) ua = UserAgent() number_dict1 = { '0': '9', '1': '2', '2': '1', '3': '4', '4': '7', '5': '8', '6': '3', '7': '5', '8': '6', '9': '0', '-': '-' } # 8935: 6048 # 8936: 6043 # 8936-94-90: 2017-08-09 # 8936-95-84: 2017-06-28 number_dict2 = { '0': '9', '1': '4', '2': '5', '3': '1', '4': '8', '5': '6', '6': '7', '7': '3', '8': '2', '9': '0', '-': '-' } KEY_WORD = '智游' # response = requests.get('https://static.tianyancha.com/fonts-styles/fonts/49/49631975/tyc-num.woff').text def get_proxy(): response = requests.get('http://localhost:5010/get/').text proxy = {'http': 'http://' + response} return proxy def get_list_page(page_num): tm = str(time.time()).split('.')[0] headers = { 'User-Agent': ua.random, 'Host': 'www.tianyancha.com', 'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm) } list_url = 'https://www.tianyancha.com/search/p{}?key={}'.format(page_num, quote(KEY_WORD)) response = requests.get(list_url, headers=headers, verify=False, proxies=get_proxy()) return response.text def parse_list_page(list_page): list_html = etree.HTML(list_page, parser=etree.HTMLParser(encoding='utf-8')) divs = list_html.cssselect('.search_row_new') all_a = list_html.cssselect('.query_name') detail_pool = Pool(4) for x in range(len(divs)): div = divs[x] detail_url = all_a[x].attrib['href'] try: person = div.cssselect('.legalPersonName')[0].text zhuceziben = div.xpath('.//span[contains(@title, "人民币")]/text()')[0] except Exception: continue else: zhuceshijian = div.xpath('.//span[contains(@title, "-")]/text()')[0] detail_pool.apply_async(get_detail_page, args=(detail_url, person, zhuceziben, zhuceshijian), callback=parse_detail_page) detail_pool.close() detail_pool.join() def get_detail_page(detail_url, person, zhuceziben, zhuceshijian): tm = str(time.time()).split('.')[0] headers = { 'User-Agent': ua.random, 'Host': 'www.tianyancha.com', 'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm) } response = requests.get(detail_url, headers=headers, proxies=get_proxy()) return response.text, detail_url, person, zhuceziben, zhuceshijian def parse_detail_page(detail_tuple): detail_html = detail_tuple[0] detail_url, person, zhuceziben, zhuceshijian = detail_tuple[1], detail_tuple[2], detail_tuple[3], detail_tuple[4] detail_obj = etree.HTML(detail_html, parser=etree.HTMLParser(encoding='utf-8')) res_str = '' try: date_str = detail_obj.cssselect('.base0910 .tyc-num')[0].text except Exception: pass else: for res in date_str: res_str += number_dict1[res] # 如果number_dict1第一套规则匹配失败,尝试第二套规则转化。 if res_str[0] != '2': res_str = '' for res in date_str: res_str += number_dict2[res] print(detail_url, person, zhuceziben, zhuceshijian, res_str) if __name__ == '__main__': pool = Pool(1) for x in range(1, 6): pool.apply_async(get_list_page, args=(x,), callback=parse_list_page) pool.close() pool.join() 阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: