您的位置：首页 > 编程语言 > Python开发

项目进展:淘宝店铺抓取

2017-11-14 13:19 309 查看

1.概要：

项目为了抓取淘宝中可能存在的侵犯明星肖像权的行为，目标获取店铺首页的图片

淘宝店铺首页：https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306

为了缩减数据量，只处理大类别

example（女装）：https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.S9RdIQ&q=%E5%A5%B3%E8%A3%85&tracelog=shopsearchnoqcat&sort=sale-desc

按照销量排行，取了前120个店铺

通过selenium+phantomjs获取Page_source,

通过re模块获取了图片的链接地址

2.代码from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
import urllib
from tkinter import *
import threading

#init driver with phantomJS
driver = webdriver.PhantomJS()

#new list
store_list = []

#init total_count
total_count = 0

#init mutex
mutex = threading.Lock()

def get_item_href():
driver = webdriver.PhantomJS()
driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")
href_list = []
for i in range(12):
href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))
href_list[i] += '&sort=sale-desc'
print(href_list[i])
return href_list

def get_shop_url(store_list,start_url):
count = 0
while count <= 5:
url = start_url + '&s=%s'%(count*20)
driver.get(url)
page = driver.page_source
urls = re.findall(r'//shop\d+.taobao.com',page,re.I)
for url in urls:
url = get_total_url(url)
if url not in store_list:
store_list.append(url)
count += 1

def get_img_url(shop_url):
global total_count
mutex.acquire()
driver.set_window_size(25600,14400)
driver.get(shop_url)
page_source = driver.page_source
img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)
#for i in img2_urls:
# img_urls.append(i)
download_path = r'C:\Users\Administrator\Pictures\test'
for count in range(len(img_urls)):
img_url = get_total_url(img_urls[count])
try:
store_name = "%s"%total_count+"_"+"%s"%count
#urllib.request.urlretrieve(img_url,download_path+"%s.jpeg"%store_name)
print("download %s.jpeg"%store_name)
#t.insert('1.0',"download %s.jpeg"%store_name)
except Exception as e:
print(e)
pass
total_count += 1
mutex.release()

def get_total_url(url):
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https:/' + url
else:
url = url
return url

def print_url(store_list):
for shop_url in store_list:
print(shop_url,end = ',')

def main():
href_list = get_item_href()
for i in range(len(href_list)):
start_url = href_list[i]
get_shop_url(store_list,start_url)
#print_url(store_list) #test
for shop_url in store_list:
print(shop_url)
t = threading.Thread(target = get_img_url,args = (shop_url,))
t.start()
t.join()
#get_img_url(shop_url)

main()

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python 淘宝网图片

相关文章推荐

新的分享

章节导航