项目进展:淘宝店铺抓取
2017-11-14 13:19
309 查看
1.概要:
项目为了抓取淘宝中可能存在的侵犯明星肖像权的行为,目标获取店铺首页的图片
淘宝店铺首页:https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306
为了缩减数据量,只处理大类别
example(女装):https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.S9RdIQ&q=%E5%A5%B3%E8%A3%85&tracelog=shopsearchnoqcat&sort=sale-desc
按照销量排行,取了前120个店铺
通过selenium+phantomjs获取Page_source,
通过re模块获取了图片的链接地址
2.代码from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
import urllib
from tkinter import *
import threading
#init driver with phantomJS
driver = webdriver.PhantomJS()
#new list
store_list = []
#init total_count
total_count = 0
#init mutex
mutex = threading.Lock()
def get_item_href():
driver = webdriver.PhantomJS()
driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")
href_list = []
for i in range(12):
href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))
href_list[i] += '&sort=sale-desc'
print(href_list[i])
return href_list
def get_shop_url(store_list,start_url):
count = 0
while count <= 5:
url = start_url + '&s=%s'%(count*20)
driver.get(url)
page = driver.page_source
urls = re.findall(r'//shop\d+.taobao.com',page,re.I)
for url in urls:
url = get_total_url(url)
if url not in store_list:
store_list.append(url)
count += 1
def get_img_url(shop_url):
global total_count
mutex.acquire()
driver.set_window_size(25600,14400)
driver.get(shop_url)
page_source = driver.page_source
img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)
#for i in img2_urls:
# img_urls.append(i)
download_path = r'C:\Users\Administrator\Pictures\test'
for count in range(len(img_urls)):
img_url = get_total_url(img_urls[count])
try:
store_name = "%s"%total_count+"_"+"%s"%count
#urllib.request.urlretrieve(img_url,download_path+"%s.jpeg"%store_name)
print("download %s.jpeg"%store_name)
#t.insert('1.0',"download %s.jpeg"%store_name)
except Exception as e:
print(e)
pass
total_count += 1
mutex.release()
def get_total_url(url):
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https:/' + url
else:
url = url
return url
def print_url(store_list):
for shop_url in store_list:
print(shop_url,end = ',')
def main():
href_list = get_item_href()
for i in range(len(href_list)):
start_url = href_list[i]
get_shop_url(store_list,start_url)
#print_url(store_list) #test
for shop_url in store_list:
print(shop_url)
t = threading.Thread(target = get_img_url,args = (shop_url,))
t.start()
t.join()
#get_img_url(shop_url)
main()
项目为了抓取淘宝中可能存在的侵犯明星肖像权的行为,目标获取店铺首页的图片
淘宝店铺首页:https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306
为了缩减数据量,只处理大类别
example(女装):https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.S9RdIQ&q=%E5%A5%B3%E8%A3%85&tracelog=shopsearchnoqcat&sort=sale-desc
按照销量排行,取了前120个店铺
通过selenium+phantomjs获取Page_source,
通过re模块获取了图片的链接地址
2.代码from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
import urllib
from tkinter import *
import threading
#init driver with phantomJS
driver = webdriver.PhantomJS()
#new list
store_list = []
#init total_count
total_count = 0
#init mutex
mutex = threading.Lock()
def get_item_href():
driver = webdriver.PhantomJS()
driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")
href_list = []
for i in range(12):
href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))
href_list[i] += '&sort=sale-desc'
print(href_list[i])
return href_list
def get_shop_url(store_list,start_url):
count = 0
while count <= 5:
url = start_url + '&s=%s'%(count*20)
driver.get(url)
page = driver.page_source
urls = re.findall(r'//shop\d+.taobao.com',page,re.I)
for url in urls:
url = get_total_url(url)
if url not in store_list:
store_list.append(url)
count += 1
def get_img_url(shop_url):
global total_count
mutex.acquire()
driver.set_window_size(25600,14400)
driver.get(shop_url)
page_source = driver.page_source
img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)
#for i in img2_urls:
# img_urls.append(i)
download_path = r'C:\Users\Administrator\Pictures\test'
for count in range(len(img_urls)):
img_url = get_total_url(img_urls[count])
try:
store_name = "%s"%total_count+"_"+"%s"%count
#urllib.request.urlretrieve(img_url,download_path+"%s.jpeg"%store_name)
print("download %s.jpeg"%store_name)
#t.insert('1.0',"download %s.jpeg"%store_name)
except Exception as e:
print(e)
pass
total_count += 1
mutex.release()
def get_total_url(url):
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https:/' + url
else:
url = url
return url
def print_url(store_list):
for shop_url in store_list:
print(shop_url,end = ',')
def main():
href_list = get_item_href()
for i in range(len(href_list)):
start_url = href_list[i]
get_shop_url(store_list,start_url)
#print_url(store_list) #test
for shop_url in store_list:
print(shop_url)
t = threading.Thread(target = get_img_url,args = (shop_url,))
t.start()
t.join()
#get_img_url(shop_url)
main()
相关文章推荐
- Python开源爬虫项目代码:抓取淘宝、京东、QQ、知网数据--转
- PHP抓取淘宝店铺等级、评分
- [置顶] 【python 淘宝爬虫】python 淘宝店铺名称,旺旺,销售量 抓取
- PHP抓取淘宝店铺等级、评分
- 纯css实现超宽图片全屏居中(兼容淘宝店铺)
- 淘宝店铺用ps+dw装修的步骤
- 结对编程项目进展——第三周
- 武汉项目进展情况
- 淘宝店铺(宝贝描述模板)克隆攻略
- 使用CURL抓取淘宝页面
- 配置管理员(CM)如何接手一个正在进展中的项目?
- 关于“淘宝爆款”的数据抓取与数据分析
- 淘宝店铺将导航和店招修改成通栏
- 项目进展日志2
- [Symbian项目进展]和两个师兄讨论
- 个人项目四则运算生成程序进展——第三周
- 常用HTML技术 淘宝店铺装修
- 项目进展
- PHP抓取淘宝商品的用户晒单评论+图片+搜索商品列表实例
- 淘宝开放API菜鸟教程——根据卖家昵称获取卖家店铺ID