您的位置:首页 > 其它

今日头条“科技”,“娱乐”类文章的抓取

2018-03-09 16:52 274 查看
import multiprocessing
from selenium import webdriver
import time
from  bs4 import BeautifulSoup
import requests

#将结果写入'articles.csv'
fw=open('articles.csv','a',encoding='utf8')
#请求头
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'
}

#获取常用3000汉字
hanzi=get_commmon_hanzi()#"啊阿埃挨哎唉哀皑癌蔼矮艾碍爱隘鞍氨安俺......"

#解析详情页
def parse_page(url,tag):
res=requests.get(url,headers=headers)
try:
text=re.findall(r'content:(.*\;\'\,)?',res.text,re.S)[0]
except:
print(url,tag,res.headers)
return
result=list()
#只保留常用汉字,去掉标点符号和其他特殊字符
for c in text:
if c in hanzi:
result.append(c)
fw.write('{},{}\n'.format(tag,''.join(result)))

#获取文章的url,因为没能破解_signature,所以使用selenium
def crawl_news(driver, url):
driver = webdriver.PhantomJS(executable_path='utils/phantomjs.exe')
tag=url.split('/')[-2]
driver.implicitly_wait(5)  # seconds
driver.get(url)
time.sleep(5)
#拖动滚动条100次
for _ in range(100):
time.sleep(3)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
soup = BeautifulSoup(driver.page_source, 'lxml')
items = soup.select('.wcommonFeed ul .item')
# print(len(items))
results=list()
for item in items:
try:
href = item.find('a', class_='link title')['href']
if href.find('/group/') != -1:
results.append(['https://www.toutiao.com{}'.format(href),tag])
except:
pass
driver.close()
return results

def crawl_begin():
#“科技”和“娱乐”
news_url=['https://www.toutiao.com/ch/news_tech/','https://www.toutiao.com/ch/news_entertainment/']
items=list()
for url in news_url:
items.extend(crawl_news(url=url))
print(len(items))
#打乱items的顺序
random.shuffle(items)
#使用进程池
pool = multiprocessing.Pool(processes=4)
for item in items:
pool.apply_async(parse_page, (item[0],item[1]))
pool.close()
pool.join()  # 调用join之前,先调用close函数,否则会出错。执行完close后不会有新的进程加入到pool,join函数等
9350
待所有子进程结束
print("done.")

if __name__ == '__main__':
crawl_begin()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: