您的位置：首页 > 编程语言 > Python开发

python3.5 beautiful4.4 扣扣国内新闻爬虫

2016-11-19 17:22 483 查看

Java代码

#!/usr/bin/python3

# -*- coding: UTF-8 -*-

'''

Created on 2016年11月18日



@author: baoyou curiousby@163.com

'''





 下载

#http://ssdfz001.iteye.com/blog/2228685



import urllib.request

import urllib.parse

import os, sys

import codecs

import bs4

from bs4 import BeautifulSoup

import re

import urllib.request, urllib.parse, http.cookiejar





#跟网址 http://news.qq.com/c/816guonei_1.htm

base_url='http://news.qq.com/'

url='http://news.qq.com/c/816guonei_1.htm'

#存储路径

save_path='C:/Users/cmcc-B100036/Desktop/'

save_img='img'

save_txt='text'

#抽取正则

reg = '<a target=\"_blank\" class=\"pic\" href=\"([^\"]*)\"><img class=\"picto\" src=\"([^\"]*)\"></a><a target=\"_blank\" class=\"linkto\" href=\"[^\"]*\">([^</a>]*)</a>([^]*)'

#request消息头

heads = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8',

'Cache-Control':'max-age=0',

'Host':'news.qq.com',

'Proxy-Connection':'keep-alive',

'Upgrade-Insecure-Requests':'1',

'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'

}



#获取网页信息

def getHtml(url):

 fp = urllib.request.urlopen(url)

 bytes = fp.read()

 respAllHtml = bytes.decode('gbk')

 fp.close();

 #print('---- respAllHtml----',respAllHtml);

 return respAllHtml;



#获取新闻列表

def getList(url):

 respHtml = getHtml(url);

 #print('---- respHtml----',respHtml);

 soup = BeautifulSoup(respHtml ,'html.parser');

 list = soup.find_all('div',class_='Q-tpList');

# print('-----------list .len------------',len(list));

 contents=[]

 for x in list:

 contents.append(x)

 return contents

#获取文本信息到本地

def loadText(contents):

 for content in contents :

 load(content)

#下载

资源

def load(content):

# print(content.prettify());

#

# print(content.find('a',class_='pic'))

# print(content.find('a',class_='pic')['href'])

# print(content.find('a',class_='pic').img)

# print(content.find('a',class_='pic').img['src'])

# print( content.find('a',class_='linkto'))

# print( content.find('a',class_='linkto').get_text())

# print(content.find('p'))

 urlsuffix=content.find('a',class_='pic')['href'];

 detailurl=base_url + urlsuffix;

 detailimg= content.find('a',class_='pic').img['src'];

 detailtitle = content.find('a',class_='linkto').get_text();

 detailcontent = content.find('p').get_text();



 save_path='C:/Users/cmcc-B100036/Desktop/'

 save_path = save_path+urlsuffix.replace(".htm","");

 if not os.path.exists(save_path):

 os.makedirs( save_path, 0o755 );

 newstext = save_path+'/%s'%save_txt

 newsimg= save_path+'/%s'%save_img

 if not os.path.exists(newstext):

 os.makedirs( newstext, 0o755 );

 if not os.path.exists(newsimg):

 os.makedirs( newsimg, 0o755 );

 urllib.request.urlretrieve(detailimg,newsimg+"/img.png" );

 with codecs.open(newstext+"/text.txt",'w+','utf-8') as fp:

 fp.write(detailurl+'\t'+detailimg+'\t'+detailtitle+'\t'+detailcontent)

 #print ('------------------------------------------------------------ end one news')



if __name__=="__main__":

# url=raw_input("""输入目标网址\n 按回车键结束\n""")

 print ('---------------------start--------------------------------------')

 url='http://news.qq.com/c/816guonei_1.htm';

 contents = getList(url);

 loadText(contents);

 print ('---------------------end---------------------------------------')

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： java 二维码 java开发扫描源码

相关文章推荐

新的分享

章节导航

python3.5 beautiful4.4 扣扣国内新闻 爬虫

python3.5 beautiful4.4 扣扣国内新闻爬虫