您的位置:首页 > 编程语言 > Python开发

python——利用python通过浏览器打开博客页面

2017-03-02 12:31 176 查看
自动打开浏览器,并打开指定的网页内容

PS:本程序缺陷, 只能打开一页的博客内容,翻页内容待扩展

# coding:utf-8

import webbrowser as web
import os
import time
import random
import urllib2
from bs4 import BeautifulSoup

def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
#将网页内容传给BeautifulSoup解析
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')#找到每一个文章item
for item in items:
content=item.find('a')
url=content.get('href')#找到每一个文章的连接
url=baseurl+url#拼接成可访问的地址
urlList.append(url)
return urlList
urls=getPage()#获取到博客地址的url列表

i=0;
while i<len(urls):#根据文章列表的url决定循环次数,也是打开的网页个数
webs.open_new_tab(urls[i])#通过默认浏览器打开网页
i=i+1
time.sleep(1)#打开后,暂停1s,防止电脑卡死
else:
time.sleep(2)
os.system('taskkill /f /I.M Chrome.exe')#全部打开一遍之后,将浏览器关闭,防止进程过多,电脑死掉
print u"close broswer %d times "%(j)


修改一下代码

实现如下功能:

统计一下个人名下博客文章的数量,并将其按照访问顺序从小到大排序

#coding:utf-8
import urllib2,re,time,random,os
from bs4 import BeautifulSoup
import webbrowser as web
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def getPage(): #伪装成浏览器登陆,获取网页源代码
url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
baseurl='http://blog.csdn.net'
contentList=[]
sortlist=[]
sortlist1=[]
urlList=[]
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url=url,headers=headers)
try:
html = urllib2.urlopen(req).read()
except urllib2.HTTPError,e:
print e.code
print e.reason
fd=open('counter.txt','w')
page = BeautifulSoup(html,'lxml')
items = page.find_all('div',class_ ='list_item list_view')
print u'总共有文章%d 篇' % len(items)
for item in items:
aa={}
content=item.find('a')

contemtUrl=baseurl+content.get('href')
#print contemtUrl

read_time=item.find('span',class_ ='link_view')
readtime=str(read_time.text.strip())
#print readtime

readtimeNumber = int(filter(str.isdigit, readtime))
#print readtimeNumber
sortlist1.append(readtimeNumber)
#time.sleep(2)
aa['indexs']=readtimeNumber
aa['content']=content.text.strip()
aa['read_time']=readtime
aa['contemtUrl']=contemtUrl
sortlist.append(aa)
sortlist1.sort()
print sortlist1

for i in sortlist1:
for a in sortlist:
if int(i) == int(a['indexs']):
totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']
print totalcontent
fd.write(totalcontent)
fd.write('\n')
urlList.append(a['contemtUrl'])
contentList.append(totalcontent)
fd.close()
return urlList
urls=getPage()


下面的是一个小功能

count=random.randint(3,10)
j=0
while j< count:
for i in range(5):
web.open_new_tb(urls[i])
time.sleep(1)
os.system('taskkill /f /IM Chrome.exe')
j = j+1
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: