【实现】[抓取知网论文标题摘要] 终于能用Sublime直接编译运行python了,可是却无法导入bs4 | BeautifulSoup
2018-03-10 21:55
796 查看
本文封装的实例下载:https://download.csdn.net/download/qq_19741181/10279675
文章主要参考:http://blog.csdn.net/Eastmount/article/details/78534119?locationNum=6&fps=1
-----------------------------------------------------------
(C:\ProgramData\Anaconda3) C:\Users\d>pip install --upgrade beautifulsoup4
Requirement already up-to-date: beautifulsoup4 in c:\programdata\anaconda3\lib\site-packages
(C:\ProgramData\Anaconda3) C:\Users\d>
----跟着网上升级试了试 :参考:http://blog.csdn.net/sinat_26599509/article/details/50609646------------------
>>> import time
>>> import re
>>> import urllib
>>> import bs4
>>> from bs4 import BeautifulSoup
>>> if __name__ == '__main__':
... url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
... content = urllib.urlopen(url).read()
... soup = BeautifulSoup(content,"html.parser")
... wz_tab = soup.find_all("div",class_="wz_tab")
... num = 0
... for tab in wz_tab:
... title = tab.find("h3")
... print(title.get_text())
... urls = tab.find("h3").find_all("a")
... flag = 0
... for u in urls:
... if flag==0:
... print(u.get('href'))
... flag += 1
... abstract = tab.find(attrs={"class":"width715"}).get_text()
... print(abstract)
... other = tab.find(attrs={"class":"year-count"})
... content = other.get_text().split("\n")
... cb_from = other.find_all("span")
... flag = 0
... for u in cb_from:
... if flag==0:
... print(u.get("title"))
... flag += 1
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[0])
... print(number[0])
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[1])
... if len(number)==1:
... print(number[0])
... elif len(number)==2:
... print(number[0], number[1])
... num = num + 1
...
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
AttributeError: module 'urllib' has no attribute 'urlopen'
>>>
-------------------出问题了 = = ----------------------------------
原来是因为,urllib.request 缺失的问题
>>> import urllib.request
>>> if __name__ == '__main__':
... url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
... content = urllib.request.urlopen(url).read()
... soup = BeautifulSoup(content,"html.parser")
... wz_tab = soup.find_all("div",class_="wz_tab")
... num = 0
... for tab in wz_tab:
... title = tab.find("h3")
... print(title.get_text())
... urls = tab.find("h3").find_all("a")
... flag = 0
... for u in urls:
... if flag==0:
... print(u.get('href'))
... flag += 1
... abstract = tab.find(attrs={"class":"width715"}).get_text()
... print(abstract)
... other = tab.find(attrs={"class":"year-count"})
... content = other.get_text().split("\n")
... cb_from = other.find_all("span")
... flag = 0
... for u in cb_from:
... if flag==0:
... print(u.get("title"))
... flag += 1
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[0])
... print(number[0])
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[1])
... if len(number)==1:
... print(number[0])
... elif len(number)==2:
... print(number[0], number[1])
... num = num + 1
...
全系统模拟器配置与仿真控制机制设计 http://epub.cnki.net/grid2008/brief/detailj.aspx?filename=1017738945.nh&dbname=CMFD201801
模拟器能够在硬件设计的同时进行软件的开发,实现软硬件交互设计,更好地满足系统的性能要求,缩短开发周期,加快产品上市。但是由于System C/C++等系统语言缺乏灵活性,使不同组件进行相连时,不仅繁琐,而且容易出错。而Python具有语言简洁、灵活、可扩展性、解释 性等特点,将Python语言用于全系统模拟器的配置以及控...
----------------问题来了,怎么导出txt?-----------------------------
在运行时 python test.py >./test.txt 参考文章:https://zhidao.baidu.com/question/412287743.html---------------可是还是不能使用bs4,怎么?办-------------------------
参考文章:http://blog.csdn.net/u010358168/article/details/62040603 | 升级bs4
(C:\ProgramData\Anaconda3) C:\Users\d>pip3 install -upgrade beautifulsoup4
Usage:
pip install [options] <requirement specifier> [package-index-options] ...
pip install [options] -r <requirements file> [package-index-options] ...
pip install [options] [-e] <vcs project url> ...
pip install [options] [-e] <local project path> ...
pip install [options] <archive url/path> ...
no such option: -u
(C:\ProgramData\Anaconda3) C:\Users\d>
-------------------------
于是我就到spyder里面run了--------------------------------------------------
-----------------------------------------------------------
>>> f = open('E:/8484.txt','a+',encoding = 'utf-8')
>>> print(title.get_text,abstract,file = f)
>>> print(abstract,file = f)
>>> f.close()
>>>
>>> f = open('E:/8484.txt','a+',encoding = 'utf-8')
>>> print(title.get_text,abstract,file = f)
>>> f.close()
>>>
-----------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import urllib
from bs4 import BeautifulSoup
#主函数
if __name__ == '__main__':
url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
content = urllib.urlopen(url).read()
soup = BeautifulSoup(content,"html.parser")
#定位论文摘要
wz_tab = soup.find_all("div",class_="wz_tab")
num = 0
for tab in wz_tab:
#标题
title = tab.find("h3")
print title.get_text()
urls = tab.find("h3").find_all("a")
#详情超链接
flag = 0
for u in urls:
if flag==0: #只获取第一个URL
print u.get('href')
flag += 1
#摘要
abstract = tab.find(attrs={"class":"width715"}).get_text()
print abstract
#获取其他信息
other = tab.find(attrs={"class":"year-count"})
content = other.get_text().split("\n")
"""
由于无法分割两个空格,如:《怀化学院学报》 2017年 第09期
故采用获取标题titile内容为出版杂志
<span title="北方文学(下旬)">《北方文学(下旬)》 2017年 第06期</span>
"""
#出版杂志+年份
cb_from = other.find_all("span")
flag = 0
for u in cb_from:
if flag==0: #获取标题
print u.get("title")
flag += 1
mode = re.compile(r'\d+\.?\d*')
number = mode.findall(content[0])
print number[0] #年份
#下载次数 被引次数
mode = re.compile(r'\d+\.?\d*')
number = mode.findall(content[1])
if len(number)==1:
print number[0]
elif len(number)==2:
print number[0], number[1]
num = nu
ca10
m + 1
---------------------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import sys
import codecs
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#主函数
if __name__ == '__main__':
url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
driver = webdriver.Firefox()
driver.get(url)
#标题
content = driver.find_elements_by_xpath("//div[@class='wz_content']/h3")
#摘要
abstracts = driver.find_elements_by_xpath("//div[@class='width715']")
#出版杂志+年份
other = driver.find_elements_by_xpath("//span[@class='year-count']/span[1]")
mode = re.compile(r'\d+\.?\d*')
#下载次数 被引次数
num = driver.find_elements_by_xpath("//span[@class='count']")
#获取内容
i = 0
for tag in content:
print tag.text
print abstracts[i].text
print other[i].get_attribute("title")
number = mode.findall(other[i].text)
print number[0] #年份
number = mode.findall(num[i].text)
if len(number)==1: #由于存在数字确实 如(100) ()
print number[0]
elif len(number)==2:
print number[0],number[1]
print ''
i = i + 1
tag.click()
time.sleep(1)
--------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import sys
import codecs
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#主函数
if __name__ == '__main__':
url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
driver = webdriver.Firefox()
driver.get(url)
#标题
content = driver.find_elements_by_xpath("//div[@class='wz_content']/h3")
#摘要
abstracts = driver.find_elements_by_xpath("//div[@class='width715']")
#出版杂志+年份
other = driver.find_elements_by_xpath("//span[@class='year-count']/span[1]")
mode = re.compile(r'\d+\.?\d*')
#下载次数 被引次数
num = driver.find_elements_by_xpath("//span[@class='count']")
#获取当前窗口句柄
now_handle = driver.current_window_handle
#获取内容
i = 0
for tag in content:
print tag.text
print abstracts[i].text
print other[i].get_attribute("title")
number = mode.findall(other[i].text)
print number[0] #年份
number = mode.findall(num[i].text)
if len(number)==1: #由于存在数字确实 如(100) ()
print number[0]
elif len(number)==2:
print number[0],number[1]
print ''
i = i + 1
tag.click()
time.sleep(2)
#跳转 获取所有窗口句柄
all_handles = driver.window_handles
#弹出两个界面,跳转到不是主窗体界面
for handle in all_handles:
if handle!=now_handle:
#输出待选择的窗口句柄
print handle
driver.switch_to_window(handle)
time.sleep(1)
print u'弹出界面信息'
print driver.current_url
print driver.title
#获取登录连接信息
elem_sub = driver.find_element_by_xpath("//div[@class='summary pad10']")
print u"作者", elem_sub.text
print ''
#关闭当前窗口
driver.close()
#输出主窗口句柄
print now_handle
driver.switch_to_window(now_handle) #返回主窗口 开始下一个跳转
py抓取知网论文摘要
-----------------------------------------------------------文章主要参考:http://blog.csdn.net/Eastmount/article/details/78534119?locationNum=6&fps=1
-----------------------------------------------------------
(C:\ProgramData\Anaconda3) C:\Users\d>pip install --upgrade beautifulsoup4
Requirement already up-to-date: beautifulsoup4 in c:\programdata\anaconda3\lib\site-packages
(C:\ProgramData\Anaconda3) C:\Users\d>
----跟着网上升级试了试 :参考:http://blog.csdn.net/sinat_26599509/article/details/50609646------------------
>>> import time
>>> import re
>>> import urllib
>>> import bs4
>>> from bs4 import BeautifulSoup
>>> if __name__ == '__main__':
... url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
... content = urllib.urlopen(url).read()
... soup = BeautifulSoup(content,"html.parser")
... wz_tab = soup.find_all("div",class_="wz_tab")
... num = 0
... for tab in wz_tab:
... title = tab.find("h3")
... print(title.get_text())
... urls = tab.find("h3").find_all("a")
... flag = 0
... for u in urls:
... if flag==0:
... print(u.get('href'))
... flag += 1
... abstract = tab.find(attrs={"class":"width715"}).get_text()
... print(abstract)
... other = tab.find(attrs={"class":"year-count"})
... content = other.get_text().split("\n")
... cb_from = other.find_all("span")
... flag = 0
... for u in cb_from:
... if flag==0:
... print(u.get("title"))
... flag += 1
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[0])
... print(number[0])
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[1])
... if len(number)==1:
... print(number[0])
... elif len(number)==2:
... print(number[0], number[1])
... num = num + 1
...
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
AttributeError: module 'urllib' has no attribute 'urlopen'
>>>
-------------------出问题了 = = ----------------------------------
原来是因为,urllib.request 缺失的问题
>>> import urllib.request
>>> if __name__ == '__main__':
... url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
... content = urllib.request.urlopen(url).read()
... soup = BeautifulSoup(content,"html.parser")
... wz_tab = soup.find_all("div",class_="wz_tab")
... num = 0
... for tab in wz_tab:
... title = tab.find("h3")
... print(title.get_text())
... urls = tab.find("h3").find_all("a")
... flag = 0
... for u in urls:
... if flag==0:
... print(u.get('href'))
... flag += 1
... abstract = tab.find(attrs={"class":"width715"}).get_text()
... print(abstract)
... other = tab.find(attrs={"class":"year-count"})
... content = other.get_text().split("\n")
... cb_from = other.find_all("span")
... flag = 0
... for u in cb_from:
... if flag==0:
... print(u.get("title"))
... flag += 1
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[0])
... print(number[0])
... mode = re.compile(r'\d+\.?\d*')
... number = mode.findall(content[1])
... if len(number)==1:
... print(number[0])
... elif len(number)==2:
... print(number[0], number[1])
... num = num + 1
...
全系统模拟器配置与仿真控制机制设计 http://epub.cnki.net/grid2008/brief/detailj.aspx?filename=1017738945.nh&dbname=CMFD201801
模拟器能够在硬件设计的同时进行软件的开发,实现软硬件交互设计,更好地满足系统的性能要求,缩短开发周期,加快产品上市。但是由于System C/C++等系统语言缺乏灵活性,使不同组件进行相连时,不仅繁琐,而且容易出错。而Python具有语言简洁、灵活、可扩展性、解释 性等特点,将Python语言用于全系统模拟器的配置以及控...
----------------问题来了,怎么导出txt?-----------------------------
在运行时 python test.py >./test.txt 参考文章:https://zhidao.baidu.com/question/412287743.html---------------可是还是不能使用bs4,怎么?办-------------------------
参考文章:http://blog.csdn.net/u010358168/article/details/62040603 | 升级bs4
(C:\ProgramData\Anaconda3) C:\Users\d>pip3 install -upgrade beautifulsoup4
Usage:
pip install [options] <requirement specifier> [package-index-options] ...
pip install [options] -r <requirements file> [package-index-options] ...
pip install [options] [-e] <vcs project url> ...
pip install [options] [-e] <local project path> ...
pip install [options] <archive url/path> ...
no such option: -u
(C:\ProgramData\Anaconda3) C:\Users\d>
-------------------------
于是我就到spyder里面run了--------------------------------------------------
-----------------------------------------------------------
>>> f = open('E:/8484.txt','a+',encoding = 'utf-8')
>>> print(title.get_text,abstract,file = f)
>>> print(abstract,file = f)
>>> f.close()
>>>
>>> f = open('E:/8484.txt','a+',encoding = 'utf-8')
>>> print(title.get_text,abstract,file = f)
>>> f.close()
>>>
-----------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import urllib
from bs4 import BeautifulSoup
#主函数
if __name__ == '__main__':
url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
content = urllib.urlopen(url).read()
soup = BeautifulSoup(content,"html.parser")
#定位论文摘要
wz_tab = soup.find_all("div",class_="wz_tab")
num = 0
for tab in wz_tab:
#标题
title = tab.find("h3")
print title.get_text()
urls = tab.find("h3").find_all("a")
#详情超链接
flag = 0
for u in urls:
if flag==0: #只获取第一个URL
print u.get('href')
flag += 1
#摘要
abstract = tab.find(attrs={"class":"width715"}).get_text()
print abstract
#获取其他信息
other = tab.find(attrs={"class":"year-count"})
content = other.get_text().split("\n")
"""
由于无法分割两个空格,如:《怀化学院学报》 2017年 第09期
故采用获取标题titile内容为出版杂志
<span title="北方文学(下旬)">《北方文学(下旬)》 2017年 第06期</span>
"""
#出版杂志+年份
cb_from = other.find_all("span")
flag = 0
for u in cb_from:
if flag==0: #获取标题
print u.get("title")
flag += 1
mode = re.compile(r'\d+\.?\d*')
number = mode.findall(content[0])
print number[0] #年份
#下载次数 被引次数
mode = re.compile(r'\d+\.?\d*')
number = mode.findall(content[1])
if len(number)==1:
print number[0]
elif len(number)==2:
print number[0], number[1]
num = nu
ca10
m + 1
---------------------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import sys
import codecs
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#主函数
if __name__ == '__main__':
url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
driver = webdriver.Firefox()
driver.get(url)
#标题
content = driver.find_elements_by_xpath("//div[@class='wz_content']/h3")
#摘要
abstracts = driver.find_elements_by_xpath("//div[@class='width715']")
#出版杂志+年份
other = driver.find_elements_by_xpath("//span[@class='year-count']/span[1]")
mode = re.compile(r'\d+\.?\d*')
#下载次数 被引次数
num = driver.find_elements_by_xpath("//span[@class='count']")
#获取内容
i = 0
for tag in content:
print tag.text
print abstracts[i].text
print other[i].get_attribute("title")
number = mode.findall(other[i].text)
print number[0] #年份
number = mode.findall(num[i].text)
if len(number)==1: #由于存在数字确实 如(100) ()
print number[0]
elif len(number)==2:
print number[0],number[1]
print ''
i = i + 1
tag.click()
time.sleep(1)
--------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import sys
import codecs
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#主函数
if __name__ == '__main__':
url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
driver = webdriver.Firefox()
driver.get(url)
#标题
content = driver.find_elements_by_xpath("//div[@class='wz_content']/h3")
#摘要
abstracts = driver.find_elements_by_xpath("//div[@class='width715']")
#出版杂志+年份
other = driver.find_elements_by_xpath("//span[@class='year-count']/span[1]")
mode = re.compile(r'\d+\.?\d*')
#下载次数 被引次数
num = driver.find_elements_by_xpath("//span[@class='count']")
#获取当前窗口句柄
now_handle = driver.current_window_handle
#获取内容
i = 0
for tag in content:
print tag.text
print abstracts[i].text
print other[i].get_attribute("title")
number = mode.findall(other[i].text)
print number[0] #年份
number = mode.findall(num[i].text)
if len(number)==1: #由于存在数字确实 如(100) ()
print number[0]
elif len(number)==2:
print number[0],number[1]
print ''
i = i + 1
tag.click()
time.sleep(2)
#跳转 获取所有窗口句柄
all_handles = driver.window_handles
#弹出两个界面,跳转到不是主窗体界面
for handle in all_handles:
if handle!=now_handle:
#输出待选择的窗口句柄
print handle
driver.switch_to_window(handle)
time.sleep(1)
print u'弹出界面信息'
print driver.current_url
print driver.title
#获取登录连接信息
elem_sub = driver.find_element_by_xpath("//div[@class='summary pad10']")
print u"作者", elem_sub.text
print ''
#关闭当前窗口
driver.close()
#输出主窗口句柄
print now_handle
driver.switch_to_window(now_handle) #返回主窗口 开始下一个跳转
相关文章推荐
- python基于BeautifulSoup实现抓取网页指定内容的方法
- 内存数据库内核开发 工作日志(内存索引实现原理)(附红黑树实现清晰完整直接可编译运行代码)(十)
- AHK 程序/脚本仅能在程序内部启动而无法直接双击运行的实现
- python介绍——优缺点/实现/运行/导入模块
- python的BeautifulSoup实现抓取网页数据
- python基于BeautifulSoup实现抓取网页指定内容的方法
- 关于利用Python无法直接抓取全部网易云音乐评论时怎么解决
- 动手实现Android源码(AOSP)的下载、编译、运行、导入、调试
- 编译pyc文件——python无法运行其它版本编译的pyc文件的情况
- Python实现抓取百度搜索结果页的网站标题信息
- 有关利用python获取网页, 以及KDD近几年论文标题与摘要链接
- [置顶] 【Android】AndroidStudio导入别人的项目报错。编译通过,无法运行解决方案
- Python实现抓取百度搜索结果页的网站标题信息
- sublime text3刚从官网下载安装后,Ctrl+B无法直接运行Python文件问题--解决方法
- py单词翻译小工具(python抓取html实现)
- .NET 4.0 无法直接运行 .NET 2.0 程序的问题
- Python 跟 ArcGIS断了联系……Model的py脚本无法运行
- c#直接调用ssis包实现Sql Server的数据导入功能
- 一个关于VS2008在其他未安装VS2008的电脑上无法运行编译的程序的问题
- asp.net 2.0实现EXCEL直接导入数据库