您的位置:首页 > 编程语言 > Python开发

【实现】[抓取知网论文标题摘要] 终于能用Sublime直接编译运行python了,可是却无法导入bs4 | BeautifulSoup

2018-03-10 21:55 796 查看
本文封装的实例下载:https://download.csdn.net/download/qq_19741181/10279675

py抓取知网论文摘要

-----------------------------------------------------------
文章主要参考:http://blog.csdn.net/Eastmount/article/details/78534119?locationNum=6&fps=1
-----------------------------------------------------------
(C:\ProgramData\Anaconda3) C:\Users\d>pip install --upgrade beautifulsoup4
Requirement already up-to-date: beautifulsoup4 in c:\programdata\anaconda3\lib\site-packages

(C:\ProgramData\Anaconda3) C:\Users\d>
----跟着网上升级试了试 :参考:http://blog.csdn.net/sinat_26599509/article/details/50609646------------------
>>> import time
>>> import re
>>> import urllib
>>> import bs4
>>> from bs4 import BeautifulSoup

>>> if __name__ == '__main__':
...   url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
...   content = urllib.urlopen(url).read()
...   soup = BeautifulSoup(content,"html.parser")
...   wz_tab = soup.find_all("div",class_="wz_tab")
...   num = 0
...   for tab in wz_tab:
...     title = tab.find("h3")
...     print(title.get_text())
...     urls = tab.find("h3").find_all("a")
...     flag = 0
...     for u in urls:
...       if flag==0:
...         print(u.get('href'))
...         flag += 1
...     abstract = tab.find(attrs={"class":"width715"}).get_text()
...     print(abstract)
...     other = tab.find(attrs={"class":"year-count"})
...     content = other.get_text().split("\n")
...     cb_from = other.find_all("span")
...     flag = 0
...     for u in cb_from:
...       if flag==0:
...         print(u.get("title"))
...         flag += 1
...     mode = re.compile(r'\d+\.?\d*')
...     number = mode.findall(content[0])
...     print(number[0])
...     mode = re.compile(r'\d+\.?\d*')
...     number = mode.findall(content[1])
...     if len(number)==1:
...       print(number[0])
...     elif len(number)==2:
...       print(number[0], number[1])
...     num = num + 1
...
Traceback (most recent call last):
  File "<stdin>", line 3, in <module>
AttributeError: module 'urllib' has no attribute 'urlopen'

>>>
-------------------出问题了 = = ----------------------------------
原来是因为,urllib.request 缺失的问题
>>> import urllib.request

>>> if __name__ == '__main__':
...   url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
...   content = urllib.request.urlopen(url).read()
...   soup = BeautifulSoup(content,"html.parser")
...   wz_tab = soup.find_all("div",class_="wz_tab")
...   num = 0
...   for tab in wz_tab:
...     title = tab.find("h3")
...     print(title.get_text())
...     urls = tab.find("h3").find_all("a")
...     flag = 0
...     for u in urls:
...       if flag==0:
...         print(u.get('href'))
...         flag += 1
...     abstract = tab.find(attrs={"class":"width715"}).get_text()
...     print(abstract)
...     other = tab.find(attrs={"class":"year-count"})
...     content = other.get_text().split("\n")
...     cb_from = other.find_all("span")
...     flag = 0
...     for u in cb_from:
...       if flag==0:
...         print(u.get("title"))
...         flag += 1
...     mode = re.compile(r'\d+\.?\d*')
...     number = mode.findall(content[0])
...     print(number[0])
...     mode = re.compile(r'\d+\.?\d*')
...     number = mode.findall(content[1])
...     if len(number)==1:
...       print(number[0])
...     elif len(number)==2:
...       print(number[0], number[1])
...     num = num + 1
...

全系统模拟器配置与仿真控制机制设计  http://epub.cnki.net/grid2008/brief/detailj.aspx?filename=1017738945.nh&dbname=CMFD201801
模拟器能够在硬件设计的同时进行软件的开发,实现软硬件交互设计,更好地满足系统的性能要求,缩短开发周期,加快产品上市。但是由于System C/C++等系统语言缺乏灵活性,使不同组件进行相连时,不仅繁琐,而且容易出错。而Python具有语言简洁、灵活、可扩展性、解释 性等特点,将Python语言用于全系统模拟器的配置以及控...

----------------问题来了,怎么导出txt?-----------------------------
在运行时 python test.py >./test.txt 参考文章:https://zhidao.baidu.com/question/412287743.html---------------可是还是不能使用bs4,怎么?办-------------------------
参考文章:http://blog.csdn.net/u010358168/article/details/62040603 | 升级bs4
(C:\ProgramData\Anaconda3) C:\Users\d>pip3 install -upgrade beautifulsoup4

Usage:
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -u

(C:\ProgramData\Anaconda3) C:\Users\d>
-------------------------

于是我就到spyder里面run了--------------------------------------------------

-----------------------------------------------------------
>>> f = open('E:/8484.txt','a+',encoding = 'utf-8')
>>> print(title.get_text,abstract,file = f)
>>> print(abstract,file = f)
>>> f.close()

>>>



>>> f = open('E:/8484.txt','a+',encoding = 'utf-8')
>>> print(title.get_text,abstract,file = f)
>>> f.close()

>>>



-----------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import urllib
from bs4 import BeautifulSoup

#主函数
if __name__ == '__main__':

url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
content = urllib.urlopen(url).read()
soup = BeautifulSoup(content,"html.parser")

#定位论文摘要
wz_tab = soup.find_all("div",class_="wz_tab")
num = 0
for tab in wz_tab:
#标题
title = tab.find("h3")
print title.get_text()
urls = tab.find("h3").find_all("a")
#详情超链接
flag = 0
for u in urls:
if flag==0: #只获取第一个URL
print u.get('href')
flag += 1
#摘要
abstract = tab.find(attrs={"class":"width715"}).get_text()
print abstract
#获取其他信息
other = tab.find(attrs={"class":"year-count"})
content = other.get_text().split("\n")
"""
由于无法分割两个空格,如:《怀化学院学报》  2017年 第09期
故采用获取标题titile内容为出版杂志
<span title="北方文学(下旬)">《北方文学(下旬)》  2017年 第06期</span>
"""
#出版杂志+年份
cb_from = other.find_all("span")
flag = 0
for u in cb_from:
if flag==0: #获取标题
print u.get("title")
flag += 1
mode = re.compile(r'\d+\.?\d*')
number = mode.findall(content[0])
print number[0] #年份

#下载次数 被引次数
mode = re.compile(r'\d+\.?\d*')
number = mode.findall(content[1])
if len(number)==1:
print number[0]
elif len(number)==2:
print number[0], number[1]

num = nu
ca10
m + 1
---------------------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import sys
import codecs
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

#主函数
if __name__ == '__main__':

url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
driver = webdriver.Firefox()
driver.get(url)
#标题
content = driver.find_elements_by_xpath("//div[@class='wz_content']/h3")
#摘要
abstracts = driver.find_elements_by_xpath("//div[@class='width715']")
#出版杂志+年份
other = driver.find_elements_by_xpath("//span[@class='year-count']/span[1]")
mode = re.compile(r'\d+\.?\d*')
#下载次数 被引次数
num = driver.find_elements_by_xpath("//span[@class='count']")

#获取内容
i = 0
for tag in content:
print tag.text
print abstracts[i].text
print other[i].get_attribute("title")
number = mode.findall(other[i].text)
print number[0] #年份
number = mode.findall(num[i].text)
if len(number)==1: #由于存在数字确实 如(100) ()
print number[0]
elif len(number)==2:
print number[0],number[1]
print ''

i = i + 1
tag.click()
time.sleep(1)
--------------------------------------------------------# -*- coding: utf-8 -*-
import time
import re
import sys
import codecs
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

#主函数
if __name__ == '__main__':

url = "http://search.cnki.net/Search.aspx?q=python&rank=relevant&cluster=all&val=&p=0"
driver = webdriver.Firefox()
driver.get(url)
#标题
content = driver.find_elements_by_xpath("//div[@class='wz_content']/h3")
#摘要
abstracts = driver.find_elements_by_xpath("//div[@class='width715']")
#出版杂志+年份
other = driver.find_elements_by_xpath("//span[@class='year-count']/span[1]")
mode = re.compile(r'\d+\.?\d*')
#下载次数 被引次数
num = driver.find_elements_by_xpath("//span[@class='count']")

#获取当前窗口句柄
now_handle = driver.current_window_handle

#获取内容
i = 0
for tag in content:
print tag.text
print abstracts[i].text
print other[i].get_attribute("title")
number = mode.findall(other[i].text)
print number[0] #年份
number = mode.findall(num[i].text)
if len(number)==1: #由于存在数字确实 如(100) ()
print number[0]
elif len(number)==2:
print number[0],number[1]
print ''

i = i + 1
tag.click()
time.sleep(2)

#跳转 获取所有窗口句柄
all_handles = driver.window_handles

#弹出两个界面,跳转到不是主窗体界面
for handle in all_handles:
if handle!=now_handle:
#输出待选择的窗口句柄
print handle
driver.switch_to_window(handle)
time.sleep(1)

print u'弹出界面信息'
print driver.current_url
print driver.title

#获取登录连接信息
elem_sub = driver.find_element_by_xpath("//div[@class='summary pad10']")
print u"作者", elem_sub.text
print ''

#关闭当前窗口
driver.close()

#输出主窗口句柄
print now_handle
driver.switch_to_window(now_handle) #返回主窗口 开始下一个跳转
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: