Python3.5 ‘wb’与‘w’区别以及写入excel的常见错误
2018-03-16 08:54
453 查看
Python3.5 ‘wb’与‘w’区别以及写入excel的常见错误
望共同进步
转载请注明地址:http://blog.csdn.net/weixin_39701039/article/details/79576549"r" 以读方式打开,只能读文件 , 如果文件不存在,会发生异常
"w" 以写方式打开,只能写文件, 如果文件不存在,创建该文件;如果文件已存在,先清空,再打开文件
"rb" 以二进制读方式打开,只能读文件 , 如果文件不存在,会发生异常
"wb" 以二进制写方式打开,只能写文件, 如果文件不存在,创建该文件;如果文件已存在,先清空,再打开文件这里结合前面写的 Python3.5 爬虫之由浅入深(三、html转excel)来看看'w'和'wb'的区别,已经延伸的说说爬取文件成伪excel时遇到的问题;
#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','w') as f1:
f1.write(result1)
#结果:
因为如果在window下运行,对于Unicode字符,需要print出来的话,由于本地系统是Windows中的cmd,默认codepage是CP936,即GBK的编码,所以python解释器需要先将上述的Unicode字符编码为GBK,然后再在cmd中显示出来。但是由于该Unicode字符串中包含一些GBK中无法显示的字符,导致此时提示“’gbk’ codec can’t encode”的错误的。这个时候我们可以在with open(..,)括号里加入编码方式,'utf-8',如下代码:
#coding:utf-8
#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
eca5
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:
f1.write(result1)
#结果:
#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','wb',encoding='utf-8') as f1:
f1.write(result1)#结果:
因为'wb'是以二进制写入文件,而result1是字符串(str),所以报错,写入文件为0kb,即没有结果
这里可以把result1转变问字节串 bytes(result1)
如下:
注意,这里with open(..)括号里没有encoding=部分了,因为二进制不能在进行编码了,不然会报错ValueError: binary mode doesn't take an encoding argument
bytes(result1,encoding='utf-8')这里是因为转字符串为二进制需要编码方式
#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:
f1.write(result1)
with open(r'G:\任务20180312\test\html_excel/test2.xls', 'w',encoding='utf-8') as f2:
f2.write(result2)
#结果:
那现在我们发现区别在于result1和result2
右键网页打开源代码,来查看区别:
区别在于result1比result2少了些代码(因为我们存入的文件形式为伪excel,所以这个是有关系的),现在有html工具(这里我用的editplus)分别将这两部分代码以浏览器形式打开:
PS:所以我们要把带样式的代码也抓取下来,建议可以看看html5和css,了解一下望有所帮助,望采纳!!
望共同进步
转载请注明地址:http://blog.csdn.net/weixin_39701039/article/details/79576549"r" 以读方式打开,只能读文件 , 如果文件不存在,会发生异常
"w" 以写方式打开,只能写文件, 如果文件不存在,创建该文件;如果文件已存在,先清空,再打开文件
"rb" 以二进制读方式打开,只能读文件 , 如果文件不存在,会发生异常
"wb" 以二进制写方式打开,只能写文件, 如果文件不存在,创建该文件;如果文件已存在,先清空,再打开文件这里结合前面写的 Python3.5 爬虫之由浅入深(三、html转excel)来看看'w'和'wb'的区别,已经延伸的说说爬取文件成伪excel时遇到的问题;
一:UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position 13785: illegal multibyte sequence
#coding:utf-8#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','w') as f1:
f1.write(result1)
#结果:
因为如果在window下运行,对于Unicode字符,需要print出来的话,由于本地系统是Windows中的cmd,默认codepage是CP936,即GBK的编码,所以python解释器需要先将上述的Unicode字符编码为GBK,然后再在cmd中显示出来。但是由于该Unicode字符串中包含一些GBK中无法显示的字符,导致此时提示“’gbk’ codec can’t encode”的错误的。这个时候我们可以在with open(..,)括号里加入编码方式,'utf-8',如下代码:
#coding:utf-8
#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
eca5
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:
f1.write(result1)
#结果:
二:ValueError: binary mode doesn't take an encoding argument
如下代码:#coding:utf-8#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','wb',encoding='utf-8') as f1:
f1.write(result1)#结果:
因为'wb'是以二进制写入文件,而result1是字符串(str),所以报错,写入文件为0kb,即没有结果
这里可以把result1转变问字节串 bytes(result1)
如下:
#coding:utf-8 #python3.5.1 import re import requests import time import os from bs4 import BeautifulSoup path = r'G:\任务20180312' url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml' url_prefix = 'http://tjj.suqian.gov.cn/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } def get_Soup(url): response = requests.get(url,headers = headers,timeout = 120) response.encoding = 'utf-8' res = response.text soup = BeautifulSoup(res,'html.parser') return soup soup = get_Soup(url) #print(soup) table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'> print(type(table)) #转变为字符串 result1 = str(table) #print(result1) tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'> result2 = str(tbody) #转变为字符串 with open(r'G:\任务20180312\test\html_excel/test1.xls','wb') as f1: f1.write(bytes(result1,encoding='utf-8'))#结果:
注意,这里with open(..)括号里没有encoding=部分了,因为二进制不能在进行编码了,不然会报错ValueError: binary mode doesn't take an encoding argument
bytes(result1,encoding='utf-8')这里是因为转字符串为二进制需要编码方式
三:得到的文件不是我们想要的表格形式,而是一堆字符串
#coding:utf-8#python3.5.1
import re
import requests
import time
import os
from bs4 import BeautifulSoup
path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_Soup(url):
response = requests.get(url,headers = headers,timeout = 120)
response.encoding = 'utf-8'
res = response.text
soup = BeautifulSoup(res,'html.parser')
return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table)) #转变为字符串
result1 = str(table)
#print(result1)
tbody = soup('tbody')[0] #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody) #转变为字符串
with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:
f1.write(result1)
with open(r'G:\任务20180312\test\html_excel/test2.xls', 'w',encoding='utf-8') as f2:
f2.write(result2)
#结果:
那现在我们发现区别在于result1和result2
右键网页打开源代码,来查看区别:
区别在于result1比result2少了些代码(因为我们存入的文件形式为伪excel,所以这个是有关系的),现在有html工具(这里我用的editplus)分别将这两部分代码以浏览器形式打开:
PS:所以我们要把带样式的代码也抓取下来,建议可以看看html5和css,了解一下望有所帮助,望采纳!!
相关文章推荐
- 【转】支付宝API:return_url 和 notify_url 的区别及其操作以及常见错误提示
- 支付宝API:return_url 和 notify_url 的区别及其操作以及常见错误提示
- python 处理Excel 常见问题- 写入Excel
- 支付宝API:return_url 和 notify_url 的区别及其操作以及常见错误提示
- Python2.7 以及 Python 3.5的实例方法,类方法,静态方法之间的区别及调用关系
- 支付宝API:return_url 和 notify_url 的区别及其操作以及常见错误提示
- selenium+python关于登录的脚本代码,使用了读取excel以及向excel中写入测试结果的方法
- python 常见错误类型以及继承关系
- Windows上Python3.5安装Scrapy(lxml) 以及与twisted有关错误的解决
- python 在excel文件中写入date日期数据,以及读取excel日期数据,如何在python中正确显示date日期。
- selenium+python关于登录的脚本代码,使用了读取excel以及向excel中写入测试结果的方法
- Python中列表和字典的区别以及适用的场景
- 【牛刀小试】——Repeater控件常见错误及几种相似控件的区别
- python-多语言功能-读excel文件并写入json,解决json输出unicode
- Windows7 Python-3.6 安装PyCrypto(pycrypto 2.6.1)出现错误以及解决方法
- [学习笔记]Python_常见错误
- c语言中两种常见代码编写错误,“写入位置 0x00000000 时发生访问冲突”和“语法错误 : 缺少“;”(在“{”的前面)”
- Python中,添加写入数据到已经存在的Excel的xls文件,即打开excel文件,写入新数据
- extmail 服务器搭建以及常见安装错误
- C# 操作Excel常见错误