您的位置:首页 > 编程语言 > Python开发

python使用codecs模块进行文件操作-读写中英文字符

2017-07-19 09:45 811 查看

摘自:

python使用codecs模块进行文件操作

以下是将源文件转成带BOM的UTF-8编码python脚本

import sys
import os
import codecs

"""
Usage: ConvertCp.py SrcDir DstDir

e.g.
if your source folder is "D:/Test/NonUnicode", destination folder is "D:/Test/utf8", just run command as follow :

python  ConvertCp.py "D:/Test/NonUnicode" "D:/Test/utf8"

"""

codePageList = (\
"utf-8",
"cp1251",
"cp855",
"cp1252",
"cp1250",
"cp1251",
"cp1254",
"cp936",
"cp950",
"cp932",
"cp949",
"cp874",
"cp1253"
)

fileExtFilter = (\
".cpp",
".c",
".cxx",
".h",
".hpp",
".hxx",
".cc",
".inl"
)

def FileIsBomUtf8Encoding(filePath):
"""
Judge the file whether is Bom Utf_8.
"""
content = ""
try:
f = open(filePath, "rb")
try:
content = f.read()
finally:
f.close()
except IOError:
print ("open file %s failed." % (os.path.basename(filePath)))
return False

if content[0:3] == '\xef\xbb\xbf':
return True

return False

#----------------------------------------------------------------------
def ConvertFileEncoding(sourceFilePath, targetFilePath, targetEncoding = "utf_8"):
"""
Convert the text files from ANSI encoding into 'targetEncoding'(utf_8).

@param sourceFilePath       source files path.
@param targetFilePath       target files path.
@param targetEncoding       target files encoding.
"""

#
# filter file ext.
#
(filePathname, filePathExt) = os.path.splitext(sourceFilePath)
if filePathExt.lower() not in fileExtFilter:
return False

#
# If the file is Bom utf_8 just skip.
#
if FileIsBomUtf8Encoding(sourceFilePath):
# print ("File \"%s\" is utf_8 format, not need convert." % (sourceFilePath))
return False

#
# Get the source content.
#
content = None
sourceEncoding = None
for cp in codePageList:
try:
sourceFile = codecs.open(sourceFilePath, mode = "r", encoding = cp)
content = sourceFile.read()
sourceEncoding = cp
break
except UnicodeDecodeError:
sourceFile.close()
content = None
continue
except IOError:
print ("open file %s failed." % (os.path.basename(sourceFilePath)))
return False

if content == None:
print ("File \"%s\" is not valid encoding." % (sourceFilePath))
return False

#
# ensure the target directory exist.
#
targetPathDir = os.path.dirname(targetFilePath)
if not os.path.exists(targetPathDir):
os.makedirs(targetPathDir)

#
# convert the file content.
#
try:
targetFile = codecs.open(targetFilePath, mode = "w", encoding = targetEncoding)
try:
if targetEncoding.lower().startswith("utf") and targetEncoding.lower()[len(targetEncoding)-1] == "8":
targetFile.write(unicode( codecs.BOM_UTF8, "utf_8" ))

if content[0:3] == u'\xef\xbb\xbf':
content = content[3:]

targetFile.write(content)
except UnicodeDecodeError:
#
# skip the failure file.
#
print ("convert file: \"%s\" failure" % (sourceFilePath) )
sourceFile.close()
targetFile.close()
os.remove(targetFilePath)
return False

finally:
sourceFile.close()
targetFile.close()

except IOError:
print ("open file %s failed." % (targetFilePath))
return False

# print ("convert file: \"%s\" from %s to %s successfully" % (os.path.basename(sourceFilePath), sourceEncoding, targetEncoding) )
return True

if __name__=='__main__':
""""""
if len(sys.argv) <= 2:
# print __doc__
sSourceDir = r"D:\\trunk"
sTargetDir = r"D:\\\trunk_new"
else:
sSourceDir = sys.argv[1]
sTargetDir = sys.argv[2]

for root, dirs, files in os.walk(sSourceDir):
for fileName in files:
sourcePath = os.path.join(root, fileName)
targetPath = sourcePath.replace(sSourceDir, sTargetDir)
ConvertFileEncoding(sourcePath, targetPath)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python codecs