Python 转化文件编码为 UTF8
使用编译器若遇到文件编码问题,当文件中有中文时,则会出现乱码,会因为编码问题导致各种错误。
所以写了个 python 脚本来检测原文件编码并转换为目标编码,以下代码以目标编码为 utf-8 为例:
需要安装 chardet,详情:https://pypi.python.org/pypi/chardet
Python2
import codecs
import os
import sys
import shutil
import re
import chardet
convertdir = "E:\\code\\GitCode\\test-demo\\src\\main\\java"
convertfiletypes = [
".java",
".h",
".hpp"
]
def convert_encoding(filename, target_encoding):
# Backup the origin file.
# convert file from the source encoding to target encoding
content = codecs.open(filename, 'r').read()
source_encoding = chardet.detect(content)['encoding']
if source_encoding != 'utf-8':
print source_encoding, filename
content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
codecs.open(filename, 'w', encoding=target_encoding).write(content)
def main():
for root, dirs, files in os.walk(convertdir):
for f in files:
for filetype in convertfiletypes:
if f.lower().endswith(filetype):
filename = os.path.join(root, f)
try:
convert_encoding(filename, 'utf-8')
except Exception, e:
print filename
if __name__ == '__main__':
main()
Python3
import codecs
import os
import sys
import shutil
import re
import chardet
convertdir = "E:\\code\\GitCode\\test-demo\\src\\main\\java"
convertfiletypes = [
".java",
".h",
".hpp"
]
def convert_encoding(filename, target_encoding):
# Backup the origin file.
# convert file from the source encoding to target encoding
content = codecs.open(filename, 'rb').read()
source_encoding = chardet.detect(content)['encoding']
if source_encoding != 'utf-8':
print(source_encoding, filename)
content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
codecs.open(filename, 'w', encoding=target_encoding).write(content)
def main():
for root, dirs, files in os.walk(convertdir):
for f in files:
for filetype in convertfiletypes:
if f.lower().endswith(filetype):
filename = os.path.join(root, f)
try:
convert_encoding(filename, 'utf-8')
except Exception as e:
print(filename,e)
if __name__ == '__main__':
main()
问题处理
若报错:
'gbk' codec can't decode byte 0xae in position 758: illegal multibyte sequence
此种错误,可能是要处理的字符串本身不是 gbk 编码,但是却以 gbk 编码去解码 。比如,字符串本身是 utf-8 的,但是却用 gbk 去解码 utf-8 的字符串,所以结果不用说,则必然出错。
通过查阅资料,在读取文本的时候加入参数‘b’, 不会提示错误,通过输出读取的数据显示。
content = codecs.open(filename, 'rb').read()
相关文章