Python 转化文件编码为 UTF8

发表于 2020-10-10 分类于 Python 阅读次数： Valine：

使用编译器若遇到文件编码问题，当文件中有中文时，则会出现乱码，会因为编码问题导致各种错误。

所以写了个 python 脚本来检测原文件编码并转换为目标编码，以下代码以目标编码为 utf-8 为例：

需要安装 chardet，详情：https://pypi.python.org/pypi/chardet

Python2

import codecs
import os
import sys
import shutil
import re
import chardet

convertdir = "E:\\code\\GitCode\\test-demo\\src\\main\\java"

convertfiletypes = [
  ".java",
  ".h",
  ".hpp"
  ]

def convert_encoding(filename, target_encoding):
    # Backup the origin file.
    # convert file from the source encoding to target encoding
    content = codecs.open(filename, 'r').read()
    source_encoding = chardet.detect(content)['encoding']
    if source_encoding != 'utf-8':
        print source_encoding, filename
        content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
        codecs.open(filename, 'w', encoding=target_encoding).write(content)

def main():
    for root, dirs, files in os.walk(convertdir):
        for f in files:
            for filetype in convertfiletypes:
                if f.lower().endswith(filetype):
                    filename = os.path.join(root, f)
                    try:
                        convert_encoding(filename, 'utf-8')
                    except Exception, e:
                        print filename

if __name__ == '__main__':
    main()

Python3

import codecs
import os
import sys
import shutil
import re
import chardet

convertdir = "E:\\code\\GitCode\\test-demo\\src\\main\\java"

convertfiletypes = [
  ".java",
  ".h",
  ".hpp"
  ]

def convert_encoding(filename, target_encoding):
    # Backup the origin file.
    # convert file from the source encoding to target encoding
    content = codecs.open(filename, 'rb').read()
    source_encoding = chardet.detect(content)['encoding']
    if source_encoding != 'utf-8':
        print(source_encoding, filename)
        content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
        codecs.open(filename, 'w', encoding=target_encoding).write(content)

def main():
    for root, dirs, files in os.walk(convertdir):
        for f in files:
            for filetype in convertfiletypes:
                if f.lower().endswith(filetype):
                    filename = os.path.join(root, f)
                    try:
                        convert_encoding(filename, 'utf-8')
                    except Exception as e:
                        print(filename,e)

if __name__ == '__main__':
    main()

问题处理

若报错：

'gbk' codec can't decode byte 0xae in position 758: illegal multibyte sequence

此种错误，可能是要处理的字符串本身不是 gbk 编码，但是却以 gbk 编码去解码。比如，字符串本身是 utf-8 的，但是却用 gbk 去解码 utf-8 的字符串，所以结果不用说，则必然出错。

通过查阅资料，在读取文本的时候加入参数‘b’, 不会提示错误，通过输出读取的数据显示。

content = codecs.open(filename, 'rb').read()