#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
using namespace std;
enum TEXT_TYPE
{
TEXT_ANSI = 0,
TEXT_UTF8 = 1,
TEXT_UTF8_BOM = 2,
TEXT_UTF16_LE = 3,
TEXT_UTF16_BE = 4,
TEXT_UNKNOW = 5,
};
//检查是否为无BOM的UTF8
bool check_utf8_without_bom(const string &file_name)
{
ifstream file_in;
file_in.open(file_name, ios::in);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;
return false;
}
stringstream buffer;
buffer << file_in.rdbuf();
file_in.close();
string text = buffer.str();
size_t len = text.size();
int n = 0;
unsigned char ch;
bool b_all_ascii = true;
//0x00-0x7F为ASCII码范围
for (size_t i = 0; i < len; ++i)
{
ch = text[i];
if ((ch & 0x80) != 0)
{
b_all_ascii = false;
}
if (n == 0)
{
if (ch >= 0x80)
{
if (ch >= 0xFC && ch <= 0xFD)
{
n = 6;
}
else if (ch >= 0xF8)
{
n = 5;
}
else if (ch >= 0xF0)
{
n = 4;
}
else if (ch >= 0xE0)
{
n = 3;
}
else if (ch >= 0xC0)
{
n = 2;
}
else
{
return false;
}
n--;
}
}
else
{
if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10开始的所有字节是多字节序列的后续字节
{
return false;
}
n--;
}
}
if (n > 0)
{
return false;
}
if (b_all_ascii)
{
return false;
}
return true;
}
//检查文本编码
TEXT_TYPE check_text_encode(const string &file_name)
{
/*
ANSI 无格式定义 对于中文编码格式是GB2312;
Unicode little endian 文本里前两个字节为FF FE 字节流是little endian
Unicode big endian 文本里前两个字节为FE FF 字节流是big endian
UTF-8带BOM 前两字节为EF BB,第三字节为BF 带BOM
UTF-8不带BOM 无格式定义,需另加判断 不带BOM
*/
ifstream file_in(file_name, ios::binary);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;;
return TEXT_UNKNOW;
}
int head;
unsigned char ch;
file_in.read((char*)&ch, sizeof(ch));
head = ch << 8;
file_in.read((char*)&ch, sizeof(ch));
head |= ch;
file_in.close();
TEXT_TYPE result_code;
switch (head)
{
case 0xFFFE:
result_code = TEXT_UTF16_LE;
break;
case 0xFEFF:
result_code = TEXT_UTF16_BE;
break;
case 0xEFBB:
result_code = TEXT_UTF8_BOM;
break;
default:
if (check_utf8_without_bom(file_name))
result_code = TEXT_UTF8;
else
result_code = TEXT_ANSI;
break;
}
return result_code;
}
int main(int argc, char* argv[])
{
string str1 = "E:\\Book\\ANSI.txt";
string str2 = "E:\\Book\\UTF8.txt";
string str3 = "E:\\Book\\UTF8_BOM.txt";
TEXT_TYPE txttype1 = check_text_encode(str1);
TEXT_TYPE txttype2 = check_text_encode(str2);
TEXT_TYPE txttype3 = check_text_encode(str3);
getchar();
return 0;
}
参考链接:
https://www.jb51.net/article/128576.htm
https://www.cnblogs.com/Toya/p/11433441.html
原文链接: https://www.cnblogs.com/rcg714786690/p/14246669.html
欢迎关注
微信关注下方公众号,第一时间获取干货硬货;公众号内回复【pdf】免费获取数百本计算机经典书籍
原创文章受到原创版权保护。转载请注明出处:https://www.ccppcoding.com/archives/207023
非原创文章文中已经注明原地址,如有侵权,联系删除
关注公众号【高性能架构探索】,第一时间获取最新文章
转载文章受原作者版权保护。转载请注明原作者出处!