只有本站会员才能查看附件,请 登录
分享一下,多日以来的学习成果。
东拼西凑,把网上几个程序拼在一起,终于达到了自己需要的效果。。。。
C、C++、CLI混合使用,再加上Libiconv终于实现了控制台输出多种格式文本不乱码。。。
说实话,难吗???真的一点也不难!!!!
繁琐到是有一些繁琐。。。
为什么,这么久才完成???
心浮气躁,根本静不下来。。。。又是多个新问题加在一起。。。。
如果,还能有当初开始学习C语言的心境。。。
这些问题加在一起,三天之内一定能解决。。。。
心就是静不下来,心浮气躁。
如果,心能静也不至于这点知识用这么久才学会。。。。。
Libiconv库文件
只有本站会员才能查看附件,请 登录
测试使用的文本文件
只有本站会员才能查看附件,请 登录

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include<locale.h>
#include"iconv.h"
#pragma comment(lib, "libiconv.lib")
#pragma warning(disable:4996)
using namespace std;
typedef unsigned int Uint32;
enum TEXT_TYPE
{
TEXT_ANSI = 0,
TEXT_UTF8 = 1,
TEXT_UTF8_BOM = 2,
TEXT_UTF16_LE = 3,
TEXT_UTF16_BE = 4,
TEXT_UNKNOW = 5,
};
class File {
public:
File() { _ptr_file = nullptr; }
File(const char* fn, const char* opr) {
open(fn, opr);
}
~File() { close(); }
//读取文件到内存
static char* read_file(Uint32& size, const char* fn) {
File tmp_file;
if (tmp_file.open(fn, "rb")) { return size = 0, nullptr; }
size = tmp_file.get_file_size();
char* buf = (char*)malloc(size + 4);///这里多分配几个字节做字符串结尾
if (!buf) { return size = 0, nullptr; }
memset(buf, 0, size + 4); ///内存填充0
size = tmp_file.read_byte(buf, size);
return buf;
}
///获取文本文件类型
///文件类型:utf-8返回0
/// utf8 bom 返回1
/// ucs-2 BE大端字节序返回2
/// USC-2 LE小端字节序返回3
/// 文件不存在返回 -1
///
static int get_file_type(const char* fn) {
File temp_file;
if (temp_file.open(fn, "rb")) { return -1; }
if (temp_file.get_file_size() <= 2) { return 0; }
char en_buf[3] = { 0 };
temp_file.read_byte(en_buf, 3);
if (0 == memcmp(en_buf, "\xEF\xBB\xBF", 3)) { return 1; }
else if (0 == memcmp(en_buf, "\xFE\xFF", 2)) { return 2; }
else if (0 == memcmp(en_buf, "\xFF\xFE", 2)) { return 3; }
return 0;
}
int open(const char* fn, const char* opr) {
close();
_ptr_file = fopen(fn, opr);
if (!_ptr_file) { return-1; }
return 0;
}
void close() {
if (!_ptr_file) { return; }
fclose((FILE*)_ptr_file);
}
///获取文件大小
Uint32 get_file_size() {
int cur_pos = ftell((FILE*)_ptr_file);
fseek((FILE*)_ptr_file, 0L, SEEK_END);
Uint32 ret = ftell((FILE*)_ptr_file);
fseek((FILE*)_ptr_file, cur_pos, SEEK_SET);
return ret;
}
///读取size个字节到dst///
Uint32 read_byte(char* dst, Uint32 size) {
Uint32 min_size = min(size, get_file_size());
fread(dst, 1, min_size, (FILE*)_ptr_file);
return min_size;
}
private:
void* _ptr_file;
//DISALLOW_COPY_AND_ASSIGN(File)
private:
File(File&);
File& operator=(File&);
};
//检查是否为无BOM的UTF8
bool check_utf8_without_bom(const string& file_name)
{
ifstream file_in;
file_in.open(file_name, ios::in);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;
return false;
}
stringstream buffer;
buffer << file_in.rdbuf();
file_in.close();
string text = buffer.str();
size_t len = text.size();
int n = 0;
unsigned char ch;
bool b_all_ascii = true;
//0x00-0x7F为ASCII码范围
for (size_t i = 0; i < len; ++i)
{
ch = text[i];
if ((ch & 0x80) != 0)
{
b_all_ascii = false;
}
if (n == 0)
{
if (ch >= 0x80)
{
if (ch >= 0xFC && ch <= 0xFD)
{
n = 6;
}
else if (ch >= 0xF8)
{
n = 5;
}
else if (ch >= 0xF0)
{
n = 4;
}
else if (ch >= 0xE0)
{
n = 3;
}
else if (ch >= 0xC0)
{
n = 2;
}
else
{
return false;
}
n--;
}
}
else
{
if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10开始的所有字节是多字节序列的后续字节
{
return false;
}
n--;
}
}
if (n > 0)
{
return false;
}
if (b_all_ascii)
{
return false;
}
return true;
}
//检查文本编码
TEXT_TYPE check_text_encode(const string& file_name)
{
/*
ANSI 无格式定义 对于中文编码格式是GB2312;
Unicode little endian 文本里前两个字节为FF FE 字节流是little endian
Unicode big endian 文本里前两个字节为FE FF 字节流是big endian
UTF-8带BOM 前两字节为EF BB,第三字节为BF 带BOM
UTF-8不带BOM 无格式定义,需另加判断 不带BOM
*/
ifstream file_in(file_name, ios::binary);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;;
return TEXT_UNKNOW;
}
int head;
unsigned char ch;
file_in.read((char*)&ch, sizeof(ch));
head = ch << 8;
file_in.read((char*)&ch, sizeof(ch));
head |= ch;
file_in.close();
TEXT_TYPE result_code;
switch (head)
{
case 0xFFFE:
result_code = TEXT_UTF16_LE;
break;
case 0xFEFF:
result_code = TEXT_UTF16_BE;
break;
case 0xEFBB:
result_code = TEXT_UTF8_BOM;
break;
default:
if (check_utf8_without_bom(file_name))
result_code = TEXT_UTF8;
else
result_code = TEXT_ANSI;
break;
}
return result_code;
}
///大端字节序转为小端字节序,当前大部分笔记本和手机都是小端字节序
void big2little(wchar_t* src, Uint32 size) {
for (Uint32 iix = 0; iix < size; ++iix, ++src) {
*src = (((*src) & 0xff00) >> 8 | (((*src) & 0x00ff) << 8));
}
return;
}
int code_convert(char* to_chatset, char* from_charset, const char* inbuf, size_t inlen, char* outbuf, rsize_t outlen)
{
iconv_t cd, err = (iconv_t)-1;
cd = iconv_open(to_chatset, from_charset);
if (cd == err)return -1;
int ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
if (ret == -1)return -1;
iconv_close(cd);
return outlen;
}
void ReadFile(string filepath)
{
Uint32 size;
char* buffer = nullptr;
char* newbuffer = nullptr;
Uint32 newsize;
TEXT_TYPE type = check_text_encode(filepath);
if (type != TEXT_UNKNOW)
{
size = 0;
buffer = File::read_file(size, filepath.c_str());
_wsetlocale(LC_ALL, L"chinese");
switch (type)
{
case TEXT_ANSI:
cout << std::string(buffer);
break;
case TEXT_UTF8:
newsize = 2 * size;
newbuffer = new char[newsize]{0};
code_convert((char*)"GBK", (char*)"UTF-8", buffer, size, newbuffer, newsize);
cout << std::string(newbuffer);
free(newbuffer);
break;
case TEXT_UTF8_BOM:
newsize = 2 * size;
newbuffer = new char[newsize] {0};
code_convert((char*)"GBK", (char*)"UTF-8", buffer+3, size, newbuffer, newsize);
cout << std::string(newbuffer);
free(newbuffer);
break;
case TEXT_UTF16_LE:
wcout << std::wstring((wchar_t*)buffer + 1);
break;
case TEXT_UTF16_BE:
big2little((wchar_t*)buffer + 1, size / 2 - 1);
wcout << std::wstring((wchar_t*)buffer + 1);
break;
default:
break;
}
free(buffer);
}
}
int main(int argc, char* argv[])
{
string File1 = "C:\\Users\\Administrator\\Desktop\\TestFile\\ANSI.txt";
string File2 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-8.txt";
string File3 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-8 BOM.txt";
string File4 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-16 LE.txt";
string File5 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-16 BE.txt";
ReadFile(File1);
ReadFile(File2);
ReadFile(File3);
ReadFile(File4);
ReadFile(File5);
return 0;
}
#include <fstream>
#include <string>
#include <sstream>
#include<locale.h>
#include"iconv.h"
#pragma comment(lib, "libiconv.lib")
#pragma warning(disable:4996)
using namespace std;
typedef unsigned int Uint32;
enum TEXT_TYPE
{
TEXT_ANSI = 0,
TEXT_UTF8 = 1,
TEXT_UTF8_BOM = 2,
TEXT_UTF16_LE = 3,
TEXT_UTF16_BE = 4,
TEXT_UNKNOW = 5,
};
class File {
public:
File() { _ptr_file = nullptr; }
File(const char* fn, const char* opr) {
open(fn, opr);
}
~File() { close(); }
//读取文件到内存
static char* read_file(Uint32& size, const char* fn) {
File tmp_file;
if (tmp_file.open(fn, "rb")) { return size = 0, nullptr; }
size = tmp_file.get_file_size();
char* buf = (char*)malloc(size + 4);///这里多分配几个字节做字符串结尾
if (!buf) { return size = 0, nullptr; }
memset(buf, 0, size + 4); ///内存填充0
size = tmp_file.read_byte(buf, size);
return buf;
}
///获取文本文件类型
///文件类型:utf-8返回0
/// utf8 bom 返回1
/// ucs-2 BE大端字节序返回2
/// USC-2 LE小端字节序返回3
/// 文件不存在返回 -1
///
static int get_file_type(const char* fn) {
File temp_file;
if (temp_file.open(fn, "rb")) { return -1; }
if (temp_file.get_file_size() <= 2) { return 0; }
char en_buf[3] = { 0 };
temp_file.read_byte(en_buf, 3);
if (0 == memcmp(en_buf, "\xEF\xBB\xBF", 3)) { return 1; }
else if (0 == memcmp(en_buf, "\xFE\xFF", 2)) { return 2; }
else if (0 == memcmp(en_buf, "\xFF\xFE", 2)) { return 3; }
return 0;
}
int open(const char* fn, const char* opr) {
close();
_ptr_file = fopen(fn, opr);
if (!_ptr_file) { return-1; }
return 0;
}
void close() {
if (!_ptr_file) { return; }
fclose((FILE*)_ptr_file);
}
///获取文件大小
Uint32 get_file_size() {
int cur_pos = ftell((FILE*)_ptr_file);
fseek((FILE*)_ptr_file, 0L, SEEK_END);
Uint32 ret = ftell((FILE*)_ptr_file);
fseek((FILE*)_ptr_file, cur_pos, SEEK_SET);
return ret;
}
///读取size个字节到dst///
Uint32 read_byte(char* dst, Uint32 size) {
Uint32 min_size = min(size, get_file_size());
fread(dst, 1, min_size, (FILE*)_ptr_file);
return min_size;
}
private:
void* _ptr_file;
//DISALLOW_COPY_AND_ASSIGN(File)
private:
File(File&);
File& operator=(File&);
};
//检查是否为无BOM的UTF8
bool check_utf8_without_bom(const string& file_name)
{
ifstream file_in;
file_in.open(file_name, ios::in);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;
return false;
}
stringstream buffer;
buffer << file_in.rdbuf();
file_in.close();
string text = buffer.str();
size_t len = text.size();
int n = 0;
unsigned char ch;
bool b_all_ascii = true;
//0x00-0x7F为ASCII码范围
for (size_t i = 0; i < len; ++i)
{
ch = text[i];
if ((ch & 0x80) != 0)
{
b_all_ascii = false;
}
if (n == 0)
{
if (ch >= 0x80)
{
if (ch >= 0xFC && ch <= 0xFD)
{
n = 6;
}
else if (ch >= 0xF8)
{
n = 5;
}
else if (ch >= 0xF0)
{
n = 4;
}
else if (ch >= 0xE0)
{
n = 3;
}
else if (ch >= 0xC0)
{
n = 2;
}
else
{
return false;
}
n--;
}
}
else
{
if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10开始的所有字节是多字节序列的后续字节
{
return false;
}
n--;
}
}
if (n > 0)
{
return false;
}
if (b_all_ascii)
{
return false;
}
return true;
}
//检查文本编码
TEXT_TYPE check_text_encode(const string& file_name)
{
/*
ANSI 无格式定义 对于中文编码格式是GB2312;
Unicode little endian 文本里前两个字节为FF FE 字节流是little endian
Unicode big endian 文本里前两个字节为FE FF 字节流是big endian
UTF-8带BOM 前两字节为EF BB,第三字节为BF 带BOM
UTF-8不带BOM 无格式定义,需另加判断 不带BOM
*/
ifstream file_in(file_name, ios::binary);
if (!file_in.is_open())
{
cout << "打开文件失败" << endl;;
return TEXT_UNKNOW;
}
int head;
unsigned char ch;
file_in.read((char*)&ch, sizeof(ch));
head = ch << 8;
file_in.read((char*)&ch, sizeof(ch));
head |= ch;
file_in.close();
TEXT_TYPE result_code;
switch (head)
{
case 0xFFFE:
result_code = TEXT_UTF16_LE;
break;
case 0xFEFF:
result_code = TEXT_UTF16_BE;
break;
case 0xEFBB:
result_code = TEXT_UTF8_BOM;
break;
default:
if (check_utf8_without_bom(file_name))
result_code = TEXT_UTF8;
else
result_code = TEXT_ANSI;
break;
}
return result_code;
}
///大端字节序转为小端字节序,当前大部分笔记本和手机都是小端字节序
void big2little(wchar_t* src, Uint32 size) {
for (Uint32 iix = 0; iix < size; ++iix, ++src) {
*src = (((*src) & 0xff00) >> 8 | (((*src) & 0x00ff) << 8));
}
return;
}
int code_convert(char* to_chatset, char* from_charset, const char* inbuf, size_t inlen, char* outbuf, rsize_t outlen)
{
iconv_t cd, err = (iconv_t)-1;
cd = iconv_open(to_chatset, from_charset);
if (cd == err)return -1;
int ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen);
if (ret == -1)return -1;
iconv_close(cd);
return outlen;
}
void ReadFile(string filepath)
{
Uint32 size;
char* buffer = nullptr;
char* newbuffer = nullptr;
Uint32 newsize;
TEXT_TYPE type = check_text_encode(filepath);
if (type != TEXT_UNKNOW)
{
size = 0;
buffer = File::read_file(size, filepath.c_str());
_wsetlocale(LC_ALL, L"chinese");
switch (type)
{
case TEXT_ANSI:
cout << std::string(buffer);
break;
case TEXT_UTF8:
newsize = 2 * size;
newbuffer = new char[newsize]{0};
code_convert((char*)"GBK", (char*)"UTF-8", buffer, size, newbuffer, newsize);
cout << std::string(newbuffer);
free(newbuffer);
break;
case TEXT_UTF8_BOM:
newsize = 2 * size;
newbuffer = new char[newsize] {0};
code_convert((char*)"GBK", (char*)"UTF-8", buffer+3, size, newbuffer, newsize);
cout << std::string(newbuffer);
free(newbuffer);
break;
case TEXT_UTF16_LE:
wcout << std::wstring((wchar_t*)buffer + 1);
break;
case TEXT_UTF16_BE:
big2little((wchar_t*)buffer + 1, size / 2 - 1);
wcout << std::wstring((wchar_t*)buffer + 1);
break;
default:
break;
}
free(buffer);
}
}
int main(int argc, char* argv[])
{
string File1 = "C:\\Users\\Administrator\\Desktop\\TestFile\\ANSI.txt";
string File2 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-8.txt";
string File3 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-8 BOM.txt";
string File4 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-16 LE.txt";
string File5 = "C:\\Users\\Administrator\\Desktop\\TestFile\\UTF-16 BE.txt";
ReadFile(File1);
ReadFile(File2);
ReadFile(File3);
ReadFile(File4);
ReadFile(File5);
return 0;
}
[此贴子已经被作者于2022-9-16 01:20编辑过]