您的位置：首页 > 其它

WideCharToMultiByte

2014-03-31 20:36 375 查看

1. 了解下这个 API

//z 2014-03-25 08:18:41 IS2120@BG57IV3 T3343244181.K.F1434403198[T1,L68,R2,V15]

void UnicodeToAnsi(WCHAR *in, char *out, int cchout)

{

int len ;

len = WideCharToMultiByte(CP_ACP,

0,

in,

wcslen(in)+1,

out,

cchout,

NULL,

NULL) ;

if (!len)

ErrorExit("out of memory") ;

}

//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]

2. 一个例子，将文件自动转换为 utf-8

// ChangeFileEncoding.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "ChangeFileEncoding.h"
#include <string>

#ifdef _DEBUG
#define new DEBUG_NEW
#endif

// 唯一的应用程序对象

CWinApp theApp;

using namespace std;

void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;

// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
/*for(int i = 0; i < argc; i++)
{
MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
}*/
//声明一个CFileFind类变量，以用来搜索

//接受一个参数作为源代码文件的根目录
TCHAR *lpszDirName = argv[1];
CString strFileType;
strFileType.Format(_T("%s\\*.*"), lpszDirName);
//递归此目录下的.h文件和.cpp文件，如果发现不是utf8编码则转换为utf8编码
recursiveFile(strFileType);

}

return nRetCode;
}

void recursiveFile( CString strFileType)
{
CFileFind finder;
BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
while(isFinded)
{
isFinded = finder.FindNextFile(); //递归搜索其他的文件
if(!finder.IsDots()) //如果不是"."目录
{
CString strFoundFile = finder.GetFilePath();
if(finder.IsDirectory()) //如果是目录，则递归地调用
{
CString strNextFileType;
strNextFileType.Format(_T("%s\\*.*"), strFoundFile);
recursiveFile(strNextFileType);
}
else
{
//如果是头文件或cpp文件
if(strFoundFile.Right(4) == _T(".cpp") || strFoundFile.Right(2) == _T(".h")) {
CFile fileReader(strFoundFile, CFile::modeRead);
byte head[3];
fileReader.Read(head, 3);
//判断是否带有BOM文件头
if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
{
fileReader.Close();
continue;
}
fileReader.SeekToBegin();

int bufLength = 256;
char *buf = new char[bufLength];
ZeroMemory(buf, bufLength);
int nReadLength;
std::stringstrContent;
while((nReadLength = fileReader.Read(buf, bufLength)))
{
strContent.append(buf, nReadLength);
ZeroMemory(buf, nReadLength);
}
delete buf;
fileReader.Close();
convertGBToUTF8(strFoundFile, strContent.c_str());
}
}
}
}
finder.Close();
}

void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
CFile fp;
fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);
int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
wchar_t* wstr = new wchar_t[len+1];
memset(wstr, 0, len+1);
MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
char* str = new char[len+1];
memset(str, 0, len+1);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
if(wstr) delete[] wstr;
str[len] = '\n';
const unsigned char aryBOM[]  = {0xEF, 0xBB, 0xBF};
fp.Write(aryBOM, sizeof(aryBOM));
fp.Write(str,len);
delete[] str;
fp.Close();
}

//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]

/article/8148088.html

//z 2014-05-06 12:00:46 L.239'43154 BG57IV3@XCL T1109932947.K.F253293061 [T409,L5358,R263,V7006]

3. v2

// ConvertZ.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "ConvertZ.h"
#include <string>

using namespace std;

#ifdef _DEBUG
#define new DEBUG_NEW
#endif

// 唯一的应用程序对象
CWinApp theApp;

void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;

// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
/*for(int i = 0; i < argc; i++)
{
MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
}*/
//声明一个CFileFind类变量，以用来搜索

if(argc != 2)
{
CString strUsage;
strUsage.Format(_T("usage : \n    %s dir\n    dir [sample] : c:\\src\n"),argv[0]);
_tprintf(strUsage.GetBuffer());
strUsage.ReleaseBuffer();

return nRetCode;
}

//接受一个参数作为源代码文件的根目录
TCHAR *lpszDirName = argv[1];
CString strFileType;
strFileType.Format(_T("%s\\*.*"), lpszDirName);
//递归此目录下的.h文件和.cpp文件，如果发现不是utf8编码则转换为utf8编码
recursiveFile(strFileType);
}

return nRetCode;
}

bool isSrcType(const CString strFileType)
{
CString strExt_R4 = strFileType.Right(4);
CString strExt_R2 = strFileType.Right(2);

if ((strExt_R4.CompareNoCase(_T(".cpp")) == 0)
|| (strExt_R2.CompareNoCase(_T(".c")) == 0)
|| (strExt_R2.CompareNoCase(_T(".h")) == 0)
|| (strExt_R4.CompareNoCase(_T(".cxx")) == 0)
|| (strExt_R4.CompareNoCase(_T(".hpp")) == 0)
)
{
return true;
}

return false;
}

void recursiveFile( CString strFileType)
{
CFileFind finder;
BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
while(isFinded)
{
isFinded = finder.FindNextFile(); //递归搜索其他的文件
if(!finder.IsDots()) //如果不是"."目录
{
CString strFoundFile = finder.GetFilePath();
if(finder.IsDirectory()) //如果是目录，则递归地调用
{
CString strNextFileType;
strNextFileType.Format(_T("%s\\*.*"), strFoundFile);
recursiveFile(strNextFileType);
}
else
{
//如果是头文件或cpp文件
if(isSrcType(strFoundFile)) {
CFile fileReader(strFoundFile, CFile::modeRead|CFile::typeBinary);
byte head[3];
fileReader.Read(head, 3);
//判断是否带有BOM文件头
if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
{
fileReader.Close();
continue;
}
fileReader.SeekToBegin();

int bufLength = 256;
char *buf = new char[bufLength];
ZeroMemory(buf, bufLength);
int nReadLength;
std::stringstrContent;
while((nReadLength = fileReader.Read(buf, bufLength)))
{
strContent.append(buf, nReadLength);
ZeroMemory(buf, nReadLength);
}
delete buf;
fileReader.Close();
convertGBToUTF8(strFoundFile, strContent.c_str());
}
}
}
}
finder.Close();
}

void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
CFile fp;
fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);

const int ngblen = static_cast<int>(strlen(gb2312));
int len = MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, NULL, 0);
wchar_t* wstr = new wchar_t[len+1];
memset(wstr, 0, (len+1)*sizeof(wchar_t));
MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, wstr, len);
wstr[len] = '\0';

int newLen = 0;
newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, NULL, 0, NULL, NULL);
char* str = new char[newLen+1];
memset(str, 0, (newLen+1)*sizeof(char));
newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, str, newLen, NULL, NULL);

if(wstr)
{
delete[] wstr;
wstr = NULL;
}

str[newLen] = '\0';
const unsigned char aryBOM[]  = {0xEF, 0xBB, 0xBF};
fp.Write(aryBOM, sizeof(aryBOM));
fp.Write(str,newLen);
delete[] str;
fp.Close();
}

//z 2014-05-22 16:55:50 L.223'25450 BG57IV3 T427209771 .K.F253293061 [T484,L6693,R325,V8206]

Simple Character Encoding Detection

By Ghosuwa
Wogomon, 23 Oct 2013

4.75 (3 votes)

Rate:

vote
1vote
2vote
3vote
4vote
5

Introduction

One very commonly asked question in programming is how to detect the character encoding of a

string

. Well, I'm going
to share a cool method I came up with that can detect if a

string

is UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE in
just 4 lines of code.

Explanation

We'll be working with null terminated

string

s, so the first rule is that we must terminate all

string

s
with a quadruple null, regardless of encoding. You may wish to add a definition such as the following:

Collapse | Copy
Code

#define NT "\0\0\0"

char *exampleString = "This is UTF-8" NT;

Next is an explanation of how the checking works.

Collapse | Copy
Code

1.===== If a stringdoesn't contain nulls, its UTF-8
:
else
:
2:===== If a stringdoesn't contain double nulls, it's UTF-16
:--.
: 3:== If the nulls are on odd numbered indices, it's UTF-16LE
:  :
: else
:  :
: 4'== The stringdefaults to UTF-16BE
:
else
:
5:===== If the index modulo 4 is 0 and the character is greater than
:      0x7F, the stringis UTF-32LE. This is because the range of
:      UTF-32 only goes up to 0x7FFFFFFF, meaning approximately 22%
:      of the characters that can be represented will validate that
:      the stringis not big endian; including a BOM.
:
else
:
6'===== The stringdefaults to UTF-32BE

The Code

We check every byte until we reach a quadruple null:

Collapse | Copy
Code

int String_GetEncoding(char *string)
{
unsigned c, i = 0, flags = 0;
while (string[i] | string[i + 1] | string[i + 2] | string[i + 3])
flags = (c = string[i++]) ? flags | ((!(flags % 4) &&
c > 0x7F) << 3) : flags | 1 | (!(i & 1) << 1)
| ((string[i] == 0) << 2);
return (flags & 1) + ((flags & 2) != 0) +
((flags & 4) != 0) + ((flags & 8) != 0);
}

The output:

Collapse | Copy
Code

0  = UTF-8
1  = UTF-16BE
2  = UTF-16LE
3  = UTF-32BE
4  = UTF-32LE

Notes

Since UTF-32 encoding can contain several null bytes, its byte order checking is done through an alternative method that doesn't work 100% of the time, e.g., if all the characters are within the ASCII range and there isn't a BOM, it'll return UTF-32BE when
it might actually be UTF-32LE.

This isn't really a big issue since UTF-32 is never used for storage, so chances are anyone that might use it will already know the byte ordering without having to check. However, if you're OCD, you could perform an additional check by treating UTF-32BE
as UTF-16 and determining that

string

's byte ordering.

License

This article, along with any associated source code and files, is licensed under The
Code Project Open License (CPOL)

About the Author

Ghosuwa
Wogomon

United States

IsTextUnicode function

Determines if a buffer is likely to contain a form of Unicode text.

Syntax

C++

BOOL IsTextUnicode(
_In_         const VOID *lpv,
_In_         int iSize,
_Inout_opt_  LPINT lpiResult
);

IMultiLanguage2::DetectInputCodepage method

2 out of 4 rated this helpful

Detects the code page of the given string.

Syntax

C++

HRESULT DetectInputCodepage(
[in]       DWORD dwFlag,
[in]       DWORD dwPrefWinCodePage,
[in]       __wchar_t *pSrcStr,
[in, out]  INT *pcSrcSize,
[in, out]  DetectEncodingInfo *lpEncoding,
[in, out]  INT *pnScores
);

Parameters

dwFlag [in]

One of the MLDETECTCP-defined bit flag values that specify the type
of incoming source text. Setting the bit flags helps the detection engines produce more accurate results.
dwPrefWinCodePage [in]

The preferred Windows code page. If this value is set to zero, this API returns all possible encodings. Otherwise, it lists only those encodings related to this parameter.
pSrcStr [in]

The ource stringfor which the client wants to detect the code page.
pcSrcSize [in, out]

The address of the buffer that stores the size of pSrcStr, in bytes. When this method is successful, it returns the number of bytes processed to this buffer.
lpEncoding [in, out]

A pointer to an array of DetectEncodingInfo structures where the detection
information is returned.
pnScores [in, out]

A pointer to a buffer that contains the number of DetectEncodingInfo structures
allocated in lpEncoding. When this method is successful, this parameter returns the number of elements of lpEncoding that are filled in.

Return value

Returns one of the following values.

Return code	Description
S_OK	Success.
S_FALSE	The method cannot determine the code page of the input stream.
E_FAIL	An error occurred.

Remarks

The caller is responsible for allocating and freeing the lpEncoding array.

Requirements

Minimum supported client	Windows XP
Minimum supported server	Windows 2000 Server
Header	Mlang.h
IDL	Mlang.idl
DLL	Mlang.dll

Detect
encoding of a stringin C/C++

Assuming you know the length of the input array, you can make the following guesses:

First, check to see if the first few bytes match any well know byte
order marks (BOM) for Unicode. If they do, you're done!

Next, search for '\0' before the last byte. If you find one, you might be dealing with UTF-16 or UTF-32. If you find multiple consecutive '\0's, it's probably UTF-32.

If any character is from

0x80

0xff

,
it's certainly not ASCII or UTF-7. If you are restricting your input to some variant of Unicode, you can assume it's UTF-8. Otherwise, you have to do some guessing to determine which multi-byte
character set it is. That will not be fun.

At this point it is either: ASCII, UTF-7, Base64, or ranges of UTF-16 or UTF-32 that just happen to not use the top bit and do not have any null characters.

share|improve
this answer

answered Sep 23 '11 at 1:42

MSN

29.8k23661

It's not an easy problem to solve, and generally relies on heuristics to take a best guess at what the input encoding is, which can be tripped up by relatively innocuous inputs - for example, take a look at this
Wikipedia article and The
Notepad file encoding Redux for more details.

If you're looking for a Windows-only solution with minimal dependencies, you can look at using a combination of IsTextUnicode and
MLang's DetectInputCodePage to
attempt character set detection.

If you are looking for portability, but don't mind taking on a fairly large dependency in the form of ICU then you can make use of it's character
set detection routines to achieve the same thing in a portable manner.

share|improve
this answer

answered Sep 23 '11 at 1:49

russw_uk

53134

The Notepad file encoding problem, redux

RATE THIS

17 Apr 2007 10:00 AM

65

About every ten months, somebody new discovers the Notepad file
encoding problem. Let's see what else there is to say about it.
First of all, can we change Notepad's detection algorithm? The problem is that there are a lot of different text files out there. Let's look just at the ones that Notepad supports.

8-bit ANSI (of which 7-bit ASCII is a subset). These have no BOM; they just dive right in with bytes of text. They are also probably the most common type of text file.
UTF-8. These usually begin with a BOM but not always.
Unicode big-endian (UTF-16BE). These usually begin with a BOM but not always.
Unicode little-endian (UTF-16LE). These usually begin with a BOM but not always.

If a BOM is found, then life is easy, since the BOM tells you what encoding the file uses. The problem is when there is no BOM. Now you have to guess, and when you guess, you can guess wrong. For example, consider this file:

D0 AE

Depending on which encoding you assume, you get very different results.

If you assume 8-bit ANSI (with code page 1252), then the file consists of the two characters

U+00D0 U+00AE

, or "Ð®". Sure this looks strange, but maybe it's part of the
word VATNIÐ® which might be the name of an Icelandic hotel.
If you assume UTF-8, then the file consists of the single Cyrillic character

U+042E

, or "Ю".
If you assume Unicode big-endian, then the file consists of the Korean Hangul syllable

U+D0AE

, or "킮".
If you assume Unicode little-endian, then the file consists of the Korean Hangul syllable

U+AED0

, or "껐".

Okay, so this file can be interpreted in four different ways. Are you going to use the "try to guess" algorithm from

IsTextUnicode

? (Michael
Kaplan has some thoughts on this subject.) If so, then you are right where Notepad is today. Notice that all four interpretations are linguistically plausible.
Some people might say that the rule should be "All files without a BOM are 8-bit ANSI." In that case, you're going to misinterpret all the files that use UTF-8 or UTF-16 and don't have a BOM. Note that the Unicode standard even advises against using
a BOM for UTF-8, so you're already throwing out everybody who follows the recommendation.
Okay, given that the Unicode folks recommend against using a BOM for UTF-8, maybe your rule is "All files without a BOM are UTF-8." Well, that messes up all 8-bit ANSI files that use characters above 127.
Maybe you're willing to accept that ambiguity, and use the rule, "If the file looks like valid UTF-8, then use UTF-8; otherwise use 8-bit ANSI, but under no circumstances should you treat the file as UTF-16LE or UTF-16BE." In other
words, "never auto-detect UTF-16". First, you still have ambiguous cases, like the file above, which could be either 8-bit ANSI or UTF-8. And second, you are going to be flat-out wrong when you run into a Unicode file that lacks a BOM, since you're going to
misinterpret it as either UTF-8 or (more likely) 8-bit ANSI. You might decide that programs that generate UTF-16 files without a BOM are broken, but that doesn't mean that they don't exist. For example,

cmd /u /c dir >results.txt

This generates a UTF-16LE file without a BOM. If you poke around your Windows directory, you'll probably find other Unicode files without a BOM. (For example, I found

COM+.log

.)
These files still "worked" under the old

IsTextUnicode

algorithm, but now they are unreadable. Maybe you consider that an acceptable loss.
The point is that no matter how you decide to resolve the ambiguity, somebody will win and somebody else will lose. And then people can start experimenting with the "losers" to find one that makes your algorithm look stupid for choosing
"incorrectly".

//////////////////////////////////////////////////////////////////////////
//
// FILE: utf8conv.h
//
// Header file defining helper functions for converting strings
// between Unicode UTF-8 and UTF-16.
//
// UTF-8 is stored in std::string; UTF-16 is stored in std::wstring.
//
// This code just uses Win32 Platform SDK and C++ standard library;
// so it can be used also with the Express editions of Visual Studio.
//
//
// February 4th, 2011
//
// by Giovanni Dicanio <gdicanio@mvps.org>
//
//////////////////////////////////////////////////////////////////////////

#pragma once

//------------------------------------------------------------------------
//                              INCLUDES
//------------------------------------------------------------------------

#include <stdarg.h>     // variable argument lists...
#include <stdio.h>      // ...and vsprintf_s

#include <exception>    // std::exception
#include <string>       // STL stringclasses

#include <Windows.h>    // Win32 Platform SDK main header

namespace utf8util {

//------------------------------------------------------------------------
// Exception class representing an error occurred during UTF-8 conversion.
//------------------------------------------------------------------------
class utf8_error
: public std::exception
{
public:

// Constructs an utf8_error with a message stringthat can use a
// printf-like syntax for formatting.
explicit utf8_error(const char * format, ...);

// Override from std::exception::what()
const char * what() const;

//
// IMPLEMENTATION
//
private:
char m_message[512];    // buffer for error message
};

inline utf8_error::utf8_error(const char * format, ...)
{
// Format error message in buffer
va_list args;
va_start(args, format);
vsprintf_s(m_message, format, args);
va_end(args);
}

inline const char * utf8_error::what() const
{
return m_message;
}

//------------------------------------------------------------------------

//------------------------------------------------------------------------
// Converts a stringfrom UTF-8 to UTF-16.
// On error, can throw an utf8_error exception.
//------------------------------------------------------------------------
inline std::wstringutf16_from_utf8(const std::string& utf8)
{
//
// Special case of empty input string
//
if (utf8.empty())
return std::wstring();

//
// Get length (in wchar_t's) of resulting UTF-16 string
//
const int utf16_length = ::MultiByteToWideChar(
CP_UTF8,            // convert from UTF-8
0,                  // default flags
utf8.data(),        // source UTF-8 string
utf8.length(),      // length (in chars) of source UTF-8 string
NULL,               // unused - no conversion done in this step
0                   // request size of destination buffer, in wchar_t's
);
if (utf16_length == 0)
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't get length of UTF-16 string(MultiByteToWideChar set last error to %lu).",
error);
}

//
// Allocate destination buffer for UTF-16 string
//
std::wstringutf16;
utf16.resize(utf16_length);

//
// Do the conversion from UTF-8 to UTF-16
//
if ( ! ::MultiByteToWideChar(
CP_UTF8,            // convert from UTF-8
0,                  // default flags
utf8.data(),        // source UTF-8 string
utf8.length(),      // length (in chars) of source UTF-8 string
&utf16[0],          // destination buffer
utf16.length()      // size of destination buffer, in wchar_t's
) )
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't convert stringfrom UTF-8 to UTF-16 (MultiByteToWideChar set last error to %lu).",
error);
}

//
// Return resulting UTF-16 string
//
return utf16;
}

//------------------------------------------------------------------------
// Converts a stringfrom UTF-16 to UTF-8.
// On error, can throw an utf8_error exception.
//------------------------------------------------------------------------
inline std::stringutf8_from_utf16(const std::wstring& utf16)
{
//
// Special case of empty input string
//
if (utf16.empty())
return std::string();

//
// Get length (in chars) of resulting UTF-8 string
//
const int utf8_length = ::WideCharToMultiByte(
CP_UTF8,            // convert to UTF-8
0,                  // default flags
utf16.data(),       // source UTF-16 string
utf16.length(),     // source stringlength, in wchar_t's,
NULL,               // unused - no conversion required in this step
0,                  // request buffer size
NULL, NULL          // unused
);
if (utf8_length == 0)
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't get length of UTF-8 string(WideCharToMultiByte set last error to %lu).",
error);
}

//
// Allocate destination buffer for UTF-8 string
//
std::stringutf8;
utf8.resize(utf8_length);

//
// Do the conversion from UTF-16 to UTF-8
//
if ( ! ::WideCharToMultiByte(
CP_UTF8,                // convert to UTF-8
0,                      // default flags
utf16.data(),           // source UTF-16 string
utf16.length(),         // source stringlength, in wchar_t's,
&utf8[0],               // destination buffer
utf8.length(),          // destination buffer size, in chars
NULL, NULL              // unused
) )
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't convert stringfrom UTF-16 to UTF-8 (WideCharToMultiByte set last error to %lu).",
error);
}

//
// Return resulting UTF-8 string
//
return utf8;
}

} // namespace utf8util

//////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
//
// FILE: TestUTF8Conversion.cpp
//
// Defines the entry point for the console test application.
//
// By Giovanni Dicanio <gdicanio@mvps.org>
//
//////////////////////////////////////////////////////////////////////////

#include "stdafx.h"       // precompiled headers
#include "utf8conv.h"     // UTF-8 conversion helpers

using namespace std;
using namespace utf8util;

//------------------------------------------------------------------------
// Some tests for UTF-8 <-> UTF-16 conversion.
//------------------------------------------------------------------------
void test()
{
//
// Test a simple UTF-16 <-> UTF-8 conversion
//

// Source UTF-16 string
wstringutf16(L"Euro sign (U+20AC): \x20AC");

// Convert from UTF-16 to UTF-8
stringutf8 = utf8_from_utf16(utf16);

// Convert back from UTF-8 to UTF-16
wstringutf16_new = utf16_from_utf8(utf8);

// Check conversion result
if (utf16_new != utf16)
throw runtime_error("UTF-16 <-> UTF-8 conversion failed.");

//
// Test with empty strings
//
if (! utf16_from_utf8("").empty())
throw runtime_error("Empty UTF-8 stringnot converted to empty UTF-16 string.");

if (! utf8_from_utf16(L"").empty())
throw runtime_error("Empty UTF-16 stringnot converted to empty UTF-8 string.");

//
// Test with invalid UTF-8 bytes
//

// 0xC0 0xAF UTF-8 sequence is discussed in "Writing Secure Code"
// (Chapter 11, "How UTF-8 Encodes Data", page 380)
char utf8_invalid[] = "UTF-8 invalid sequence: \xC0\xAF";
wstringutf16_invalid = utf16_from_utf8(utf8_invalid);
//
// Unicode UTF-16 'REPLACEMENT CHARACTER' (U+FFFD)
// is used for the invalid UTF-8 bytes.
//
// http://www.fileformat.info/info/unicode/char/fffd/index.htm //
}

//------------------------------------------------------------------------
// Entry-point.
//------------------------------------------------------------------------
int wmain(int argc, wchar_t* argv[])
{
static const int ok = 0;
static const int fail = 1;
int exit_code = ok;

try
{
cout << "*** Testing UTF-8 <-> UTF-16 Conversion ***" << endl;
test();
cout << "All right." << endl;
}
catch(const exception & e)
{
cerr << "*** ERROR: " << e.what() << endl;
exit_code = fail;
}

return exit_code;
}

//////////////////////////////////////////////////////////////////////////

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航

WideCharToMultiByte

Simple Character Encoding Detection

Introduction

Explanation

The Code

Notes

License

About the Author

IsTextUnicode function

Syntax

Syntax

Parameters

Return value

Remarks

Requirements

See also

Detectencoding of a stringin C/C++

The Notepad file encoding problem, redux

Detect
encoding of a stringin C/C++