您的位置:首页 > 编程语言 > Java开发

Java读取UTF-8格式txt文件第一行出现乱码——问号“?”及解决

2016-03-04 09:37 781 查看
今天导入文件的时候,出现?,经过核查是UTF-8那个BOM的问题,最后经过查找,找了一个比较好的方法解决。

使用一个更强大点的工具类(可以支持UTF-8/UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE):

参考地址:http://koti.mbnet.fi/akini/java/unicodereader/, 下载其中两个文件:UnicodeStream和UnicodeReader

附上这两个类的代码,
/**

version: 1.1 / 2007-01-25

- changed BOM recognition ordering (longer boms first)

Original pseudocode : Thomas Weidenfeller

Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html
BOMs in byte length ordering:

00 00 FE FF = UTF-32, big-endian

FF FE 00 00 = UTF-32, little-endian

EF BB BF = UTF-8,

FE FF = UTF-16, big-endian

FF FE = UTF-16, little-endian

Win2k Notepad:

Unicode format = UTF-16LE

***/

import java.io.*;

/**

* This inputstream will recognize unicode BOM marks

* and will skip bytes if getEncoding() method is called

* before any of the read(...) methods.

*

* Usage pattern:

String enc = "ISO-8859-1"; // or NULL to use systemdefault

FileInputStream fis = new FileInputStream(file);

UnicodeInputStream uin = new UnicodeInputStream(fis, enc);

enc = uin.getEncoding(); // check and skip possible BOM bytes

InputStreamReader in;

if (enc == null) in = new InputStreamReader(uin);

else in = new InputStreamReader(uin, enc);

*/

public class UnicodeInputStream extends InputStream {

PushbackInputStream internalIn;

boolean isInited = false;

String defaultEnc;

String encoding;

private static final int BOM_SIZE = 4;

UnicodeInputStream(InputStream in, String defaultEnc) {

internalIn = new PushbackInputStream(in, BOM_SIZE);

this.defaultEnc = defaultEnc;

}

public String getDefaultEncoding() {

return defaultEnc;

}

public String getEncoding() {

if (!isInited) {

try {

init();

} catch (IOException ex) {

IllegalStateException ise = new IllegalStateException("Init method failed.");

ise.initCause(ise);

throw ise;

}

}

return encoding;

}

/**

* Read-ahead four bytes and check for BOM marks. Extra bytes are

* unread back to the stream, only BOM bytes are skipped.

*/

protected void init() throws IOException {

if (isInited) return;

byte bom[] = new byte[BOM_SIZE];

int n, unread;

n = internalIn.read(bom, 0, bom.length);

if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&

(bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {

encoding = "UTF-32BE";

unread = n - 4;

} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&

(bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {

encoding = "UTF-32LE";

unread = n - 4;

} else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&

(bom[2] == (byte)0xBF) ) {

encoding = "UTF-8";

unread = n - 3;

} else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {

encoding = "UTF-16BE";

unread = n - 2;

} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {

encoding = "UTF-16LE";

unread = n - 2;

} else {

// Unicode BOM mark not found, unread all bytes

encoding = defaultEnc;

unread = n;

}

//System.out.println("read=" + n + ", unread=" + unread);

if (unread > 0) internalIn.unread(bom, (n - unread), unread);

isInited = true;

}

public void close() throws IOException {

//init();

isInited = true;

internalIn.close();

}

public int read() throws IOException {

//init();

isInited = true;

return internalIn.read();

}

}

package Test;

/**

version: 1.1 / 2007-01-25

- changed BOM recognition ordering (longer boms first)

Original pseudocode : Thomas Weidenfeller

Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html
BOMs:

00 00 FE FF = UTF-32, big-endian

FF FE 00 00 = UTF-32, little-endian

EF BB BF = UTF-8,

FE FF = UTF-16, big-endian

FF FE = UTF-16, little-endian

Win2k Notepad:

Unicode format = UTF-16LE

***/

import java.io.*;

/**

* Generic unicode textreader, which will use BOM mark

* to identify the encoding to be used. If BOM is not found

* then use a given default or system encoding.

*/

public class UnicodeReader extends Reader {

PushbackInputStream internalIn;

InputStreamReader internalIn2 = null;

String defaultEnc;

private static final int BOM_SIZE = 4;

/**

*

* @param in inputstream to be read

* @param defaultEnc default encoding if stream does not have

* BOM marker. Give NULL to use system-level default.

*/

public UnicodeReader(InputStream in, String defaultEnc) {

internalIn = new PushbackInputStream(in, BOM_SIZE);

this.defaultEnc = defaultEnc;

}

public String getDefaultEncoding() {

return defaultEnc;

}

/**

* Get stream encoding or NULL if stream is uninitialized.

* Call init() or read() method to initialize it.

*/

public String getEncoding() {

if (internalIn2 == null) return null;

return internalIn2.getEncoding();

}

/**

* Read-ahead four bytes and check for BOM marks. Extra bytes are

* unread back to the stream, only BOM bytes are skipped.

*/

protected void init() throws IOException {

if (internalIn2 != null) return;

String encoding;

byte bom[] = new byte[BOM_SIZE];

int n, unread;

n = internalIn.read(bom, 0, bom.length);

if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&

(bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {

encoding = "UTF-32BE";

unread = n - 4;

} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&

(bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {

encoding = "UTF-32LE";

unread = n - 4;

} else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&

(bom[2] == (byte)0xBF) ) {

encoding = "UTF-8";

unread = n - 3;

} else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {

encoding = "UTF-16BE";

unread = n - 2;

} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {

encoding = "UTF-16LE";

unread = n - 2;

} else {

// Unicode BOM mark not found, unread all bytes

encoding = defaultEnc;

unread = n;

}

//System.out.println("read=" + n + ", unread=" + unread);

if (unread > 0) internalIn.unread(bom, (n - unread), unread);

// Use given encoding

if (encoding == null) {

internalIn2 = new InputStreamReader(internalIn);

} else {

internalIn2 = new InputStreamReader(internalIn, encoding);

}

}

public void close() throws IOException {

init();

internalIn2.close();

}

public int read(char[] cbuf, int off, int len) throws IOException {

init();

return internalIn2.read(cbuf, off, len);

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: