您的位置:首页 > 编程语言 > Java开发

java识别文件编码格式(无三方JAR包)

2015-08-31 23:14 567 查看

java识别文件编码格式代码

大部分转自:http://blog.csdn.net/txtdown0909/article/details/7933054

package com.caoyong.file;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.List;

/**

* 文件操作类

* @author cy

*

*/

public class FileOperateUtil

{

/**

* 读取文件内容到BYTE数组

* @param file

* @return byte[]

*/

public static byte[] readFileToByteArray(File file)

{

FileInputStream fis = null;

try

{

fis = new FileInputStream(file);

int len = fis.available();

byte[] bytes = new byte[len];

fis.read(bytes, 0, len);

return bytes;

} catch (FileNotFoundException e)

{

e.printStackTrace();

} catch (IOException e)

{

e.printStackTrace();

}

return new byte[0];

}

/**

* 读取文件内容到LIST

* @param file

* @param charsetName

* @return List<String>

*/

public static List<String> readFileToList(File file,String charsetName)

{

List<String> list = new ArrayList<String>();

FileInputStream fis = null;

InputStreamReader is = null;

BufferedReader br = null;

try

{

fis = new FileInputStream(file);

is = new InputStreamReader(fis,charsetName);

br = new BufferedReader(is);

String line = null;

while(null != (line = br.readLine()))

{

list.add(line);

}

} catch (FileNotFoundException e)

{

e.printStackTrace();

} catch (IOException e)

{

e.printStackTrace();

}

return list;

}

}

package com.caoyong.file;

import java.io.BufferedInputStream;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

/**

* 判断文件编码类型(使用文本文件测试,其它文件没验证)

*

* @author cy

*

*/

public class JudgeFileOperation

{

/**

* 读取文件内容

*

* @param path

* @return List<String>

*/

public static List<String> readFileToList(String path)

{

String codeString = codeString(path);

boolean haveBom = true;

if ("utf-8无bom".equals(codeString) || "gbk".equals(codeString))

{

haveBom = false;

}

if ("utf-8无bom".equals(codeString))

{

codeString = "utf-8";

}

System.out.println("codeString:" + codeString + " haveBom:" + haveBom);

List<String> list = new ArrayList<String>();

BufferedReader reader = null;

try

{

reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), codeString));

String str = "";

while ((str = reader.readLine()) != null)

{

if (haveBom)

{

list.add(removeBom(str, codeString));

haveBom = false;

} else

{

list.add(str);

}

}

} catch (UnsupportedEncodingException e)

{

e.printStackTrace();

} catch (FileNotFoundException e)

{

e.printStackTrace();

} catch (IOException e)

{

e.printStackTrace();

} finally

{

if (reader != null)

{

try

{

reader.close();

} catch (IOException e)

{

e.printStackTrace();

}

}

}

return list;

}

/**

* 得到编码对应的字符串

*

* @param key

* @return String

*/

public static String getRefData(String key)

{

Map<String, String> map = new HashMap<String, String>();

map.put("UTF-8", "EFBBBF");

map.put("Unicode", "FFFE");

map.put("UTF-16BE", "FEFF");

return map.get(key);

}

/**

* 在window下用记事本保存为utf8时会在文件开始处加入EFBBBF标识符,本方法可以判断并去掉这个标识符 BOMs: 00 00 FE FF =

* UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF = UTF-8,

* FE FF = UTF-16, big-endian FF FE = UTF-16, little-endian

*

* @param line

* @return

* @throws UnsupportedEncodingException

*/

public static String removeBom(String line, String codeString)

{

byte[] allbytes = null;

try

{

allbytes = line.getBytes(codeString);

} catch (UnsupportedEncodingException e)

{

e.printStackTrace();

}

String start = "";

for (int i = 0; i < allbytes.length; i++)

{

int tmp = allbytes[i];

String hexString = Integer.toHexString(tmp);

// 1个byte变成16进制的,只需要2位就可以表示了,取后面两位,去掉前面的符号填充

if (hexString.length() < 2)

{

continue;

}

hexString = hexString.substring(hexString.length() - 2);

start += hexString.toUpperCase();

if (start.equals(getRefData(codeString)))

{

break;

}

}

if (start.equals(getRefData(codeString)))

{

try

{

return new String(Arrays.copyOfRange(allbytes, getRefData(codeString).length() / 2, allbytes.length),

codeString);

} catch (UnsupportedEncodingException e)

{

e.printStackTrace();

}

}

try

{

return new String(Arrays.copyOfRange(allbytes, 0, allbytes.length), codeString);

} catch (UnsupportedEncodingException e)

{

e.printStackTrace();

}

return line;

}

/**

* 判断文件的编码格式 目前能判断UTF-8(有或无BOM) Unicode UTF-16BE gbk五种类型

*

* @param fileName

* @return 文件编码格式

* @throws Exception

*/

public static String codeString(String fileName)

{

String code = null;

try

{

BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fileName));

int p = (bis.read() << 8) + bis.read();

switch (p)

{

case 0xefbb:

code = "UTF-8";

break;

case 0xfffe:

code = "Unicode";

break;

case 0xfeff:

code = "UTF-16BE";

break;

default:

{

if (isUTF8(new File(fileName)))

{

code = "utf-8无bom";

} else

{

code = "gbk";

}

}

}

} catch (FileNotFoundException e)

{

e.printStackTrace();

} catch (IOException e)

{

e.printStackTrace();

}

return code;

}

/**

* 判断文件是无dom的utf8还是gbk

*

* @param file

* @return

*/

public static boolean isUTF8(File file)

{

byte[] buf = FileOperateUtil.readFileToByteArray(file);

/*

* for (byte b : buf) { System.out.println(Integer.toHexString(b)); }

*/

List<String> readFileToList = FileOperateUtil.readFileToList(file, "utf-8");

byte[] list = new byte[10];

boolean isFirst = true;

int count = 0;

for (String string : readFileToList)

{

if (isFirst)

{

try

{

list = string.getBytes("utf-8");

} catch (UnsupportedEncodingException e)

{

e.printStackTrace();

}

isFirst = false;

} else

{

byte[] temp = null;

try

{

temp = string.getBytes("utf-8");

} catch (UnsupportedEncodingException e)

{

e.printStackTrace();

}

int index = list.length;

list = Arrays.copyOf(list, list.length + temp.length);

for (byte b : temp)

{

list[index++] = b;

}

}

if (count < readFileToList.size() - 1)

{

list = Arrays.copyOf(list, list.length + 2);

list[list.length - 2] = 0xd;

list[list.length - 1] = 0xa;

}

count++;

}

for (int i = 0; i < list.length; i++)

{

if (i >= buf.length - 1)

{

break;

}

byte a = list[i];

byte b = buf[i];

if (a != b)

{

return false;

}

}

return true;

}

/**

* 简单判断文件编码方法

*

* @param path

*/

public static void simpleFileCode(String path)

{

File f = new File(path);

try

{

InputStream ios = new FileInputStream(f);

byte[] b = new byte[3];

ios.read(b);

ios.close();

if (b[0] == -17 && b[1] == -69 && b[2] == -65)

{

System.out.println(f.getName() + "编码为UTF-8");

} else

{

System.out.println(f.getName() + "可能是GBK");

}

} catch (Exception e)

{

e.printStackTrace();

}

}

/**

* 测试方法

*

* @param args

*/

public static void main(String[] args)

{

// 读取任意编码格式的文件

StringBuffer sb = new StringBuffer();

sb.append(System.getProperty("user.dir"));

sb.append(File.separator);

sb.append("resource");

sb.append(File.separator);

// sb.append("gbk.txt");

// sb.append("utf8.txt");

sb.append("unicode.txt");

List<String> list = readFileToList(sb.toString());

for (String s : list)

{

System.out.println("File content:" + s);

}

System.out.println("**simpleFileCode**");

simpleFileCode(sb.toString());

}

}

代码完整工程下载:http://download.csdn.net/detail/mike_caoyong/9068729
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: