java识别文件编码格式(无三方JAR包)
2015-08-31 23:14
567 查看
java识别文件编码格式代码
大部分转自:http://blog.csdn.net/txtdown0909/article/details/7933054package com.caoyong.file;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
/**
* 文件操作类
* @author cy
*
*/
public class FileOperateUtil
{
/**
* 读取文件内容到BYTE数组
* @param file
* @return byte[]
*/
public static byte[] readFileToByteArray(File file)
{
FileInputStream fis = null;
try
{
fis = new FileInputStream(file);
int len = fis.available();
byte[] bytes = new byte[len];
fis.read(bytes, 0, len);
return bytes;
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
return new byte[0];
}
/**
* 读取文件内容到LIST
* @param file
* @param charsetName
* @return List<String>
*/
public static List<String> readFileToList(File file,String charsetName)
{
List<String> list = new ArrayList<String>();
FileInputStream fis = null;
InputStreamReader is = null;
BufferedReader br = null;
try
{
fis = new FileInputStream(file);
is = new InputStreamReader(fis,charsetName);
br = new BufferedReader(is);
String line = null;
while(null != (line = br.readLine()))
{
list.add(line);
}
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
return list;
}
}
package com.caoyong.file;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 判断文件编码类型(使用文本文件测试,其它文件没验证)
*
* @author cy
*
*/
public class JudgeFileOperation
{
/**
* 读取文件内容
*
* @param path
* @return List<String>
*/
public static List<String> readFileToList(String path)
{
String codeString = codeString(path);
boolean haveBom = true;
if ("utf-8无bom".equals(codeString) || "gbk".equals(codeString))
{
haveBom = false;
}
if ("utf-8无bom".equals(codeString))
{
codeString = "utf-8";
}
System.out.println("codeString:" + codeString + " haveBom:" + haveBom);
List<String> list = new ArrayList<String>();
BufferedReader reader = null;
try
{
reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), codeString));
String str = "";
while ((str = reader.readLine()) != null)
{
if (haveBom)
{
list.add(removeBom(str, codeString));
haveBom = false;
} else
{
list.add(str);
}
}
} catch (UnsupportedEncodingException e)
{
e.printStackTrace();
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
} finally
{
if (reader != null)
{
try
{
reader.close();
} catch (IOException e)
{
e.printStackTrace();
}
}
}
return list;
}
/**
* 得到编码对应的字符串
*
* @param key
* @return String
*/
public static String getRefData(String key)
{
Map<String, String> map = new HashMap<String, String>();
map.put("UTF-8", "EFBBBF");
map.put("Unicode", "FFFE");
map.put("UTF-16BE", "FEFF");
return map.get(key);
}
/**
* 在window下用记事本保存为utf8时会在文件开始处加入EFBBBF标识符,本方法可以判断并去掉这个标识符 BOMs: 00 00 FE FF =
* UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF = UTF-8,
* FE FF = UTF-16, big-endian FF FE = UTF-16, little-endian
*
* @param line
* @return
* @throws UnsupportedEncodingException
*/
public static String removeBom(String line, String codeString)
{
byte[] allbytes = null;
try
{
allbytes = line.getBytes(codeString);
} catch (UnsupportedEncodingException e)
{
e.printStackTrace();
}
String start = "";
for (int i = 0; i < allbytes.length; i++)
{
int tmp = allbytes[i];
String hexString = Integer.toHexString(tmp);
// 1个byte变成16进制的,只需要2位就可以表示了,取后面两位,去掉前面的符号填充
if (hexString.length() < 2)
{
continue;
}
hexString = hexString.substring(hexString.length() - 2);
start += hexString.toUpperCase();
if (start.equals(getRefData(codeString)))
{
break;
}
}
if (start.equals(getRefData(codeString)))
{
try
{
return new String(Arrays.copyOfRange(allbytes, getRefData(codeString).length() / 2, allbytes.length),
codeString);
} catch (UnsupportedEncodingException e)
{
e.printStackTrace();
}
}
try
{
return new String(Arrays.copyOfRange(allbytes, 0, allbytes.length), codeString);
} catch (UnsupportedEncodingException e)
{
e.printStackTrace();
}
return line;
}
/**
* 判断文件的编码格式 目前能判断UTF-8(有或无BOM) Unicode UTF-16BE gbk五种类型
*
* @param fileName
* @return 文件编码格式
* @throws Exception
*/
public static String codeString(String fileName)
{
String code = null;
try
{
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fileName));
int p = (bis.read() << 8) + bis.read();
switch (p)
{
case 0xefbb:
code = "UTF-8";
break;
case 0xfffe:
code = "Unicode";
break;
case 0xfeff:
code = "UTF-16BE";
break;
default:
{
if (isUTF8(new File(fileName)))
{
code = "utf-8无bom";
} else
{
code = "gbk";
}
}
}
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
return code;
}
/**
* 判断文件是无dom的utf8还是gbk
*
* @param file
* @return
*/
public static boolean isUTF8(File file)
{
byte[] buf = FileOperateUtil.readFileToByteArray(file);
/*
* for (byte b : buf) { System.out.println(Integer.toHexString(b)); }
*/
List<String> readFileToList = FileOperateUtil.readFileToList(file, "utf-8");
byte[] list = new byte[10];
boolean isFirst = true;
int count = 0;
for (String string : readFileToList)
{
if (isFirst)
{
try
{
list = string.getBytes("utf-8");
} catch (UnsupportedEncodingException e)
{
e.printStackTrace();
}
isFirst = false;
} else
{
byte[] temp = null;
try
{
temp = string.getBytes("utf-8");
} catch (UnsupportedEncodingException e)
{
e.printStackTrace();
}
int index = list.length;
list = Arrays.copyOf(list, list.length + temp.length);
for (byte b : temp)
{
list[index++] = b;
}
}
if (count < readFileToList.size() - 1)
{
list = Arrays.copyOf(list, list.length + 2);
list[list.length - 2] = 0xd;
list[list.length - 1] = 0xa;
}
count++;
}
for (int i = 0; i < list.length; i++)
{
if (i >= buf.length - 1)
{
break;
}
byte a = list[i];
byte b = buf[i];
if (a != b)
{
return false;
}
}
return true;
}
/**
* 简单判断文件编码方法
*
* @param path
*/
public static void simpleFileCode(String path)
{
File f = new File(path);
try
{
InputStream ios = new FileInputStream(f);
byte[] b = new byte[3];
ios.read(b);
ios.close();
if (b[0] == -17 && b[1] == -69 && b[2] == -65)
{
System.out.println(f.getName() + "编码为UTF-8");
} else
{
System.out.println(f.getName() + "可能是GBK");
}
} catch (Exception e)
{
e.printStackTrace();
}
}
/**
* 测试方法
*
* @param args
*/
public static void main(String[] args)
{
// 读取任意编码格式的文件
StringBuffer sb = new StringBuffer();
sb.append(System.getProperty("user.dir"));
sb.append(File.separator);
sb.append("resource");
sb.append(File.separator);
// sb.append("gbk.txt");
// sb.append("utf8.txt");
sb.append("unicode.txt");
List<String> list = readFileToList(sb.toString());
for (String s : list)
{
System.out.println("File content:" + s);
}
System.out.println("**simpleFileCode**");
simpleFileCode(sb.toString());
}
}
代码完整工程下载:http://download.csdn.net/detail/mike_caoyong/9068729
相关文章推荐
- JAVA笔记之private、构造函数、this
- Java多线程之线程生命周期以及线程各个时期的状态
- java 8 JVM性能优化
- Struts的一些总结
- 【转】Java 集合系列目录(Category)
- web工程中实现spring框架工程详解
- java集合类操作优化经验总结
- Java中如何遍历Map对象的4种方法
- 利用myeclipse的反向工程来创建hbm映射文件
- Spring学习(八)spring整合struts2
- java中的volatile关键字
- [转]Java垃圾回收:GC在什么时候对什么做了什么
- Spring的ioc
- [转]Java垃圾回收:GC在什么时候对什么做了什么
- Java内存简介
- [笔记][Java7并发编程实战手册]4.7-4.8 在执行器中延迟执行或则周期执行任务ScheduledThreadPoolExecutor
- 【android】 eclipse导入sdk源码、javadoc(帮助文档)
- java.lang.IllegalStateException: Missing project All-Projects
- 菜鸟好文推荐(十五)——9个基于Java的搜索引擎框架
- 菜鸟好文推荐(十四)——Java 常见异常及趣味解释