您的位置:首页 > Web前端 > HTML

手机解析HTML,XML,TXT,XHTML,WML等文档

2017-04-10 17:58 603 查看
package Core;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Hashtable;

/**
* @author fonter
* http://fonter.iteye.com * 此类用于解析HTML,XML,TXT,XHTML,WML等文档,支持CDATA,支持Text Extractor
*/
public class HtmlInputStreamReader {

private Reader reader;
private boolean unresolved;
private boolean processNsp = true;
private boolean token;
private boolean wasCR;
private String encoding;
private char[] srcBuf;
private Hashtable entityMap;
private boolean relaxed = true;
private boolean degenerated;
private String[] attributes = new String[16];
private int type;
static final private String UNEXPECTED_EOF = "Unexpected EOF";
//static final private String ILLEGAL_TYPE = "Wrong event type";
public static final String NO_NAMESPACE = "";
public static final int START_DOCUMENT = 0;
public static final int END_DOCUMENT = 1;
public static final int START_TAG = 2;
public static final int END_TAG = 3;
public static final int TEXT = 4;
public static final int CDSECT = 5;
public static final int ENTITY_REF = 6;
public static final int IGNORABLE_WHITESPACE = 7;
public static final int PROCESSING_INSTRUCTION = 8;
public static final int COMMENT = 9;
public static final int DOCDECL = 10;
public static final int LEGACY = 999;
public static final int XML_DECL = 998;
private String[] nspStack = new String[8];
private int[] nspCounts = new int[4];
private String version;
private Boolean standalone;
private char[] txtBuf = new char[128];
private int txtPos;
private String error;
private int srcLength;
private int srcPos;
private int srcCount;

private int stackMismatch = 0;
private String namespace;
private String prefix;
private String name;
private String[] elementStack = new String[16];

private int line;
private int column;

private int[] peek = new int[2];
private int peekCount;
private boolean isWhitespace;
private int attributeCount;

private int depth;

public HtmlInputStreamReader() throws IOException{
srcBuf = new char[Runtime.getRuntime().freeMemory() >= 1048576 ? 8192 : 128];
}

public void setInput(Reader reader) throws IOException {
this.reader = reader;
line = 1;
column = 0;
type = START_DOCUMENT;
name = null;
namespace = null;
degenerated = false;
attributeCount = -1;
encoding = null;
version = null;
standalone = null;
srcLength = 0;

if (reader == null)
return;

srcPos = 0;
srcCount = 0;
peekCount = 0;
depth = 0;

entityMap = new Hashtable();
entityMap.put("amp", "&");
entityMap.put("apos", "'");
entityMap.put("gt", ">");
entityMap.put("lt", "<");
entityMap.put("quot", "\"");
entityMap.put("copy", "\251");
entityMap.put("reg", "\256");
entityMap.put("yen", "\245");
}

private final int peek(int pos) throws IOException {

while (pos >= peekCount) {

int nw;

if (srcBuf.length <= 1)
nw = reader.read();
else if (srcPos < srcCount)
nw = srcBuf[srcPos++];
else {
srcCount = reader.read(srcBuf, 0, srcBuf.length);
if (srcCount <= 0)
nw = -1;
else
nw = srcBuf[0];

srcPos = 1;
}

if (nw == '\r') {
wasCR = true;
peek[peekCount++] = '\n';
} else {
if (nw == '\n') {
if (!wasCR)
peek[peekCount++] = '\n';
} else
peek[peekCount++] = nw;

wasCR = false;
}
}

return peek[pos];
}

private final int peekType() throws IOException {
switch (peek(0)) {
case -1 :
return END_DOCUMENT;
case '&' :
return ENTITY_REF;
case '<' :
switch (peek(1)) {
case '/' :
return END_TAG;
case '?' :
case '!' :
return LEGACY;
default :
return START_TAG;
}
default :
return TEXT;
}
}

private final void error(String desc){
exception(desc);
}

private final void exception(String desc){
System.out.println(desc);
}

public final void nextImpl() throws IOException{

if (reader == null)
exception("No Input specified");

if (type == END_TAG)
depth--;

while (true) {
attributeCount = -1;

// degenerated needs to be handled before error because of possible
// processor expectations(!)

if (degenerated) {
degenerated = false;
type = END_TAG;
return;
}

if (error != null) {
for (int i = 0; i < error.length(); i++)
push(error.charAt(i));
//				text = error;
error = null;
type = COMMENT;
return;
}

if (relaxed
&& (stackMismatch > 0 || (peek(0) == -1 && depth > 0))) {
int sp = (depth - 1) << 2;
type = END_TAG;
namespace = elementStack[sp];
prefix = elementStack[sp + 1];
name = elementStack[sp + 2];
if (stackMismatch != 1)
error = "missing end tag /" + name + " inserted";
if (stackMismatch > 0)
stackMismatch--;
return;
}

prefix = null;
name = null;
namespace = null;
//            text = null;

type = peekType();
//System.out.println("Markup:"+type);

switch (type) {

case ENTITY_REF :
pushEntity();
return;

case START_TAG :
parseStartTag(false);
return;

case END_TAG :
parseEndTag();
return;

case END_DOCUMENT :
return;

case TEXT :
pushText('<', !token);
if (depth == 0) {
if (isWhitespace)
type = IGNORABLE_WHITESPACE;
// make exception switchable for instances.chg... !!!!
//	else
//    exception ("text '"+getText ()+"' not allowed outside root element");
}
return;

default :
type = parseLegacy(token);
if (type != XML_DECL)
return;
}
}
}

// boolean isEND(){
//	return isEOF;
//}

public String getInputEncoding() {
return encoding;
}

public String getText() {
return type < TEXT
|| (type == ENTITY_REF && unresolved) ? null : get(0);
}
//text Extractor
public String getTextExtractor() {
//String s = get(0).;
StringBuffer sb = new StringBuffer();
return type < TEXT
|| (type == ENTITY_REF && unresolved) ? null : appendCollapseWhiteSpace(sb,get(0)).toString();
}

private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'};

public static final boolean isWhiteSpace(final char ch) {
for (int i=0; i<WHITESPACE.length; i++)
if (ch==WHITESPACE[i]) return true;
return false;
}

static final StringBuffer appendCollapseWhiteSpace(StringBuffer sb, String text) {
final int textLength=text.length();
int i=0;
boolean firstWasWhiteSpace=false;
while (true) {
if (i>=textLength) return sb;
if (!isWhiteSpace(text.charAt(i))) break;
i++;
}
do {
final char ch = text.charAt(i++);
if (isWhiteSpace(ch)) {
firstWasWhiteSpace =true;
} else {
if (firstWasWhiteSpace) {
sb.append(' ');
firstWasWhiteSpace =false;
}
sb.append(ch);
}
} while (i<textLength);
return sb;
}

public int getEventType(){
return type;
}

private final void parseEndTag()
throws IOException{

read(); // '<'
read(); // '/'
name = readName();
//System.out.println("EndTag:"+name);
skip();
read('>');

int sp = (depth - 1) << 2;

if (depth == 0) {
error("element stack empty");
type = COMMENT;
return;
}

if (!name.equals(elementStack[sp + 3])) {
error("expected: /" + elementStack[sp + 3] + " read: " + name);

// become case insensitive in relaxed mode

int probe = sp;
while (probe >= 0 && !name.toLowerCase().equals(elementStack[probe + 3].toLowerCase())) {
stackMismatch++;
probe -= 4;
}

if (probe < 0) {
stackMismatch = 0;
//			text = "unexpected end tag ignored";
type = COMMENT;
return;
}
}

namespace = elementStack[sp];
prefix = elementStack[sp + 1];
name = elementStack[sp + 2];
}

private final int parseLegacy(boolean push)
throws IOException{

String req = "";
int term;
int result;
int prev = 0;

read(); // <
int c = read();
if (c == '?') {
if ((peek(0) == 'x' || peek(0) == 'X')
&& (peek(1) == 'm' || peek(1) == 'M')) {

if (push) {
push(peek(0));
push(peek(1));
}
read();
read();

if ((peek(0) == 'l' || peek(0) == 'L') && peek(1) <= ' ') {

if (line != 1 || column > 4)
error("PI must not start with xml");

parseStartTag(true);

if (attributeCount < 1 || !"version".equals(attributes[2]))
error("version expected");

version = attributes[3];

int pos = 1;

if (pos < attributeCount
&& "encoding".equals(attributes[2 + 4])) {
encoding = attributes[3 + 4];
pos++;
}

if (pos < attributeCount
&& "standalone".equals(attributes[4 * pos + 2])) {
String st = attributes[3 + 4 * pos];
if ("yes".equals(st))
standalone = new Boolean(true);
else if ("no".equals(st))
standalone = new Boolean(false);
else
error("illegal standalone value: " + st);
pos++;
}

if (pos != attributeCount)
error("illegal xmldecl");

isWhitespace = true;
txtPos = 0;

return XML_DECL;
}
}

/*            int c0 = read ();
int c1 = read ();
int */

term = '?';
result = PROCESSING_INSTRUCTION;
} else if (c == '!') {
if (peek(0) == '-') {
result = COMMENT;
req = "--";
term = '-';
} else if (peek(0) == '[') {
result = CDSECT;
req = "[CDATA[";
term = ']';
push = true;
} else {
result = DOCDECL;
req = "DOCTYPE";
term = -1;
}
} else {
error("illegal: <" + c);
return COMMENT;
}
for (int i = 0; i < req.length(); i++)
read(req.charAt(i));

if (result == DOCDECL)
parseDoctype(push);
else {
while (true) {
c = read();
if (c == -1){
error(UNEXPECTED_EOF);
return COMMENT;
}

if (push)
push(c);

if ((term == '?' || c == term)
&& peek(0) == term
&& peek(1) == '>')
break;

prev = c;
}

if (term == '-' && prev == '-')
error("illegal comment delimiter: --->");

read();
read();

if (push && term != '?')
txtPos--;

}
return result;
}
private final String readName()
throws IOException{

int pos = txtPos;
int c = peek(0);
if ((c < 'a' || c > 'z')
&& (c < 'A' || c > 'Z')
&& c != '_'
&& c != ':'
&& c < 0x0c0
&& !relaxed)
error("name expected");

do {
push(read());
c = peek(0);
}
while ((c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
|| c == '_'
|| c == '-'
|| c == ':'
|| c == '.'
|| c >= 0x0b7);

String result = get(pos);
txtPos = pos;
return result;
}

private final String get(int pos) {
return new String(txtBuf, pos, txtPos - pos);
}

private final void skip() throws IOException {

while (true) {
int c = peek(0);
if (c > ' ' || c == -1)
break;
read();
}
}
private final void parseDoctype(boolean push)
throws IOException{
int nesting = 1;
boolean quoted = false;

// read();

while (true) {
int i = read();
switch (i) {

case -1 :
error(UNEXPECTED_EOF);
return;

case '\'' :
quoted = !quoted;
break;

case '<' :
if (!quoted)
nesting++;
break;

case '>' :
if (!quoted) {
if ((--nesting) == 0)
return;
}
break;
}
if (push)
push(i);
}
}
private final void pushText(int delimiter, boolean resolveEntities)
throws IOException{

int next = peek(0);
int cbrCount = 0;

while (next != -1 && next != delimiter) { // covers eof, '<', '"'

if (delimiter == ' ')
if (next <= ' ' || next == '>')
break;

if (next == '&') {
if (!resolveEntities)
break;

pushEntity();
} else if (next == '\n' && type == START_TAG) {
read();
push(' ');
} else
push(read());

if (next == '>' && cbrCount >= 2 && delimiter != ']')
error("Illegal: ]]>");

if (next == ']')
cbrCount++;
else
cbrCount = 0;

next = peek(0);
}
}

private final void pushEntity()
throws IOException{

push(read()); // &

int pos = txtPos;

while (true) {
int c = read();
if (c == ';')
break;
if (c < 128
&& (c < '0' || c > '9')
&& (c < 'a' || c > 'z')
&& (c < 'A' || c > 'Z')
&& c != '_'
&& c != '-'
&& c != '#') {
if(!relaxed){
error("unterminated entity ref");
}
//; ends with:"+(char)c);
if (c != -1)
push(c);
return;
}

push(c);
}

String code = get(pos);
txtPos = pos - 1;
if (token && type == ENTITY_REF){
name = code;
}

if (code.charAt(0) == '#') {
int c =
(code.charAt(1) == 'x'
? Integer.parseInt(code.substring(2), 16)
: Integer.parseInt(code.substring(1)));
push(c);
return;
}

String result = (String) entityMap.get(code);

unresolved = result == null;

if (unresolved) {
if (!token)
error("unresolved: &" + code + ";");
} else {
for (int i = 0; i < result.length(); i++)
push(result.charAt(i));
}
}

private final void parseStartTag(boolean xmldecl)
throws IOException{

if (!xmldecl)
read();
name = readName();
//System.out.println("StartTag:"+name);
attributeCount = 0;

while (true) {
skip();

int c = peek(0);

if (xmldecl) {
if (c == '?') {
read();
read('>');
return;
}
} else {
if (c == '/') {
degenerated = true;
read();
skip();
read('>');
break;
}

if (c == '>' && !xmldecl) {
read();
break;
}
}

if (c == -1) {
error(UNEXPECTED_EOF);
//type = COMMENT;
return;
}

String attrName = readName();

if (attrName.length() == 0) {
error("attr name expected");
break;
}

int i = (attributeCount++) << 2;

attributes = ensureCapacity(attributes, i + 4);

attributes[i++] = "";
attributes[i++] = null;
attributes[i++] = attrName;

skip();

if (peek(0) != '=') {
error("Attr.value missing f. "+attrName);
attributes[i] = "1";
} else {
read('=');
skip();
int delimiter = peek(0);

if (delimiter != '\'' && delimiter != '"') {
error("attr value delimiter missing!");
delimiter = ' ';
} else
read();

int p = txtPos;
pushText(delimiter, true);
String skdkfk = get(p);
attributes[i] = skdkfk;
System.out.println("attributes:"+skdkfk);
txtPos = p;

if (delimiter != ' ')
read(); // skip endquote
}
}

int sp = depth++ << 2;

elementStack = ensureCapacity(elementStack, sp + 4);
elementStack[sp + 3] = name;

if (depth >= nspCounts.length) {
int[] bigger = new int[depth + 4];
System.arraycopy(nspCounts, 0, bigger, 0, nspCounts.length);
nspCounts = bigger;
}

nspCounts[depth] = nspCounts[depth - 1];

/*
if(!relaxed){
for (int i = attributeCount - 1; i > 0; i--) {
for (int j = 0; j < i; j++) {
if (getAttributeName(i).equals(getAttributeName(j)))
exception("Duplicate Attribute: " + getAttributeName(i));
}
}
}
*/
if (processNsp)
adjustNsp();
else
namespace = "";

elementStack[sp] = namespace;
elementStack[sp + 1] = prefix;
elementStack[sp + 2] = name;
}

private final boolean adjustNsp(){

boolean any = false;

for (int i = 0; i < attributeCount << 2; i += 4) {
// * 4 - 4; i >= 0; i -= 4) {

String attrName = attributes[i + 2];
int cut = attrName.indexOf(':');
String prefix;

if (cut != -1) {
prefix = attrName.substring(0, cut);
attrName = attrName.substring(cut + 1);
} else if (attrName.equals("xmlns")) {
prefix = attrName;
attrName = null;
} else
continue;

if (!prefix.equals("xmlns")) {
any = true;
} else {
int j = (nspCounts[depth]++) << 1;

nspStack = ensureCapacity(nspStack, j + 2);
nspStack[j] = attrName;
nspStack[j + 1] = attributes[i + 3];

if (attrName != null && attributes[i + 3].equals(""))
error("illegal empty namespace");

//  prefixMap = new PrefixMap (prefixMap, attrName, attr.getValue ());

//System.out.println (prefixMap);

System.arraycopy(
attributes,
i + 4,
attributes,
i,
((--attributeCount) << 2) - i);

i -= 4;
}
}

if (any) {
for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) {

String attrName = attributes[i + 2];
int cut = attrName.indexOf(':');

if (cut == 0 && !relaxed)
throw new RuntimeException(
"illegal attribute name: " + attrName + " at " + this);

else if (cut != -1) {
String attrPrefix = attrName.substring(0, cut);

attrName = attrName.substring(cut + 1);

String attrNs = getNamespace(attrPrefix);

if (attrNs == null && !relaxed)
throw new RuntimeException(
"Undefined Prefix: " + attrPrefix + " in " + this);

attributes[i] = attrNs;
attributes[i + 1] = attrPrefix;
attributes[i + 2] = attrName;

/*
if (!relaxed) {
for (int j = (attributeCount << 2) - 4; j > i; j -= 4)
if (attrName.equals(attributes[j + 2])
&& attrNs.equals(attributes[j]))
exception(
"Duplicate Attribute: {"
+ attrNs
+ "}"
+ attrName);
}
*/
}
}
}

int cut = name.indexOf(':');

if (cut == 0)
error("illegal tag name: " + name);

if (cut != -1) {
prefix = name.substring(0, cut);
name = name.substring(cut + 1);
}

this.namespace = getNamespace(prefix);

if (this.namespace == null) {
if (prefix != null)
error("undefined prefix: " + prefix);
this.namespace = NO_NAMESPACE;
}

return any;
}
//获取命名空间
public String getNamespace(String prefix) {

if ("xml".equals(prefix))
return "http://www.w3.org/XML/1998/namespace";
if ("xmlns".equals(prefix))
return "http://www.w3.org/2000/xmlns/";

for (int i = (getNamespaceCount(depth) << 1) - 2; i >= 0; i -= 2) {
if (prefix == null) {
if (nspStack[i] == null)
return nspStack[i + 1];
} else if (prefix.equals(nspStack[i]))
return nspStack[i + 1];
}
return null;
}

public int getNamespaceCount(int depth) {
if (depth > this.depth)
throw new IndexOutOfBoundsException();
return nspCounts[depth];
}

private final void read(char c)throws IOException{
int a = read();
if (a != c)
error("expected: '" + c + "' actual: '" + ((char) a) + "'");
}

private final int read() throws IOException {
int result;

if (peekCount == 0)
result = peek(0);
else {
result = peek[0];
peek[0] = peek[1];
}
//		else {
//			result = peek[0];
//			System.arraycopy (peek, 1, peek, 0, peekCount-1);
//		}
peekCount--;

column++;
srcLength++;

if (result == '\n') {

line++;
column = 1;
}

return result;
}

private final void push(int c) {

isWhitespace &= c <= ' ';

if (txtPos == txtBuf.length) {
char[] bigger = new char[txtPos * 4 / 3 + 4];
System.arraycopy(txtBuf, 0, bigger, 0, txtPos);
txtBuf = bigger;
}
txtBuf[txtPos++] = (char) c;
}

private final String[] ensureCapacity(String[] arr, int required) {
if (arr.length >= required)
return arr;
String[] bigger = new String[required + 16];
System.arraycopy(arr, 0, bigger, 0, arr.length);
return bigger;
}
//设置编码
public void setInput(InputStream is, String _enc) throws IOException {

srcPos = 0;
srcCount = 0;
String enc = _enc;

if (is == null)
throw new IllegalArgumentException();

try {

if (enc == null) {
// read four bytes

int chk = 0;

while (srcCount < 4) {
int i = is.read();
srcLength++;
if (i == -1)
break;
chk = (chk << 8) | i;
srcBuf[srcCount++] = (char) i;
}
System.out.println(chk);
if (srcCount == 4) {
switch (chk) {
case 0x00000FEFF :
enc = "UTF-32BE";
srcCount = 0;
break;

case 0x0FFFE0000 :
enc = "UTF-32LE";
srcCount = 0;
break;

case 0x03c :
enc = "UTF-32BE";
srcBuf[0] = '<';
srcCount = 1;
break;

case 0x03c000000 :
enc = "UTF-32LE";
srcBuf[0] = '<';
srcCount = 1;
break;

case 0x0003c003f :
enc = "UTF-16BE";
srcBuf[0] = '<';
srcBuf[1] = '?';
srcCount = 2;
break;
// 这是我加上去的----------------------------------
case 0x3c68746d:
//System.out.println("ssdesdfdf");
enc = "gb2312";
srcBuf[0] = '<';
//srcBuf[1] = '?';
srcCount = 1;
break;

case 0xd0a3c3f:
enc = "UTF-8";
srcBuf[0] = '<';
srcBuf[1] = '?';
srcCount = 2;
break;
//-------------------------------------------

case 0x03c003f00 :
enc = "UTF-16LE";
srcBuf[0] = '<';
srcBuf[1] = '!';
srcCount = 2;
break;
case 0xa0a3c21:
enc = "UTF-8";
srcBuf[0] = '<';
srcBuf[1] = '!';
srcCount = 2;
break;
//case 0x03c21444f:
//enc = "gb2312";
//srcBuf[0] = '<';
//srcBuf[1] = '!';
//srcCount = 2;
//break;

case 0x03c3f786d :
while (true) {

int i = is.read();
srcLength++;
if (i == -1)
break;
srcBuf[srcCount++] = (char) i;
if (i == '>') {
String s = new String(srcBuf, 0, srcCount);
int i0 = s.indexOf("encoding");
if (i0 != -1) {
while (s.charAt(i0) != '"'
&& s.charAt(i0) != '\'')
i0++;
char deli = s.charAt(i0++);
int i1 = s.indexOf(deli, i0);
enc = s.substring(i0, i1);
}
if(enc == null)
enc = "UTF-8";
break;
}
}

default :
if ((chk & 0x0ffff0000) == 0x0FEFF0000) {
enc = "UTF-16BE";
srcBuf[0] =
(char) ((srcBuf[2] << 8) | srcBuf[3]);
srcCount = 1;
} else if ((chk & 0x0ffff0000) == 0x0fffe0000) {
enc = "UTF-16LE";
srcBuf[0] =
(char) ((srcBuf[3] << 8) | srcBuf[2]);
srcCount = 1;
} else if ((chk & 0x0ffffff00) == 0x0EFBBBF00) {
enc = "UTF-8";
srcBuf[0] = srcBuf[3];
srcCount = 1;
}
}
}
}
System.out.println(enc);
//if (enc == null)
//enc = "gb2312";

int sc = srcCount;
if (enc == null)
setInput(new InputStreamReader(is));
else
setInput(new InputStreamReader(is, enc));
encoding = _enc;
srcCount = sc;
} catch (Exception e) {
throw new IOException();
}

}

public int next() throws IOException {

txtPos = 0;
isWhitespace = true;
int minType = 9999;
token = false;

do {
nextImpl();
if (type < minType)
minType = type;
//	    if (curr <= TEXT) type = curr;
}
while (minType > ENTITY_REF // ignorable
|| (minType >= TEXT && peekType() >= TEXT));

type = minType;
if (type > TEXT)
type = TEXT;

return type;
}

public int getLength(){
return srcLength;
}

//获取标签名
public String getTagName(){
return name;
}

//获取标签属性
public String getAttributeValue(String namespace, String name) {

for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) {
if (attributes[i + 2].equals(name)
&& (namespace == null || attributes[i].equals(namespace)))
return attributes[i + 3];
}

return null;
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: