MyXML源码分析系列:我自己写的一个XML解析-基于状态图
2013-11-25 00:00
681 查看
项目需要,就自己写了一个解析XML报文的函数,返回值有4种
1 当前XML不完整.
2 能找到一个完整的XML,并且没有多余的字符剩下。
3 能找到一个完整的XML节点,并且有多余的字符还在缓冲区中。
4 报文格式错误
稍微改造下,添加一些处理节点和属性的代码,
就可以构造成码农版XML解析器。(有兴趣的码农可以拿过去改造下,很简单。)
PS:我项目中的XML是从网络报文抓取的,没有换行符和多余的空白这些东西的,
如果是读取文件需要考虑这些因素,当然也不难,仔细一点就可以。
废话不多说,上代码:
C语言代码如下:
根据这种思路,就可以写出JSON的解析器,
思路还是如上,根据状态图来考虑即可。
特点是流式解析。
1 当前XML不完整.
2 能找到一个完整的XML,并且没有多余的字符剩下。
3 能找到一个完整的XML节点,并且有多余的字符还在缓冲区中。
4 报文格式错误
稍微改造下,添加一些处理节点和属性的代码,
就可以构造成码农版XML解析器。(有兴趣的码农可以拿过去改造下,很简单。)
PS:我项目中的XML是从网络报文抓取的,没有换行符和多余的空白这些东西的,
如果是读取文件需要考虑这些因素,当然也不难,仔细一点就可以。
废话不多说,上代码:
#define XML_STRING_ERROR 0//出现了错误,无法统计 #define XML_COMPLETE 1 //完全匹配成功 #define XML_NO_COMPLETE_LOSS_CHAR 2//缺少字符串 #define XML_NO_COMPLETE_MORE_CHAR 3//包含一个匹配的,但是有更多的字符串 //<iq>...</iq> 1 2 3 4 5 //<iq /> 1 6 7 // from=" " 8 9 10 #define XML_STATE_CLOSED 0//初始状态 #define XML_STATE_RECEIVED_TAG_FIRST_CHAR 1//表明收到了第一种字符,如上面两行所示 #define XML_STATE_RECEIVED_TAG_SECOND_CHAR 2 #define XML_STATE_RECEIVED_TAG_THIRD_CHAR 3 #define XML_STATE_RECEIVED_TAG_FORTH_CHAR 4 #define XML_STATE_RECEIVED_TAG_FIFTH_CHAR 5 #define XML_STATE_RECEIVED_TAG_SIXTH_CHAR 6 #define XML_STATE_RECEIVED_TAG_SEVENTH_CHAR 7//收到第7种字符 #define XML_STATE_RECEIVED_TAG_EIGHTH_CHAR 8//收到了第8种字符 #define XML_STATE_RECEIVED_TAG_NINTH_CHAR 9//收到了第9种字符 #define XML_STATE_RECEIVED_TAG_TENTH_CHAR 10//收到了第10种字符 #define XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR 11//已经收到了标签的开始的字符 #define XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR 12//已经收到标签的结束的字符 #define XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR 13//等待收到属性的字符串 #define XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR 14//已经收到了属性标签 #define XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR 15//已经收到了属性内容 #define XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR 16//收到了文本字符串
C语言代码如下:
//originlength表示其最初的长度,不需要str以\0结尾 STATIC INT32 im_xml_is_complete(INT8* str,UINT32 originlength,UINT32* length,INT8** innertextBegin,INT8** innertextEnd) { UINT32 layer=0;//当前存在的节点的层次 INT32 result=R_ERROR; INT32 state=XML_STATE_CLOSED;//最开始为关闭状态 INT8 c=0; INT8* p=str; UINT32 innertext=0;//0表示还没有登记过,1表示已经登记过 //innertextBegin,innertextEnd可以为空 if(NULL==str || 0==originlength || str[0]!='<' || NULL==length) { return XML_STRING_ERROR; } //im_log(DEBUG_LEVEL,"state: XML_STATE_CLOSED"); //表明是一个有效的字符串 while('\0'!=*p) { c=*p; //im_log(DEBUG_LEVEL,"get char %c",c); if(XML_STATE_CLOSED==state) { if('<'==c) { state=XML_STATE_RECEIVED_TAG_FIRST_CHAR; layer++; } else goto error; } else if(XML_STATE_RECEIVED_TAG_FIRST_CHAR==state) { if('<'==c||'/'==c||'>'==c||' '==c) goto error; else state=XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR; } else if(XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR==state) { if('<'==c) goto error; else if('/'==c) { state=XML_STATE_RECEIVED_TAG_SIXTH_CHAR; } else if('>'==c) state=XML_STATE_RECEIVED_TAG_SECOND_CHAR; else if(' '==c) state=XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR; else state=XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR; } else if(XML_STATE_RECEIVED_TAG_SIXTH_CHAR==state) { if('>'==c) { //表明当前标签结束了 layer--; //但是有可能存在父亲节点,所以需要根据父亲节点的层次来判断 if(0==layer)//表明没有节点了,到达了closed状态 { //此时已经可以返回了,返回时返回两种状态 //if(*(p+1)=='\0')//表明正好是一个完整的XML if(p+1-str==originlength)//表明正好是一个完整的XML { result=XML_COMPLETE; *length=p-str+1; return result; } else { //表明还存在多余的字符串 //则计算出当前的字符串长度 *length=p-str+1; return XML_NO_COMPLETE_MORE_CHAR; } } else { //表明还存在父节点 state=XML_STATE_RECEIVED_TAG_SECOND_CHAR; } } else goto error; } else if(XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR==state) { //表明在等待收到属性字符串 if('<'==c||'='==c||'"'==c) goto error; else if('>'==c) { state=XML_STATE_RECEIVED_TAG_SECOND_CHAR; } else if('/'==c) { state=XML_STATE_RECEIVED_TAG_SIXTH_CHAR; } else if(' '==c) //保持当前状态 state=XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR; else //收到了有效属性字符 state=XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR; } else if(XML_STATE_RECEIVED_TAG_SECOND_CHAR==state) { if('<'==c) { //收到这个字符时,不清楚当前的状态,需要根据后面一个字符来判断 //if(*(p+1)=='\0') if(p+1-str==originlength) { return XML_NO_COMPLETE_LOSS_CHAR; } else if(*(p+1)=='/') { //表明是准备结束的 state=XML_STATE_RECEIVED_TAG_THIRD_CHAR; } else { //确实是起一个新的标签 layer++; state=XML_STATE_RECEIVED_TAG_FIRST_CHAR; } } else if('/'==c ||'>'==c) { goto error; } else { state=XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR; //如果是第一个innertext则登记 if(0==innertext&&NULL!=innertextBegin) { //此时不应该将innertext置为1 *innertextBegin=p; } } } else if(XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR==state) { if('<'==c) { state=XML_STATE_RECEIVED_TAG_THIRD_CHAR; if(0==innertext&&NULL!=innertextEnd) { *innertextEnd=p-1; innertext=1;//表示以后不写了,找到第一个就行了 } } //else if('/'==c||'>'==c) goto error; //支持里面包含/字符 else if('>'==c) goto error; else state=XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR; } else if(XML_STATE_RECEIVED_TAG_THIRD_CHAR==state) { if('/'==c) state=XML_STATE_RECEIVED_TAG_FORTH_CHAR; else goto error; } else if(XML_STATE_RECEIVED_TAG_FORTH_CHAR==state) { //第四种状态 if('<'==c || '/'==c ||'>'==c) goto error; else state=XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR; } else if(XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR==state) { //已经收到了结束字符 if('<'==c || '/'==c || ' '==c) goto error; else if('>'==c) { //表明结束了 layer--; //但是有可能存在父亲节点,所以需要根据父亲节点的层次来判断 if(0==layer)//表明没有节点了,到达了closed状态 { //此时已经可以返回了,返回时返回两种状态 //if(*(p+1)=='\0') if(p+1-str==originlength) { result=XML_COMPLETE; *length=p-str+1; return result; } else { //表明还存在多余的字符串 //则计算出当前的字符串长度 *length=p-str+1; return XML_NO_COMPLETE_MORE_CHAR; } } else { //表明还存在父节点 state=XML_STATE_RECEIVED_TAG_SECOND_CHAR; } } else state=XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR; } else if(XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR==state) { if('<'==c||' '==c||'>'==c||'/'==c||'"'==c) goto error; else if('='==c) { state=XML_STATE_RECEIVED_TAG_EIGHTH_CHAR; } else //保持状态 state=XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR; } else if(XML_STATE_RECEIVED_TAG_EIGHTH_CHAR==state) { //这种情况下只能是收到" if('"'==c) state=XML_STATE_RECEIVED_TAG_NINTH_CHAR; else goto error; } else if(XML_STATE_RECEIVED_TAG_NINTH_CHAR==state) { //已经收到了属性内容的开始的" //这里不对属性字符串的内容做过多限制 if(c=='"') state=XML_STATE_RECEIVED_TAG_TENTH_CHAR; else { state=XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR; } } else if(XML_STATE_RECEIVED_TAG_TENTH_CHAR==state) { if('<'==c||'='==c||'"'==c) goto error; else if('/'==c) { state=XML_STATE_RECEIVED_TAG_SIXTH_CHAR; } else if('>'==c) state=XML_STATE_RECEIVED_TAG_SECOND_CHAR; else if(' '==c) state=XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR; else { //错误的 goto error; } } else if(XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR==state) { if(c=='"') state=XML_STATE_RECEIVED_TAG_TENTH_CHAR; else { state=XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR; } } if(0)//暂时不打印 { switch(state) { case XML_STATE_CLOSED: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_CLOSED"); break; case XML_STATE_RECEIVED_TAG_FIRST_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_FIRST_CHAR"); break; case XML_STATE_RECEIVED_TAG_SECOND_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_SECOND_CHAR"); break; case XML_STATE_RECEIVED_TAG_THIRD_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_THIRD_CHAR"); break; case XML_STATE_RECEIVED_TAG_FORTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_FORTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_FIFTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_FIFTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_SIXTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_SIXTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_SEVENTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_SEVENTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_EIGHTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_EIGHTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_NINTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_NINTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_TENTH_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_TENTH_CHAR"); break; case XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR"); break; case XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR"); break; case XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR"); break; case XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR"); break; case XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR"); break; case XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR: im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR"); break; default: break; } } p++; } return XML_NO_COMPLETE_LOSS_CHAR; error: return XML_STRING_ERROR; }
根据这种思路,就可以写出JSON的解析器,
思路还是如上,根据状态图来考虑即可。
特点是流式解析。
相关文章推荐
- MyBatis-3.4.2-源码分析1:解析XML之propertiesElement(root.evalNode("properties"))
- MyBatis-3.4.2-源码分析14:XML解析之sqlElement(context.evalNodes("/mapper/sql"))
- jQuery-1.9.1源码分析系列(三) Sizzle选择器引擎——词法解析
- webkit 源码分析系列--css样式解析
- Android源码解析四大组件系列(一)---Service的启动过程分析
- MySQL系列:innodb源码分析之page结构解析
- Mybatis3源码分析(三):解析mapper的xml配置文件
- jQuery-1.9.1源码分析系列(三) Sizzle选择器引擎——词法解析
- 源码分析 —— AsyncTask 完全解析(基于7.0)
- “基于MFC的第三方XML解析和生成API-------CMarkup类”的一个必须要注意的地方
- 基于rt-thread+lwip源码分析-LWIP的IP层数据处理代码解析(lwip源码解析二)
- mybatis源码解析 - 通过一个简单查询例子分析流程
- 解析从源码分析常见的基于Array的数据结构动态扩容机制的详解
- Spring Ioc 源码分析(一)- XML 解析
- MyBatis-3.4.2-源码分析7:解析XML之settingsElement(settings)
- Spring3.2 中 Bean 定义之基于 XML 配置方式的源码解析
- MyBatis-3.4.2-源码分析6:解析XML之objectWrapperFactoryElement & reflectorFactoryElement
- 基于rt-thread+lwip分析数据是怎么从网卡芯片接收数据到pbuf的(lwip源码解析一)
- MyBatis-3.4.2-源码分析2:解析XML之settingsAsProperties(root.evalNode("settings"))
- MyBatis-3.4.2-源码分析5:解析XML之objectFactoryElement(root.evalNode("objectFactory"))