C语言URL解析器(代码分享) 3ff8
2016-06-12 21:26
330 查看
C语言URL解析器(代码分享)
By qianghaohao(CodeNutter)
本程序可以解析出URL中每个字段的值然后存入结构体
中.也可以调用提供的API只获取需要的部分.
本URL解析器代码来自https://github.com/jwerle/url.h,在此基础上进行了大量的修改,修复了很多bug.总体思路
没有改变,就是把很多细节改了下,现在可以正常使用了.
-->可能还存在bug,如果网友发现了可以及时指正.
感受:花了一天多的时间修复,充分地感受到了IDE对编程的效率的影响,
一个很奇葩的越界导致的问题结果调了半天没找到问题。。。
结果人家五分钟搞定。。。
话不多说,直接上源代码:
接口文件:url_parser.h:
实现文件:url_parser.c:
使用示例:main.c:
使用示例:
By qianghaohao(CodeNutter)
本程序可以解析出URL中每个字段的值然后存入结构体
中.也可以调用提供的API只获取需要的部分.
本URL解析器代码来自https://github.com/jwerle/url.h,在此基础上进行了大量的修改,修复了很多bug.总体思路
没有改变,就是把很多细节改了下,现在可以正常使用了.
-->可能还存在bug,如果网友发现了可以及时指正.
感受:花了一天多的时间修复,充分地感受到了IDE对编程的效率的影响,
一个很奇葩的越界导致的问题结果调了半天没找到问题。。。
结果人家五分钟搞定。。。
话不多说,直接上源代码:
接口文件:url_parser.h:
#ifndef URL_PARSER #define URL_PARSER /** * Dependencies */ #include <stdlib.h> #include <stdio.h> #include <stdbool.h> #include <string.h> /** * url.h version */ #define URL_VERSION 0.0.2 /** * Max length of a url protocol scheme */ #define URL_PROTOCOL_MAX_LENGTH 16 /** * Max length of a url host part */ #define URL_HOSTNAME_MAX_LENGTH 128 /** * Max length of a url tld part */ #define URL_TLD_MAX_LENGTH 16 /** * Max length of a url auth part */ #define URL_AUTH_MAX_LENGTH 32 /** * `url_data` struct that defines parts * of a parsed URL such as host and protocol */ #define URL_MAX_LENGTH 1024 // url结构信息 typedef struct url_data { char *href; char *protocol; char *host; char *auth; char *hostname; char *pathname; char *search; char *path; char *hash; char *query; char *port; } url_data_t; // prototype /** * Parses a url into parts and returns * a `url_data_t *` pointer */ url_data_t * url_parse (char *url); char * url_get_protocol (char *url); char * url_get_auth (char *url); char * url_get_hostname (char *url); char * url_get_host (char *url); char * url_get_pathname (char *url); char * url_get_path (char *url); char * url_get_search (char *url); char * url_get_query (char *url); char * url_get_hash (char *url); char * url_get_port (char *url); void url_free (url_data_t *data); bool url_is_protocol (char *str); bool url_is_ssh (char *str); void url_inspect (char *url); void url_data_inspect (url_data_t *data); #endif
实现文件:url_parser.c:
#include "url_parser.h" /** * URI Schemes * http://en.wikipedia.org/wiki/URI_scheme */ // URL协议头表:用来判断解析出来的协议是否在此表中 char *URL_SCHEMES[] = { // official IANA registered schemes "aaa", "aaas", "about", "acap", "acct", "adiumxtra", "afp", "afs", "aim", "apt", "attachment", "aw", "beshare", "bitcoin", "bolo", "callto", "cap", "chrome", "crome-extension", "com-evenbrite-attendee", "cid", "coap", "coaps","content", "crid", "cvs", "data", "dav", "dict", "lna-playsingle", "dln-playcontainer", "dns", "dtn", "dvb", "ed2k", "facetime", "fax", "feed", "file", "finger", "fish","ftp", "geo", "gg","git", "gizmoproject", "go", "gopher", "gtalk", "h323", "hcp", "http", "https", "iax", "icap", "icon","im", "imap", "info", "ipn", "ipp", "irc", "irc6", "ircs", "iris", "iris.beep", "iris.xpc", "iris.xpcs","iris.lws", "itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap", "ldaps", "magnet", "mailserver","mailto", "maps", "market", "message", "mid", "mms", "modem", "ms-help", "mssettings-power", "msnim", "msrp", "msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs", "ni", "nih", "nntp", "notes","oid", "paquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop", "pres", "prospero", "proxy", "psyc","query", "reload", "res", "resource", "rmi", "rsync", "rtmp","rtsp", "secondlife", "service","session", "sftp", "sgn", "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "snews", "snmp", "soap.beep","soap.beeps", "soldat", "spotify", "ssh", "steam", "svn", "tag", "teamspeak", "tel", "telnet", "tftp", "things","thismessage", "tn3270", "tip", "tv", "udp", "unreal", "urn", "ut2004", "vemmi","ventrilo", "videotex", "view-source", "wais","webcal", "ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire","xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri","ymsgr", // unofficial schemes "javascript", "jdbc", "doi" }; char * m_strdup (const char *str) { int n = strlen(str) + 1; char *dup = malloc(n); if (dup) strcpy(dup, str); return dup; } static char * get_part (char *url, const char *format, int l) { bool has = false; char *tmp = malloc(URL_MAX_LENGTH * sizeof(char)); memset(tmp, 0, URL_MAX_LENGTH * sizeof(char)); char *fmt_url = m_strdup(url); char *ret = NULL; if (!tmp || !fmt_url) return NULL; strcpy(tmp, ""); strcpy(fmt_url, ""); // move pointer exactly the amount // of characters in the `prototcol` char // plus 3 characters that represent the `://` // part of the url fmt_url = fmt_url + l; sscanf(fmt_url, format, tmp); // if (0 != strcmp(tmp, tmp_url)) { if (0 != strcmp(tmp, fmt_url)) { has = true; ret = m_strdup(tmp); } // descrement pointer to original // position so it can be free'd fmt_url = fmt_url - l; free(tmp); free(fmt_url); if (!has) { free(ret); return NULL; } else { return ret; } } /************* 解析URL各部分到url_data_t结构体 *************/ url_data_t * url_parse (char *url) { url_data_t *data = malloc(sizeof(url_data_t)); if (!data) return NULL; data->href = url; char *tmp; char *tmp_url = m_strdup(url); bool is_ssh = false; /************ 解析协议头 ***********/ char *protocol = url_get_protocol(tmp_url); if (!protocol) return NULL; // length of protocol plus :// int protocol_len = (int) strlen(protocol) + 3; data->protocol = protocol; is_ssh = url_is_ssh(protocol); /************ 解析用户名 ***********/ char *auth = NULL; int auth_len = 0; if ((tmp = strstr(tmp_url, "@"))) { auth = get_part(tmp_url, "%[^@]", protocol_len); auth_len = strlen(auth); if (auth) auth_len++; } data->auth = auth; char *hostname = NULL; /************ 解析主机名(包括端口号) ***********/ hostname = (is_ssh) ? get_part(tmp_url, "%[^:]", protocol_len + auth_len) : get_part(tmp_url, "%[^/]", protocol_len + auth_len); if (!hostname) return NULL; int hostname_len = (int) strlen(hostname); char *tmp_hostname = m_strdup(hostname); data->hostname = hostname; /************ 解析主机名 ***********/ char *host = malloc(strlen(tmp_hostname) * sizeof(char)); memset(host, 0, strlen(tmp_hostname) * sizeof(char)); sscanf(tmp_hostname, "%[^:]", host); if (!host) return NULL; int host_len = (int) strlen(host); data->host = host; /************ 解析端口号 ***********/ char *port = malloc(URL_PROTOCOL_MAX_LENGTH * (sizeof(char))); memset(port, 0, URL_PROTOCOL_MAX_LENGTH * (sizeof(char))); if (!port) return NULL; tmp_hostname = tmp_hostname + (host_len + 1); sscanf(tmp_hostname, "%s", port); tmp_hostname = tmp_hostname - (host_len + 1); data->port = port; free(tmp_hostname); /************ 解析完整路径名 ***********/ char *tmp_path; tmp_path = (is_ssh) ? get_part(tmp_url, ":%s", protocol_len + auth_len + hostname_len) : get_part(tmp_url, "/%s", protocol_len + auth_len + hostname_len); char *path = malloc(strlen(tmp_path) * sizeof(char) + 1); memset(path, 0, strlen(tmp_path) * sizeof(char) + 1); if (!path) return NULL; char *fmt = (is_ssh)? "%s" : "/%s"; sprintf(path, fmt, tmp_path); data->path = path; free(tmp_path); /************ 解析路径名(不包括参数) ***********/ char *pathname = malloc(strlen(path) * sizeof(char) + 1); memset(pathname, 0, strlen(path) * sizeof(char) + 1); if (!pathname) return NULL; tmp_path = m_strdup(path); sscanf(tmp_path, "%[^? | ^#]", pathname); int pathname_len = strlen(pathname); data->pathname = pathname; /************* 解析搜索参数 ***********/ char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(search)); memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(search)); if (!search) return NULL; tmp_path = tmp_path + pathname_len; sscanf(tmp_path, "%[^#]", search); tmp_path = tmp_path - pathname_len; data->search = search; int search_len = strlen(search); free(tmp_path); /************* 解析查询参数 ***********/ char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char)); memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char)); if (!query) return NULL; sscanf(data->search, "?%s", query); data->query = query; /************* 解析hash值 ***********/ char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char)); memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char)); if (!hash) return NULL; tmp_path = data->path + (pathname_len + search_len); sscanf(tmp_path, "%s", hash); tmp_path = data->path - (pathname_len + search_len); data->hash = hash; free(tmp_path); return data; } bool url_is_protocol (char *str) { int count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]); for (int i = 0; i < count; ++i) { if (0 == strcmp(URL_SCHEMES[i], str)) { return true; } } return false; } bool url_is_ssh (char *str) { str = m_strdup(str); if (0 == strcmp(str, "ssh") || 0 == strcmp(str, "git")) { free(str); return true; } return false; } char * url_get_protocol (char *url) { char *protocol = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char)); if (!protocol) return NULL; sscanf(url, "%[^://]", protocol); if (url_is_protocol(protocol)) return protocol; return NULL; } char * url_get_auth (char *url) { char *protocol = url_get_protocol(url); if (!protocol) return NULL; int l = (int) strlen(protocol) + 3; return get_part(url, "%[^@]", l); } char * url_get_hostname (char *url) { int l = 3; char *protocol = url_get_protocol(url); char *tmp_protocol = m_strdup(protocol); char *auth = url_get_auth(url); if (!protocol) return NULL; if (auth) l += strlen(auth) + 1; // add one @ symbol if (auth) free(auth); l += (int) strlen(protocol); free(protocol); char * hostname = url_is_ssh(tmp_protocol) ? get_part(url, "%[^:]", l) : get_part(url, "%[^/]", l); free(tmp_protocol); return hostname; } char * url_get_host (char *url) { char *host = malloc(URL_HOSTNAME_MAX_LENGTH * sizeof(char)); char *hostname = url_get_hostname(url); memset(host, 0, URL_HOSTNAME_MAX_LENGTH * sizeof(char)); if (!host || !hostname) return NULL; sscanf(hostname, "%[^:]", host); free(hostname); return host; } char * url_get_pathname (char *url) { char *path = url_get_path(url); char *pathname = malloc(URL_MAX_LENGTH * sizeof(char)); memset(pathname, 0, URL_MAX_LENGTH * sizeof(char)); if (!path || !pathname) return NULL; sscanf(path, "%[^?]", pathname); free(path); return pathname; } char * url_get_path (char *url) { int l = 3; char *tmp_path; char *protocol = url_get_protocol(url); char *auth = url_get_auth(url); char *hostname = url_get_hostname(url); if (!protocol || !hostname) return NULL; bool is_ssh = url_is_ssh(protocol); l += (int) strlen(protocol) + (int) strlen(hostname); if (auth) l+= (int) strlen(auth) +1; // @ symbol tmp_path = (is_ssh) ? get_part(url, ":%s", l) : get_part(url, "/%s", l); char *fmt = (is_ssh)? "%s" : "/%s"; char *path = malloc(strlen(tmp_path) * sizeof(char) + 1); memset(path, 0, strlen(tmp_path) * sizeof(char) + 1); sprintf(path, fmt, tmp_path); if (auth) free(auth); free(protocol); free(hostname); free(tmp_path); return path; } char * url_get_search (char *url) { char *path = url_get_path(url); char *pathname = url_get_pathname(url); char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(char)); memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(char)); if (!path || !search) return NULL; path = path + (int)strlen(pathname); sscanf(path, "%[^#]", search); path = path - (int)strlen(pathname); free(path); free(pathname); return search; } char * url_get_query (char *url) { char *search = url_get_search(url); char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char)); memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char)); if (!search) return NULL; sscanf(search, "?%s", query); free(search); return query; } char * url_get_hash (char *url) { char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char)); memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char)); if (!hash) return NULL; char *path = url_get_path(url); if (!path) ret 3ff0 urn NULL; char *pathname = url_get_pathname(url); if (!pathname) return NULL; char *search = url_get_search(url); int pathname_len = (int) strlen(pathname); int search_len = (int) strlen(search); path = path + (pathname_len + search_len); sscanf(path, "%s", hash); path = path - (pathname_len + search_len); free(pathname); free(path); if (search) free(search); return hash; } char * url_get_port (char *url) { char *port = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char)); memset(port, 0, URL_PROTOCOL_MAX_LENGTH * sizeof(char)); char *hostname = url_get_hostname(url); char *host = url_get_host(url); if (!port || !hostname) return NULL; char *tmp_hostname = hostname; tmp_hostname = hostname + (strlen(host) + 1); sscanf(tmp_hostname, "%s", port); free(hostname); return port; } void url_inspect (char *url) { url_data_inspect(url_parse(url)); } void url_data_inspect (url_data_t *data) { printf("#url =>\n"); printf(" .href: \"%s\"\n", data->href); printf(" .protocol: \"%s\"\n", data->protocol); printf(" .host: \"%s\"\n", data->host); printf(" .auth: \"%s\"\n", data->auth); printf(" .hostname: \"%s\"\n", data->hostname); printf(" .pathname: \"%s\"\n", data->pathname); printf(" .search: \"%s\"\n", data->search); printf(" .path: \"%s\"\n", data->path); printf(" .hash: \"%s\"\n", data->hash); printf(" .query: \"%s\"\n", data->query); printf(" .port: \"%s\"\n", data->port); } void url_free (url_data_t *data) { if (!data) return; if (data->auth) free(data->auth); if (data->protocol) free(data->protocol); if (data->hostname) free(data->hostname); if (data->host) free(data->host); if (data->pathname) free(data->pathname); if (data->path) free(data->path); if (data->hash) free(data->hash); if (data->search) free(data->search); if (data->query) free(data->query); }
使用示例:main.c:
/************************************************************************* > File Name: main.c > Author: qianghaohao(Xqiang) > Program: url_parser > Readme: 本URL解析器代码来自https://github.com/jwerle/url.h > 在此基础上进行了大量的修改,修复了很多bug. > 总体思路没有改变,就是把很多细节改了下,现在可以正常使用了. > -->可能还存在bug,如果网友发现了可以及时指正. > Platform: 可以跨平台使用 > Created Time: 2016年06月11日 星期六 18时39分14秒 ************************************************************************/ #include"url_parser.h" url_data_t *url_info = NULL; int main(int argc, char **argv) { if (argc != 2) { fprintf(stderr, "usage:%s <url>", argv[0]); return -1; } url_info = url_parse(argv[1]); if (NULL == url_info) { fprintf(stderr, "%s error...\n", argv[1]); return -1; } url_data_inspect(url_info); // 测试用例:http://www.baidu.com:80/cig-bin/index.html?sdkfj#283sjkdf printf("\n================= API Demo ================\n"); printf("herf:%s\n", argv[1]); char *hostname = url_get_host(argv[1]); printf("hostname:%s\n", hostname); char *path = url_get_path(argv[1]); printf("path:%s\n", path); char *host = url_get_host(argv[1]); printf("host:%s\n", host); char *proto = url_get_protocol(argv[1]); printf("protocol:%s\n", proto); char *auth = url_get_auth(argv[1]); printf("auth:%s\n", auth); char *search = url_get_search(argv[1]); printf("search:%s\n", search); char *hash = url_get_hash(argv[1]); printf("hash:%s\n", hash); char *query = url_get_query(argv[1]); printf("query:%s\n", query); char *port = url_get_port(argv[1]); printf("port:%s\n", port); return 0; }
使用示例:
相关文章推荐
- C语言URL解析器(代码分享)
- C++高级排序算法详解
- c++实现Face++ API的调用
- [C++ 面试基础知识总结] 类
- Emacs中实现C/C++函数列表显示
- 步步为营<三> C语言 语句和表达式
- c++11多线程简介
- C/C++,运算符重载
- GeekBand c++學習筆記之五(對象模型與動態綁定的討論)
- C++在单片机上面的啪啪啪
- 循环队列 作为消息队列
- const 在基本语言中的总结
- 【C++】第1章 在VS2015中用C++编写控制台应用程序
- C++STL 之map
- C语言链接mysql常用函数
- c++ new string数组问题
- 模拟 - 约瑟夫环问题及其变式
- 关于约瑟夫问题的那点事
- C 指向指针的指针