您的位置:首页 > 编程语言 > C语言/C++

C语言URL解析器(代码分享) 3ff8

2016-06-12 21:26 330 查看
C语言URL解析器(代码分享)

By qianghaohao(CodeNutter)

本程序可以解析出URL中每个字段的值然后存入结构体
中.也可以调用提供的API只获取需要的部分.
本URL解析器代码来自https://github.com/jwerle/url.h,在此基础上进行了大量的修改,修复了很多bug.总体思路
没有改变,就是把很多细节改了下,现在可以正常使用了.
-->可能还存在bug,如果网友发现了可以及时指正.

感受:花了一天多的时间修复,充分地感受到了IDE对编程的效率的影响,
一个很奇葩的越界导致的问题结果调了半天没找到问题。。。
结果人家五分钟搞定。。。

话不多说,直接上源代码:

接口文件:url_parser.h:

#ifndef URL_PARSER
#define URL_PARSER

/** * Dependencies */

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>

/** * url.h version */

#define URL_VERSION 0.0.2

/** * Max length of a url protocol scheme */

#define URL_PROTOCOL_MAX_LENGTH 16

/** * Max length of a url host part */

#define URL_HOSTNAME_MAX_LENGTH 128

/** * Max length of a url tld part */

#define URL_TLD_MAX_LENGTH 16

/** * Max length of a url auth part */

#define URL_AUTH_MAX_LENGTH 32

/** * `url_data` struct that defines parts * of a parsed URL such as host and protocol */

#define URL_MAX_LENGTH 1024

// url结构信息
typedef struct url_data {
char *href;
char *protocol;
char *host;
char *auth;
char *hostname;
char *pathname;
char *search;
char *path;
char *hash;
char *query;
char *port;
} url_data_t;

// prototype

/** * Parses a url into parts and returns * a `url_data_t *` pointer */

url_data_t *
url_parse (char *url);

char *
url_get_protocol (char *url);

char *
url_get_auth (char *url);

char *
url_get_hostname (char *url);

char *
url_get_host (char *url);

char *
url_get_pathname (char *url);

char *
url_get_path (char *url);

char *
url_get_search (char *url);

char *
url_get_query (char *url);

char *
url_get_hash (char *url);

char *
url_get_port (char *url);

void
url_free (url_data_t *data);

bool
url_is_protocol (char *str);

bool
url_is_ssh (char *str);

void
url_inspect (char *url);

void
url_data_inspect (url_data_t *data);

#endif

实现文件:url_parser.c:

#include "url_parser.h"

/**
* URI Schemes
* http://en.wikipedia.org/wiki/URI_scheme */
//  URL协议头表:用来判断解析出来的协议是否在此表中
char *URL_SCHEMES[] = {
// official IANA registered schemes
"aaa", "aaas", "about", "acap", "acct", "adiumxtra", "afp", "afs", "aim", "apt", "attachment", "aw",
"beshare", "bitcoin", "bolo", "callto", "cap", "chrome", "crome-extension", "com-evenbrite-attendee",
"cid", "coap", "coaps","content", "crid", "cvs", "data", "dav", "dict", "lna-playsingle", "dln-playcontainer",
"dns", "dtn", "dvb", "ed2k", "facetime", "fax", "feed", "file", "finger", "fish","ftp", "geo", "gg","git",
"gizmoproject", "go", "gopher", "gtalk", "h323", "hcp", "http", "https", "iax", "icap", "icon","im",
"imap", "info", "ipn", "ipp", "irc", "irc6", "ircs", "iris", "iris.beep", "iris.xpc", "iris.xpcs","iris.lws",
"itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap", "ldaps", "magnet", "mailserver","mailto",
"maps", "market", "message", "mid", "mms", "modem", "ms-help", "mssettings-power", "msnim", "msrp",
"msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs", "ni", "nih", "nntp", "notes","oid",
"paquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop", "pres", "prospero", "proxy",
"psyc","query", "reload", "res", "resource", "rmi", "rsync", "rtmp","rtsp",  "secondlife", "service","session",
"sftp", "sgn", "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "snews", "snmp", "soap.beep","soap.beeps",
"soldat", "spotify", "ssh", "steam", "svn", "tag", "teamspeak", "tel", "telnet", "tftp", "things","thismessage",
"tn3270", "tip", "tv", "udp", "unreal", "urn", "ut2004", "vemmi","ventrilo", "videotex", "view-source", "wais","webcal",
"ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire","xmlrpc.beep",  "xmlrpc.beeps", "xmpp", "xri","ymsgr",

// unofficial schemes
"javascript", "jdbc", "doi"
};

char *
m_strdup (const char *str) {
int n = strlen(str) + 1;
char *dup = malloc(n);
if (dup) strcpy(dup, str);
return dup;
}

static char *
get_part (char *url, const char *format, int l) {
bool has = false;

char *tmp = malloc(URL_MAX_LENGTH * sizeof(char));
memset(tmp, 0, URL_MAX_LENGTH * sizeof(char));
char *fmt_url = m_strdup(url);
char *ret = NULL;
if (!tmp || !fmt_url)
return NULL;

strcpy(tmp, "");
strcpy(fmt_url, "");

// move pointer exactly the amount
// of characters in the `prototcol` char
// plus 3 characters that represent the `://`
// part of the url
fmt_url = fmt_url + l;
sscanf(fmt_url, format, tmp);

// if (0 != strcmp(tmp, tmp_url)) {
if (0 != strcmp(tmp, fmt_url)) {
has = true;
ret = m_strdup(tmp);
}

// descrement pointer to original
// position so it can be free'd
fmt_url = fmt_url - l;
free(tmp);
free(fmt_url);
if (!has) {
free(ret);
return NULL;
} else {
return ret;
}
}

/************* 解析URL各部分到url_data_t结构体 *************/
url_data_t *
url_parse (char *url) {
url_data_t *data = malloc(sizeof(url_data_t));
if (!data) return NULL;

data->href = url;
char *tmp;
char *tmp_url = m_strdup(url);
bool is_ssh = false;

/************ 解析协议头 ***********/
char *protocol = url_get_protocol(tmp_url);
if (!protocol) return NULL;
// length of protocol plus ://
int protocol_len = (int) strlen(protocol) + 3;
data->protocol = protocol;

is_ssh = url_is_ssh(protocol);

/************ 解析用户名 ***********/
char *auth = NULL;
int auth_len = 0;
if ((tmp = strstr(tmp_url, "@"))) {
auth = get_part(tmp_url, "%[^@]", protocol_len);
auth_len = strlen(auth);
if (auth) auth_len++;
}

data->auth = auth;
char *hostname = NULL;

/************ 解析主机名(包括端口号) ***********/
hostname = (is_ssh)
? get_part(tmp_url, "%[^:]", protocol_len + auth_len)
: get_part(tmp_url, "%[^/]", protocol_len + auth_len);

if (!hostname) return NULL;
int hostname_len = (int) strlen(hostname);
char *tmp_hostname = m_strdup(hostname);
data->hostname = hostname;

/************ 解析主机名 ***********/
char *host = malloc(strlen(tmp_hostname) * sizeof(char));
memset(host, 0, strlen(tmp_hostname) * sizeof(char));
sscanf(tmp_hostname, "%[^:]", host);
if (!host) return NULL;
int host_len = (int) strlen(host);
data->host = host;

/************ 解析端口号 ***********/
char *port = malloc(URL_PROTOCOL_MAX_LENGTH * (sizeof(char)));
memset(port, 0, URL_PROTOCOL_MAX_LENGTH * (sizeof(char)));
if (!port) return NULL;

tmp_hostname = tmp_hostname + (host_len + 1);
sscanf(tmp_hostname, "%s", port);
tmp_hostname = tmp_hostname - (host_len + 1);
data->port = port;
free(tmp_hostname);

/************ 解析完整路径名 ***********/
char *tmp_path;
tmp_path = (is_ssh)
? get_part(tmp_url, ":%s", protocol_len + auth_len + hostname_len)
: get_part(tmp_url, "/%s", protocol_len + auth_len + hostname_len);

char *path = malloc(strlen(tmp_path) * sizeof(char) + 1);
memset(path, 0, strlen(tmp_path) * sizeof(char) + 1);
if (!path) return NULL;
char *fmt = (is_ssh)? "%s" : "/%s";
sprintf(path, fmt, tmp_path);
data->path = path;
free(tmp_path);

/************ 解析路径名(不包括参数) ***********/
char *pathname = malloc(strlen(path) * sizeof(char) + 1);
memset(pathname, 0, strlen(path) * sizeof(char) + 1);

if (!pathname) return NULL;
tmp_path = m_strdup(path);
sscanf(tmp_path, "%[^? | ^#]", pathname);
int pathname_len = strlen(pathname);
data->pathname = pathname;

/************* 解析搜索参数 ***********/
char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(search));
memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(search));
if (!search) return NULL;
tmp_path = tmp_path + pathname_len;
sscanf(tmp_path, "%[^#]", search);
tmp_path = tmp_path - pathname_len;
data->search = search;
int search_len = strlen(search);
free(tmp_path);

/************* 解析查询参数 ***********/
char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));
memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char));
if (!query) return NULL;
sscanf(data->search, "?%s", query);
data->query = query;

/************* 解析hash值 ***********/
char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));
memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char));
if (!hash) return NULL;
tmp_path = data->path + (pathname_len + search_len);
sscanf(tmp_path, "%s", hash);
tmp_path = data->path - (pathname_len + search_len);
data->hash = hash;
free(tmp_path);

return data;
}

bool
url_is_protocol (char *str) {
int count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]);

for (int i = 0; i < count; ++i) {
if (0 == strcmp(URL_SCHEMES[i], str)) {
return true;
}
}

return false;
}

bool
url_is_ssh (char *str) {
str = m_strdup(str);
if (0 == strcmp(str, "ssh") ||
0 == strcmp(str, "git")) {
free(str);
return true;
}

return false;
}

char *
url_get_protocol (char *url) {
char *protocol = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char));
if (!protocol) return NULL;
sscanf(url, "%[^://]", protocol);
if (url_is_protocol(protocol)) return protocol;
return NULL;
}

char *
url_get_auth (char *url) {
char *protocol = url_get_protocol(url);
if (!protocol) return NULL;
int l = (int) strlen(protocol) + 3;
return get_part(url, "%[^@]", l);
}

char *
url_get_hostname (char *url) {
int l = 3;
char *protocol = url_get_protocol(url);
char *tmp_protocol = m_strdup(protocol);
char *auth = url_get_auth(url);

if (!protocol) return NULL;
if (auth) l += strlen(auth) + 1; // add one @ symbol
if (auth) free(auth);

l += (int) strlen(protocol);

free(protocol);

char * hostname = url_is_ssh(tmp_protocol)
? get_part(url, "%[^:]", l)
: get_part(url, "%[^/]", l);
free(tmp_protocol);
return hostname;
}

char *
url_get_host (char *url) {
char *host = malloc(URL_HOSTNAME_MAX_LENGTH * sizeof(char));
char *hostname = url_get_hostname(url);
memset(host, 0, URL_HOSTNAME_MAX_LENGTH * sizeof(char));
if (!host || !hostname) return NULL;

sscanf(hostname, "%[^:]", host);

free(hostname);

return host;
}

char *
url_get_pathname (char *url) {
char *path = url_get_path(url);
char *pathname = malloc(URL_MAX_LENGTH * sizeof(char));
memset(pathname, 0, URL_MAX_LENGTH * sizeof(char));
if (!path || !pathname) return NULL;

sscanf(path, "%[^?]", pathname);

free(path);

return pathname;
}

char *
url_get_path (char *url) {
int l = 3;
char *tmp_path;
char *protocol = url_get_protocol(url);
char *auth = url_get_auth(url);
char *hostname = url_get_hostname(url);

if (!protocol || !hostname)
return NULL;

bool is_ssh = url_is_ssh(protocol);

l += (int) strlen(protocol) + (int) strlen(hostname);

if (auth) l+= (int) strlen(auth) +1; // @ symbol

tmp_path = (is_ssh)
? get_part(url, ":%s", l)
: get_part(url, "/%s", l);

char *fmt = (is_ssh)? "%s" : "/%s";
char *path = malloc(strlen(tmp_path) * sizeof(char) + 1);
memset(path, 0, strlen(tmp_path) * sizeof(char) + 1);
sprintf(path, fmt, tmp_path);

if (auth) free(auth);
free(protocol);
free(hostname);
free(tmp_path);

return path;

}

char *
url_get_search (char *url) {
char *path = url_get_path(url);
char *pathname = url_get_pathname(url);
char *search = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));
memset(search, 0, URL_AUTH_MAX_LENGTH * sizeof(char));
if (!path || !search) return NULL;

path = path + (int)strlen(pathname);
sscanf(path, "%[^#]", search);
path = path - (int)strlen(pathname);

free(path);
free(pathname);

return search;
}

char *
url_get_query (char *url) {
char *search = url_get_search(url);
char *query = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));
memset(query, 0, URL_AUTH_MAX_LENGTH * sizeof(char));
if (!search) return NULL;
sscanf(search, "?%s", query);
free(search);
return query;
}

char *
url_get_hash (char *url) {
char *hash = malloc(URL_AUTH_MAX_LENGTH * sizeof(char));
memset(hash, 0, URL_AUTH_MAX_LENGTH * sizeof(char));
if (!hash) return NULL;

char *path = url_get_path(url);
if (!path) ret
3ff0
urn NULL;

char *pathname = url_get_pathname(url);
if (!pathname) return NULL;
char *search = url_get_search(url);

int pathname_len = (int) strlen(pathname);
int search_len = (int) strlen(search);

path = path + (pathname_len + search_len);
sscanf(path, "%s", hash);
path = path - (pathname_len + search_len);
free(pathname);
free(path);
if (search) free(search);

return hash;
}

char *
url_get_port (char *url) {
char *port = malloc(URL_PROTOCOL_MAX_LENGTH * sizeof(char));
memset(port, 0, URL_PROTOCOL_MAX_LENGTH * sizeof(char));
char *hostname = url_get_hostname(url);
char *host = url_get_host(url);
if (!port || !hostname) return NULL;

char *tmp_hostname = hostname;
tmp_hostname = hostname + (strlen(host) + 1);
sscanf(tmp_hostname, "%s", port);

free(hostname);
return port;
}

void
url_inspect (char *url) {
url_data_inspect(url_parse(url));
}

void
url_data_inspect (url_data_t *data) {
printf("#url =>\n");
printf("    .href: \"%s\"\n",     data->href);
printf("    .protocol: \"%s\"\n", data->protocol);
printf("    .host: \"%s\"\n",     data->host);
printf("    .auth: \"%s\"\n",     data->auth);
printf("    .hostname: \"%s\"\n", data->hostname);
printf("    .pathname: \"%s\"\n", data->pathname);
printf("    .search: \"%s\"\n",   data->search);
printf("    .path: \"%s\"\n",     data->path);
printf("    .hash: \"%s\"\n",     data->hash);
printf("    .query: \"%s\"\n",    data->query);
printf("    .port: \"%s\"\n",     data->port);
}

void
url_free (url_data_t *data) {
if (!data) return;
if (data->auth) free(data->auth);
if (data->protocol) free(data->protocol);
if (data->hostname) free(data->hostname);
if (data->host) free(data->host);
if (data->pathname) free(data->pathname);
if (data->path) free(data->path);
if (data->hash) free(data->hash);
if (data->search) free(data->search);
if (data->query) free(data->query);
}

使用示例:main.c:

/*************************************************************************
> File Name: main.c
> Author: qianghaohao(Xqiang)
> Program: url_parser
> Readme: 本URL解析器代码来自https://github.com/jwerle/url.h
>         在此基础上进行了大量的修改,修复了很多bug.
>         总体思路没有改变,就是把很多细节改了下,现在可以正常使用了.
>    -->可能还存在bug,如果网友发现了可以及时指正.
> Platform: 可以跨平台使用
> Created Time: 2016年06月11日 星期六 18时39分14秒
************************************************************************/

#include"url_parser.h"
url_data_t *url_info = NULL;
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "usage:%s <url>", argv[0]);
return -1;
}

url_info = url_parse(argv[1]);
if (NULL == url_info) {
fprintf(stderr, "%s error...\n", argv[1]);
return -1;
}
url_data_inspect(url_info);

//  测试用例:http://www.baidu.com:80/cig-bin/index.html?sdkfj#283sjkdf
printf("\n================= API Demo ================\n");

printf("herf:%s\n", argv[1]);

char *hostname = url_get_host(argv[1]);
printf("hostname:%s\n", hostname);

char *path = url_get_path(argv[1]);
printf("path:%s\n", path);

char *host = url_get_host(argv[1]);
printf("host:%s\n", host);

char *proto = url_get_protocol(argv[1]);
printf("protocol:%s\n", proto);

char *auth = url_get_auth(argv[1]);
printf("auth:%s\n", auth);

char *search = url_get_search(argv[1]);
printf("search:%s\n", search);

char *hash = url_get_hash(argv[1]);
printf("hash:%s\n", hash);

char *query = url_get_query(argv[1]);
printf("query:%s\n", query);

char *port = url_get_port(argv[1]);
printf("port:%s\n", port);
return 0;
}

使用示例:



内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息