The URL and The Parser Implementation
Overview
[RFC 1738] and [RFC 3986] define Uniform Resource Locators (URL) and its generic syntax, respectively. According to these RFCs, we have implemented URL parser in the C programming language. The url consists of eight parts; scheme, host, port, path, query, fragment, username, and password. Hence, we define the following structure for storing each part.
struct parsed_url {
char *scheme; /* mandatory */
char *host; /* mandatory */
char *port; /* optional */
char *path; /* optional */
char *query; /* optional */
char *fragment; /* optional */
char *username; /* optional */
char *password; /* optional */
};
Functions
We have implemented two simple functions; one parses a URL and returns the corresponding dataset in the structure parsed_url, and the other frees the memory of the structure parsed_url, which would be allocated in the parser function.
Thus, the header file url_parser.h is written as follows.
/*_
* Copyright 2010 Scyphus Solutions Co. Ltd. All rights reserved.
*
* Authors:
* Hirochika Asai
*/
#ifndef _URL_PARSER_H
#define _URL_PARSER_H
/*
* URL storage
*/
struct parsed_url {
char *scheme; /* mandatory */
char *host; /* mandatory */
char *port; /* optional */
char *path; /* optional */
char *query; /* optional */
char *fragment; /* optional */
char *username; /* optional */
char *password; /* optional */
};
#ifdef __cplusplus
extern "C" {
#endif
/*
* Declaration of function prototypes
*/
struct parsed_url * parse_url(const char *);
void parsed_url_free(struct parsed_url *);
#ifdef __cplusplus
}
#endif
#endif /* _URL_PARSER_H */
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim600: sw=4 ts=4 fdm=marker
* vim<600: sw=4 ts=4
*/
The parser implementation
The following code is the implementation of the URL parser. The parser function parsed_url returns NULL if an error has occurred.
/*_
* Copyright 2010-2011 Scyphus Solutions Co. Ltd. All rights reserved.
*
* Authors:
* Hirochika Asai
*/
#include "url_parser.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/*
* Prototype declarations
*/
static __inline__ int _is_scheme_char(int);
/*
* Check whether the character is permitted in scheme string
*/
static __inline__ int
_is_scheme_char(int c)
{
return (!isalpha(c) && '+' != c && '-' != c && '.' != c) ? 0 : 1;
}
/*
* See RFC 1738, 3986
*/
struct parsed_url *
parse_url(const char *url)
{
struct parsed_url *purl;
const char *tmpstr;
const char *curstr;
int len;
int i;
int userpass_flag;
int bracket_flag;
/* Allocate the parsed url storage */
purl = malloc(sizeof(struct http_parsed_url));
if ( NULL == purl ) {
return NULL;
}
purl->scheme = NULL;
purl->host = NULL;
purl->port = NULL;
purl->path = NULL;
purl->query = NULL;
purl->fragment = NULL;
purl->username = NULL;
purl->password = NULL;
curstr = url;
/*
* <scheme>:<scheme-specific-part>
* <scheme> := [a-z\+\-\.]+
* upper case = lower case for resiliency
*/
/* Read scheme */
tmpstr = strchr(curstr, ':');
if ( NULL == tmpstr ) {
/* Not found the character */
http_parsed_url_free(purl);
return NULL;
}
/* Get the scheme length */
len = tmpstr - curstr;
/* Check restrictions */
for ( i = 0; i < len; i++ ) {
if ( !_is_scheme_char(curstr[i]) ) {
/* Invalid format */
http_parsed_url_free(purl);
return NULL;
}
}
/* Copy the scheme to the storage */
purl->scheme = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->scheme ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->scheme, curstr, len);
purl->scheme[len] = '\0';
/* Make the character to lower if it is upper case. */
for ( i = 0; i < len; i++ ) {
purl->scheme[i] = tolower(purl->scheme[i]);
}
/* Skip ':' */
tmpstr++;
curstr = tmpstr;
/*
* //<user>:<password>@<host>:<port>/<url-path>
* Any ":", "@" and "/" must be encoded.
*/
/* Eat "//" */
for ( i = 0; i < 2; i++ ) {
if ( '/' != *curstr ) {
http_parsed_url_free(purl);
return NULL;
}
curstr++;
}
/* Check if the user (and password) are specified. */
userpass_flag = 0;
tmpstr = curstr;
while ( '\0' != *tmpstr ) {
if ( '@' == *tmpstr ) {
/* Username and password are specified */
userpass_flag = 1;
break;
} else if ( '/' == *tmpstr ) {
/* End of <host>:<port> specification */
userpass_flag = 0;
break;
}
tmpstr++;
}
/* User and password specification */
tmpstr = curstr;
if ( userpass_flag ) {
/* Read username */
while ( '\0' != *tmpstr && ':' != *tmpstr && '@' != *tmpstr ) {
tmpstr++;
}
len = tmpstr - curstr;
purl->username = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->username ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->username, curstr, len);
purl->username[len] = '\0';
/* Proceed current pointer */
curstr = tmpstr;
if ( ':' == *curstr ) {
/* Skip ':' */
curstr++;
/* Read password */
tmpstr = curstr;
while ( '\0' != *tmpstr && '@' != *tmpstr ) {
tmpstr++;
}
len = tmpstr - curstr;
purl->password = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->password ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->password, curstr, len);
purl->password[len] = '\0';
curstr = tmpstr;
}
/* Skip '@' */
if ( '@' != *curstr ) {
http_parsed_url_free(purl);
return NULL;
}
curstr++;
}
if ( '[' == *curstr ) {
bracket_flag = 1;
} else {
bracket_flag = 0;
}
/* Proceed on by delimiters with reading host */
tmpstr = curstr;
while ( '\0' != *tmpstr ) {
if ( bracket_flag && ']' == *tmpstr ) {
/* End of IPv6 address. */
tmpstr++;
break;
} else if ( !bracket_flag && (':' == *tmpstr || '/' == *tmpstr) ) {
/* Port number is specified. */
break;
}
tmpstr++;
}
len = tmpstr - curstr;
purl->host = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->host || len <= 0 ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->host, curstr, len);
purl->host[len] = '\0';
curstr = tmpstr;
/* Is port number specified? */
if ( ':' == *curstr ) {
curstr++;
/* Read port number */
tmpstr = curstr;
while ( '\0' != *tmpstr && '/' != *tmpstr ) {
tmpstr++;
}
len = tmpstr - curstr;
purl->port = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->port ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->port, curstr, len);
purl->port[len] = '\0';
curstr = tmpstr;
}
/* End of the string */
if ( '\0' == *curstr ) {
return purl;
}
/* Skip '/' */
if ( '/' != *curstr ) {
http_parsed_url_free(purl);
return NULL;
}
curstr++;
/* Parse path */
tmpstr = curstr;
while ( '\0' != *tmpstr && '#' != *tmpstr && '?' != *tmpstr ) {
tmpstr++;
}
len = tmpstr - curstr;
purl->path = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->path ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->path, curstr, len);
purl->path[len] = '\0';
curstr = tmpstr;
/* Is query specified? */
if ( '?' == *curstr ) {
/* Skip '?' */
curstr++;
/* Read query */
tmpstr = curstr;
while ( '\0' != *tmpstr && '#' != *tmpstr ) {
tmpstr++;
}
len = tmpstr - curstr;
purl->query = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->query ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->query, curstr, len);
purl->query[len] = '\0';
curstr = tmpstr;
}
/* Is fragment specified? */
if ( '#' == *curstr ) {
/* Skip '#' */
curstr++;
/* Read fragment */
tmpstr = curstr;
while ( '\0' != *tmpstr ) {
tmpstr++;
}
len = tmpstr - curstr;
purl->fragment = malloc(sizeof(char) * (len + 1));
if ( NULL == purl->fragment ) {
http_parsed_url_free(purl);
return NULL;
}
(void)strncpy(purl->fragment, curstr, len);
purl->fragment[len] = '\0';
curstr = tmpstr;
}
return purl;
}
/*
* Free memory of parsed url
*/
void
parsed_url_free(struct parsed_url *purl)
{
if ( NULL != purl ) {
if ( NULL != purl->scheme ) {
free(purl->scheme);
}
if ( NULL != purl->host ) {
free(purl->host);
}
if ( NULL != purl->port ) {
free(purl->port);
}
if ( NULL != purl->path ) {
free(purl->path);
}
if ( NULL != purl->query ) {
free(purl->query);
}
if ( NULL != purl->fragment ) {
free(purl->fragment);
}
if ( NULL != purl->username ) {
free(purl->username);
}
if ( NULL != purl->password ) {
free(purl->password);
}
free(purl);
}
}
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim600: sw=4 ts=4 fdm=marker
* vim<600: sw=4 ts=4
*/