URL parser/URI parser ALGOL68

From Rosetta Code

ALGOL 68 URI Parser[edit]

This is a URI parser implemented in Algol 68. The text can be cut-and-paste into an Algol 68 program or saved in a file and included in another program by using the read pragma available in Algol 68, e.g.:

PR read "uriParser.a68" PR
URI u := parse uri( "fred:[email protected]" );
...

ALGOL 68[edit]

# URI parser #
 
# MODE returned by the URI parser #
MODE URI = STRUCT( STRING scheme
, STRING userinfo
, STRING host
, STRING port
, STRING path
, STRING query
, STRING fragment id
, BOOL ok # TRUE if the URI parse was OK #
, STRING error # error message if the parse failed #
);
 
# returns the URI parsed from text #
# ok OF the result will be TRUE if the parse was successful #
# ok OF the result will be FALSE if the parse failed #
# and error OF the result will be a suitable error message #
# the authority is split into the userinfo, host and port fields #
# and not returned as a separate combined field #
PROC parse uri = ( STRING text )URI:
BEGIN
INT pos := 0; # current character position #
INT end pos := 0; # last character position #
 
STRING alphas = "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
;
STRING digits = "0123456789";
STRING sub delims = "!$&'()*+,;=";
STRING unreserved = alphas + digits + "-._~";
STRING hex digits = digits + "abcdefABCDEF";
 
 
# sets the error message of the result and indicates the parse failed #
PROC error = ( STRING message )VOID:
BEGIN
ok OF result := FALSE;
error OF result := message + " (near position " + whole( pos, 0 ) + ")"
END # error # ;
 
# returns TRUE if we have passed the end of text, FALSE otherwise #
PROC at end = BOOL: pos > end pos;
 
# returns the current character from the string #
# or REPR 0 if we have passed the end of the string #
PROC curr char = CHAR: IF at end THEN REPR 0 ELSE text[ pos ] FI;
 
# returns the character n positions after the current one or REPR 0 if there isn't one #
PROC peek = ( INT n )CHAR: IF pos + n > end pos THEN REPR 0 ELSE text[ pos + n ] FI;
 
# returns TRUE if the current character is ch, FALSE otherwise #
PROC have = ( CHAR ch )BOOL: curr char = ch;
 
# returns TRUE if the current character is one of the specified characters, FALSE otherwise #
PROC have one of = ( STRING characters )BOOL: char in string( curr char, NIL, characters );
 
# returns TRUE if ch is a letter (a-z, A-Z only), FALSE otherwise #
PROC is letter = ( CHAR ch )BOOL: char in string( ch, NIL, alphas );
 
# returns TRUE if ch is a hex digit, FALSE otherwise #
PROC is hex = ( CHAR ch )BOOL: char in string( ch, NIL, hex digits );
 
# positions to the next character, if there is one #
PROC next char = VOID: IF at end THEN pos := end pos + 1 ELSE pos +:= 1 FI;
 
# returns and skips over the sequence of chatracters matching the specified characters #
# or hex encoded characters ( "%" followed by 2 hex digits ) #
PROC possibly encoded seq = ( STRING characters )STRING:
BEGIN
STRING result := "";
BOOL ok := TRUE;
WHILE CHAR ch := curr char;
ok AND ( have one of( characters ) OR ch = "%" )
DO
IF ch = "%"
THEN
# should be "%" followed by a hex digit and a hex digit #
IF NOT is hex( peek( 1 ) ) OR NOT is hex( peek( 2 ) )
THEN
# invalid encoded character #
error( "Invalid encoded character" );
ok := FALSE
ELSE
# encoding looks OK #
result +:= curr char;
next char;
result +:= curr char;
next char;
result +:= curr char;
next char
FI
ELSE
# single character element #
result +:= curr char;
next char
FI
OD;
result
END # possibly encoded seq # ;
 
# returns and skips over the sequence of the specified characters starting #
# at the current position, if there is one #
PROC seq = ( STRING characters )STRING:
BEGIN
STRING result := "";
WHILE have one of( characters )
DO
result +:= curr char;
next char
OD;
result
END # seq # ;
 
# returns and skips over the sequence of the specified characters starting #
# at the current position #
# if the sequence is empty, the specified error message is issued #
PROC seq 1 = ( STRING characters, error message )STRING:
BEGIN
STRING result := seq( characters );
IF result = ""
THEN
# empty sequence #
error( "Expected at least one of: """ + characters + """ for " + error message )
FI;
result
END # seq 1 # ;
 
# checks the current character is ch and advances over it if it is #
# if the current character is not ch, an error is indicated #
PROC must be = ( CHAR ch, STRING message )VOID: IF have( ch ) THEN next char ELSE error( message ) FI;
 
# checks we have reached the end of the text and sets an error if we haven't #
PROC must be at end = ( STRING message )VOID: IF NOT at end THEN error( message ) FI;
 
# returns and skips over an IPV6 address - the address format is not validated #
PROC ipv6 address = STRING: seq( hex digits + ":" ) + seq( digits + "." );
 
# ------------ #
# main parsing #
# ============ #
 
URI result := ( "", "", "", "", "", "", "", TRUE, "" );
 
# initialise parsing #
pos := LWB text;
end pos := UPB text;
 
# get the scheme #
IF ok OF result
THEN
scheme OF result := seq 1( alphas + digits + "+-.", "URI scheme" );
IF ok OF result
THEN
# the scheme must start with a letter #
IF NOT is letter( ( scheme OF result )[ 1 ] )
THEN
# scheme didn't start with a-z, A-Z #
error( "URI scheme must start with a letter (a-z, A-Z only)" )
ELSE
# ok so far, there should be a ":" next #
must be( ":", "after the URI scheme" )
FI
FI
FI;
 
# get the path #
IF ok OF result
THEN
# got the scheme OK, get the path #
IF curr char = "/" AND peek( 1 ) = "/"
THEN
# URI has an authority #
# there will optionally be userinfo followed by @ #
# if there is no "@", the element will be the host #
next char;
next char;
# remember the start positioin of the element, incase we need to backtrack #
INT start pos := pos;
userinfo OF result := possibly encoded seq( unreserved + sub delims + ":" );
IF ok OF result
THEN
# got an element OK #
IF have( "@" )
THEN
# there was an "@", so the element we just parsed was the user info #
next char
ELSE
# didn't get any user info, backtrack to parse the text as the host #
userinfo OF result := "";
pos := start pos
FI
FI;
# we should now have the host optionally followed by ":" and the port #
IF have( "[" )
THEN
# host is an IP literal #
next char;
host OF result := ipv6 address;
must be( "]", "following IPV6 address in URI host" )
ELSE
# host is a reg-name or IPV4 address #
# note an IPV4 address matches the reg-name pattern and we do not #
# distinguish between them #
host OF result := possibly encoded seq( unreserved + sub delims )
FI;
# can now have a port - ":" followed by digits #
IF have( ":" )
THEN
# have a port #
next char;
port OF result := seq( digits );
# the port can only be followed by "/", "?" or a hash character #
# as the authority must be followed by a path-abempty #
# and that is followed by optional query and optional fragment id #
IF NOT have one of( "/?#" ) AND NOT at end
THEN
# the port is invalid or followed by extraneous characters #
error( "Invalid URI port" )
FI
FI
FI;
# get the path #
# we expect a possibly empty sequence of segments separated by "/" #
# a segment is a possibly empty sequence of #
# unreserved, sub-delims, %xx characters, ":" or "@" #
# the RFC categorises paths as: #
# path-abempty - begins with "/" or is empty #
# path-absolute - begins with "/" but not "//" #
# path-noscheme - no leading "/" and no ":" in the first segment #
# path-rootless - no leading "/" can have ":" in the first segment #
# path-empty - empty path #
# we do not attempt to distinguish between them #
WHILE path OF result +:= possibly encoded seq( unreserved + sub delims + ":@" );
have( "/" ) AND ok OF result
DO
path OF result +:= "/";
next char
OD
FI;
 
# get the query #
IF have( "?" ) AND ok OF result
THEN
# have a query #
next char;
query OF result := possibly encoded seq( unreserved + sub delims + ":@/?" )
FI;
 
# get the fragment id, if there is one #
IF have( "#" ) AND ok OF result
THEN
# have a fragment id #
next char;
fragment id OF result := possibly encoded seq( unreserved + sub delims + "/?" )
FI;
 
# should have reached the end of the text #
IF ok OF result
THEN
# haven't reached the end of the text #
must be at end( "unexpected text at the end of the URI: " + text[ pos : ] )
FI;
 
result
END # parse uri # ;