URL parser/URI parser ALGOL68
ALGOL 68 URI Parser
This is a URI parser implemented in Algol 68. The text can be cut-and-paste into an Algol 68 program or saved in a file and included in another program by using the read pragma available in Algol 68, e.g.: <lang algol68>PR read "uriParser.a68" PR URI u := parse uri( "fred://harry@wombat.com" ); ...</lang>
ALGOL 68
<lang algol68># URI parser #
- MODE returned by the URI parser #
MODE URI = STRUCT( STRING scheme
, STRING userinfo , STRING host , STRING port , STRING path , STRING query , STRING fragment id , BOOL ok # TRUE if the URI parse was OK # , STRING error # error message if the parse failed # );
- returns the URI parsed from text #
- ok OF the result will be TRUE if the parse was successful #
- ok OF the result will be FALSE if the parse failed #
- and error OF the result will be a suitable error message #
- the authority is split into the userinfo, host and port fields #
- and not returned as a separate combined field #
PROC parse uri = ( STRING text )URI:
BEGIN INT pos := 0; # current character position # INT end pos := 0; # last character position #
STRING alphas = "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ; STRING digits = "0123456789"; STRING sub delims = "!$&'()*+,;="; STRING unreserved = alphas + digits + "-._~"; STRING hex digits = digits + "abcdefABCDEF";
# sets the error message of the result and indicates the parse failed # PROC error = ( STRING message )VOID: BEGIN ok OF result := FALSE; error OF result := message + " (near position " + whole( pos, 0 ) + ")" END # error # ;
# returns TRUE if we have passed the end of text, FALSE otherwise # PROC at end = BOOL: pos > end pos;
# returns the current character from the string # # or REPR 0 if we have passed the end of the string # PROC curr char = CHAR: IF at end THEN REPR 0 ELSE text[ pos ] FI;
# returns the character n positions after the current one or REPR 0 if there isn't one # PROC peek = ( INT n )CHAR: IF pos + n > end pos THEN REPR 0 ELSE text[ pos + n ] FI;
# returns TRUE if the current character is ch, FALSE otherwise # PROC have = ( CHAR ch )BOOL: curr char = ch;
# returns TRUE if the current character is one of the specified characters, FALSE otherwise # PROC have one of = ( STRING characters )BOOL: char in string( curr char, NIL, characters );
# returns TRUE if ch is a letter (a-z, A-Z only), FALSE otherwise # PROC is letter = ( CHAR ch )BOOL: char in string( ch, NIL, alphas );
# returns TRUE if ch is a hex digit, FALSE otherwise # PROC is hex = ( CHAR ch )BOOL: char in string( ch, NIL, hex digits );
# positions to the next character, if there is one # PROC next char = VOID: IF at end THEN pos := end pos + 1 ELSE pos +:= 1 FI;
# returns and skips over the sequence of chatracters matching the specified characters # # or hex encoded characters ( "%" followed by 2 hex digits ) # PROC possibly encoded seq = ( STRING characters )STRING: BEGIN STRING result := ""; BOOL ok := TRUE; WHILE CHAR ch := curr char; ok AND ( have one of( characters ) OR ch = "%" ) DO IF ch = "%" THEN # should be "%" followed by a hex digit and a hex digit # IF NOT is hex( peek( 1 ) ) OR NOT is hex( peek( 2 ) ) THEN # invalid encoded character # error( "Invalid encoded character" ); ok := FALSE ELSE # encoding looks OK # result +:= curr char; next char; result +:= curr char; next char; result +:= curr char; next char FI ELSE # single character element # result +:= curr char; next char FI OD; result END # possibly encoded seq # ;
# returns and skips over the sequence of the specified characters starting # # at the current position, if there is one # PROC seq = ( STRING characters )STRING: BEGIN STRING result := ""; WHILE have one of( characters ) DO result +:= curr char; next char OD; result END # seq # ;
# returns and skips over the sequence of the specified characters starting # # at the current position # # if the sequence is empty, the specified error message is issued # PROC seq 1 = ( STRING characters, error message )STRING: BEGIN STRING result := seq( characters ); IF result = "" THEN # empty sequence # error( "Expected at least one of: """ + characters + """ for " + error message ) FI; result END # seq 1 # ;
# checks the current character is ch and advances over it if it is # # if the current character is not ch, an error is indicated # PROC must be = ( CHAR ch, STRING message )VOID: IF have( ch ) THEN next char ELSE error( message ) FI;
# checks we have reached the end of the text and sets an error if we haven't # PROC must be at end = ( STRING message )VOID: IF NOT at end THEN error( message ) FI;
# returns and skips over an IPV6 address - the address format is not validated # PROC ipv6 address = STRING: seq( hex digits + ":" ) + seq( digits + "." );
# ------------ # # main parsing # # ============ #
URI result := ( "", "", "", "", "", "", "", TRUE, "" );
# initialise parsing # pos := LWB text; end pos := UPB text;
# get the scheme # IF ok OF result THEN scheme OF result := seq 1( alphas + digits + "+-.", "URI scheme" ); IF ok OF result THEN # the scheme must start with a letter # IF NOT is letter( ( scheme OF result )[ 1 ] ) THEN # scheme didn't start with a-z, A-Z # error( "URI scheme must start with a letter (a-z, A-Z only)" ) ELSE # ok so far, there should be a ":" next # must be( ":", "after the URI scheme" ) FI FI FI;
# get the path # IF ok OF result THEN # got the scheme OK, get the path # IF curr char = "/" AND peek( 1 ) = "/" THEN # URI has an authority # # there will optionally be userinfo followed by @ # # if there is no "@", the element will be the host # next char; next char; # remember the start positioin of the element, incase we need to backtrack # INT start pos := pos; userinfo OF result := possibly encoded seq( unreserved + sub delims + ":" ); IF ok OF result THEN # got an element OK # IF have( "@" ) THEN # there was an "@", so the element we just parsed was the user info # next char ELSE # didn't get any user info, backtrack to parse the text as the host # userinfo OF result := ""; pos := start pos FI FI; # we should now have the host optionally followed by ":" and the port # IF have( "[" ) THEN # host is an IP literal # next char; host OF result := ipv6 address; must be( "]", "following IPV6 address in URI host" ) ELSE # host is a reg-name or IPV4 address # # note an IPV4 address matches the reg-name pattern and we do not # # distinguish between them # host OF result := possibly encoded seq( unreserved + sub delims ) FI; # can now have a port - ":" followed by digits # IF have( ":" ) THEN # have a port # next char; port OF result := seq( digits ); # the port can only be followed by "/", "?" or a hash character # # as the authority must be followed by a path-abempty # # and that is followed by optional query and optional fragment id # IF NOT have one of( "/?#" ) AND NOT at end THEN # the port is invalid or followed by extraneous characters # error( "Invalid URI port" ) FI FI FI; # get the path # # we expect a possibly empty sequence of segments separated by "/" # # a segment is a possibly empty sequence of # # unreserved, sub-delims, %xx characters, ":" or "@" # # the RFC categorises paths as: # # path-abempty - begins with "/" or is empty # # path-absolute - begins with "/" but not "//" # # path-noscheme - no leading "/" and no ":" in the first segment # # path-rootless - no leading "/" can have ":" in the first segment # # path-empty - empty path # # we do not attempt to distinguish between them # WHILE path OF result +:= possibly encoded seq( unreserved + sub delims + ":@" ); have( "/" ) AND ok OF result DO path OF result +:= "/"; next char OD FI;
# get the query # IF have( "?" ) AND ok OF result THEN # have a query # next char; query OF result := possibly encoded seq( unreserved + sub delims + ":@/?" ) FI;
# get the fragment id, if there is one # IF have( "#" ) AND ok OF result THEN # have a fragment id # next char; fragment id OF result := possibly encoded seq( unreserved + sub delims + "/?" ) FI;
# should have reached the end of the text # IF ok OF result THEN # haven't reached the end of the text # must be at end( "unexpected text at the end of the URI: " + text[ pos : ] ) FI;
result END # parse uri # ;</lang>