CloudFlare suffered a massive security issue affecting all of its customers, including Rosetta Code. All passwords not changed since February 19th 2017 have been expired, and session cookie longevity will be reduced until late March.--Michael Mol (talk) 05:15, 25 February 2017 (UTC)

URL parser/URI parser ALGOL68

From Rosetta Code

ALGOL 68 URI Parser[edit]

This is a URI parser implemented in Algol 68. The text can be cut-and-paste into an Algol 68 program or saved in a file and included in another program by using the read pragma available in Algol 68, e.g.:

PR read "uriParser.a68" PR
URI u := parse uri( "fred:[email protected]" );
...

ALGOL 68[edit]

# URI parser #
 
# MODE returned by the URI parser #
MODE URI = STRUCT( STRING scheme
, STRING userinfo
, STRING host
, STRING port
, STRING path
, STRING query
, STRING fragment id
, BOOL ok # TRUE if the URI parse was OK #
, STRING error # error message if the parse failed #
);
 
# returns the URI parsed from text #
# ok OF the result will be TRUE if the parse was successful #
# ok OF the result will be FALSE if the parse failed #
# and error OF the result will be a suitable error message #
# the authority is split into the userinfo, host and port fields #
# and not returned as a separate combined field #
PROC parse uri = ( STRING text )URI:
BEGIN
INT pos := 0; # current character position #
INT end pos := 0; # last character position #
 
STRING alphas = "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
;
STRING digits = "0123456789";
STRING sub delims = "!$&'()*+,;=";
STRING unreserved = alphas + digits + "-._~";
STRING hex digits = digits + "abcdefABCDEF";
 
 
# sets the error message of the result and indicates the parse failed #
PROC error = ( STRING message )VOID:
BEGIN
ok OF result := FALSE;
error OF result := message + " (near position " + whole( pos, 0 ) + ")"
END # error # ;
 
# returns TRUE if we have passed the end of text, FALSE otherwise #
PROC at end = BOOL: pos > end pos;
 
# returns the current character from the string #
# or REPR 0 if we have passed the end of the string #
PROC curr char = CHAR: IF at end THEN REPR 0 ELSE text[ pos ] FI;
 
# returns the character n positions after the current one or REPR 0 if there isn't one #
PROC peek = ( INT n )CHAR: IF pos + n > end pos THEN REPR 0 ELSE text[ pos + n ] FI;
 
# returns TRUE if the current character is ch, FALSE otherwise #
PROC have = ( CHAR ch )BOOL: curr char = ch;
 
# returns TRUE if the current character is one of the specified characters, FALSE otherwise #
PROC have one of = ( STRING characters )BOOL: char in string( curr char, NIL, characters );
 
# returns TRUE if ch is a letter (a-z, A-Z only), FALSE otherwise #
PROC is letter = ( CHAR ch )BOOL: char in string( ch, NIL, alphas );
 
# returns TRUE if ch is a hex digit, FALSE otherwise #
PROC is hex = ( CHAR ch )BOOL: char in string( ch, NIL, hex digits );
 
# positions to the next character, if there is one #
PROC next char = VOID: IF at end THEN pos := end pos + 1 ELSE pos +:= 1 FI;
 
# returns and skips over the sequence of chatracters matching the specified characters #
# or hex encoded characters ( "%" followed by 2 hex digits ) #
PROC possibly encoded seq = ( STRING characters )STRING:
BEGIN
STRING result := "";
BOOL ok := TRUE;
WHILE CHAR ch := curr char;
ok AND ( have one of( characters ) OR ch = "%" )
DO
IF ch = "%"
THEN
# should be "%" followed by a hex digit and a hex digit #
IF NOT is hex( peek( 1 ) ) OR NOT is hex( peek( 2 ) )
THEN
# invalid encoded character #
error( "Invalid encoded character" );
ok := FALSE
ELSE
# encoding looks OK #
result +:= curr char;
next char;
result +:= curr char;
next char;
result +:= curr char;
next char
FI
ELSE
# single character element #
result +:= curr char;
next char
FI
OD;
result
END # possibly encoded seq # ;
 
# returns and skips over the sequence of the specified characters starting #
# at the current position, if there is one #
PROC seq = ( STRING characters )STRING:
BEGIN
STRING result := "";
WHILE have one of( characters )
DO
result +:= curr char;
next char
OD;
result
END # seq # ;
 
# returns and skips over the sequence of the specified characters starting #
# at the current position #
# if the sequence is empty, the specified error message is issued #
PROC seq 1 = ( STRING characters, error message )STRING:
BEGIN
STRING result := seq( characters );
IF result = ""
THEN
# empty sequence #
error( "Expected at least one of: """ + characters + """ for " + error message )
FI;
result
END # seq 1 # ;
 
# checks the current character is ch and advances over it if it is #
# if the current character is not ch, an error is indicated #
PROC must be = ( CHAR ch, STRING message )VOID: IF have( ch ) THEN next char ELSE error( message ) FI;
 
# checks we have reached the end of the text and sets an error if we haven't #
PROC must be at end = ( STRING message )VOID: IF NOT at end THEN error( message ) FI;
 
# returns and skips over an IPV6 address - the address format is not validated #
PROC ipv6 address = STRING: seq( hex digits + ":" ) + seq( digits + "." );
 
# ------------ #
# main parsing #
# ============ #
 
URI result := ( "", "", "", "", "", "", "", TRUE, "" );
 
# initialise parsing #
pos := LWB text;
end pos := UPB text;
 
# get the scheme #
IF ok OF result
THEN
scheme OF result := seq 1( alphas + digits + "+-.", "URI scheme" );
IF ok OF result
THEN
# the scheme must start with a letter #
IF NOT is letter( ( scheme OF result )[ 1 ] )
THEN
# scheme didn't start with a-z, A-Z #
error( "URI scheme must start with a letter (a-z, A-Z only)" )
ELSE
# ok so far, there should be a ":" next #
must be( ":", "after the URI scheme" )
FI
FI
FI;
 
# get the path #
IF ok OF result
THEN
# got the scheme OK, get the path #
IF curr char = "/" AND peek( 1 ) = "/"
THEN
# URI has an authority #
# there will optionally be userinfo followed by @ #
# if there is no "@", the element will be the host #
next char;
next char;
# remember the start positioin of the element, incase we need to backtrack #
INT start pos := pos;
userinfo OF result := possibly encoded seq( unreserved + sub delims + ":" );
IF ok OF result
THEN
# got an element OK #
IF have( "@" )
THEN
# there was an "@", so the element we just parsed was the user info #
next char
ELSE
# didn't get any user info, backtrack to parse the text as the host #
userinfo OF result := "";
pos := start pos
FI
FI;
# we should now have the host optionally followed by ":" and the port #
IF have( "[" )
THEN
# host is an IP literal #
next char;
host OF result := ipv6 address;
must be( "]", "following IPV6 address in URI host" )
ELSE
# host is a reg-name or IPV4 address #
# note an IPV4 address matches the reg-name pattern and we do not #
# distinguish between them #
host OF result := possibly encoded seq( unreserved + sub delims )
FI;
# can now have a port - ":" followed by digits #
IF have( ":" )
THEN
# have a port #
next char;
port OF result := seq( digits );
# the port can only be followed by "/", "?" or a hash character #
# as the authority must be followed by a path-abempty #
# and that is followed by optional query and optional fragment id #
IF NOT have one of( "/?#" ) AND NOT at end
THEN
# the port is invalid or followed by extraneous characters #
error( "Invalid URI port" )
FI
FI
FI;
# get the path #
# we expect a possibly empty sequence of segments separated by "/" #
# a segment is a possibly empty sequence of #
# unreserved, sub-delims, %xx characters, ":" or "@" #
# the RFC categorises paths as: #
# path-abempty - begins with "/" or is empty #
# path-absolute - begins with "/" but not "//" #
# path-noscheme - no leading "/" and no ":" in the first segment #
# path-rootless - no leading "/" can have ":" in the first segment #
# path-empty - empty path #
# we do not attempt to distinguish between them #
WHILE path OF result +:= possibly encoded seq( unreserved + sub delims + ":@" );
have( "/" ) AND ok OF result
DO
path OF result +:= "/";
next char
OD
FI;
 
# get the query #
IF have( "?" ) AND ok OF result
THEN
# have a query #
next char;
query OF result := possibly encoded seq( unreserved + sub delims + ":@/?" )
FI;
 
# get the fragment id, if there is one #
IF have( "#" ) AND ok OF result
THEN
# have a fragment id #
next char;
fragment id OF result := possibly encoded seq( unreserved + sub delims + "/?" )
FI;
 
# should have reached the end of the text #
IF ok OF result
THEN
# haven't reached the end of the text #
must be at end( "unexpected text at the end of the URI: " + text[ pos : ] )
FI;
 
result
END # parse uri # ;