Find URI in text: Difference between revisions

Line 637:

=={{header|REXX}}==

<lang rexx>/*REXX program scans a text (contained within REXX ~~pgm~~) to extract URIs.*/

<lang rexx>/*REXX program scans a text (contained within the REXX program) to extract URIs and IRIs*/

~~text~~='this URI contains an illegal character, parentheses and a misplaced full stop:',

$$= 'this URI contains an illegal character, parentheses and a misplaced full stop:',

'http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (which is handled by http://mediawiki.org/).',

'and another one just to confuse the parser: http://en.wikipedia.org/wiki/-)',

'")" is handled the wrong way by the mediawiki parser.',

'ftp://domain.name/path(balanced_brackets)/foo.html',

'ftp://domain.name/path(balanced_brackets)/ending.in.dot.',

'ftp://domain.name/path(unbalanced_brackets/ending.in.dot.',

'leading junk ftp://domain.name/path/embedded?punct/uation.',

'leading junk ftp://domain.name/dangling_close_paren)',

'if you have other interesting URIs for testing, please add them here:'

@abc='abcdefghijklmnopqrstuvwxyz'; ~~@abcs=@abc||translate~~(~~@abc~~)

@abc= 'abcdefghijklmnopqrstuvwxyz' /*construct lowercase (Latin) alphabet.*/

@abcU= @abc; upper @abcU; @abcs= @abc || @abcU /* " lower & uppercase " */

@scheme=@abcs || 0123456789 || '+-.'

@~~unreserved~~=@abcs || 0123456789 || '-._~'

@scheme= @abcs || 0123456789 || '+-.' /*add decimal digits & some punctuation*/

@unreserved= @abcs || 0123456789 || '-._~' /* " " " " " " */

@reserved=@unreserved"/?#[]@!$&)(*+,;=\'"

@reserved= @unreserved"/?#[]@!$&)(*+,;=\'" /*add other punctuation & special chars*/

t=space(text)' ' /*variable T is a working copy.*/

#=0 /*~~count~~ of ~~URI's~~ ~~found~~ so ~~far.~~ */

$= space($$)' ' /*variable $ is a working copy of $$ */

/*~~▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄~~*/

#= 0 /*the count of URI's found (so far).*/

/*▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄*/

do while t\='' /*scan text for multiple URIs. */

~~y=pos(':',t)~~ /*locate a colon in the text body*/

do while $\=''; y= pos(':', $) /*locate a colon (:) in the text body.$*/

if y==0 then leave /*~~Colon~~ found? No, we're done. */

if y==0 then leave /*Was a colon found? Nope, we're done.*/

if y==1 then do /*handle a bare colon by itself. */

if y==1 then do; parse var $ . $ /*handle a bare colon by itself. */

~~parse~~ ~~var~~ t . t /*~~ignore~~ ~~the~~ ~~bare~~ ~~colon~~ ~~(:).~~ */

iterate /*go and keep scanning for a colon. */

~~iterate~~ /*go & ~~keep~~ ~~scanning for~~ a ~~colon~~.*/

end /* [↑] (a rare special case.) */

~~end~~ /* ~~[↑]~~ a ~~rare~~ ~~special~~ ~~case~~. */

sr= reverse( left($, y - 1) ) /*extract the scheme and reverse it. */

sr=~~reverse(left~~(t,~~y-1)~~) /*~~extract~~ the ~~scheme~~ ~~and~~ ~~reverse~~.*/

se= verify(sr, @scheme) /*locate the end of the scheme. */

se=~~verify~~(sr,~~@scheme~~) /*~~locate~~ ~~the~~ ~~end~~ of ~~the scheme~~. */

$= substr($, y + 1) /*assign an adjusted new text. */

t=~~substr(t,y+1)~~ ~~/*assign~~ an ~~adjusted~~ ~~new~~ ~~text~~. */

if se\==0 then sr= left(sr, se - 1) /*possibly "crop" the scheme name. */

~~if se\=~~=0 ~~then sr=left~~(sr~~,se-1~~) /*~~possibly~~ ~~crop~~ ~~the~~ ~~scheme~~ name. */

s= reverse(sr) /*reverse it again to rectify the name.*/

s=~~reverse~~(sr) /*~~reverse~~ ~~again~~ to ~~rectify~~ ~~name~~. */

he= verify($, @reserved) /*locate the end of the hier─part. */

he=~~verify~~(t,~~@reserved~~) /*~~locate~~ the ~~end~~ of ~~the~~ ~~hier-part~~*/

s= s':'left($, he - 1) /*extract and append the hier─part. */

s=~~s':'left~~(t,he-1) /*~~extract~~ & ~~append~~ ~~the~~ ~~hier-~~part.*/

$= substr($, he) /*assign an adjusted new part of text. */

t=~~substr(t,he)~~ /*~~assign~~ an ~~adjusted~~ ~~new~~ ~~text~~. */

#= # + 1 /*bump the URI counter. */

#=~~#+1~~ /*~~bump~~ the URI ~~counter.~~ */

!.#= s /*assign the URI to an array (!.) */

~~!.#=s~~ /*~~assign~~ the URI to an ~~array.~~ */

end /*while*/ /* [↑] scan the text for URI's. */

/*▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀*/

end /*while t\='' */ /* [↑] scan the text for URIs. */

/~~*▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀*~~/

do k=1 for #; say !.k; end /*stick a fork in it, we're all done. */</lang>

do k=1 for #; say !.k; end /*stick a fork in it, we're done.*/</lang>

'''output'''

<pre>

stop: