Find URI in text: Difference between revisions

Content added Content deleted
m (added whitespace to the task's preamble, added a ;Task:, added whitespace before the TOC.)
m (→‎{{header|REXX}}: used a template for the output section, added/changed whitespace and comments,)
Line 637: Line 637:


=={{header|REXX}}==
=={{header|REXX}}==
<lang rexx>/*REXX program scans a text (contained within REXX pgm) to extract URIs.*/
<lang rexx>/*REXX program scans a text (contained within the REXX program) to extract URIs and IRIs*/
text='this URI contains an illegal character, parentheses and a misplaced full stop:',
$$= 'this URI contains an illegal character, parentheses and a misplaced full stop:',
'http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (which is handled by http://mediawiki.org/).',
'http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (which is handled by http://mediawiki.org/).',
'and another one just to confuse the parser: http://en.wikipedia.org/wiki/-)',
'and another one just to confuse the parser: http://en.wikipedia.org/wiki/-)',
'")" is handled the wrong way by the mediawiki parser.',
'")" is handled the wrong way by the mediawiki parser.',
'ftp://domain.name/path(balanced_brackets)/foo.html',
'ftp://domain.name/path(balanced_brackets)/foo.html',
'ftp://domain.name/path(balanced_brackets)/ending.in.dot.',
'ftp://domain.name/path(balanced_brackets)/ending.in.dot.',
'ftp://domain.name/path(unbalanced_brackets/ending.in.dot.',
'ftp://domain.name/path(unbalanced_brackets/ending.in.dot.',
'leading junk ftp://domain.name/path/embedded?punct/uation.',
'leading junk ftp://domain.name/path/embedded?punct/uation.',
'leading junk ftp://domain.name/dangling_close_paren)',
'leading junk ftp://domain.name/dangling_close_paren)',
'if you have other interesting URIs for testing, please add them here:'
'if you have other interesting URIs for testing, please add them here:'


@abc='abcdefghijklmnopqrstuvwxyz'; @abcs=@abc||translate(@abc)
@abc= 'abcdefghijklmnopqrstuvwxyz' /*construct lowercase (Latin) alphabet.*/
@abcU= @abc; upper @abcU; @abcs= @abc || @abcU /* " lower & uppercase " */
@scheme=@abcs || 0123456789 || '+-.'
@unreserved=@abcs || 0123456789 || '-._~'
@scheme= @abcs || 0123456789 || '+-.' /*add decimal digits & some punctuation*/
@unreserved= @abcs || 0123456789 || '-._~' /* " " " " " " */
@reserved=@unreserved"/?#[]@!$&)(*+,;=\'"
@reserved= @unreserved"/?#[]@!$&)(*+,;=\'" /*add other punctuation & special chars*/
t=space(text)' ' /*variable T is a working copy.*/
#=0 /*count of URI's found so far. */
$= space($$)' ' /*variable $ is a working copy of $$ */
/*▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄*/
#= 0 /*the count of URI's found (so far).*/
/*▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄*/
do while t\='' /*scan text for multiple URIs. */
y=pos(':',t) /*locate a colon in the text body*/
do while $\=''; y= pos(':', $) /*locate a colon (:) in the text body.$*/
if y==0 then leave /*Colon found? No, we're done. */
if y==0 then leave /*Was a colon found? Nope, we're done.*/
if y==1 then do /*handle a bare colon by itself. */
if y==1 then do; parse var $ . $ /*handle a bare colon by itself. */
parse var t . t /*ignore the bare colon (:). */
iterate /*go and keep scanning for a colon. */
iterate /*go & keep scanning for a colon.*/
end /* [↑] (a rare special case.) */
end /* [↑] a rare special case. */
sr= reverse( left($, y - 1) ) /*extract the scheme and reverse it. */
sr=reverse(left(t,y-1)) /*extract the scheme and reverse.*/
se= verify(sr, @scheme) /*locate the end of the scheme. */
se=verify(sr,@scheme) /*locate the end of the scheme. */
$= substr($, y + 1) /*assign an adjusted new text. */
t=substr(t,y+1) /*assign an adjusted new text. */
if se\==0 then sr= left(sr, se - 1) /*possibly "crop" the scheme name. */
if se\==0 then sr=left(sr,se-1) /*possibly crop the scheme name. */
s= reverse(sr) /*reverse it again to rectify the name.*/
s=reverse(sr) /*reverse again to rectify name. */
he= verify($, @reserved) /*locate the end of the hier─part. */
he=verify(t,@reserved) /*locate the end of the hier-part*/
s= s':'left($, he - 1) /*extract and append the hier─part. */
s=s':'left(t,he-1) /*extract & append the hier-part.*/
$= substr($, he) /*assign an adjusted new part of text. */
t=substr(t,he) /*assign an adjusted new text. */
#= # + 1 /*bump the URI counter. */
#=#+1 /*bump the URI counter. */
!.#= s /*assign the URI to an array (!.) */
!.#=s /*assign the URI to an array. */
end /*while*/ /* [↑] scan the text for URI's. */
/*▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀*/
end /*while t\='' */ /* [↑] scan the text for URIs. */
/*▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀*/
do k=1 for #; say !.k; end /*stick a fork in it, we're all done. */</lang>
{{out|output|text=&nbsp; when using the internal default inputs:}}
do k=1 for #; say !.k; end /*stick a fork in it, we're done.*/</lang>
'''output'''
<pre>
<pre>
stop:
stop: