Find URI in text: Difference between revisions
m (→{{header|Icon}} and {{header|Unicon}}: tweak comment) |
|||
Line 121: | Line 121: | ||
procedure arbcp(cs,pr,n) #: match arbitrary numbers of (cset|proc,n) |
procedure arbcp(cs,pr,n) #: match arbitrary numbers of (cset|proc,n) |
||
local p,i |
local p,i |
||
/n := 0 |
/n := 0 # for 0* / 1* |
||
runerr(0 > n,205) |
runerr(0 > n,205) |
||
p := &pos |
p := &pos |
||
Line 132: | Line 132: | ||
procedure arbsp(st,pr,n) #: match arbitrary numbers of (string || proc,n) |
procedure arbsp(st,pr,n) #: match arbitrary numbers of (string || proc,n) |
||
local p,i |
local p,i |
||
/n := 0 |
/n := 0 # for 0* / 1* |
||
runerr(0 > n,205) |
runerr(0 > n,205) |
||
p := &pos |
p := &pos |
Revision as of 05:03, 8 January 2012
Write a function to search plain text for URIs.
The function should return a list of URIs found in the text.
The definition of a URI is given in RFC 3986.
For searching URIs in particular "Appendix C. Delimiting a URI in Context" is noteworthy.
Consider the following issues:
. , ; ' ? ( )
are legal characters in a URI, but they are often used in plain text as a delimiter.- a user may type an URI as seen in the browser location-bar with non-ascii characters (which are not legal).
- URIs can be something else besides http:// or https://
sample text:
this URI contains an illegal character, parentheses and a misplaced full stop: http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (which is handled by http://mediawiki.org/). and another one just to confuse the parser: http://en.wikipedia.org/wiki/-) ")" is handled the wrong way by the mediawiki parser. ftp://domain.name/path(balanced_brackets)/foo.html ftp://domain.name/path(balanced_brackets)/ending.in.dot. ftp://domain.name/path(unbalanced_brackets/ending.in.dot. leading junk ftp://domain.name/path/embedded?punct/uation. leading junk ftp://domain.name/dangling_close_paren) if you have other interesting URIs for testing, please add them here:
Regular expressions to solve the task are fine, but alternative approaches are welcome too. (otherwise, this task would degrade into 'fining and applying the best regular expression')
Icon and Unicon
This example follows the RFC very closely (see Talk page for discussion). For better IP parsing see Parse_an_IP_Address.
<lang Icon>procedure main()
every write(findURItext("this URI contains an illegal character, parentheses_ and a misplaced full stop:\n_ http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). _ which is handled by http://mediawiki.org/).\n_ and another one just to confuse the parser: _ http://en.wikipedia.org/wiki/-)\n_ \")\" is handled the wrong way by the mediawiki parser.\n_ ftp://domain.name/path(balanced_brackets)/foo.html\n_ ftp://domain.name/path(balanced_brackets)/ending.in.dot.\n_ ftp://domain.name/path(unbalanced_brackets/ending.in.dot.\n_ leading junk ftp://domain.name/path/embedded?punct/uation.\n_ leading junk ftp://domain.name/dangling_close_paren)\n_ if you have other interesting URIs for testing, please add them here:"))
end
$define GENDELIM ':/?#[]@' $define SUBDELIM '!$&()*+,;=\ $define UNRESERVED &letters ++ &digits ++ '-._~' $define RESERVED GENDELIM++SUBDELIM $define HEXDIGITS '0123456789aAbBcCdDeEfF'
procedure findURItext(s) #: generate all syntatically valid URI's from s
local u s ? while tab(upto(&letters)) || (u := URI()) do suspend u
end
procedure URI() #: match longest URI at cursor
static sc2 initial sc2 := &letters ++ &digits ++ '+-.' # scheme suspend ( ( tab(any(&letters)) || (tab(many(sc2)) |="") || =":" ) || # scheme ( (="//" || authority() || arbsp("/",segment)) | # heir ... (="/" || ( path_rootless() |="")) | path_rootless() | ="" ) || ( ( ="?" || queryfrag() ) |="" ) || # query ( ( ="#" || queryfrag() ) |="" ) # fragment )
end
procedure queryfrag() #: match a query or fragment
static pc initial pc := UNRESERVED ++ SUBDELIM ++ ':@/?' suspend arbcp(pc,pctencode)
end
procedure segment(n) #: match a pchar segment
static sc initial sc := UNRESERVED ++ SUBDELIM ++ ':@' suspend arbcp(sc,pctencode,n)
end
procedure segmentnc(n) #: match a pchar--':' segment
static sc initial sc := UNRESERVED ++ SUBDELIM ++ '@' suspend arbcp(sc,pctencode,n)
end
procedure path_rootless() #: match a rootless path
suspend segment(1) || arbsp("/",segment)
end
procedure authority() #: match authority
static uic,rnc initial { rnc := UNRESERVED ++ SUBDELIM # regular name uic := rnc ++ ':' # userinfo } suspend ( (arbcp(uic,pctencode) || ="@") |="") || # userinfo ( IPsimple() | arbcp(rnc,pctencode) ) || # host ( (=":" || tab(many(&digits))) |="")
end
procedure IPsimple() #: match ip address (trickable )
static i4c,i6c,ifc initial { i4c := &digits ++ '.' i6c := HEXDIGITS ++ '.:' ifc := UNRESERVED ++ SUBDELIM ++ ':' } suspend ( ="[" || ( tab(many(i6c)) | ( ="v"||tab(any(HEXDIGITS))||="."||tab(any(ifc))||tab(many(ifc)) ) ) || ="]" ) | tab(many(i4c))
end
procedure arbcp(cs,pr,n) #: match arbitrary numbers of (cset|proc,n)
local p,i /n := 0 # for 0* / 1* runerr(0 > n,205) p := &pos i := 0 while tab(many(cs)) | pr() do i +:= 1 if i >= n then suspend &subject[p:&pos] &pos := p # restore &pos
end
procedure arbsp(st,pr,n) #: match arbitrary numbers of (string || proc,n)
local p,i /n := 0 # for 0* / 1* runerr(0 > n,205) p := &pos i := 0 while =st || pr() do i +:= 1 if i >= n then suspend &subject[p:&pos] &pos := p # restore &pos
end
procedure pctencode() #: match 1 % encoded single byte character
suspend ="%" || tab(any(HEXDIGITS)) || tab(any(HEXDIGITS))
end</lang>
Output:
stop: http://en.wikipedia.org/wiki/Erich_K http://mediawiki.org/). parser: http://en.wikipedia.org/wiki/-) ftp://domain.name/path(balanced_brackets)/foo.html ftp://domain.name/path(balanced_brackets)/ending.in.dot. ftp://domain.name/path(unbalanced_brackets/ending.in.dot. ftp://domain.name/path/embedded?punct/uation. ftp://domain.name/dangling_close_paren) here:
Pike
<lang Pike>string uritext = "this URI contains an illegal character, parentheses and a misplaced full stop:\n" "http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (which is handled by http://mediawiki.org/).\n" "and another one just to confuse the parser: http://en.wikipedia.org/wiki/-)\n";
array find_uris(string uritext) {
array uris=({}); int pos=0; while((pos = search(uritext, "://", pos+1))>0) { int prepos = sizeof(array_sscanf(reverse(uritext[pos-20..pos-1]), "%[a-zA-Z0-9+.-]%s")[0]); int postpos = sizeof(array_sscanf(uritext[pos+3..], "%[^\n\r\t <>\"]%s")[0]);
if ((<'.',',','?','!',';'>)[uritext[pos+postpos+2]]) postpos--; if (uritext[pos-prepos-1]=='(' && uritext[pos+postpos+2]==')') postpos--; if (uritext[pos-prepos-1]=='\ && uritext[pos+postpos+2]=='\) postpos--; uris+= ({ uritext[pos-prepos..pos+postpos+2] }); } return uris;
}
find_uris(uritext); Result: ({ /* 3 elements */
"http://en.wikipedia.org/wiki/Erich_K\303\244stner_(camera_designer)", "http://mediawiki.org/)", "http://en.wikipedia.org/wiki/-)" })</lang>
TXR
<lang txr>@(define path (path))@\
@(local x y)@\ @(cases)@\ (@(path x))@(path y)@(bind path `(@x)@y`)@\ @(or)@\ @{x /[.,;'!?][^ \t\f\v]/}@(path y)@(bind path `@x@y`)@\ @(or)@\ @{x /[^ .,;'!?()\t\f\v]/}@(path y)@(bind path `@x@y`)@\ @(or)@\ @(bind path "")@\ @(end)@\
@(end) @(define url (url))@\
@(local proto domain path)@\ @{proto /[A-Za-z]+/}://@{domain /[^ \/\t\f\v]+/}@\ @(cases)/@(path path)@\ @(bind url `@proto://@domain/@path`)@\ @(or)@\ @(bind url `@proto://@domain`)@\ @(end)@\
@(end) @(collect) @ (all) @line @ (and) @ (coll)@(url url)@(end)@(flatten url) @ (end) @(end) @(output) LINE
URLS
@ (repeat) @line @ (repeat)
@url
@ (end) @ (end) @(end)</lang>
Test file:
$ cat url-data Blah blah http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (Handled by http://mediawiki.org/). Confuse the parser: http://en.wikipedia.org/wiki/-) ftp://domain.name/path(balanced_brackets)/foo.html ftp://domain.name/path(balanced_brackets)/ending.in.dot. ftp://domain.name/path(unbalanced_brackets/ending.in.dot. leading junk ftp://domain.name/path/embedded?punct/uation. leading junk ftp://domain.name/dangling_close_paren)
Run:
$ txr url.txr url-data LINE URLS ---------------------- Blah blah http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer). (Handled by http://mediawiki.org/). http://en.wikipedia.org/wiki/Erich_Kästner_(camera_designer) http://mediawiki.org/ Confuse the parser: http://en.wikipedia.org/wiki/-) http://en.wikipedia.org/wiki/- ftp://domain.name/path(balanced_brackets)/foo.html ftp://domain.name/path(balanced_brackets)/foo.html ftp://domain.name/path(balanced_brackets)/ending.in.dot. ftp://domain.name/path(balanced_brackets)/ending.in.dot ftp://domain.name/path(unbalanced_brackets/ending.in.dot. ftp://domain.name/path leading junk ftp://domain.name/path/embedded?punct/uation. ftp://domain.name/path/embedded?punct/uation leading junk ftp://domain.name/dangling_close_paren) ftp://domain.name/dangling_close_paren