Web scraping: Difference between revisions
Content added Content deleted
Thundergnat (talk | contribs) (Rename Perl 6 -> Raku, alphabetize, minor clean-up) |
|||
Line 96: | Line 96: | ||
end loop; |
end loop; |
||
end Get_UTC_Time;</lang> |
end Get_UTC_Time;</lang> |
||
=={{header|AutoHotkey}}== |
|||
<lang AutoHotkey>UrlDownloadToFile, http://tycho.usno.navy.mil/cgi-bin/timer.pl, time.html |
|||
FileRead, timefile, time.html |
|||
pos := InStr(timefile, "UTC") |
|||
msgbox % time := SubStr(timefile, pos - 9, 8)</lang> |
|||
=={{header|AWK}}== |
|||
This is inspired by [http://www.gnu.org/software/gawk/manual/gawkinet/html_node/GETURL.html#GETURL GETURL] example in the manual for gawk. |
|||
<tt><lang awk>#! /usr/bin/awk -f |
|||
BEGIN { |
|||
purl = "/inet/tcp/0/tycho.usno.navy.mil/80" |
|||
ORS = RS = "\r\n\r\n" |
|||
print "GET /cgi-bin/timer.pl HTTP/1.0" |& purl |
|||
purl |& getline header |
|||
while ( (purl |& getline ) > 0 ) |
|||
{ |
|||
split($0, a, "\n") |
|||
for(i=1; i <= length(a); i++) |
|||
{ |
|||
if ( a[i] ~ /UTC/ ) |
|||
{ |
|||
sub(/^<BR>/, "", a[i]) |
|||
printf "%s\n", a[i] |
|||
} |
|||
} |
|||
} |
|||
close(purl) |
|||
}</lang></tt> |
|||
=={{header|ALGOL 68}}== |
=={{header|ALGOL 68}}== |
||
Line 213: | Line 181: | ||
<pre>12:30:15</pre> |
<pre>12:30:15</pre> |
||
=={{header|AutoHotkey}}== |
|||
<lang AutoHotkey>UrlDownloadToFile, http://tycho.usno.navy.mil/cgi-bin/timer.pl, time.html |
|||
FileRead, timefile, time.html |
|||
pos := InStr(timefile, "UTC") |
|||
msgbox % time := SubStr(timefile, pos - 9, 8)</lang> |
|||
=={{header|AWK}}== |
|||
This is inspired by [http://www.gnu.org/software/gawk/manual/gawkinet/html_node/GETURL.html#GETURL GETURL] example in the manual for gawk. |
|||
<tt><lang awk>#! /usr/bin/awk -f |
|||
BEGIN { |
|||
purl = "/inet/tcp/0/tycho.usno.navy.mil/80" |
|||
ORS = RS = "\r\n\r\n" |
|||
print "GET /cgi-bin/timer.pl HTTP/1.0" |& purl |
|||
purl |& getline header |
|||
while ( (purl |& getline ) > 0 ) |
|||
{ |
|||
split($0, a, "\n") |
|||
for(i=1; i <= length(a); i++) |
|||
{ |
|||
if ( a[i] ~ /UTC/ ) |
|||
{ |
|||
sub(/^<BR>/, "", a[i]) |
|||
printf "%s\n", a[i] |
|||
} |
|||
} |
|||
} |
|||
close(purl) |
|||
}</lang></tt> |
|||
=={{header|BBC BASIC}}== |
=={{header|BBC BASIC}}== |
||
Line 291: | Line 291: | ||
return 0; |
return 0; |
||
}</lang> |
}</lang> |
||
=={{header|C sharp|C#}}== |
|||
<lang csharp>class Program |
|||
{ |
|||
static void Main(string[] args) |
|||
{ |
|||
WebClient wc = new WebClient(); |
|||
Stream myStream = wc.OpenRead("http://tycho.usno.navy.mil/cgi-bin/timer.pl"); |
|||
string html = ""; |
|||
using (StreamReader sr = new StreamReader(myStream)) |
|||
{ |
|||
while (sr.Peek() >= 0) |
|||
{ |
|||
html = sr.ReadLine(); |
|||
if (html.Contains("UTC")) |
|||
{ |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
Console.WriteLine(html.Remove(0, 4)); |
|||
Console.ReadLine(); |
|||
} |
|||
} |
|||
</lang> |
|||
=={{header|C++}}== |
=={{header|C++}}== |
||
Line 319: | Line 346: | ||
} |
} |
||
}</lang> |
}</lang> |
||
=={{header|C sharp|C#}}== |
|||
<lang csharp>class Program |
|||
{ |
|||
static void Main(string[] args) |
|||
{ |
|||
WebClient wc = new WebClient(); |
|||
Stream myStream = wc.OpenRead("http://tycho.usno.navy.mil/cgi-bin/timer.pl"); |
|||
string html = ""; |
|||
using (StreamReader sr = new StreamReader(myStream)) |
|||
{ |
|||
while (sr.Peek() >= 0) |
|||
{ |
|||
html = sr.ReadLine(); |
|||
if (html.Contains("UTC")) |
|||
{ |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
Console.WriteLine(html.Remove(0, 4)); |
|||
Console.ReadLine(); |
|||
} |
|||
} |
|||
</lang> |
|||
=={{header|Caché ObjectScript}}== |
=={{header|Caché ObjectScript}}== |
||
Line 1,277: | Line 1,277: | ||
end; |
end; |
||
end;</lang> |
end;</lang> |
||
=={{header|Microsoft Small Basic}}== |
|||
<lang vb> |
|||
'Entered by AykayayCiti -- Earl L. Montgomery |
|||
url_name = "http://tycho.usno.navy.mil/cgi-bin/timer.pl" |
|||
url_data = Network.GetWebPageContents(url_name) |
|||
find = "UTC" |
|||
' the length from the UTC to the time is -18 so we need |
|||
' to subtract from the UTC position |
|||
pos = Text.GetIndexOf(url_data,find)-18 |
|||
result = Text.GetSubText(url_data,pos,(18+3)) 'plus 3 to add the UTC |
|||
TextWindow.WriteLine(result) |
|||
'you can eleminate a line of code by putting the |
|||
' GetIndexOf insde the GetSubText |
|||
'result2 = Text.GetSubText(url_data,Text.GetIndexOf(url_data,find)-18,(18+3)) |
|||
'TextWindow.WriteLine(result2)</lang> |
|||
{{out}} |
|||
<pre> |
|||
Mar. 19, 04:19:34 UTC |
|||
Press any key to continue... |
|||
</pre> |
|||
=={{header|mIRC Scripting Language}}== |
=={{header|mIRC Scripting Language}}== |
||
Line 1,302: | Line 1,324: | ||
} |
} |
||
}</lang> |
}</lang> |
||
=={{header|Microsoft Small Basic}}== |
|||
<lang vb> |
|||
'Entered by AykayayCiti -- Earl L. Montgomery |
|||
url_name = "http://tycho.usno.navy.mil/cgi-bin/timer.pl" |
|||
url_data = Network.GetWebPageContents(url_name) |
|||
find = "UTC" |
|||
' the length from the UTC to the time is -18 so we need |
|||
' to subtract from the UTC position |
|||
pos = Text.GetIndexOf(url_data,find)-18 |
|||
result = Text.GetSubText(url_data,pos,(18+3)) 'plus 3 to add the UTC |
|||
TextWindow.WriteLine(result) |
|||
'you can eleminate a line of code by putting the |
|||
' GetIndexOf insde the GetSubText |
|||
'result2 = Text.GetSubText(url_data,Text.GetIndexOf(url_data,find)-18,(18+3)) |
|||
'TextWindow.WriteLine(result2)</lang> |
|||
{{out}} |
|||
<pre> |
|||
Mar. 19, 04:19:34 UTC |
|||
Press any key to continue... |
|||
</pre> |
|||
=={{header|NetRexx}}== |
=={{header|NetRexx}}== |
||
Line 1,507: | Line 1,505: | ||
get($url) =~ /<BR>(.+? UTC)/ |
get($url) =~ /<BR>(.+? UTC)/ |
||
and print "$1\n";</lang> |
and print "$1\n";</lang> |
||
=={{header|Perl 6}}== |
|||
<lang perl6>use HTTP::Client; # https://github.com/supernovus/perl6-http-client/ |
|||
my $site = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"; |
|||
HTTP::Client.new.get($site).content.match(/'<BR>'( .+? <ws> UTC )/)[0].say</lang> |
|||
Note that the string between '<' and '>' refers to regex tokens, so to match a literal '<BR>' you need to quote it, while <ws> refers to the built-in token whitespace. |
|||
Also, whitespace is ignored by default in Perl 6 regexes. |
|||
=={{header|Phix}}== |
=={{header|Phix}}== |
||
Line 1,701: | Line 1,691: | ||
"https://tycho.usno.navy.mil/cgi-bin/timer.pl") |
"https://tycho.usno.navy.mil/cgi-bin/timer.pl") |
||
</lang> |
</lang> |
||
=={{header|Raku}}== |
|||
(formerly Perl 6) |
|||
<lang perl6>use HTTP::Client; # https://github.com/supernovus/perl6-http-client/ |
|||
my $site = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"; |
|||
HTTP::Client.new.get($site).content.match(/'<BR>'( .+? <ws> UTC )/)[0].say</lang> |
|||
Note that the string between '<' and '>' refers to regex tokens, so to match a literal '<BR>' you need to quote it, while <ws> refers to the built-in token whitespace. |
|||
Also, whitespace is ignored by default in Perl 6 regexes. |
|||
=={{header|REBOL}}== |
=={{header|REBOL}}== |
||
Line 1,728: | Line 1,727: | ||
print ["Current UTC time:" current]</lang> |
print ["Current UTC time:" current]</lang> |
||
=={{header|Run BASIC}}== |
|||
<lang runbasic>print word$(word$(httpget$("http://tycho.usno.navy.mil/cgi-bin/timer.pl"),1,"UTC"),2,"<BR>")</lang> |
|||
{{out}} |
|||
<pre>May. 09, 16:13:44</pre> |
|||
=={{header|Ruby}}== |
=={{header|Ruby}}== |
||
Line 1,755: | Line 1,749: | ||
puts URI.parse('http://tycho.usno.navy.mil/cgi-bin/timer.pl').read.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) UTC/)[1] |
puts URI.parse('http://tycho.usno.navy.mil/cgi-bin/timer.pl').read.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) UTC/)[1] |
||
</lang> |
</lang> |
||
=={{header|Run BASIC}}== |
|||
<lang runbasic>print word$(word$(httpget$("http://tycho.usno.navy.mil/cgi-bin/timer.pl"),1,"UTC"),2,"<BR>")</lang> |
|||
{{out}} |
|||
<pre>May. 09, 16:13:44</pre> |
|||
=={{header|Scala}}== |
=={{header|Scala}}== |
||
Line 1,839: | Line 1,838: | ||
Oct. 27, 00:20:50 UTC |
Oct. 27, 00:20:50 UTC |
||
</pre> |
</pre> |
||
=={{header|Tcl}}== |
=={{header|Tcl}}== |
||
Line 1,854: | Line 1,852: | ||
<lang tcl>set data [exec curl -s http://tycho.usno.navy.mil/cgi-bin/timer.pl] |
<lang tcl>set data [exec curl -s http://tycho.usno.navy.mil/cgi-bin/timer.pl] |
||
puts [lrange [lsearch -glob -inline [split $data <BR>] *UTC*] 0 3]</lang> |
puts [lrange [lsearch -glob -inline [split $data <BR>] *UTC*] 0 3]</lang> |
||
=={{header|ToffeeScript}}== |
|||
<lang coffeescript>e, page = require('request').get! 'http://tycho.usno.navy.mil/cgi-bin/timer.pl' |
|||
l = line for line in page.body.split('\n') when line.indexOf('UTC')>0 |
|||
console.log l.substr(4,l.length-20)</lang> |
|||
=={{header|TUSCRIPT}}== |
=={{header|TUSCRIPT}}== |
||
Line 1,861: | Line 1,864: | ||
SET utc = FILTER (time,":*UTC*:",-) |
SET utc = FILTER (time,":*UTC*:",-) |
||
</lang> |
</lang> |
||
=={{header|ToffeeScript}}== |
|||
<lang coffeescript>e, page = require('request').get! 'http://tycho.usno.navy.mil/cgi-bin/timer.pl' |
|||
l = line for line in page.body.split('\n') when line.indexOf('UTC')>0 |
|||
console.log l.substr(4,l.length-20)</lang> |
|||
=={{header|TXR}}== |
=={{header|TXR}}== |
||
Line 2,040: | Line 2,037: | ||
Apr. 21, 21:02:03 UTC Universal Time |
Apr. 21, 21:02:03 UTC Universal Time |
||
</pre> |
</pre> |
||
=={{header|Visual Basic .NET}}== |
=={{header|Visual Basic .NET}}== |