Web scraping: Difference between revisions

Content added Content deleted
(Rename Perl 6 -> Raku, alphabetize, minor clean-up)
Line 96: Line 96:
end loop;
end loop;
end Get_UTC_Time;</lang>
end Get_UTC_Time;</lang>

=={{header|AutoHotkey}}==
<lang AutoHotkey>UrlDownloadToFile, http://tycho.usno.navy.mil/cgi-bin/timer.pl, time.html
FileRead, timefile, time.html
pos := InStr(timefile, "UTC")
msgbox % time := SubStr(timefile, pos - 9, 8)</lang>

=={{header|AWK}}==

This is inspired by [http://www.gnu.org/software/gawk/manual/gawkinet/html_node/GETURL.html#GETURL GETURL] example in the manual for gawk.

<tt><lang awk>#! /usr/bin/awk -f

BEGIN {
purl = "/inet/tcp/0/tycho.usno.navy.mil/80"
ORS = RS = "\r\n\r\n"
print "GET /cgi-bin/timer.pl HTTP/1.0" |& purl
purl |& getline header
while ( (purl |& getline ) > 0 )
{
split($0, a, "\n")
for(i=1; i <= length(a); i++)
{
if ( a[i] ~ /UTC/ )
{
sub(/^<BR>/, "", a[i])
printf "%s\n", a[i]
}
}
}
close(purl)
}</lang></tt>


=={{header|ALGOL 68}}==
=={{header|ALGOL 68}}==
Line 213: Line 181:


<pre>12:30:15</pre>
<pre>12:30:15</pre>

=={{header|AutoHotkey}}==
<lang AutoHotkey>UrlDownloadToFile, http://tycho.usno.navy.mil/cgi-bin/timer.pl, time.html
FileRead, timefile, time.html
pos := InStr(timefile, "UTC")
msgbox % time := SubStr(timefile, pos - 9, 8)</lang>

=={{header|AWK}}==

This is inspired by [http://www.gnu.org/software/gawk/manual/gawkinet/html_node/GETURL.html#GETURL GETURL] example in the manual for gawk.

<tt><lang awk>#! /usr/bin/awk -f

BEGIN {
purl = "/inet/tcp/0/tycho.usno.navy.mil/80"
ORS = RS = "\r\n\r\n"
print "GET /cgi-bin/timer.pl HTTP/1.0" |& purl
purl |& getline header
while ( (purl |& getline ) > 0 )
{
split($0, a, "\n")
for(i=1; i <= length(a); i++)
{
if ( a[i] ~ /UTC/ )
{
sub(/^<BR>/, "", a[i])
printf "%s\n", a[i]
}
}
}
close(purl)
}</lang></tt>


=={{header|BBC BASIC}}==
=={{header|BBC BASIC}}==
Line 291: Line 291:
return 0;
return 0;
}</lang>
}</lang>

=={{header|C sharp|C#}}==
<lang csharp>class Program
{
static void Main(string[] args)
{
WebClient wc = new WebClient();
Stream myStream = wc.OpenRead("http://tycho.usno.navy.mil/cgi-bin/timer.pl");
string html = "";
using (StreamReader sr = new StreamReader(myStream))
{
while (sr.Peek() >= 0)
{
html = sr.ReadLine();
if (html.Contains("UTC"))
{
break;
}
}
}
Console.WriteLine(html.Remove(0, 4));

Console.ReadLine();
}
}
</lang>


=={{header|C++}}==
=={{header|C++}}==
Line 319: Line 346:
}
}
}</lang>
}</lang>

=={{header|C sharp|C#}}==
<lang csharp>class Program
{
static void Main(string[] args)
{
WebClient wc = new WebClient();
Stream myStream = wc.OpenRead("http://tycho.usno.navy.mil/cgi-bin/timer.pl");
string html = "";
using (StreamReader sr = new StreamReader(myStream))
{
while (sr.Peek() >= 0)
{
html = sr.ReadLine();
if (html.Contains("UTC"))
{
break;
}
}
}
Console.WriteLine(html.Remove(0, 4));

Console.ReadLine();
}
}
</lang>


=={{header|Caché ObjectScript}}==
=={{header|Caché ObjectScript}}==
Line 1,277: Line 1,277:
end;
end;
end;</lang>
end;</lang>

=={{header|Microsoft Small Basic}}==
<lang vb>
'Entered by AykayayCiti -- Earl L. Montgomery
url_name = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
url_data = Network.GetWebPageContents(url_name)
find = "UTC"
' the length from the UTC to the time is -18 so we need
' to subtract from the UTC position
pos = Text.GetIndexOf(url_data,find)-18
result = Text.GetSubText(url_data,pos,(18+3)) 'plus 3 to add the UTC
TextWindow.WriteLine(result)

'you can eleminate a line of code by putting the
' GetIndexOf insde the GetSubText
'result2 = Text.GetSubText(url_data,Text.GetIndexOf(url_data,find)-18,(18+3))
'TextWindow.WriteLine(result2)</lang>
{{out}}
<pre>
Mar. 19, 04:19:34 UTC
Press any key to continue...
</pre>


=={{header|mIRC Scripting Language}}==
=={{header|mIRC Scripting Language}}==
Line 1,302: Line 1,324:
}
}
}</lang>
}</lang>



=={{header|Microsoft Small Basic}}==
<lang vb>
'Entered by AykayayCiti -- Earl L. Montgomery
url_name = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
url_data = Network.GetWebPageContents(url_name)
find = "UTC"
' the length from the UTC to the time is -18 so we need
' to subtract from the UTC position
pos = Text.GetIndexOf(url_data,find)-18
result = Text.GetSubText(url_data,pos,(18+3)) 'plus 3 to add the UTC
TextWindow.WriteLine(result)

'you can eleminate a line of code by putting the
' GetIndexOf insde the GetSubText
'result2 = Text.GetSubText(url_data,Text.GetIndexOf(url_data,find)-18,(18+3))
'TextWindow.WriteLine(result2)</lang>
{{out}}
<pre>
Mar. 19, 04:19:34 UTC
Press any key to continue...
</pre>


=={{header|NetRexx}}==
=={{header|NetRexx}}==
Line 1,507: Line 1,505:
get($url) =~ /<BR>(.+? UTC)/
get($url) =~ /<BR>(.+? UTC)/
and print "$1\n";</lang>
and print "$1\n";</lang>

=={{header|Perl 6}}==
<lang perl6>use HTTP::Client; # https://github.com/supernovus/perl6-http-client/
my $site = "http://tycho.usno.navy.mil/cgi-bin/timer.pl";
HTTP::Client.new.get($site).content.match(/'<BR>'( .+? <ws> UTC )/)[0].say</lang>

Note that the string between '<' and '>' refers to regex tokens, so to match a literal '&lt;BR&gt;' you need to quote it, while <ws> refers to the built-in token whitespace.
Also, whitespace is ignored by default in Perl&nbsp;6 regexes.


=={{header|Phix}}==
=={{header|Phix}}==
Line 1,701: Line 1,691:
"https://tycho.usno.navy.mil/cgi-bin/timer.pl")
"https://tycho.usno.navy.mil/cgi-bin/timer.pl")
</lang>
</lang>

=={{header|Raku}}==
(formerly Perl 6)
<lang perl6>use HTTP::Client; # https://github.com/supernovus/perl6-http-client/
my $site = "http://tycho.usno.navy.mil/cgi-bin/timer.pl";
HTTP::Client.new.get($site).content.match(/'<BR>'( .+? <ws> UTC )/)[0].say</lang>

Note that the string between '<' and '>' refers to regex tokens, so to match a literal '&lt;BR&gt;' you need to quote it, while <ws> refers to the built-in token whitespace.
Also, whitespace is ignored by default in Perl&nbsp;6 regexes.


=={{header|REBOL}}==
=={{header|REBOL}}==
Line 1,728: Line 1,727:


print ["Current UTC time:" current]</lang>
print ["Current UTC time:" current]</lang>

=={{header|Run BASIC}}==
<lang runbasic>print word$(word$(httpget$("http://tycho.usno.navy.mil/cgi-bin/timer.pl"),1,"UTC"),2,"<BR>")</lang>
{{out}}
<pre>May. 09, 16:13:44</pre>


=={{header|Ruby}}==
=={{header|Ruby}}==
Line 1,755: Line 1,749:
puts URI.parse('http://tycho.usno.navy.mil/cgi-bin/timer.pl').read.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) UTC/)[1]
puts URI.parse('http://tycho.usno.navy.mil/cgi-bin/timer.pl').read.match(/ (\d{1,2}:\d{1,2}:\d{1,2}) UTC/)[1]
</lang>
</lang>

=={{header|Run BASIC}}==
<lang runbasic>print word$(word$(httpget$("http://tycho.usno.navy.mil/cgi-bin/timer.pl"),1,"UTC"),2,"<BR>")</lang>
{{out}}
<pre>May. 09, 16:13:44</pre>


=={{header|Scala}}==
=={{header|Scala}}==
Line 1,839: Line 1,838:
Oct. 27, 00:20:50 UTC
Oct. 27, 00:20:50 UTC
</pre>
</pre>



=={{header|Tcl}}==
=={{header|Tcl}}==
Line 1,854: Line 1,852:
<lang tcl>set data [exec curl -s http://tycho.usno.navy.mil/cgi-bin/timer.pl]
<lang tcl>set data [exec curl -s http://tycho.usno.navy.mil/cgi-bin/timer.pl]
puts [lrange [lsearch -glob -inline [split $data <BR>] *UTC*] 0 3]</lang>
puts [lrange [lsearch -glob -inline [split $data <BR>] *UTC*] 0 3]</lang>

=={{header|ToffeeScript}}==
<lang coffeescript>e, page = require('request').get! 'http://tycho.usno.navy.mil/cgi-bin/timer.pl'
l = line for line in page.body.split('\n') when line.indexOf('UTC')>0
console.log l.substr(4,l.length-20)</lang>


=={{header|TUSCRIPT}}==
=={{header|TUSCRIPT}}==
Line 1,861: Line 1,864:
SET utc = FILTER (time,":*UTC*:",-)
SET utc = FILTER (time,":*UTC*:",-)
</lang>
</lang>


=={{header|ToffeeScript}}==
<lang coffeescript>e, page = require('request').get! 'http://tycho.usno.navy.mil/cgi-bin/timer.pl'
l = line for line in page.body.split('\n') when line.indexOf('UTC')>0
console.log l.substr(4,l.length-20)</lang>


=={{header|TXR}}==
=={{header|TXR}}==
Line 2,040: Line 2,037:
Apr. 21, 21:02:03 UTC Universal Time
Apr. 21, 21:02:03 UTC Universal Time
</pre>
</pre>




=={{header|Visual Basic .NET}}==
=={{header|Visual Basic .NET}}==