Web scraping: Difference between revisions

no edit summary
(Undo revision 259816 by AykayayCiti (talk))
No edit summary
Line 26:
<TITLE>What time is it?</TITLE>
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>
<BR>Jul. 27, 22:57:22 UTC Universal Time
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time
<BR>Jul. 27, 05:57:22 PM CDT Central Time
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time
 
...
Line 50:
: get-time
read-url
/<BR>.*?(\d{2}:\d{2}:\d{2})\sUTC/
tuck r:match if
1 r:@ . cr
Line 187:
<lang dos>
when ScrapeButton.Click do
set ScrapeWeb.Url to SourceTextBox.Text
call ScrapeWeb.Get
 
when ScrapeWeb.GotText url,responseCode,responseType,responseContent do
initialize local Left to split at first text (text: get responseContent, at: PreTextBox.Text)
initialize local Right to "" in
set Right to select list item (list: get Left, index: 2)
set ResultLabel.Text to select list item (list: split at first (text:get Right, at: PostTextBox.Text), index: 1)
</lang>
 
Line 340:
ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status
{
// implement error handling
Try {
 
// some initialisation
Set list="", sc=$$$OK
// check input parameters
If $Match(pHost, "^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$")=0 {
Set sc=$$$ERROR($$$GeneralError, "Invalid host name.")
Quit
}
}
// create http request and get page
Set req=##class(%Net.HttpRequest).%New()
Set req.Server=pHost
Do req.Get(pPath)
// check for success
If $Extract(req.HttpResponse.StatusCode)'=2 {
Set sc=$$$ERROR($$$GeneralError, "Page not loaded.")
Quit
}
}
// read http response stream
Set html=req.HttpResponse.Data
Set html.LineTerminator=$Char(10)
Set sc=html.Rewind()
// read http response stream
While 'html.AtEnd {
Set line=html.ReadLine(, .sc, .eol)
Set pos=$Locate(line, pRegEx)
If pos {
Set parse=$Piece($Extract(line, pos, *), $Char(9))
Set slot=$ListLength(list)+1
Set $List(list, slot)=parse
}
}
}
}
} Catch err {
// an error has occurred
If err.Name="<REGULAR EXPRESSION>" {
Set sc=$$$ERROR($$$GeneralError, "Invalid regular expression.")
} Else {
Set sc=$$$ERROR($$$CacheError, $ZError)
}
}
}
// return status
Quit sc
}
 
Line 658:
begin
{ The line we're looking for is something like this:
<BR>May. 04. 21:55:19 UTC Universal Time }
 
// Check each line
Line 767:
 
main() ->
inets:start(),
{ok, {_Status, _Header, HTML}} = httpc:request(?Url),
{match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]),
io:format("~s~n",[Time]).</lang>
 
=={{header|F_Sharp|F#}}==
Line 966:
 
public class WebTime{
public static void main(String[] args){
try{
URL address = new URL(
"http://tycho.usno.navy.mil/cgi-bin/timer.pl");
URLConnection conn = address.openConnection();
BufferedReader in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while(!(line = in.readLine()).contains("UTC"));
System.out.println(line.substring(4));
}catch(IOException e){
System.err.println("error connecting to server.");
e.printStackTrace();
}
}
}
}</lang>
 
Line 1,003:
{{out}}
<lang sh>$ ./Web_scraping.jq
Apr. 21, 05:19:32 UTC Universal Time</lang>
 
=={{header|Julia}}==
Line 1,096:
local(raw_htmlstring = '<TITLE>What time is it?</TITLE>
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>
<BR>Jul. 27, 22:57:22 UTC Universal Time
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time
<BR>Jul. 27, 05:57:22 PM CDT Central Time
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time
</PRE></H3>
')
Line 1,111:
 
local(
reg_exp = regexp(-find = `<br>(.*?) UTC`, -input = #raw_htmlstring, -ignorecase),
datepart_txt = #reg_exp -> find ? #reg_exp -> matchstring(1) | string
)
 
Line 1,186:
tok = s(ix(k-1)+4:ix(k)-1);
if findstr(tok,'UTC')
disp(tok);
end;
end;</lang>
Line 1,421:
 
echo preg_replace(
"/^.*<BR>(.*) UTC.*$/su",
"\\1",
file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl')
);
</lang>
Line 1,558:
=={{header|REBOL}}==
<lang REBOL>REBOL [
Title: "Web Scraping"
Author: oofoe
Date: 2009-12-07
URL: http://rosettacode.org/wiki/Web_Scraping
]
 
Line 1,618:
 
object WebTime extends Application {
val text = Source.fromURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl")
val utc = text.getLines.find(_.contains("UTC"))
utc match {
case Some(s) => println(s.substring(4))
case _ => println("error")
}
}
</lang>
Line 1,847:
Debug.Print ReturnValue
MsgBox (ReturnValue)
End Sub</lang>
 
 
Line 1,856:
Mar. 05, 00:57:37 UTC Universal Time
</pre>
 
=={{header|VBScript}}==
<lang vb>Function GetUTC() As String
Url = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
With CreateObject("MSXML2.XMLHTTP.6.0")
.Open "GET", Url, False
.send
arrt = Split(.responseText, vbLf)
End With
For Each t In arrt
If InStr(t, "UTC") Then
GetUTC = StripHttpTags(t)
Exit For
End If
Next
End Function
Function StripHttpTags(s)
With New RegExp
.Global = True
.Pattern = "\<.+?\>"
If .Test(s) Then
StripHttpTags = .Replace(s, "")
Else
StripHttpTags = s
End If
End With
End Function
 
WScript.StdOut.Write GetUTC
WScript.StdOut.WriteLine</lang>
 
{{Out}}
<pre>
Run getTime Subroutine
Apr. 21, 21:02:03 UTC Universal Time
</pre>
 
 
 
=={{header|Visual Basic .NET}}==
Line 1,894 ⟶ 1,935:
c:=data.seek(Void,0); // start of line
line:=data[c,data.seek(Void,1)-c].text;
line.print(); // the HTML UTC line
 
re:=RegExp(0'|.*(\d\d:\d\d:\d\d)|); // get time
Line 1,901 ⟶ 1,942:
{{out}}
<pre>
<BR>Mar. 18, 06:18:31 UTC Universal Time
06:18:31
</pre>