Web scraping: Difference between revisions
Content added Content deleted
(Undo revision 259816 by AykayayCiti (talk)) |
No edit summary |
||
Line 26: | Line 26: | ||
<TITLE>What time is it?</TITLE> |
<TITLE>What time is it?</TITLE> |
||
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE> |
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE> |
||
<BR>Jul. 27, 22:57:22 UTC |
<BR>Jul. 27, 22:57:22 UTC Universal Time |
||
<BR>Jul. 27, 06:57:22 PM EDT |
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time |
||
<BR>Jul. 27, 05:57:22 PM CDT |
<BR>Jul. 27, 05:57:22 PM CDT Central Time |
||
<BR>Jul. 27, 04:57:22 PM MDT |
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time |
||
<BR>Jul. 27, 03:57:22 PM PDT |
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time |
||
<BR>Jul. 27, 02:57:22 PM AKDT |
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time |
||
<BR>Jul. 27, 12:57:22 PM HAST |
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time |
||
... |
... |
||
Line 50: | Line 50: | ||
: get-time |
: get-time |
||
read-url |
read-url |
||
/<BR>.*?(\d{2}:\d{2}:\d{2})\sUTC/ |
/<BR>.*?(\d{2}:\d{2}:\d{2})\sUTC/ |
||
tuck r:match if |
tuck r:match if |
||
1 r:@ . cr |
1 r:@ . cr |
||
Line 187: | Line 187: | ||
<lang dos> |
<lang dos> |
||
when ScrapeButton.Click do |
when ScrapeButton.Click do |
||
set ScrapeWeb.Url to SourceTextBox.Text |
|||
call ScrapeWeb.Get |
|||
when ScrapeWeb.GotText url,responseCode,responseType,responseContent do |
when ScrapeWeb.GotText url,responseCode,responseType,responseContent do |
||
initialize local Left to split at first text (text: get responseContent, at: PreTextBox.Text) |
|||
initialize local Right to "" in |
|||
set Right to select list item (list: get Left, index: 2) |
|||
set ResultLabel.Text to select list item (list: split at first (text:get Right, at: PostTextBox.Text), index: 1) |
|||
</lang> |
</lang> |
||
Line 340: | Line 340: | ||
ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status |
ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status |
||
{ |
{ |
||
// implement error handling |
|||
Try { |
|||
// some initialisation |
|||
Set list="", sc=$$$OK |
|||
⚫ | |||
// check input parameters |
|||
If $Match(pHost, "^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$")=0 { |
|||
Set sc=$$$ERROR($$$GeneralError, "Invalid host name.") |
|||
Quit |
|||
} |
|||
} |
|||
// create http request and get page |
|||
Set req=##class(%Net.HttpRequest).%New() |
|||
Set req.Server=pHost |
|||
Do req.Get(pPath) |
|||
// check for success |
|||
If $Extract(req.HttpResponse.StatusCode)'=2 { |
|||
Set sc=$$$ERROR($$$GeneralError, "Page not loaded.") |
|||
Quit |
|||
} |
|||
} |
|||
// read http response stream |
|||
Set html=req.HttpResponse.Data |
|||
Set html.LineTerminator=$Char(10) |
|||
Set sc=html.Rewind() |
|||
// read http response stream |
|||
While 'html.AtEnd { |
|||
Set line=html.ReadLine(, .sc, .eol) |
|||
Set pos=$Locate(line, pRegEx) |
|||
If pos { |
|||
Set parse=$Piece($Extract(line, pos, *), $Char(9)) |
|||
Set slot=$ListLength(list)+1 |
|||
Set $List(list, slot)=parse |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} Catch err { |
|||
// an error has occurred |
|||
If err.Name="<REGULAR EXPRESSION>" { |
|||
Set sc=$$$ERROR($$$GeneralError, "Invalid regular expression.") |
|||
} Else { |
|||
Set sc=$$$ERROR($$$CacheError, $ZError) |
|||
} |
|||
} |
|||
} |
|||
// return status |
|||
Quit sc |
|||
} |
} |
||
Line 658: | Line 658: | ||
begin |
begin |
||
{ The line we're looking for is something like this: |
{ The line we're looking for is something like this: |
||
<BR>May. 04. 21:55:19 UTC |
<BR>May. 04. 21:55:19 UTC Universal Time } |
||
// Check each line |
// Check each line |
||
Line 767: | Line 767: | ||
main() -> |
main() -> |
||
inets:start(), |
|||
{ok, {_Status, _Header, HTML}} = httpc:request(?Url), |
|||
{match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]), |
|||
io:format("~s~n",[Time]).</lang> |
|||
=={{header|F_Sharp|F#}}== |
=={{header|F_Sharp|F#}}== |
||
Line 966: | Line 966: | ||
public class WebTime{ |
public class WebTime{ |
||
public static void main(String[] args){ |
|||
try{ |
|||
URL address = new URL( |
|||
"http://tycho.usno.navy.mil/cgi-bin/timer.pl"); |
|||
URLConnection conn = address.openConnection(); |
|||
BufferedReader in = new BufferedReader( |
|||
new InputStreamReader(conn.getInputStream())); |
|||
String line; |
|||
while(!(line = in.readLine()).contains("UTC")); |
|||
System.out.println(line.substring(4)); |
|||
}catch(IOException e){ |
|||
System.err.println("error connecting to server."); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
}</lang> |
}</lang> |
||
Line 1,003: | Line 1,003: | ||
{{out}} |
{{out}} |
||
<lang sh>$ ./Web_scraping.jq |
<lang sh>$ ./Web_scraping.jq |
||
Apr. 21, 05:19:32 UTC |
Apr. 21, 05:19:32 UTC Universal Time</lang> |
||
=={{header|Julia}}== |
=={{header|Julia}}== |
||
Line 1,096: | Line 1,096: | ||
local(raw_htmlstring = '<TITLE>What time is it?</TITLE> |
local(raw_htmlstring = '<TITLE>What time is it?</TITLE> |
||
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE> |
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE> |
||
<BR>Jul. 27, 22:57:22 UTC |
<BR>Jul. 27, 22:57:22 UTC Universal Time |
||
<BR>Jul. 27, 06:57:22 PM EDT |
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time |
||
<BR>Jul. 27, 05:57:22 PM CDT |
<BR>Jul. 27, 05:57:22 PM CDT Central Time |
||
<BR>Jul. 27, 04:57:22 PM MDT |
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time |
||
<BR>Jul. 27, 03:57:22 PM PDT |
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time |
||
<BR>Jul. 27, 02:57:22 PM AKDT |
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time |
||
<BR>Jul. 27, 12:57:22 PM HAST |
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time |
||
</PRE></H3> |
</PRE></H3> |
||
') |
') |
||
Line 1,111: | Line 1,111: | ||
local( |
local( |
||
reg_exp = regexp(-find = `<br>(.*?) UTC`, -input = #raw_htmlstring, -ignorecase), |
|||
datepart_txt = #reg_exp -> find ? #reg_exp -> matchstring(1) | string |
|||
) |
) |
||
Line 1,186: | Line 1,186: | ||
tok = s(ix(k-1)+4:ix(k)-1); |
tok = s(ix(k-1)+4:ix(k)-1); |
||
if findstr(tok,'UTC') |
if findstr(tok,'UTC') |
||
disp(tok); |
|||
end; |
end; |
||
end;</lang> |
end;</lang> |
||
Line 1,421: | Line 1,421: | ||
echo preg_replace( |
echo preg_replace( |
||
"/^.*<BR>(.*) UTC.*$/su", |
|||
"\\1", |
|||
file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl') |
|||
); |
); |
||
</lang> |
</lang> |
||
Line 1,558: | Line 1,558: | ||
=={{header|REBOL}}== |
=={{header|REBOL}}== |
||
<lang REBOL>REBOL [ |
<lang REBOL>REBOL [ |
||
Title: "Web Scraping" |
|||
Author: oofoe |
|||
Date: 2009-12-07 |
|||
URL: http://rosettacode.org/wiki/Web_Scraping |
|||
] |
] |
||
Line 1,618: | Line 1,618: | ||
object WebTime extends Application { |
object WebTime extends Application { |
||
val text = Source.fromURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl") |
|||
val utc = text.getLines.find(_.contains("UTC")) |
|||
utc match { |
|||
case Some(s) => println(s.substring(4)) |
|||
case _ => println("error") |
|||
} |
|||
} |
} |
||
</lang> |
</lang> |
||
Line 1,847: | Line 1,847: | ||
Debug.Print ReturnValue |
Debug.Print ReturnValue |
||
MsgBox (ReturnValue) |
MsgBox (ReturnValue) |
||
End Sub |
End Sub</lang> |
||
Line 1,856: | Line 1,856: | ||
Mar. 05, 00:57:37 UTC Universal Time |
Mar. 05, 00:57:37 UTC Universal Time |
||
</pre> |
</pre> |
||
=={{header|VBScript}}== |
|||
<lang vb>Function GetUTC() As String |
|||
Url = "http://tycho.usno.navy.mil/cgi-bin/timer.pl" |
|||
With CreateObject("MSXML2.XMLHTTP.6.0") |
|||
.Open "GET", Url, False |
|||
.send |
|||
arrt = Split(.responseText, vbLf) |
|||
End With |
|||
For Each t In arrt |
|||
If InStr(t, "UTC") Then |
|||
GetUTC = StripHttpTags(t) |
|||
Exit For |
|||
End If |
|||
Next |
|||
End Function |
|||
⚫ | |||
Function StripHttpTags(s) |
|||
With New RegExp |
|||
.Global = True |
|||
.Pattern = "\<.+?\>" |
|||
If .Test(s) Then |
|||
StripHttpTags = .Replace(s, "") |
|||
Else |
|||
StripHttpTags = s |
|||
End If |
|||
End With |
|||
End Function |
|||
WScript.StdOut.Write GetUTC |
|||
WScript.StdOut.WriteLine</lang> |
|||
{{Out}} |
|||
<pre> |
|||
Run getTime Subroutine |
|||
Apr. 21, 21:02:03 UTC Universal Time |
|||
</pre> |
|||
=={{header|Visual Basic .NET}}== |
=={{header|Visual Basic .NET}}== |
||
Line 1,894: | Line 1,935: | ||
c:=data.seek(Void,0); // start of line |
c:=data.seek(Void,0); // start of line |
||
line:=data[c,data.seek(Void,1)-c].text; |
line:=data[c,data.seek(Void,1)-c].text; |
||
line.print(); |
line.print(); // the HTML UTC line |
||
re:=RegExp(0'|.*(\d\d:\d\d:\d\d)|); // get time |
re:=RegExp(0'|.*(\d\d:\d\d:\d\d)|); // get time |
||
Line 1,901: | Line 1,942: | ||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
||
<BR>Mar. 18, 06:18:31 UTC |
<BR>Mar. 18, 06:18:31 UTC Universal Time |
||
06:18:31 |
06:18:31 |
||
</pre> |
</pre> |