Web scraping: Difference between revisions

Content added Content deleted
(Undo revision 259816 by AykayayCiti (talk))
No edit summary
Line 26: Line 26:
<TITLE>What time is it?</TITLE>
<TITLE>What time is it?</TITLE>
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>
<BR>Jul. 27, 22:57:22 UTC Universal Time
<BR>Jul. 27, 22:57:22 UTC Universal Time
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time
<BR>Jul. 27, 05:57:22 PM CDT Central Time
<BR>Jul. 27, 05:57:22 PM CDT Central Time
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time


...
...
Line 50: Line 50:
: get-time
: get-time
read-url
read-url
/<BR>.*?(\d{2}:\d{2}:\d{2})\sUTC/
/<BR>.*?(\d{2}:\d{2}:\d{2})\sUTC/
tuck r:match if
tuck r:match if
1 r:@ . cr
1 r:@ . cr
Line 187: Line 187:
<lang dos>
<lang dos>
when ScrapeButton.Click do
when ScrapeButton.Click do
set ScrapeWeb.Url to SourceTextBox.Text
set ScrapeWeb.Url to SourceTextBox.Text
call ScrapeWeb.Get
call ScrapeWeb.Get


when ScrapeWeb.GotText url,responseCode,responseType,responseContent do
when ScrapeWeb.GotText url,responseCode,responseType,responseContent do
initialize local Left to split at first text (text: get responseContent, at: PreTextBox.Text)
initialize local Left to split at first text (text: get responseContent, at: PreTextBox.Text)
initialize local Right to "" in
initialize local Right to "" in
set Right to select list item (list: get Left, index: 2)
set Right to select list item (list: get Left, index: 2)
set ResultLabel.Text to select list item (list: split at first (text:get Right, at: PostTextBox.Text), index: 1)
set ResultLabel.Text to select list item (list: split at first (text:get Right, at: PostTextBox.Text), index: 1)
</lang>
</lang>


Line 340: Line 340:
ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status
ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status
{
{
// implement error handling
// implement error handling
Try {
Try {


// some initialisation
// some initialisation
Set list="", sc=$$$OK
Set list="", sc=$$$OK
// check input parameters
// check input parameters
If $Match(pHost, "^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$")=0 {
If $Match(pHost, "^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$")=0 {
Set sc=$$$ERROR($$$GeneralError, "Invalid host name.")
Set sc=$$$ERROR($$$GeneralError, "Invalid host name.")
Quit
Quit
}
}
// create http request and get page
// create http request and get page
Set req=##class(%Net.HttpRequest).%New()
Set req=##class(%Net.HttpRequest).%New()
Set req.Server=pHost
Set req.Server=pHost
Do req.Get(pPath)
Do req.Get(pPath)
// check for success
// check for success
If $Extract(req.HttpResponse.StatusCode)'=2 {
If $Extract(req.HttpResponse.StatusCode)'=2 {
Set sc=$$$ERROR($$$GeneralError, "Page not loaded.")
Set sc=$$$ERROR($$$GeneralError, "Page not loaded.")
Quit
Quit
}
}
// read http response stream
// read http response stream
Set html=req.HttpResponse.Data
Set html=req.HttpResponse.Data
Set html.LineTerminator=$Char(10)
Set html.LineTerminator=$Char(10)
Set sc=html.Rewind()
Set sc=html.Rewind()
// read http response stream
// read http response stream
While 'html.AtEnd {
While 'html.AtEnd {
Set line=html.ReadLine(, .sc, .eol)
Set line=html.ReadLine(, .sc, .eol)
Set pos=$Locate(line, pRegEx)
Set pos=$Locate(line, pRegEx)
If pos {
If pos {
Set parse=$Piece($Extract(line, pos, *), $Char(9))
Set parse=$Piece($Extract(line, pos, *), $Char(9))
Set slot=$ListLength(list)+1
Set slot=$ListLength(list)+1
Set $List(list, slot)=parse
Set $List(list, slot)=parse
}
}
}
}
} Catch err {
} Catch err {
// an error has occurred
// an error has occurred
If err.Name="<REGULAR EXPRESSION>" {
If err.Name="<REGULAR EXPRESSION>" {
Set sc=$$$ERROR($$$GeneralError, "Invalid regular expression.")
Set sc=$$$ERROR($$$GeneralError, "Invalid regular expression.")
} Else {
} Else {
Set sc=$$$ERROR($$$CacheError, $ZError)
Set sc=$$$ERROR($$$CacheError, $ZError)
}
}
}
}
// return status
// return status
Quit sc
Quit sc
}
}


Line 658: Line 658:
begin
begin
{ The line we're looking for is something like this:
{ The line we're looking for is something like this:
<BR>May. 04. 21:55:19 UTC Universal Time }
<BR>May. 04. 21:55:19 UTC Universal Time }


// Check each line
// Check each line
Line 767: Line 767:


main() ->
main() ->
inets:start(),
inets:start(),
{ok, {_Status, _Header, HTML}} = httpc:request(?Url),
{ok, {_Status, _Header, HTML}} = httpc:request(?Url),
{match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]),
{match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]),
io:format("~s~n",[Time]).</lang>
io:format("~s~n",[Time]).</lang>


=={{header|F_Sharp|F#}}==
=={{header|F_Sharp|F#}}==
Line 966: Line 966:


public class WebTime{
public class WebTime{
public static void main(String[] args){
public static void main(String[] args){
try{
try{
URL address = new URL(
URL address = new URL(
"http://tycho.usno.navy.mil/cgi-bin/timer.pl");
"http://tycho.usno.navy.mil/cgi-bin/timer.pl");
URLConnection conn = address.openConnection();
URLConnection conn = address.openConnection();
BufferedReader in = new BufferedReader(
BufferedReader in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
new InputStreamReader(conn.getInputStream()));
String line;
String line;
while(!(line = in.readLine()).contains("UTC"));
while(!(line = in.readLine()).contains("UTC"));
System.out.println(line.substring(4));
System.out.println(line.substring(4));
}catch(IOException e){
}catch(IOException e){
System.err.println("error connecting to server.");
System.err.println("error connecting to server.");
e.printStackTrace();
e.printStackTrace();
}
}
}
}
}</lang>
}</lang>


Line 1,003: Line 1,003:
{{out}}
{{out}}
<lang sh>$ ./Web_scraping.jq
<lang sh>$ ./Web_scraping.jq
Apr. 21, 05:19:32 UTC Universal Time</lang>
Apr. 21, 05:19:32 UTC Universal Time</lang>


=={{header|Julia}}==
=={{header|Julia}}==
Line 1,096: Line 1,096:
local(raw_htmlstring = '<TITLE>What time is it?</TITLE>
local(raw_htmlstring = '<TITLE>What time is it?</TITLE>
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>
<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>
<BR>Jul. 27, 22:57:22 UTC Universal Time
<BR>Jul. 27, 22:57:22 UTC Universal Time
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time
<BR>Jul. 27, 06:57:22 PM EDT Eastern Time
<BR>Jul. 27, 05:57:22 PM CDT Central Time
<BR>Jul. 27, 05:57:22 PM CDT Central Time
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time
<BR>Jul. 27, 04:57:22 PM MDT Mountain Time
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time
<BR>Jul. 27, 03:57:22 PM PDT Pacific Time
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time
<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time
<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time
</PRE></H3>
</PRE></H3>
')
')
Line 1,111: Line 1,111:


local(
local(
reg_exp = regexp(-find = `<br>(.*?) UTC`, -input = #raw_htmlstring, -ignorecase),
reg_exp = regexp(-find = `<br>(.*?) UTC`, -input = #raw_htmlstring, -ignorecase),
datepart_txt = #reg_exp -> find ? #reg_exp -> matchstring(1) | string
datepart_txt = #reg_exp -> find ? #reg_exp -> matchstring(1) | string
)
)


Line 1,186: Line 1,186:
tok = s(ix(k-1)+4:ix(k)-1);
tok = s(ix(k-1)+4:ix(k)-1);
if findstr(tok,'UTC')
if findstr(tok,'UTC')
disp(tok);
disp(tok);
end;
end;
end;</lang>
end;</lang>
Line 1,421: Line 1,421:


echo preg_replace(
echo preg_replace(
"/^.*<BR>(.*) UTC.*$/su",
"/^.*<BR>(.*) UTC.*$/su",
"\\1",
"\\1",
file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl')
file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl')
);
);
</lang>
</lang>
Line 1,558: Line 1,558:
=={{header|REBOL}}==
=={{header|REBOL}}==
<lang REBOL>REBOL [
<lang REBOL>REBOL [
Title: "Web Scraping"
Title: "Web Scraping"
Author: oofoe
Author: oofoe
Date: 2009-12-07
Date: 2009-12-07
URL: http://rosettacode.org/wiki/Web_Scraping
URL: http://rosettacode.org/wiki/Web_Scraping
]
]


Line 1,618: Line 1,618:


object WebTime extends Application {
object WebTime extends Application {
val text = Source.fromURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl")
val text = Source.fromURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl")
val utc = text.getLines.find(_.contains("UTC"))
val utc = text.getLines.find(_.contains("UTC"))
utc match {
utc match {
case Some(s) => println(s.substring(4))
case Some(s) => println(s.substring(4))
case _ => println("error")
case _ => println("error")
}
}
}
}
</lang>
</lang>
Line 1,847: Line 1,847:
Debug.Print ReturnValue
Debug.Print ReturnValue
MsgBox (ReturnValue)
MsgBox (ReturnValue)
End Sub
End Sub</lang>




Line 1,856: Line 1,856:
Mar. 05, 00:57:37 UTC Universal Time
Mar. 05, 00:57:37 UTC Universal Time
</pre>
</pre>

=={{header|VBScript}}==
<lang vb>Function GetUTC() As String
Url = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"
With CreateObject("MSXML2.XMLHTTP.6.0")
.Open "GET", Url, False
.send
arrt = Split(.responseText, vbLf)
End With
For Each t In arrt
If InStr(t, "UTC") Then
GetUTC = StripHttpTags(t)
Exit For
End If
Next
End Function
Function StripHttpTags(s)
With New RegExp
.Global = True
.Pattern = "\<.+?\>"
If .Test(s) Then
StripHttpTags = .Replace(s, "")
Else
StripHttpTags = s
End If
End With
End Function

WScript.StdOut.Write GetUTC
WScript.StdOut.WriteLine</lang>

{{Out}}
<pre>
Run getTime Subroutine
Apr. 21, 21:02:03 UTC Universal Time
</pre>




=={{header|Visual Basic .NET}}==
=={{header|Visual Basic .NET}}==
Line 1,894: Line 1,935:
c:=data.seek(Void,0); // start of line
c:=data.seek(Void,0); // start of line
line:=data[c,data.seek(Void,1)-c].text;
line:=data[c,data.seek(Void,1)-c].text;
line.print(); // the HTML UTC line
line.print(); // the HTML UTC line


re:=RegExp(0'|.*(\d\d:\d\d:\d\d)|); // get time
re:=RegExp(0'|.*(\d\d:\d\d:\d\d)|); // get time
Line 1,901: Line 1,942:
{{out}}
{{out}}
<pre>
<pre>
<BR>Mar. 18, 06:18:31 UTC Universal Time
<BR>Mar. 18, 06:18:31 UTC Universal Time
06:18:31
06:18:31
</pre>
</pre>