Web scraping: Difference between revisions

Line 26:

<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>

<BR>Jul. 27, 22:57:22 UTC Universal Time

<BR>Jul. 27, 06:57:22 PM EDT Eastern Time

<BR>Jul. 27, 05:57:22 PM CDT Central Time

<BR>Jul. 27, 04:57:22 PM MDT Mountain Time

<BR>Jul. 27, 03:57:22 PM PDT Pacific Time

<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time

<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time

...

Line 50:

: get-time

read-url

/<BR>.*?(\d{2}:\d{2}:\d{2})\sUTC/

tuck r:match if

1 r:@ . cr

Line 187:

when ScrapeButton.Click do

set ScrapeWeb.Url to SourceTextBox.Text

call ScrapeWeb.Get

when ScrapeWeb.GotText url,responseCode,responseType,responseContent do

initialize local Left to split at first text (text: get responseContent, at: PreTextBox.Text)

initialize local Right to "" in

set Right to select list item (list: get Left, index: 2)

set ResultLabel.Text to select list item (list: split at first (text:get Right, at: PostTextBox.Text), index: 1)

</lang>

Line 340:

ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status

{

// implement error handling

Try {

// some initialisation

Set list="", sc=$$$OK

⚫

// check input parameters

If $Match(pHost, "^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$")=0 {

Set sc=$$$ERROR($$$GeneralError, "Invalid host name.")

Quit

}

// create http request and get page

Set req=##class(%Net.HttpRequest).%New()

Set req.Server=pHost

Do req.Get(pPath)

// check for success

If $Extract(req.HttpResponse.StatusCode)'=2 {

Set sc=$$$ERROR($$$GeneralError, "Page not loaded.")

Quit

}

// read http response stream

Set html=req.HttpResponse.Data

Set html.LineTerminator=$Char(10)

Set sc=html.Rewind()

// read http response stream

While 'html.AtEnd {

Set line=html.ReadLine(, .sc, .eol)

Set pos=$Locate(line, pRegEx)

If pos {

Set parse=$Piece($Extract(line, pos, *), $Char(9))

Set slot=$ListLength(list)+1

Set $List(list, slot)=parse

}

} Catch err {

// an error has occurred

If err.Name="<REGULAR EXPRESSION>" {

Set sc=$$$ERROR($$$GeneralError, "Invalid regular expression.")

} Else {

Set sc=$$$ERROR($$$CacheError, $ZError)

}

// return status

Quit sc

}

Line 658:

begin

{ The line we're looking for is something like this:

<BR>May. 04. 21:55:19 UTC Universal Time }

// Check each line

Line 767:

main() ->

inets:start(),

{ok, {_Status, _Header, HTML}} = httpc:request(?Url),

{match, [Time]} = re:run(HTML, ?Match, [{capture, all_but_first, binary}]),

io:format("~s~n",[Time]).</lang>

=={{header|F_Sharp|F#}}==

Line 966:

public class WebTime{

public static void main(String[] args){

try{

URL address = new URL(

"http://tycho.usno.navy.mil/cgi-bin/timer.pl");

URLConnection conn = address.openConnection();

BufferedReader in = new BufferedReader(

new InputStreamReader(conn.getInputStream()));

String line;

while(!(line = in.readLine()).contains("UTC"));

System.out.println(line.substring(4));

}catch(IOException e){

System.err.println("error connecting to server.");

e.printStackTrace();

}

}</lang>

Line 1,003:

<lang sh>$ ./Web_scraping.jq

Apr. 21, 05:19:32 UTC Universal Time</lang>

=={{header|Julia}}==

Line 1,096:

local(raw_htmlstring = '<TITLE>What time is it?</TITLE>

<H2> US Naval Observatory Master Clock Time</H2> <H3><PRE>

<BR>Jul. 27, 22:57:22 UTC Universal Time

<BR>Jul. 27, 06:57:22 PM EDT Eastern Time

<BR>Jul. 27, 05:57:22 PM CDT Central Time

<BR>Jul. 27, 04:57:22 PM MDT Mountain Time

<BR>Jul. 27, 03:57:22 PM PDT Pacific Time

<BR>Jul. 27, 02:57:22 PM AKDT Alaska Time

<BR>Jul. 27, 12:57:22 PM HAST Hawaii-Aleutian Time

</PRE></H3>

')

Line 1,111:

local(

reg_exp = regexp(-find = `<br>(.*?) UTC`, -input = #raw_htmlstring, -ignorecase),

datepart_txt = #reg_exp -> find ? #reg_exp -> matchstring(1) | string

)

Line 1,186:

tok = s(ix(k-1)+4:ix(k)-1);

if findstr(tok,'UTC')

disp(tok);

end;

end;</lang>

Line 1,421:

echo preg_replace(

"/^.*<BR>(.*) UTC.*$/su",

"\\1",

file_get_contents('http://tycho.usno.navy.mil/cgi-bin/timer.pl')

);

</lang>

Line 1,558:

=={{header|REBOL}}==

<lang REBOL>REBOL [

Title: "Web Scraping"

Author: oofoe

Date: 2009-12-07

URL: http://rosettacode.org/wiki/Web_Scraping

]

Line 1,618:

object WebTime extends Application {

val text = Source.fromURL("http://tycho.usno.navy.mil/cgi-bin/timer.pl")

val utc = text.getLines.find(_.contains("UTC"))

utc match {

case Some(s) => println(s.substring(4))

case _ => println("error")

}

</lang>

Line 1,847:

Debug.Print ReturnValue

MsgBox (ReturnValue)

End Sub

End Sub</lang>

Line 1,856:

Mar. 05, 00:57:37 UTC Universal Time

</pre>

=={{header|VBScript}}==

<lang vb>Function GetUTC() As String

Url = "http://tycho.usno.navy.mil/cgi-bin/timer.pl"

With CreateObject("MSXML2.XMLHTTP.6.0")

.Open "GET", Url, False

.send

arrt = Split(.responseText, vbLf)

End With

For Each t In arrt

If InStr(t, "UTC") Then

GetUTC = StripHttpTags(t)

Exit For

End If

⚫

Function StripHttpTags(s)

With New RegExp

.Global = True

.Pattern = "\<.+?\>"

If .Test(s) Then

StripHttpTags = .Replace(s, "")

Else

StripHttpTags = s

End If

End With

End Function

WScript.StdOut.Write GetUTC

WScript.StdOut.WriteLine</lang>

<pre>

Run getTime Subroutine

Apr. 21, 21:02:03 UTC Universal Time

</pre>

=={{header|Visual Basic .NET}}==

Line 1,894:

Line 1,935:

c:=data.seek(Void,0); // start of line

line:=data[c,data.seek(Void,1)-c].text;

line.print(); // the HTML UTC line

re:=RegExp(0'|.*(\d\d:\d\d:\d\d)|); // get time

Line 1,901:

Line 1,942:

<pre>

<BR>Mar. 18, 06:18:31 UTC Universal Time

06:18:31

</pre>