Web scraping: Difference between revisions

m
→‎{{header|Phix}}: syntax coloured, marked p2js incompatible, updated output
m (→‎{{header|Phix}}: syntax coloured, marked p2js incompatible, updated output)
Line 1,543:
=={{header|Phix}}==
{{libheader|Phix/libcurl}}
<!--<lang Phix>(notonline)-- demo\rosetta\web_scrape.exw>
<span style="color: #000080;font-style:italic;">--
include builtins\libcurl.e
-- demo\rosetta\web_scrape.exw
include builtins\timedate.e
-- ===========================
 
--</span>
--object res = curl_easy_perform_ex("https://tycho.usno.navy.mil/cgi-bin/timer.pl")
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- (libcurl)</span>
object res = curl_easy_perform_ex("https://rosettacode.org/wiki/Talk:Web_scraping")
<span style="color: #008080;">include</span> <span style="color: #000000;">builtins</span><span style="color: #0000FF;">\</span><span style="color: #000000;">libcurl</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
if string(res) then
<span style="color: #008080;">include</span> <span style="color: #000000;">builtins</span><span style="color: #0000FF;">\</span><span style="color: #004080;">timedate</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
res = split(res,'\n')
for i=1 to length(res) do
<span style="color: #004080;">object</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">curl_easy_perform_ex</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"https://rosettacode.org/wiki/Talk:Web_scraping"</span><span style="color: #0000FF;">)</span>
integer k = match("UTC",res[i])
<span style="color: #008080;">if</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
if k then
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">)</span>
-- res = res[i][5..k-2]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
res = res[i][1..k-3]
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">match</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"UTC"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">])</span>
k = match("</a> ",res)
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span> <span style="color: #008080;">then</span>
res = res[k+5..$]
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #000080;font-style:italic;">-- (debug aid)</span>
exit
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">..</span><span style="color: #000000;">k</span><span style="color: #0000FF;">-</span><span style="color: #000000;">3</span><span style="color: #0000FF;">]</span>
end if
<span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">rmatch</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"&lt;/a&gt;"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span>
end for
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">[</span><span style="color: #000000;">k</span><span style="color: #0000FF;">+</span><span style="color: #000000;">5</span><span style="color: #0000FF;">..$])</span>
?res
<span style="color: #008080;">exit</span>
if string(res) then
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
-- timedate td = parse_date_string(res, {"Mmm. d, hh:mm:ss"})
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
timedate td = parse_date_string(res, {"hh:mm, d Mmmm yyyy"})
<span style="color: #0000FF;">?</span><span style="color: #000000;">res</span>
-- td[DT_YEAR] = date()[DT_YEAR]
<span style="color: #008080;">if</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
-- ?format_timedate(td,"h:mpm Dddd ddth Mmmm")
<span style="color: #004080;">timedate</span> <span style="color: #000000;">td</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">parse_date_string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"hh:mm, d Mmmm yyyy"</span><span style="color: #0000FF;">})</span>
?format_timedate(td,"Dddd Mmmm ddth yyyy h:mpm")
<span style="color: #0000FF;">?</span><span style="color: #7060A8;">format_timedate</span><span style="color: #0000FF;">(</span><span style="color: #000000;">td</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Dddd Mmmm ddth yyyy h:mpm"</span><span style="color: #0000FF;">)</span>
-- ?format_timedate(date(),"h:mpm Dddd ddth Mmmm")
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end if
<span style="color: #008080;">else</span>
else
<span style="color: #0000FF;">?{</span><span style="color: #008000;">"some error"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">curl_easy_strerror</span><span style="color: #0000FF;">(</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)}</span>
?{"some error",res,curl_easy_strerror(res)}
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end if<!--</lang>-->
{{out}}
(backFrom whenthe thattalk tychopage, and right now every page workedon rc)
<pre>
"20:59, 30 May 2020"
"Apr. 26, 12:38:18"
"Saturday May 30th 2020 8:59pm"
"12:38pm Friday April 26th"
"1:38pm Friday April 26th"
</pre>
The last line differs because it is British Summer Time here.<br>
Note that since that webpage has no year, td[DT_YEAR] will be 0, and without setting it as shown the weekday would also be wrong.
{{out}}
(From the talk page)
<pre>
"20:53, 20 August 2008"
"Wednesday August 20th 2008 8:53pm"
</pre>
 
7,805

edits