Web scraping: Difference between revisions
Content added Content deleted
Line 1,467: | Line 1,467: | ||
Note that the string between '<' and '>' refers to regex tokens, so to match a literal '<BR>' you need to quote it, while <ws> refers to the built-in token whitespace. |
Note that the string between '<' and '>' refers to regex tokens, so to match a literal '<BR>' you need to quote it, while <ws> refers to the built-in token whitespace. |
||
Also, whitespace is ignored by default in Perl 6 regexes. |
Also, whitespace is ignored by default in Perl 6 regexes. |
||
=={{header|Phix}}== |
|||
<lang Phix>-- demo\rosetta\web_scrape.exw |
|||
include builtins\libcurl.e |
|||
include builtins\timedate.e |
|||
curl_global_init() |
|||
atom curl = curl_easy_init() |
|||
curl_easy_setopt(curl, CURLOPT_URL, "https://tycho.usno.navy.mil/cgi-bin/timer.pl") |
|||
object res = curl_easy_perform_ex(curl) |
|||
if string(res) then |
|||
res = split(res,'\n') |
|||
for i=1 to length(res) do |
|||
integer k = match("UTC",res[i]) |
|||
if k then |
|||
res = res[i][5..k-2] |
|||
exit |
|||
end if |
|||
end for |
|||
?res |
|||
if string(res) then |
|||
timedate td = parse_date_string(res, {"Mmm. d, hh:mm:ss"}) |
|||
?format_timedate(td,"h:mpm Mmmm ddth") |
|||
end if |
|||
else |
|||
?"some error" |
|||
end if |
|||
curl_easy_cleanup(curl) |
|||
curl_global_cleanup()</lang> |
|||
{{out}} |
|||
<pre> |
|||
"Apr. 26, 12:24:11" |
|||
"12:24pm April 26th" |
|||
</pre> |
|||
Note that since that webpage has no year, td[DT_YEAR] will be 0 (you could of course just set it from date()[DT_YEAR]), and hence the weekday would also be wrong. |
|||
=={{header|PHP}}== |
=={{header|PHP}}== |