Web scraping: Difference between revisions
Content added Content deleted
mNo edit summary |
m (→{{header|Wren}}: Minor changes (including code to skip the site notice) and rerun.) |
||
Line 2,265: | Line 2,265: | ||
An embedded program so we can ask the C host to download the page for us. This task's talk page is being used for this purpose as the original URL no longer works. |
An embedded program so we can ask the C host to download the page for us. This task's talk page is being used for this purpose as the original URL no longer works. |
||
The code is based in part on the C example though, as we don't have regex, we use our Pattern module to identify the first occurrence of a UTC date/time. |
The code is based in part on the C example though, as we don't have regex, we use our Pattern module to identify the first occurrence of a UTC date/time after the site notice. |
||
<syntaxhighlight lang="ecmascript">/* |
<syntaxhighlight lang="ecmascript">/* Web_scraping.wren */ |
||
import "./pattern" for Pattern |
import "./pattern" for Pattern |
||
Line 2,275: | Line 2,275: | ||
var CURLOPT_WRITEDATA = 10001 |
var CURLOPT_WRITEDATA = 10001 |
||
var BUFSIZE = 16384 |
var BUFSIZE = 16384 * 4 |
||
foreign class Buffer { |
foreign class Buffer { |
||
Line 2,306: | Line 2,306: | ||
var html = buffer.value |
var html = buffer.value |
||
var ix = html.indexOf("(UTC)") |
var ix = html.indexOf("(UTC)") |
||
ix = html.indexOf("(UTC)", ix + 1) // skip the site notice |
|||
if (ix == -1) { |
if (ix == -1) { |
||
System.print("UTC time not found.") |
System.print("UTC time not found.") |
||
Line 2,315: | Line 2,316: | ||
<br> |
<br> |
||
We now embed this in the following C program, compile and run it. |
We now embed this in the following C program, compile and run it. |
||
<syntaxhighlight lang="c">/* gcc |
<syntaxhighlight lang="c">/* gcc Web_scraping.c -o Web_scraping -lcurl -lwren -lm */ |
||
#include <stdio.h> |
#include <stdio.h> |
||
Line 2,471: | Line 2,472: | ||
WrenVM* vm = wrenNewVM(&config); |
WrenVM* vm = wrenNewVM(&config); |
||
const char* module = "main"; |
const char* module = "main"; |
||
const char* fileName = " |
const char* fileName = "Web_scraping.wren"; |
||
char *script = readFile(fileName); |
char *script = readFile(fileName); |
||
WrenInterpretResult result = wrenInterpret(vm, module, script); |
WrenInterpretResult result = wrenInterpret(vm, module, script); |
||
Line 2,491: | Line 2,492: | ||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
||
20: |
20:53, 20 August 2008 |
||
</pre> |
</pre> |
||