WiktionaryDumps to words: Difference between revisions
(Added Wren) |
Thundergnat (talk | contribs) m (syntax highlighting fixup automation) |
||
Line 9: | Line 9: | ||
=={{header|C}}== |
=={{header|C}}== |
||
< |
<syntaxhighlight lang="c">#include <stdio.h> |
||
#include <stdlib.h> |
#include <stdlib.h> |
||
#include <stdbool.h> |
#include <stdbool.h> |
||
Line 205: | Line 205: | ||
return 0; |
return 0; |
||
}</ |
}</syntaxhighlight> |
||
{{out}} |
{{out}} |
||
Line 229: | Line 229: | ||
=={{header|Java}}== |
=={{header|Java}}== |
||
< |
<syntaxhighlight lang="java">import org.xml.sax.*; |
||
import org.xml.sax.helpers.DefaultHandler; |
import org.xml.sax.helpers.DefaultHandler; |
||
import org.xml.sax.SAXException; |
import org.xml.sax.SAXException; |
||
Line 288: | Line 288: | ||
} |
} |
||
} |
} |
||
}</ |
}</syntaxhighlight> |
||
{{out}} |
{{out}} |
||
Line 311: | Line 311: | ||
=={{header|Julia}}== |
=={{header|Julia}}== |
||
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found. |
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found. |
||
< |
<syntaxhighlight lang="julia">using CodecBzip2 |
||
function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80) |
function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80) |
||
Line 341: | Line 341: | ||
getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout |
getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout |
||
</ |
</syntaxhighlight>{{out}} |
||
<pre> |
<pre> |
||
gratis |
gratis |
||
Line 430: | Line 430: | ||
Using the library [http://erratique.ch/software/xmlm xmlm]: |
Using the library [http://erratique.ch/software/xmlm xmlm]: |
||
< |
<syntaxhighlight lang="ocaml">let () = |
||
let i = Xmlm.make_input ~strip:true (`Channel stdin) in |
let i = Xmlm.make_input ~strip:true (`Channel stdin) in |
||
let title = ref "" in |
let title = ref "" in |
||
Line 463: | Line 463: | ||
then print_endline !title |
then print_endline !title |
||
end |
end |
||
done</ |
done</syntaxhighlight> |
||
{{out}} |
{{out}} |
||
Line 485: | Line 485: | ||
=={{header|Perl}}== |
=={{header|Perl}}== |
||
{{trans|Raku}} |
{{trans|Raku}} |
||
< |
<syntaxhighlight lang="perl"># 20211214 Perl programming solution |
||
use strict; |
use strict; |
||
Line 529: | Line 529: | ||
} |
} |
||
} |
} |
||
)</ |
)</syntaxhighlight> |
||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
||
Line 542: | Line 542: | ||
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.<br> |
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.<br> |
||
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so |
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so |
||
<!--< |
<!--<syntaxhighlight lang="phix">(notonline)--> |
||
<span style="color: #008080;">constant</span> <span style="color: #000000;">url</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"</span> |
<span style="color: #008080;">constant</span> <span style="color: #000000;">url</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"</span> |
||
Line 631: | Line 631: | ||
<span style="color: #7060A8;">curl_easy_cleanup</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">)</span> |
<span style="color: #7060A8;">curl_easy_cleanup</span><span style="color: #0000FF;">(</span><span style="color: #000000;">curl</span><span style="color: #0000FF;">)</span> |
||
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Total downloaded: %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">file_size_k</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tbr</span><span style="color: #0000FF;">)})</span> |
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"Total downloaded: %s\n"</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">file_size_k</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tbr</span><span style="color: #0000FF;">)})</span> |
||
<!--</ |
<!--</syntaxhighlight>--> |
||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
||
Line 644: | Line 644: | ||
=={{header|Raku}}== |
=={{header|Raku}}== |
||
I misunderstood the data format and now just copy verbatim from Julia entry the processing logics .. |
I misunderstood the data format and now just copy verbatim from Julia entry the processing logics .. |
||
<lang |
<syntaxhighlight lang="raku" line># 20211209 Raku programming solution |
||
use LWP::Simple; |
use LWP::Simple; |
||
Line 710: | Line 710: | ||
my $ua = CustomLWP.new: URL => $URL ; |
my $ua = CustomLWP.new: URL => $URL ; |
||
$ua.CustomRequest>>.say</ |
$ua.CustomRequest>>.say</syntaxhighlight> |
||
{{out}} |
{{out}} |
||
<pre> |
<pre> |
||
Line 728: | Line 728: | ||
Rather than downloading the full 800MB .bz2 file and then decompressing it, we abort the download after receiving no more than the first 512 KB and then decompress that ignoring the resultant BZ_UNEXPECTED_EOF error. This turns out to be enough to find the first 26 French words. |
Rather than downloading the full 800MB .bz2 file and then decompressing it, we abort the download after receiving no more than the first 512 KB and then decompress that ignoring the resultant BZ_UNEXPECTED_EOF error. This turns out to be enough to find the first 26 French words. |
||
< |
<syntaxhighlight lang="ecmascript">/* wiktionary_dumps_to_words.wren */ |
||
import "./pattern" for Pattern |
import "./pattern" for Pattern |
||
Line 789: | Line 789: | ||
gotTextLast = false |
gotTextLast = false |
||
} |
} |
||
}</ |
}</syntaxhighlight> |
||
<br> |
<br> |
||
We now embed this script in the following C program, build and run. |
We now embed this script in the following C program, build and run. |
||
< |
<syntaxhighlight lang="c">/* gcc wiktionary_dumps_to_words.c -o wiktionary_dumps_to_words -lcurl -lbz2 -lwren -lm */ |
||
#include <stdio.h> |
#include <stdio.h> |
||
Line 1,004: | Line 1,004: | ||
free(script); |
free(script); |
||
return 0; |
return 0; |
||
}</ |
}</syntaxhighlight> |
||
{{out}} |
{{out}} |
Revision as of 20:43, 28 August 2022
- Task
Make a file that can be useful with spell checkers like Ispell and Aspell.
Use the wiktionary dump (input) to create a file equivalent than "/usr/share/dict/spanish" (output). The input file is an XML dump of the Wiktionary that is a bz2'ed file of about 800MB. The output file should be a file similar than "/usr/share/dict/spanish" which contains one word of a given language by line in a simple text file. An example of such a file is available in Ubuntu with the package wspanish.
C
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <expat.h>
#include <pcre.h>
#ifdef XML_LARGE_SIZE
# define XML_FMT_INT_MOD "ll"
#else
# define XML_FMT_INT_MOD "l"
#endif
#ifdef XML_UNICODE_WCHAR_T
# define XML_FMT_STR "ls"
#else
# define XML_FMT_STR "s"
#endif
void reset_char_data_buffer();
void process_char_data_buffer();
static bool last_tag_is_title;
static bool last_tag_is_text;
static pcre *reCompiled;
static pcre_extra *pcreExtra;
void start_element(void *data, const char *element, const char **attribute) {
process_char_data_buffer();
reset_char_data_buffer();
if (strcmp("title", element) == 0) {
last_tag_is_title = true;
}
if (strcmp("text", element) == 0) {
last_tag_is_text = true;
}
}
void end_element(void *data, const char *el) {
process_char_data_buffer();
reset_char_data_buffer();
}
#define TITLE_BUF_SIZE (1024 * 8)
static char char_data_buffer[1024 * 64 * 8];
static char title_buffer[TITLE_BUF_SIZE];
static size_t offs;
static bool overflow;
void reset_char_data_buffer(void) {
offs = 0;
overflow = false;
}
// pastes parts of the node together
void char_data(void *userData, const XML_Char *s, int len) {
if (!overflow) {
if (len + offs >= sizeof(char_data_buffer)) {
overflow = true;
fprintf(stderr, "Warning: buffer overflow\n");
fflush(stderr);
} else {
memcpy(char_data_buffer + offs, s, len);
offs += len;
}
}
}
void try_match();
// if the element is the one we're after
void process_char_data_buffer(void) {
if (offs > 0) {
char_data_buffer[offs] = '\0';
if (last_tag_is_title) {
unsigned int n = (offs+1 > TITLE_BUF_SIZE) ? TITLE_BUF_SIZE : (offs+1);
memcpy(title_buffer, char_data_buffer, n);
last_tag_is_title = false;
}
if (last_tag_is_text) {
try_match();
last_tag_is_text = false;
}
}
}
void try_match()
{
int subStrVec[80];
int subStrVecLen;
int pcreExecRet;
subStrVecLen = sizeof(subStrVec) / sizeof(int);
pcreExecRet = pcre_exec(
reCompiled, pcreExtra,
char_data_buffer, strlen(char_data_buffer),
0, 0,
subStrVec, subStrVecLen);
if (pcreExecRet < 0) {
switch (pcreExecRet) {
case PCRE_ERROR_NOMATCH : break;
case PCRE_ERROR_NULL : fprintf(stderr, "Something was null\n"); break;
case PCRE_ERROR_BADOPTION : fprintf(stderr, "A bad option was passed\n"); break;
case PCRE_ERROR_BADMAGIC : fprintf(stderr, "Magic number bad (compiled re corrupt?)\n"); break;
case PCRE_ERROR_UNKNOWN_NODE : fprintf(stderr, "Something kooky in the compiled re\n"); break;
case PCRE_ERROR_NOMEMORY : fprintf(stderr, "Ran out of memory\n"); break;
default : fprintf(stderr, "Unknown error\n"); break;
}
} else {
puts(title_buffer); // print the word
}
}
#define BUF_SIZE 1024
int main(int argc, char *argv[])
{
char buffer[BUF_SIZE];
int n;
const char *pcreErrorStr;
int pcreErrorOffset;
char *aStrRegex;
char **aLineToMatch;
// Using PCRE
aStrRegex = "(.*)(==French==)(.*)"; // search for French language
reCompiled = pcre_compile(aStrRegex, PCRE_DOTALL | PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL);
if (reCompiled == NULL) {
fprintf(stderr, "ERROR: Could not compile regex '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr);
if (pcreErrorStr != NULL) {
fprintf(stderr, "ERROR: Could not study regex '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
// Using Expat parser
XML_Parser parser = XML_ParserCreate(NULL);
XML_SetElementHandler(parser, start_element, end_element);
XML_SetCharacterDataHandler(parser, char_data);
reset_char_data_buffer();
while (1) {
int done;
int len;
len = (int)fread(buffer, 1, BUF_SIZE, stdin);
if (ferror(stdin)) {
fprintf(stderr, "Read error\n");
exit(1);
}
done = feof(stdin);
if (XML_Parse(parser, buffer, len, done) == XML_STATUS_ERROR) {
fprintf(stderr,
"Parse error at line %" XML_FMT_INT_MOD "u:\n%" XML_FMT_STR "\n",
XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
exit(1);
}
if (done) break;
}
XML_ParserFree(parser);
pcre_free(reCompiled);
if (pcreExtra != NULL) {
#ifdef PCRE_CONFIG_JIT
pcre_free_study(pcreExtra);
#else
pcre_free(pcreExtra);
#endif
}
return 0;
}
- Output:
$ gcc wikt_to_words.c -o wikt_to_words -lpcre -lexpat $ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \ ./wikt_to_words gratis gratuit livre chien pond pies pie A connotation minute ...
Java
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.SAXException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
class MyHandler extends DefaultHandler {
private static final String TITLE = "title";
private static final String TEXT = "text";
private String lastTag = "";
private String title = "";
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
String regex = ".*==French==.*";
Pattern pat = Pattern.compile(regex, Pattern.DOTALL);
switch (lastTag) {
case TITLE:
title = new String(ch, start, length);
break;
case TEXT:
String text = new String(ch, start, length);
Matcher mat = pat.matcher(text);
if (mat.matches()) {
System.out.println(title);
}
break;
}
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
lastTag = qName;
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
lastTag = "";
}
}
public class WiktoWords {
public static void main(java.lang.String[] args) {
try {
SAXParserFactory spFactory = SAXParserFactory.newInstance();
SAXParser saxParser = spFactory.newSAXParser();
MyHandler handler = new MyHandler();
saxParser.parse(new InputSource(System.in), handler);
} catch(Exception e) {
System.exit(1);
}
}
}
- Output:
$ javac WiktoWords.java $ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \ java WiktoWords gratis gratuit livre chien pond pies pie A connotation minute ...
Julia
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found.
using CodecBzip2
function getwords(io::IO, output::IO; languagemark = "==French==", maxwords = 80)
title, txopen, txclose = "<title>", "<text", "</text>"
got_text_last = false
wordcount, titleword = 0, ""
for line in eachline(io)
if occursin(title, line)
got_text_last = false
titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : ""
elseif occursin(txopen, line)
got_text_last = true
elseif occursin(languagemark, line)
if got_text_last && titleword != ""
println(output, titleword)
(wordcount += 1) >= maxwords && break
end
got_text_last = false
elseif occursin(txclose, line)
got_text_last = false
end
end
end
const url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"
const urlfile = "wikidump.bz2"
stat(urlfile).size == 0 && download(url, urlfile)
const stream = Bzip2DecompressorStream(open(urlfile))
getwords(stream, stdout) # or open a file to write to and use its IO handle instead of stdout
- Output:
gratis gratuit livre chien pond pies pie A connotation minute trade adjective adjectival substantive patronage merchandise eagle fa fable a- abaca abada abalone abandon abattoir abaxial abbatial abdication abdicative abdomen abdominal abdominales abduction aberrance aberrant aberration abhorrent abhorrer abime abject abjection abjuration abjure abjurer ablactation ablation ablative able abluent ablution abolition abominable abomination abord abortive about abracadabra abrase abrasion abrasive abraxas abreuvoir abrogation abrogative abrupt on abscission abscond absconder quiz nu lente été servant robot y absent absenter absolution absorbable
OCaml
Using the library xmlm:
let () =
let i = Xmlm.make_input ~strip:true (`Channel stdin) in
let title = ref "" in
let tag_path = ref [] in
let push_tag tag =
tag_path := tag :: !tag_path
in
let pop_tag () =
match !tag_path with [] -> ()
| _ :: tl -> tag_path := tl
in
let last_tag_is tag =
match !tag_path with [] -> false
| hd :: _ -> hd = tag
in
let reg = Str.regexp_string "==French==" in
let matches s =
try let _ = Str.search_forward reg s 0 in true
with Not_found -> false
in
while not (Xmlm.eoi i) do
match Xmlm.input i with
| `Dtd dtd -> ()
| `El_start ((uri, tag_name), attrs) -> push_tag tag_name
| `El_end -> pop_tag ()
| `Data s ->
if last_tag_is "title"
then title := s;
if last_tag_is "text"
then begin
if matches s
then print_endline !title
end
done
- Output:
wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \ ocaml str.cma -I $(ocamlfind query xmlm) xmlm.cma to_words.ml gratis gratuit livre chien pond pies pie A connotation minute ...
Perl
# 20211214 Perl programming solution
use strict;
use warnings;
use LWP::UserAgent;
use Compress::Raw::Bzip2 ;
my $LanguageMark = "==French==";
my $Target = 5; # words
my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
my %needles; my $plain = my $tail = '';
my $ua = LWP::UserAgent->new;
my $bz = new Compress::Raw::Bunzip2({ -Bufsize => 1, -AppendOutput => 0 });
my $res = $ua->request( HTTP::Request->new(GET => $URL),
sub { # @_ = Data Chunk, HTTP::Response
foreach (split '', $_[0]) {
my $status = $bz->bzinflate($_, substr($plain, 0)) ;
last if $status == BZ_STREAM_END or $status != BZ_OK ;
}
if ( scalar ( my @haystacks = split "\n", $plain)) {
$haystacks[0] = $tail . $haystacks[0];
$tail = $haystacks[$#haystacks];
my ($title,$got_text_last) = '', 0 ;
foreach ( @haystacks[0..$#haystacks-1] ) {
if ( /<title>(\w+?)<\/title>/ ) {
($title,$got_text_last) = $1, 0;
} elsif ( /<text/ ) {
$got_text_last = 1;
} elsif ( /$LanguageMark/ ) {
$needles{$title}++ if ( $got_text_last and $title.defined );
if ( %needles >= $Target ) {
print "$_\n" for sort keys %needles;
exit;
}
$got_text_last = 0;
} elsif ( /<\/text>/ ) { $got_text_last = 0 }
}
}
}
)
- Output:
chien gratis gratuit livre pond
Phix
Does not rely on wget/bzcat etc. Downloads in 16K or so blocks, unpacks one block at a time in memory, terminates properly when 5 or more words are found.
Tested on Windows, should be fine on Linux as long as you can provide a suitable bz2.so
constant url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2" include builtins/libcurl.e include builtins/bzstream.e bool got_text_last = false integer wordcount = 0 string titleword = "" function doline(object line) if not string(line) then -- (opt: close output file) return false end if integer k = match("<title>", line) if k then got_text_last = false k += length("<title>") integer l = match("</title>", line, k) titleword = iff(l?line[k..l-1]:"") elsif match("<text", line) then got_text_last = true elsif match("==French==", line) then if got_text_last and titleword != "" then printf(1,"%s\n", titleword) wordcount += 1 if wordcount >= 5 then -- (opt: close output file) return false end if end if got_text_last = false elsif match("</text>", line) then got_text_last = false end if return true end function atom tbr = 0 -- Total Bytes Written string demiline = "" constant BLOCKSIZE = 8192 atom outbuf = allocate(BLOCKSIZE) function write_callback(atom pData, integer size, integer nmemb, atom pUserdata) integer bytes_written = size*nmemb tbr += bytes_written set_struct_field(id_bzs,p_bzs,"next_in",pData) set_struct_field(id_bzs,p_bzs,"avail_in",bytes_written) set_struct_field(id_bzs,p_bzs,"next_out",outbuf) set_struct_field(id_bzs,p_bzs,"avail_out",BLOCKSIZE) while true do integer res = BZ2_bzDecompress(), avail_in = get_struct_field(id_bzs,p_bzs,"avail_in"), avail_out = get_struct_field(id_bzs,p_bzs,"avail_out") if avail_out<BLOCKSIZE then string block = demiline & peek({outbuf,BLOCKSIZE-avail_out}) integer linestart = 1 for i=1 to length(block) do if block[i]='\n' then if not doline(block[linestart..i-1]) then BZ2_bzDecompressEnd() return 0 -- terminate download end if linestart = i+1 end if end for demiline = block[linestart..$] set_struct_field(id_bzs,p_bzs,"next_out",outbuf) set_struct_field(id_bzs,p_bzs,"avail_out",BLOCKSIZE) end if if res=BZ_STREAM_END then BZ2_bzDecompressEnd() return 0 end if if res!=BZ_OK then ?9/0 end if if avail_in=0 then exit end if end while return bytes_written end function constant write_cb = call_back({'+',routine_id("write_callback")}) atom curl = curl_easy_init() curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb) curl_easy_setopt(curl, CURLOPT_URL, url) BZ2_bzDecompressInit() integer res = curl_easy_perform(curl) curl_easy_cleanup(curl) printf(1,"Total downloaded: %s\n",{file_size_k(tbr)})
- Output:
gratis gratuit livre chien pond Total downloaded: 239.67KB
Raku
I misunderstood the data format and now just copy verbatim from Julia entry the processing logics ..
# 20211209 Raku programming solution
use LWP::Simple;
use Compress::Bzip2;
use IO::Socket::SSL;
my $LanguageMark = "==French==";
my $Target = 5; # words
my $URL = 'https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2';
class CustomLWP is LWP::Simple { has $.URL ;
method CustomRequest {
my Blob $resp = Buf.new;
my $bzip = Compress::Bzip2::Stream.new;
my ( $tail, %needles ) = '';
my ($host, $port, $path) = self.parse_url($.URL)[1..3];
my $sock = IO::Socket::SSL.new: :$host, :$port;
$sock.print( "GET {$path} HTTP/1.1\r\n" ~ self.stringify_headers( {
'Connection' => 'close',
'User-Agent' => "LWP::Simple/{LWP::Simple::<$VERSION>} " ~
"Raku/{$*RAKU.compiler.gist}",
'Host' => $host
} ) ~ "\r\n" ) or die ; # request string
while !self.got-header($resp) { $resp ~= $sock.read(2048) }
my $bzip-stream = supply {
emit self.parse_response($resp)[2]; # $resp_content @ parent class
loop {
done if %needles.elems >= $Target ;
( my $chunk = $sock.read(4096) ) ?? emit $chunk !! done
}
}
react {
whenever $bzip-stream -> $crypt {
my $plain = ( [~] $bzip.decompress: $crypt ).decode('utf8-c8');
my @haystacks = $plain.split: "\n";
@haystacks[0] = $tail ~ @haystacks[0];
$tail = @haystacks[*-1];
my ($title,$got_text_last) = '', False ;
for @haystacks[0..*-2] {
if / '<title>' (\w+?) '</title>' / {
($title,$got_text_last) = $0, False;
} elsif / '<text' / {
$got_text_last = True
} elsif / $LanguageMark / {
%needles{$title}++ if ( $got_text_last and $title.Bool );
last if ( %needles.elems >= $Target ) ;
$got_text_last = False;
} elsif / '</text>' / { $got_text_last = False }
}
}
}
return %needles.keys[^$Target]
}
}
my $ua = CustomLWP.new: URL => $URL ;
$ua.CustomRequest>>.say
- Output:
chien gratuit gratis pond livre
Wren
An embedded program so we can use libcurl and libbzip2.
Rather than downloading the full 800MB .bz2 file and then decompressing it, we abort the download after receiving no more than the first 512 KB and then decompress that ignoring the resultant BZ_UNEXPECTED_EOF error. This turns out to be enough to find the first 26 French words.
/* wiktionary_dumps_to_words.wren */
import "./pattern" for Pattern
var CURLOPT_URL = 10002
var CURLOPT_FOLLOWLOCATION = 52
var CURLOPT_WRITEFUNCTION = 20011
var CURLOPT_WRITEDATA = 10001
foreign class Buffer {
construct new() {} // C will allocate buffer of a suitable size
foreign value // returns buffer contents as a string after decompression
}
foreign class Curl {
construct easyInit() {}
foreign easySetOpt(opt, param)
foreign easyPerform()
foreign easyCleanup()
}
var curl = Curl.easyInit()
var getContent = Fn.new { |url|
var buffer = Buffer.new()
curl.easySetOpt(CURLOPT_URL, url)
curl.easySetOpt(CURLOPT_FOLLOWLOCATION, 1)
curl.easySetOpt(CURLOPT_WRITEFUNCTION, 0) // write function to be supplied by C
curl.easySetOpt(CURLOPT_WRITEDATA, buffer)
curl.easyPerform()
return buffer.value
}
var url = "https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2"
var content = getContent.call(url)
curl.easyCleanup()
var lines = content.split("\n")
var title = "<title>"
var txtOpen = "<text"
var txtClose = "</text>"
var langMark = "==French=="
var gotTextLast = false
var titleWord = ""
var p = Pattern.new("<title>[+1^<]<//title>")
for (line in lines) {
if (line.indexOf(title) >= 0) {
gotTextLast = false
var m = p.find(line)
titleWord = m ? m.capsText[0] : ""
} else if (line.indexOf(txtOpen) >= 0) {
gotTextLast = true
} else if (line.indexOf(langMark) >= 0) {
if (gotTextLast && titleWord != "") System.print(titleWord)
gotTextLast = false
} else if (line.indexOf(txtClose) >= 0) {
gotTextLast = false
}
}
We now embed this script in the following C program, build and run.
/* gcc wiktionary_dumps_to_words.c -o wiktionary_dumps_to_words -lcurl -lbz2 -lwren -lm */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <bzlib.h>
#include "wren.h"
struct MemoryStruct {
char *memory;
size_t size;
};
const size_t LIMIT = 512 * 1024;
/* C <=> Wren interface functions */
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
size_t size_needed = mem->size + realsize + 1;
if (size_needed > LIMIT) return -1; // abort download
char *ptr = realloc(mem->memory, size_needed);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
void C_bufferAllocate(WrenVM* vm) {
struct MemoryStruct *ms = (struct MemoryStruct *)wrenSetSlotNewForeign(vm, 0, 0, sizeof(struct MemoryStruct));
ms->memory = malloc(1);
ms->size = 0;
}
void C_bufferFinalize(void* data) {
struct MemoryStruct *ms = (struct MemoryStruct *)data;
free(ms->memory);
}
void C_curlAllocate(WrenVM* vm) {
CURL** pcurl = (CURL**)wrenSetSlotNewForeign(vm, 0, 0, sizeof(CURL*));
*pcurl = curl_easy_init();
}
void C_value(WrenVM* vm) {
struct MemoryStruct *ms = (struct MemoryStruct *)wrenGetSlotForeign(vm, 0);
/* decompress string before returning to Wren */
unsigned int destLen = ms->size * 5; // should be more than enough
char *dest = malloc(destLen);
int ret = BZ2_bzBuffToBuffDecompress(dest, &destLen, ms->memory, ms->size, 0, 0);
/* should get a 'compressed data ends unexpectedly' error here which we ignore
but report any other error */
if (ret != BZ_UNEXPECTED_EOF && ret != BZ_OK) printf("error number %d occurred", ret);
char *ptr = realloc(ms->memory, destLen);
if(!ptr) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return;
}
ms->memory = ptr;
memcpy(ms->memory, dest, destLen);
ms->size = destLen;
wrenSetSlotString(vm, 0, ms->memory);
free(dest);
}
void C_easyPerform(WrenVM* vm) {
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0);
curl_easy_perform(curl);
}
void C_easyCleanup(WrenVM* vm) {
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0);
curl_easy_cleanup(curl);
}
void C_easySetOpt(WrenVM* vm) {
CURL* curl = *(CURL**)wrenGetSlotForeign(vm, 0);
CURLoption opt = (CURLoption)wrenGetSlotDouble(vm, 1);
if (opt < 10000) {
long lparam = (long)wrenGetSlotDouble(vm, 2);
curl_easy_setopt(curl, opt, lparam);
} else if (opt < 20000) {
if (opt == CURLOPT_WRITEDATA) {
struct MemoryStruct *ms = (struct MemoryStruct *)wrenGetSlotForeign(vm, 2);
curl_easy_setopt(curl, opt, (void *)ms);
} else if (opt == CURLOPT_URL) {
const char *url = wrenGetSlotString(vm, 2);
curl_easy_setopt(curl, opt, url);
}
} else if (opt < 30000) {
if (opt == CURLOPT_WRITEFUNCTION) {
curl_easy_setopt(curl, opt, &WriteMemoryCallback);
}
}
}
WrenForeignClassMethods bindForeignClass(WrenVM* vm, const char* module, const char* className) {
WrenForeignClassMethods methods;
methods.allocate = NULL;
methods.finalize = NULL;
if (strcmp(module, "main") == 0) {
if (strcmp(className, "Buffer") == 0) {
methods.allocate = C_bufferAllocate;
methods.finalize = C_bufferFinalize;
} else if (strcmp(className, "Curl") == 0) {
methods.allocate = C_curlAllocate;
}
}
return methods;
}
WrenForeignMethodFn bindForeignMethod(
WrenVM* vm,
const char* module,
const char* className,
bool isStatic,
const char* signature) {
if (strcmp(module, "main") == 0) {
if (strcmp(className, "Buffer") == 0) {
if (!isStatic && strcmp(signature, "value") == 0) return C_value;
} else if (strcmp(className, "Curl") == 0) {
if (!isStatic && strcmp(signature, "easySetOpt(_,_)") == 0) return C_easySetOpt;
if (!isStatic && strcmp(signature, "easyPerform()") == 0) return C_easyPerform;
if (!isStatic && strcmp(signature, "easyCleanup()") == 0) return C_easyCleanup;
}
}
return NULL;
}
static void writeFn(WrenVM* vm, const char* text) {
printf("%s", text);
}
void errorFn(WrenVM* vm, WrenErrorType errorType, const char* module, const int line, const char* msg) {
switch (errorType) {
case WREN_ERROR_COMPILE:
printf("[%s line %d] [Error] %s\n", module, line, msg);
break;
case WREN_ERROR_STACK_TRACE:
printf("[%s line %d] in %s\n", module, line, msg);
break;
case WREN_ERROR_RUNTIME:
printf("[Runtime Error] %s\n", msg);
break;
}
}
char *readFile(const char *fileName) {
FILE *f = fopen(fileName, "r");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
rewind(f);
char *script = malloc(fsize + 1);
fread(script, 1, fsize, f);
fclose(f);
script[fsize] = 0;
return script;
}
static void loadModuleComplete(WrenVM* vm, const char* module, WrenLoadModuleResult result) {
if( result.source) free((void*)result.source);
}
WrenLoadModuleResult loadModule(WrenVM* vm, const char* name) {
WrenLoadModuleResult result = {0};
if (strcmp(name, "random") != 0 && strcmp(name, "meta") != 0) {
result.onComplete = loadModuleComplete;
char fullName[strlen(name) + 6];
strcpy(fullName, name);
strcat(fullName, ".wren");
result.source = readFile(fullName);
}
return result;
}
int main(int argc, char **argv) {
WrenConfiguration config;
wrenInitConfiguration(&config);
config.writeFn = &writeFn;
config.errorFn = &errorFn;
config.bindForeignClassFn = &bindForeignClass;
config.bindForeignMethodFn = &bindForeignMethod;
config.loadModuleFn = &loadModule;
WrenVM* vm = wrenNewVM(&config);
const char* module = "main";
const char* fileName = "wiktionary_dumps_to_words.wren";
char *script = readFile(fileName);
WrenInterpretResult result = wrenInterpret(vm, module, script);
switch (result) {
case WREN_RESULT_COMPILE_ERROR:
printf("Compile Error!\n");
break;
case WREN_RESULT_RUNTIME_ERROR:
printf("Runtime Error!\n");
break;
case WREN_RESULT_SUCCESS:
break;
}
wrenFreeVM(vm);
free(script);
return 0;
}
- Output:
gratis gratuit livre chien pond pies pie A connotation minute trade adjective adjectival substantive patronage deal merchandise eagle f fa fable a- abaca abada abalone abandon