WiktionaryDumps to words: Difference between revisions
Content added Content deleted
m (write to a file not to the terminal) |
m (simplify state logic) |
||
Line 312: | Line 312: | ||
=={{header|Julia}}== |
=={{header|Julia}}== |
||
Uses Regex and state |
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found. |
||
<lang julia>using CodecBzip2 |
<lang julia>using CodecBzip2 |
||
function getwords(io::IO, output::IO, languagemark = "==French==", maxwords = 80) |
function getwords(io::IO, output::IO, languagemark = "==French==", maxwords = 80) |
||
title, txopen, txclose = "<title>", "<text", "</text>" |
title, txopen, txclose = "<title>", "<text", "</text>" |
||
got_text_last = false |
|||
wordcount, titleword = 0, "" |
wordcount, titleword = 0, "" |
||
for line in eachline(io) |
for line in eachline(io) |
||
if occursin(title, line) |
if occursin(title, line) |
||
got_text_last = false |
|||
titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : "" |
titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : "" |
||
elseif occursin(txopen, line) |
elseif occursin(txopen, line) |
||
got_text_last = true |
|||
elseif occursin(languagemark, line) |
elseif occursin(languagemark, line) |
||
if got_text_last && titleword != "" |
if got_text_last && titleword != "" |
||
Line 330: | Line 330: | ||
(wordcount += 1) >= maxwords && break |
(wordcount += 1) >= maxwords && break |
||
end |
end |
||
got_text_last = false |
|||
elseif occursin(txclose, line) |
elseif occursin(txclose, line) |
||
got_text_last = false |
|||
end |
end |
||
end |
end |