WiktionaryDumps to words: Difference between revisions

Content added Content deleted
m (write to a file not to the terminal)
m (simplify state logic)
Line 312: Line 312:


=={{header|Julia}}==
=={{header|Julia}}==
Uses Regex and state variables instead of XML parsing. Default setting prints the first 80 French words found.
Uses Regex and a state variable instead of XML parsing. Default setting prints the first 80 French words found.
<lang julia>using CodecBzip2
<lang julia>using CodecBzip2


function getwords(io::IO, output::IO, languagemark = "==French==", maxwords = 80)
function getwords(io::IO, output::IO, languagemark = "==French==", maxwords = 80)
title, txopen, txclose = "<title>", "<text", "</text>"
title, txopen, txclose = "<title>", "<text", "</text>"
got_title_last, got_text_last = false, false
got_text_last = false
wordcount, titleword = 0, ""
wordcount, titleword = 0, ""
for line in eachline(io)
for line in eachline(io)
if occursin(title, line)
if occursin(title, line)
got_title_last, got_text_last = true, false
got_text_last = false
titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : ""
titleword = (m = match(r"<title>([^<]+)</title>", line)) != nothing ? m[1] : ""
elseif occursin(txopen, line)
elseif occursin(txopen, line)
got_title_last, got_text_last = false, true
got_text_last = true
elseif occursin(languagemark, line)
elseif occursin(languagemark, line)
if got_text_last && titleword != ""
if got_text_last && titleword != ""
Line 330: Line 330:
(wordcount += 1) >= maxwords && break
(wordcount += 1) >= maxwords && break
end
end
got_title_last, got_text_last = false, false
got_text_last = false
elseif occursin(txclose, line)
elseif occursin(txclose, line)
got_title_last, got_text_last = false, false
got_text_last = false
end
end
end
end