WiktionaryDumps to words: Difference between revisions

Work in progress Java example
(Warning about future changes)
(Work in progress Java example)
Line 4:
 
Use the [https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 wiktionary dump] (input) to create a file equivalent than [http://manpages.ubuntu.com/manpages/bionic/man5/french.5.html "/usr/share/dict/french"] (output). This dump is a big bz2'ed XML file of about 800MB. The "/usr/share/dict/french" file contains one word of the French language by line in a text file. This file is available in Ubuntu with the package '''wfrench'''.
 
=={{header|Java}}==
 
<lang java>import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.SAXException;
 
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.ParserConfigurationException;
 
class MyHandler extends DefaultHandler {
private static final String TITLE = "title";
private static final String TEXT = "text";
 
private String lastTag = "";
private String title = "";
 
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
switch (lastTag) {
case TITLE:
title = new String(ch, start, length);
break;
case TEXT:
String text = new String(ch, start, length);
if (text.matches("(.*)\n==French==\n(.*)")) {
System.out.println(title);
}
break;
}
}
 
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException {
lastTag = qName;
}
 
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
lastTag = "";
}
}
 
public class WiktoWords {
public static void main(java.lang.String[] args) {
try {
SAXParserFactory spFactory = SAXParserFactory.newInstance();
SAXParser saxParser = spFactory.newSAXParser();
MyHandler handler = new MyHandler();
saxParser.parse(new InputSource(System.in), handler);
} catch(Exception e) {
System.exit(1);
}
}
}</lang>
 
{{out}}
 
<pre>
$ javac WiktoWords.java
$ wget --quiet https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 -O - | bzcat | \
java WiktoWords
hélice
pingouin
égoïsme
écholocation
nitroglycérine
croque-mitaine
</pre>
 
=={{header|OCaml}}==