M2Resources.java
package org.wikimedia.search.glent;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import lombok.SneakyThrows;
public final class M2Resources {
private static final String CONFUSIONS = "confusions-cjk.csv";
private static M2Resources instance;
public enum WordFreqType {
/**
* Term statistics derived from various sources.
* TODO: Clarify
*/
DICTIONARY("word-freq-cjk.csv"),
/**
* Counts number of articles in the language Wikipedia
* that contain the string within their content.
*/
WIKI("word-freq-cjk-wiki.csv");
final String resource;
WordFreqType(String resource) {
this.resource = resource;
}
}
public static synchronized M2Resources getInstance() {
if (instance == null) {
instance = new M2Resources();
}
return instance;
}
private Map<String, List<String>> confusionsCache;
private Map<WordFreqType, Map<String, Map<String, Integer>>> wordFreqCache;
private M2Resources() {
wordFreqCache = new EnumMap<>(WordFreqType.class);
}
public Map<String, List<String>> confusions() throws IOException {
if (confusionsCache == null) {
confusionsCache = loadConfusions();
}
return confusionsCache;
}
/**
* @return Map from a source string to an unsorted list of
* possible replacement strings. Applies to all CJK languages.
*/
private Map<String, List<String>> loadConfusions() throws IOException {
Map<String, List<String>> res = new HashMap<>();
for (CSVRecord entry : parse(M2Resources.CONFUSIONS)) {
String from = entry.get("charFrom");
String to = entry.get("charTo");
res.computeIfAbsent(from, x -> new ArrayList<>()).add(to);
}
return res;
}
/**
* @return Per-language word frequency statistics. Top level map
* keyed by language, second level by word.
*/
public Map<String, Map<String, Integer>> wordFreq() {
return wordFreq(WordFreqType.WIKI);
}
public Map<String, Map<String, Integer>> wordFreq(WordFreqType type) {
return wordFreqCache.computeIfAbsent(type, this::loadWordFreq);
}
@SneakyThrows(IOException.class)
private Map<String, Map<String, Integer>> loadWordFreq(WordFreqType type) {
Map<String, Map<String, Integer>> res = new HashMap<>();
for (CSVRecord entry : parse(type.resource)) {
String lang = entry.get("lang");
String word = entry.get("word");
Integer freq = Integer.valueOf(entry.get("frequency"));
res.computeIfAbsent(lang, x -> new HashMap<>()).put(word, freq);
}
return res;
}
private CSVParser parse(String resource) throws IOException {
InputStream is = getClass().getClassLoader().getResourceAsStream(resource);
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
return CSVFormat.DEFAULT.withFirstRecordAsHeader().parse(reader);
}
}