M2Resources.java

package org.wikimedia.search.glent;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

import lombok.SneakyThrows;


public final class M2Resources {
    private static final String CONFUSIONS = "confusions-cjk.csv";
    private static M2Resources instance;

    public enum WordFreqType {
        /**
         * Term statistics derived from various sources.
         * TODO: Clarify
         */
        DICTIONARY("word-freq-cjk.csv"),

        /**
         * Counts number of articles in the language Wikipedia
         * that contain the string within their content.
         */
        WIKI("word-freq-cjk-wiki.csv");

        final String resource;

        WordFreqType(String resource) {
            this.resource = resource;
        }
    }

    public static synchronized M2Resources getInstance() {
        if (instance == null) {
            instance = new M2Resources();
        }
        return instance;
    }

    private Map<String, List<String>> confusionsCache;
    private Map<WordFreqType, Map<String, Map<String, Integer>>> wordFreqCache;

    private M2Resources() {
        wordFreqCache = new EnumMap<>(WordFreqType.class);
    }

    public Map<String, List<String>> confusions() throws IOException {
        if (confusionsCache == null) {
            confusionsCache = loadConfusions();
        }
        return confusionsCache;
    }

    /**
     * @return Map from a source string to an unsorted list of
     *  possible replacement strings. Applies to all CJK languages.
     */
    private Map<String, List<String>> loadConfusions() throws IOException {
        Map<String, List<String>> res = new HashMap<>();
        for (CSVRecord entry : parse(M2Resources.CONFUSIONS)) {
            String from = entry.get("charFrom");
            String to = entry.get("charTo");
            res.computeIfAbsent(from, x -> new ArrayList<>()).add(to);
        }
        return res;
    }

    /**
     * @return Per-language word frequency statistics. Top level map
     *  keyed by language, second level by word.
     */
    public Map<String, Map<String, Integer>> wordFreq() {
        return wordFreq(WordFreqType.WIKI);
    }

    public Map<String, Map<String, Integer>> wordFreq(WordFreqType type) {
        return wordFreqCache.computeIfAbsent(type, this::loadWordFreq);
    }

    @SneakyThrows(IOException.class)
    private Map<String, Map<String, Integer>> loadWordFreq(WordFreqType type) {
        Map<String, Map<String, Integer>> res = new HashMap<>();
        for (CSVRecord entry : parse(type.resource)) {
            String lang = entry.get("lang");
            String word = entry.get("word");
            Integer freq = Integer.valueOf(entry.get("frequency"));
            res.computeIfAbsent(lang, x -> new HashMap<>()).put(word, freq);
        }
        return res;
    }

    private CSVParser parse(String resource) throws IOException {
        InputStream is = getClass().getClassLoader().getResourceAsStream(resource);
        Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
        return CSVFormat.DEFAULT.withFirstRecordAsHeader().parse(reader);
    }
}