LabelService.java

package org.wikidata.query.rdf.blazegraph.label;

import static com.google.common.base.Preconditions.checkArgument;
import static org.wikidata.query.rdf.blazegraph.BigdataValuesHelper.makeIV;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;

import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.impl.LiteralImpl;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.vocabulary.RDFS;
import org.wikidata.query.rdf.common.uri.Ontology;
import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
import org.wikidata.query.rdf.common.uri.UrisSchemeFactory;
import org.wikidata.query.rdf.common.uri.UrisScheme;

import com.bigdata.bop.BOp;
import com.bigdata.bop.Constant;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IValueExpression;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.sparql.ast.JoinGroupNode;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.TermNode;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AbstractServiceFactory;
import com.bigdata.rdf.sparql.ast.eval.ServiceParams;
import com.bigdata.rdf.sparql.ast.service.BigdataNativeServiceOptions;
import com.bigdata.rdf.sparql.ast.service.BigdataServiceCall;
import com.bigdata.rdf.sparql.ast.service.IServiceOptions;
import com.bigdata.rdf.sparql.ast.service.ServiceCallCreateParams;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;
import com.bigdata.rdf.sparql.ast.service.ServiceRegistry;
import com.bigdata.rdf.spo.ISPO;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.BD;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.google.common.collect.ImmutableSet;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * Implements a "service" that resolves label like things in a way that doesn't
 * change the cardinality of the result set. You can call it like this: <code>
 *  SELECT *
 *  WHERE {
 *    SERVICE wikibase:label {
 *      bd:serviceParam wikibase:language "en,de,fr" .
 *      wd:Q123 rdfs:label ?q123Label .
 *      wd:Q123 rdfs:altLabel ?q123Alt .
 *      wd:Q123 schema:description ?q123Desc .
 *      wd:Q321 rdf:label ?q321Label .
 *    }
 *  }
 * </code> or like this:<code>
 *  SELECT ?sLabel ?sAltLabel ?sDescription ?oLabel
 *  WHERE {
 *    ?s wdt:P22 ?o .
 *    SERVICE wikibase:label {
 *      bd:serviceParam wikibase:language "en,de" .
 *      bd:serviceParam wikibase:language "fr" .
 *    }
 *  }
 * </code>
 * <p>
 * If the label isn't available in any of the fallback languages it'll come back
 * as the entityId. Alt labels and descriptions just come back unbound if they
 * don't exist. If multiple values are defined they come back as a comma
 * separated list.
 *
 * <p>
 * This works by resolving the label-like thing per incoming binding, one at a
 * time. It would probably be faster to do something bulkish but we don't do
 * that yet. The code to do the comma separated lists and entityIds is pretty
 * simple once you've resolve the data.
 * <p>
 * The second invocation pattern works using {@code EmptyLabelServiceOptimizer}
 * to inspect the query and automatically build the first form out of the second
 * form by inspecting the query's projection.
 */
public class LabelService extends AbstractServiceFactory {
    /**
     * Options configuring this service as a native Blazegraph service.
     */
    private static final BigdataNativeServiceOptions SERVICE_OPTIONS = new BigdataNativeServiceOptions();

    /**
     * The URI service key.
     */
    public static final URI SERVICE_KEY = new URIImpl(Ontology.LABEL);

    /**
     * URI for service language parameter.
     */
    public static final URIImpl LANGUAGE_PARAM = new URIImpl(Ontology.NAMESPACE + "language");

    /**
     * Auto-language tag from the GUI.
     * We will filter it out if it seeps into actual query.
     */
    private static final String AUTO_LANGUAGE = "[AUTO_LANGUAGE]";

    /**
     * Register the service so it is recognized by Blazegraph.
     */
    public static void register() {
        final ServiceRegistry reg = ServiceRegistry.getInstance();
        reg.add(SERVICE_KEY, new LabelService());
        reg.addWhitelistURL(SERVICE_KEY.toString());
    }

    @Override
    public IServiceOptions getServiceOptions() {
        return SERVICE_OPTIONS;
    }

    @Override
    public BigdataServiceCall create(ServiceCallCreateParams params, final ServiceParams serviceParams) {
        /*
         * Luckily service calls are always pushed to the last operation in a
         * query. We still check it and tell users we won't resolve labels for
         * unbound subjects.
         */
        // TODO this whole class just throws RuntimeException instead of ??
        return new LabelServiceCall(new ResolutionContext(params.getTripleStore(), findLanguageFallbacks(serviceParams)),
                findResolutions(params));
    }

    /**
     * Check whether language tag is bad.
     */
    private boolean isBadLanguage(String lang) {
        if (lang.isEmpty()) {
            return true;
        }
        return AUTO_LANGUAGE.equals(lang);
    }

    /**
     * Resolve the language fallbacks from the statement pattern node in the
     * query.
     */
    private Map<String, Integer> findLanguageFallbacks(final ServiceParams params) {
        List<TermNode> paramNodes = params.get(LANGUAGE_PARAM);

        checkArgument(!paramNodes.isEmpty(), "You must provide the label service a list of languages.");


        // TODO there has to be a better data structure for this.
        /*
         * Lucene has tons of things for this, but yeah. Maybe it doesn't
         * matter.
         */
        Map<String, Integer> fallbacksMap = new HashMap<>();
        int cnt = 0;
        for (TermNode term: paramNodes) {
            checkArgument(!term.isVariable(), "not a constant");

            final Value v = term.getValue();

            checkArgument(v instanceof Literal, "not a literal");

            final String s = v.stringValue().trim();
            if (s.contains(",")) {
                // we also allow comma lists for convenience
                for (String ls: s.split(",")) {
                    final String key = ls.trim();
                    if (isBadLanguage(key)) {
                        continue;
                    }
                    if (!fallbacksMap.containsKey(key)) {
                        fallbacksMap.put(key, cnt);
                    }
                    ++cnt;
                }
            } else {
                if (!isBadLanguage(s) && !fallbacksMap.containsKey(s)) {
                    fallbacksMap.put(s, cnt);
                }
            }
            ++cnt;
        }

        return fallbacksMap;
    }

    /**
     * Create the resolutions list from the service call parameters.
     */
    private static List<Resolution> findResolutions(ServiceCallCreateParams params) {
        return findResolutions(params.getServiceNode());
    }

    /**
     * Create the resolutions list from the service call parameters.
     */
    static List<Resolution> findResolutions(final ServiceNode params) {
        JoinGroupNode g = (JoinGroupNode) params.getGraphPattern();
        List<Resolution> resolutions = new ArrayList<>(g.args().size());
        for (BOp st : g.args()) {
            StatementPatternNode sn = (StatementPatternNode) st;
            if (sn.s().isConstant() && BD.SERVICE_PARAM.equals(sn.s().getValue())) {
                // skip service params
                continue;
            }
            resolutions.add(new Resolution(sn));
        }
        return resolutions;
    }

    /**
     * Represents the call site in a particular SPARQL query.
     */
    private static class LabelServiceCall implements BigdataServiceCall {
        /*
         * Suppress VisibilityModifier check because members are package private
         * so non-static inner classes can access them without the messy
         * accessor methods. This isn't an information leak because this class
         * is already a private inner class.
         */
        /**
         * The context in which the resolutions will be done.
         */
        final ResolutionContext context;
        /**
         * Things to resolve.
         */
        final List<Resolution> resolutions;

        /**
         * Build with all the right stuff resolved.
         */
        LabelServiceCall(ResolutionContext context, List<Resolution> resolutions) {
            this.context = context;
            this.resolutions = resolutions;
        }

        @Override
        public IServiceOptions getServiceOptions() {
            return SERVICE_OPTIONS;
        }

        @Override
        public ICloseableIterator<IBindingSet> call(final IBindingSet[] bindingSets) throws Exception {
            return new Chunk(bindingSets);
        }

        /**
         * A chunk of calls to resolve labels.
         */
        private class Chunk implements ICloseableIterator<IBindingSet> {
            /**
             * Binding sets being resolved in this chunk.
             */
            private final IBindingSet[] bindingSets;
            /**
             * Has this chunk been closed?
             */
            private boolean closed;
            /**
             * Index of the next binding set to handle when next is next called.
             */
            private int i;

            Chunk(IBindingSet[] bindingSets) {
                this.bindingSets = bindingSets;
            }

            @Override
            public boolean hasNext() {
                return !closed && i < bindingSets.length;
            }

            @Override
            public IBindingSet next() {
                if (!hasNext()) {
                    throw new NoSuchElementException();
                }

                IBindingSet binding = bindingSets[i++];
                context.binding(binding);
                for (Resolution resolution : resolutions) {
                    context.resolve(resolution);
                }
                return binding;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }

            @Override
            public void close() {
                closed = true;
            }
        }
    }

    /**
     * Description of a specific resolution request for the service. The service
     * can resolve many such requests at once.
     */
    @SuppressWarnings("rawtypes")
    static final class Resolution {
        /**
         * Subject of the service call.
         */
        private final IValueExpression subject;
        /**
         * URI for the label to resolve.
         */
        private final IValueExpression label;
        /**
         * The target variable to which to bind the label.
         */
        private final IVariable target;

        private Resolution(StatementPatternNode st) {
            subject = st.s().getValueExpression();
            label = st.p().getValueExpression();
            target = getVariableToBind(st);
        }

        /**
         * Subject of the service call.
         */
        public IValueExpression subject() {
            return subject;
        }

        /**
         * URI for the label to resolve.
         */
        public IValueExpression labelType() {
            return label;
        }

        /**
         * The target variable to which to bind the label.
         */
        public IVariable target() {
            return target;
        }

        /**
         * Resolve the variable that needs to be bound from the statement
         * pattern node in the query.
         */
        private IVariable<IV> getVariableToBind(StatementPatternNode st) {
            try {
                return ((VarNode) st.o()).getValueExpression();
            } catch (ClassCastException e) {
                throw new RuntimeException("Expected a variable in the object position to which to bind the language.", e);
            }
        }
    }

    /**
     * Context in which Resolutions are resolved. This only goes from subjects
     * and label types to labels. It doesn't go from label types and label
     * values to subjects. That wouldn't be an efficient process anyway even
     * though it is technically possible.
     */
    @SuppressWarnings({"rawtypes", "unchecked"})
    private static class ResolutionContext {
        /**
         * The TripleStore to resolve the BindingSets against.
         */
        private final AbstractTripleStore tripleStore;
        /**
         * The LexiconRelation for the TripleStore we're working with.
         */
        private final LexiconRelation lexiconRelation;
        /**
         * The language fallbacks as a map from language code to order of
         * precidence.
         */
        private final Map<String, Integer> languageFallbacks;
        /**
         * List of labels with the best language. Cleared and rebuilt as on
         * every new call to resolve.
         */
        private final List<IV> bestLabels = new ArrayList<>();
        /**
         * The binding currently being resolved.
         */
        private IBindingSet binding;
        /**
         * The subject for this resolution as resolved in this BindingSet.
         */
        private IV resolvedSubject;
        /**
         * The label type for the current resolution as resolved in this
         * BindingSet.
         */
        private IV resolvedLabelType;
        /**
         * The IV the represents rdfs:label. Its built lazily when needed and
         * cached.
         */
        private IV rdfsLabelIv;
        /**
         * The IV the represents schema:description. Its built lazily when needed and
         * cached.
         * TODO: if we get more than two, convert this to a map.
         */
        private IV descriptionIv;

        ResolutionContext(AbstractTripleStore tripleStore, Map<String, Integer> languageFallbacks) {
            this.tripleStore = tripleStore;
            this.languageFallbacks = languageFallbacks;
            lexiconRelation = tripleStore.getLexiconRelation();
        }

        /**
         * Set the current BindingSet to be worked on.
         */
        public void binding(IBindingSet binding) {
            this.binding = binding;
        }

        /**
         * Resolve the target of the resolution in the current BindingSet.
         */
        public void resolve(Resolution resolution) {
            // Do not overwrite already bound variables
            // Ref: https://phabricator.wikimedia.org/T159723
            // Ref: https://phabricator.wikimedia.org/T170704
            if (binding.get(resolution.target()) != null) {
                return;
            }
            resolvedSubject = resolveToIvOrError(resolution.subject(), "subject");
            resolvedLabelType = resolveToIvOrError(resolution.labelType(), "label type");
            if (resolvedSubject == null || resolvedLabelType == null) {
                return;
            }
            // TODO this is one at a time - maybe a batch things?
            fillBestLabels();
            IV label = pickOrBuildBestLabel();
            if (label != null) {
                binding.set(resolution.target(), new Constant(label));
            }
        }

        /**
         * Gets the best label from a lookup. The best labels are put into the
         * bestLabels list parameter. That parameter is cleared before the
         * method starts and returning an empty list means there are no good
         * labels.
         */
        @SuppressWarnings("checkstyle:npathcomplexity")
        private void fillBestLabels() {
            bestLabels.clear();
            if (languageFallbacks.isEmpty()) {
                return;
            }
            IChunkedOrderedIterator<ISPO> lookup = tripleStore.getAccessPath(resolvedSubject, resolvedLabelType, null)
                    .iterator();
            try {
                int bestLabelRank = Integer.MAX_VALUE;
                boolean uniqueType = uniqueLabelType(resolvedLabelType);
                while (lookup.hasNext()) {
                    ISPO spo = lookup.next();
                    IV o = spo.o();
                    if (!o.isLiteral()) {
                        // Not a literal, no chance its a label then
                        continue;
                    }
                    /*
                     * Hydrate all of the objects into language literals so we
                     * can check the language. This is slow because it has to go
                     * to the term dictionary but there isn't anything we can do
                     * about it for now.
                     */
                    Literal literal = (Literal) lexiconRelation.getTerm(o);
                    String language = literal.getLanguage();
                    if (language == null) {
                        // Not a language label, skip.
                        continue;
                    }
                    Integer languageOrdinal = languageFallbacks.get(language);
                    if (languageOrdinal == null) {
                        // Not a language the user wants
                        continue;
                    }
                    if (languageOrdinal == bestLabelRank) {
                        bestLabels.add(o);
                    }
                    if (languageOrdinal < bestLabelRank) {
                        bestLabelRank = languageOrdinal;
                        bestLabels.clear();
                        bestLabels.add(o);
                    }
                    if (languageOrdinal == 0 && uniqueType) {
                        // If we found best possible label, for an unique type, we're done.
                        break;
                    }
                }
            } finally {
                lookup.close();
            }
        }

        /**
         * Does this label type only allow one label?
         */
        private boolean uniqueLabelType(IV labeltype) {
            return rdfsLabelIv().equals(labeltype) || descriptionIv().equals(labeltype);
        }

        /**
         * By hook or by crook return a single IV for this resolution. Processes
         * bestLabels, so you'll have to call fillBestLabels before calling
         * this. Options:
         * <ul>
         * <li>If there is a single label it returns it.
         * <li>If there isn't a label it uses bestEffortLabel to mock up a label
         * <li>If there are multiple labels it uses joinLabels to smoosh them
         * into a comma separated list.
         * </ul>
         */
        private IV pickOrBuildBestLabel() {
            switch (bestLabels.size()) {
                case 1:
                    // Found a single label so we can just return it.
                    // This is probably the most common case.
                    return bestLabels.get(0);
                case 0:
                    // Didn't find a real label so lets fake one up
                    return bestEffortLabel();
                default:
                    return joinLabels();
            }
        }

        /**
         * Build a mock IV from a literal.
         */
        private IV mock(Literal literal) {
            return makeIV(lexiconRelation.getValueFactory(), literal);
        }

        /**
         * Returns the IV to which expression is bound in the current context or
         * throws an error if it isn't bound.
         */
        private IV resolveToIvOrError(IValueExpression expression, String nameOfExpression) {
            Object resolved = expression.get(binding);
            if (resolved == null) {
                return null;
//                throw new RuntimeException(String.format(Locale.ROOT,
//                        "Refusing to lookup labels for unknown %s (%s). That'd be way way way inefficient.",
//                        nameOfExpression, expression));
            }
            try {
                return (IV) resolved;
            } catch (ClassCastException e) {
                throw new RuntimeException(String.format(Locale.ROOT,
                        "Expected %s (%s) to be bound to an IV but it wasn't.", nameOfExpression, expression), e);
            }
        }

        /**
         * The IV the represents rdfs:label. Its built lazily when needed and
         * cached.
         */
        private IV rdfsLabelIv() {
            if (rdfsLabelIv == null) {
                rdfsLabelIv = tripleStore.getVocabulary().get(RDFS.LABEL);
            }
            return rdfsLabelIv;
        }

        /**
         * The IV the represents rdfs:label. Its built lazily when needed and
         * cached.
         */
        private IV descriptionIv() {
            if (descriptionIv == null) {
                descriptionIv = tripleStore.getVocabulary().get(new URIImpl(SchemaDotOrg.DESCRIPTION));
            }
            return descriptionIv;
        }

        /**
         * The WikibaseUris to use in this context.
         * @return
         */
        private UrisScheme uris() {
            return UrisSchemeFactory.getURISystem();
        }

        /**
         * Build a label for something without a label. If the resolvedLabelType
         * is actually rdfs:label you'll get a nice Q1324 style label but if it
         * isn't you'll get an empty string.
         */
        private IV bestEffortLabel() {
            // Only rdfs:label gets the entity ID as the label
            if (!rdfsLabelIv().equals(resolvedLabelType)) {
                // Everything else gets the empty string
                return null;
            }
            BigdataValue value = resolvedSubject.asValue(lexiconRelation);
            //lexiconRelation.getTerm(resolvedSubject);
            String bestEffortLabel = value.stringValue();
            bestEffortLabel = uris().entityURItoId(bestEffortLabel);
            return mock(new LiteralImpl(bestEffortLabel));
        }

        /**
         * Smoosh bestLabels into a comma separated list.
         */
        private IV joinLabels() {
            // Found lots of labels so we should merge them into one.
            // This is going to be common for alt labels
            StringBuilder b = new StringBuilder();
            String language = null;
            boolean first = true;
            for (IV label : bestLabels) {
                Literal literal = (Literal) lexiconRelation.getTerm(label);
                if (!first) {
                    b.append(", ");
                } else {
                    first = false;
                }
                b.append(literal.stringValue());
                if (language == null) {
                    language = literal.getLanguage();
                }
            }
            return mock(new LiteralImpl(b.toString(), language));
        }
    }

    @Override
    public Set<IVariable<?>> getDesiredBound(final ServiceNode serviceNode) {
        final List<Resolution> res = findResolutions(serviceNode);
        return res.stream()
            .filter(resolution -> resolution.subject() instanceof IVariable)
            .map(resolution -> (IVariable<?>)resolution.subject())
            .collect(ImmutableSet.toImmutableSet());
    }

}