WikibaseInlineUriFactory.java
package org.wikidata.query.rdf.blazegraph;
import org.wikidata.query.rdf.blazegraph.inline.uri.InlineFixedWidthHexIntegerURIHandler;
import org.wikidata.query.rdf.blazegraph.inline.uri.UndecoratedUuidInlineUriHandler;
import org.wikidata.query.rdf.common.uri.CommonValues;
import org.wikidata.query.rdf.common.uri.PropertyType;
import org.wikidata.query.rdf.common.uri.UrisScheme;
import org.wikidata.query.rdf.common.uri.UrisSchemeFactory;
import com.bigdata.rdf.internal.InlineURIFactory;
import com.bigdata.rdf.internal.InlineURIHandler;
import com.bigdata.rdf.internal.InlineUnsignedIntegerURIHandler;
import com.bigdata.rdf.internal.NormalizingInlineUriHandler;
import com.bigdata.rdf.internal.TrailingSlashRemovingInlineUriHandler;
/**
* Factory building InlineURIHandlers for wikidata.
*
* One thing to consider when working on these is that its way better for write
* (and probably update) performance if all the bits of an entity are grouped
* together in Blazegraph's BTrees. Scattering them causes updates to have to
* touch lots of BTree nodes. {s,p,o}, {p,o,s}, and {o,s,p} are the indexes so
* and {s,p,o} seems most sensitive to scattering.
*
* Another thing to consider is that un-inlined uris are stored as longs which
* take up 9 bytes including the flags byte. And inlined uris are stored as 1
* flag byte, 1 (or 2) uri prefix bytes, and then delegate date type. That means
* that if the delegate data type is any larger than 6 bytes then its a net loss
* on index size using it. So you should avoid longs and uuids. Maybe even
* forbid them entirely.
*/
public class WikibaseInlineUriFactory extends InlineURIFactory {
private static final UrisScheme URIS = UrisSchemeFactory.getURISystem();
public WikibaseInlineUriFactory() {
/*
* Order matters here because some of these are prefixes of each other.
*/
for (PropertyType p: PropertyType.values()) {
addHandler(new InlineUnsignedIntegerURIHandler(URIS.property(p) + "P"));
}
URIS.inlinableEntityInitials().forEach(s -> addHandler(new InlineUnsignedIntegerURIHandler(URIS.entityIdToURI(s))));
// Lexemes TODO: can't really do it because of Forms: L1-F1
/*
* We don't use WikibaseStyleStatementInlineUriHandler because it makes
* things worse!
*/
// These aren't part of wikibase but are common in wikidata
// TODO: add more prefixes?
// VIAF ID
InlineURIHandler viaf = new TrailingSlashRemovingInlineUriHandler(
new InlineUnsignedIntegerURIHandler(CommonValues.VIAF));
addHandler(viaf);
addHandler(new NormalizingInlineUriHandler(viaf, CommonValues.VIAF_HTTP));
// GeoNames ID
addHandler(new TrailingSlashRemovingInlineUriHandler(
new InlineUnsignedIntegerURIHandler(CommonValues.GEONAMES)));
// PubChem ID
addHandler(new InlineUnsignedIntegerURIHandler(CommonValues.PUBCHEM));
// ChemSpider ID
addHandler(new InlineUnsignedIntegerURIHandler(CommonValues.CHEMSPIDER));
/*
* Value nodes are inlined even though they are pretty big (uuids). It
* doesn't seem to effect performance either way.
*
* Statements can't be inlined without losing information or making them
* huge and bloating the index. We could probably rewrite them at the
* munger into something less-uuid-ish.
*
* References aren't uuids - they are sha1s or sha0s or something
* similarly 160 bit wide. 160 bits is too big to fit into a uuid so we
* can't inline that without bloating either.
*/
addHandler(new UndecoratedUuidInlineUriHandler(URIS.value()));
}
public static class V001 extends WikibaseInlineUriFactory {
public V001() {
super();
addHandler(new InlineFixedWidthHexIntegerURIHandler(URIS.reference(), 40));
InlineURIHandler viaf = new TrailingSlashRemovingInlineUriHandler(
new InlineUnsignedIntegerURIHandler(CommonValues.VIAF_HTTP));
addHandler(viaf);
addHandler(new NormalizingInlineUriHandler(viaf, CommonValues.VIAF));
// Entrez Gene ID
addHandler(new InlineUnsignedIntegerURIHandler(CommonValues.GENEID));
}
}
public static class V002 extends V001 {
public V002() {
super();
addHandler(new UndecoratedUuidInlineUriHandler(URIS.wellKnownBNodeIRIPrefix()));
}
}
}