JsonSchemaSparkConverter.java

package org.wikimedia.eventutilities.spark.sql;

import javax.annotation.Nonnull;
import javax.annotation.ParametersAreNonnullByDefault;

import org.apache.spark.sql.types.DataType;
import org.wikimedia.eventutilities.core.event.types.JsonSchemaConverter;

import com.fasterxml.jackson.databind.node.ObjectNode;

/**
 * Converts from JSONSchemas to Spark SQL types (DataType).
 * <p>
 * Adheres to a few Event Platform specific schema conventions,
 * specifically support for specifying Map types in JSONSchema
 * via additionalProperties schema.
 * </p>
 *
 * <p>
 * Supports converting timestamp types (where format: date-time in the JSONSchema)
 * to either StringType or TimestampType.
 * Historically at WMF, we used Spark to create Hive tables, and our old
 * version of Hive did not support Timestamp types. It does now, but
 * until we migrate existing tables, we should continue using String types.
 * https://phabricator.wikimedia.org/T278467
 * The default is to use TimestampType, so if you need StringType, you must set
 * the timestampsAsStrings param to true.
 * </p>
 */
@ParametersAreNonnullByDefault
public final class JsonSchemaSparkConverter {

    private static final JsonSchemaConverter<DataType> SPARK_DATA_TYPE_CONVERTER =
        new JsonSchemaConverter<>(new DataTypeSchemaConversions());

    private static final JsonSchemaConverter<DataType> SPARK_DATA_TYPE_CONVERTER_TIMESTAMPS_AS_STRINGS =
        new JsonSchemaConverter<>(new DataTypeSchemaConversions(true));

    /**
     * Converts this JSONSchema to a Spark SQL DataType,
     * with timestamp (format: date-time) fields as TimestampType.
     *
     * @param jsonSchema
     *  The JSONSchema ObjectNode.  This should have at minimum type.
     *
     * @return
     *  jsonSchema converted to
     *  {@link DataType}
     */
    @Nonnull
    public static DataType toDataType(ObjectNode jsonSchema) {
        return toDataType(jsonSchema, false);
    }

    /**
     * Converts this JSONSchema to a Spark SQL DataType.
     *
     * @param jsonSchema
     *  The JSONSchema ObjectNode.  This should have at minimum type.
     *
     * @param timestampsAsStrings
     *  If true, timestamp fields (JSONSchema string fields with format: date-time)
     *  will StringType instead of TimestampType in the converted Spark schema.
     *
     * @return
     *  jsonSchema converted to
     *  {@link DataType}
     */
    @Nonnull
    public static DataType toDataType(ObjectNode jsonSchema, boolean timestampsAsStrings) {
        if (timestampsAsStrings) {
            return SPARK_DATA_TYPE_CONVERTER_TIMESTAMPS_AS_STRINGS.convert(jsonSchema);
        } else {
            return SPARK_DATA_TYPE_CONVERTER.convert(jsonSchema);
        }
    }

    /**
     * Constructor to make maven checkstyle plugin happy.
     * See: https://checkstyle.sourceforge.io/config_design.html#HideUtilityClassConstructor
     */
    private JsonSchemaSparkConverter() {}
}