JsonSchemaSparkConverter.java
package org.wikimedia.eventutilities.spark.sql;
import javax.annotation.Nonnull;
import javax.annotation.ParametersAreNonnullByDefault;
import org.apache.spark.sql.types.DataType;
import org.wikimedia.eventutilities.core.event.types.JsonSchemaConverter;
import com.fasterxml.jackson.databind.node.ObjectNode;
/**
* Converts from JSONSchemas to Spark SQL types (DataType).
* <p>
* Adheres to a few Event Platform specific schema conventions,
* specifically support for specifying Map types in JSONSchema
* via additionalProperties schema.
* </p>
*
* <p>
* Supports converting timestamp types (where format: date-time in the JSONSchema)
* to either StringType or TimestampType.
* Historically at WMF, we used Spark to create Hive tables, and our old
* version of Hive did not support Timestamp types. It does now, but
* until we migrate existing tables, we should continue using String types.
* https://phabricator.wikimedia.org/T278467
* The default is to use TimestampType, so if you need StringType, you must set
* the timestampsAsStrings param to true.
* </p>
*/
@ParametersAreNonnullByDefault
public final class JsonSchemaSparkConverter {
private static final JsonSchemaConverter<DataType> SPARK_DATA_TYPE_CONVERTER =
new JsonSchemaConverter<>(new DataTypeSchemaConversions());
private static final JsonSchemaConverter<DataType> SPARK_DATA_TYPE_CONVERTER_TIMESTAMPS_AS_STRINGS =
new JsonSchemaConverter<>(new DataTypeSchemaConversions(true));
/**
* Converts this JSONSchema to a Spark SQL DataType,
* with timestamp (format: date-time) fields as TimestampType.
*
* @param jsonSchema
* The JSONSchema ObjectNode. This should have at minimum type.
*
* @return
* jsonSchema converted to
* {@link DataType}
*/
@Nonnull
public static DataType toDataType(ObjectNode jsonSchema) {
return toDataType(jsonSchema, false);
}
/**
* Converts this JSONSchema to a Spark SQL DataType.
*
* @param jsonSchema
* The JSONSchema ObjectNode. This should have at minimum type.
*
* @param timestampsAsStrings
* If true, timestamp fields (JSONSchema string fields with format: date-time)
* will StringType instead of TimestampType in the converted Spark schema.
*
* @return
* jsonSchema converted to
* {@link DataType}
*/
@Nonnull
public static DataType toDataType(ObjectNode jsonSchema, boolean timestampsAsStrings) {
if (timestampsAsStrings) {
return SPARK_DATA_TYPE_CONVERTER_TIMESTAMPS_AS_STRINGS.convert(jsonSchema);
} else {
return SPARK_DATA_TYPE_CONVERTER.convert(jsonSchema);
}
}
/**
* Constructor to make maven checkstyle plugin happy.
* See: https://checkstyle.sourceforge.io/config_design.html#HideUtilityClassConstructor
*/
private JsonSchemaSparkConverter() {}
}