TopServicesPerSubgraph.scala
package org.wikidata.query.rdf.spark.metrics.queries.subgraphs
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, explode, lit}
import org.wikidata.query.rdf.spark.utils.SubgraphUtils.{extractItem, sparkDfColumnsToMap}
object TopServicesPerSubgraph {
/** Gets the number of times various services are used for queries in each subgraph.
*
* @param subgraphQueriesInfo all subgraphQueries and their info from processedQueries.
* Expected columns: id, subgraph, query, query_time, query_time_class, ua, q_info
* @return spark dataframe with columns: subgraph, service_counts: map< string, bigint >
*/
def getTopServices(subgraphQueriesInfo: DataFrame): DataFrame = {
sparkDfColumnsToMap(
subgraphQueriesInfo
.select(col("subgraph"), explode(col("q_info.services")))
.select(col("subgraph"), extractItem(col("key"), lit("NODE_URI")).alias("service"))
.groupBy("subgraph", "service")
.count(),
"service",
"count",
"service_counts",
List("subgraph")
)
}
}