UaSubgraphCounts.scala
package org.wikidata.query.rdf.spark.metrics.queries.subgraphs
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{count, countDistinct}
import org.wikidata.query.rdf.spark.utils.SubgraphUtils.sparkDfColumnsToListOfStruct
object UaSubgraphCounts {
/**
* @param subgraphQueriesInfo all subgraphQueries (queries mapped to subgraphs) and their info from processedQueries.
* Expected columns: id, subgraph, query, query_time, query_time_class, ua, q_info
* @return spark dataframe with columns: id (dummy id for merging purposes), ua_subgraph_dist: struct< subgraph_count, ua_count >
*/
def getUaSubgraphCounts(subgraphQueriesInfo: DataFrame): DataFrame = {
sparkDfColumnsToListOfStruct(subgraphQueriesInfo
.groupBy("ua")
.agg(countDistinct("subgraph").as("subgraph_count"))
.groupBy("subgraph_count")
.agg(count("ua").as("ua_count")),
List("subgraph_count", "ua_count"),
"ua_subgraph_dist",
List()
)
}
}