NumOfSubgraphsPerQueryDist.scala
package org.wikidata.query.rdf.spark.metrics.queries.subgraphs
import org.apache.spark.sql.DataFrame
import org.wikidata.query.rdf.spark.utils.SubgraphUtils.sparkDfColumnsToListOfStruct
object NumOfSubgraphsPerQueryDist {
/**
* @param numOfSubgraphsPerQuery Number of Queries that access `X` Number of Subgraphs.
* Expected columns: id, subgraph_count.
* @return spark dataframe. Expected columns: id (dummy id for merging purposes),
* query_subgraph_dist: struct< subgraph_count, query_count >
*/
def getNumOfSubgraphsPerQueryDist(numOfSubgraphsPerQuery: DataFrame): DataFrame = {
sparkDfColumnsToListOfStruct(
numOfSubgraphsPerQuery
.groupBy("subgraph_count")
.count()
.withColumnRenamed("count", "query_count"),
List("subgraph_count", "query_count"),
"query_subgraph_dist",
List()
)
}
}