AnalyzeNodeVisitor.scala

package org.wikidata.query.rdf.spark.transform.queries.sparql.visitors

import scala.collection.mutable

import org.apache.jena.graph._
import org.apache.jena.graph.impl.LiteralLabel
import org.apache.jena.shared.PrefixMapping

case class NodeInfo(nodeType: String, nodeValue: String)

class AnalyzeNodeVisitor(
  prefixMapping: PrefixMapping
) extends NodeVisitor {

  val wdNodeCount: mutable.Map[String, Long] = new mutable.HashMap[String, Long]().withDefaultValue(0L)
  val nodeCount: mutable.Map[String, Long] = new mutable.HashMap[String, Long]().withDefaultValue(0L)
  val prefixesCount: mutable.Map[String, Long] = new mutable.HashMap[String, Long]().withDefaultValue(0L)

  val nullRes = Option.empty

  private def incWdNode(s: String): Unit = {
    wdNodeCount(s) = wdNodeCount(s) + 1L
  }

  private def incNode(s: String): Unit ={
    nodeCount(s) = nodeCount(s) + 1L
  }

  private def incPrefix(s: String): Unit = {
    prefixesCount(s) = prefixesCount(s) + 1L
  }

  override def visitAny(nodeAny: Node_ANY): NodeInfo = {
    incNode("NODE_ANY")
    NodeInfo("NODE_ANY", "")
  }

  override def visitBlank(nodeBlank: Node_Blank, blankNodeId: BlankNodeId): NodeInfo = {
    incNode(s"NODE_BLANK[${blankNodeId.getLabelString}]")
    NodeInfo("NODE_BLANK", blankNodeId.getLabelString)
  }

  override def visitLiteral(nodeLiteral: Node_Literal, literalLabel: LiteralLabel): NodeInfo = {
    incNode(s"NODE_LITERAL[${literalLabel.toString()}]")
    NodeInfo("NODE_LITERAL", literalLabel.toString())
  }

  override def visitURI(nodeUri: Node_URI, s: String): NodeInfo = {
    val shortForm = prefixMapping.shortForm(s)
    val (prefix, qname) = {
      val colonIdx = shortForm.indexOf(':')
      if (colonIdx > 0) {
        (Some(shortForm.slice(0, colonIdx)), Some(shortForm.slice(colonIdx + 1, shortForm.length)))
      } else {
        val slashIdx = shortForm.lastIndexOf('/')
        if (slashIdx > 0) {
          (nullRes, Some(shortForm.slice(slashIdx + 1, shortForm.length)))
        } else {
          (nullRes, nullRes)
        }
      }
    }
    if (qname.forall(_.matches("^[QP]\\d+$"))){
      incWdNode(qname.get)
    }
    prefix.foreach(p => incPrefix(p))
    incNode(s"NODE_URI[$shortForm]")
    NodeInfo("NODE_URI", shortForm)
  }

  override def visitVariable(nodeVariable: Node_Variable, s: String): NodeInfo = {
    incNode(s"NODE_VAR[$s]")
    NodeInfo("NODE_VAR", s)
  }
}