Skip to content

Instantly share code, notes, and snippets.

@elbakramer
Last active April 9, 2019 13:07
Show Gist options
  • Select an option

  • Save elbakramer/3b8cac916f3491b2150e24a011e5d5cb to your computer and use it in GitHub Desktop.

Select an option

Save elbakramer/3b8cac916f3491b2150e24a011e5d5cb to your computer and use it in GitHub Desktop.
WIP
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
sealed trait Symbol[C] extends Ordered[Symbol[C]] {
def isTerminal(): Boolean
}
case class CharacterSymbol[C: Ordering](val character: C) extends Symbol[C] {
def isTerminal(): Boolean = false
def compare(that: CharacterSymbol[C]): Int = Ordering[C].compare(this.character, that.character)
def compare(that: TerminalSymbol[C]): Int = -1
override def compare(that: Symbol[C]): Int = that match {
case that: CharacterSymbol[C] => compare(that)
case that: TerminalSymbol[C] => compare(that)
}
override def toString(): String = character.toString()
}
case class TerminalSymbol[C: Ordering](val index: Int) extends Symbol[C] {
def isTerminal(): Boolean = true
def compare(that: TerminalSymbol[C]): Int = Ordering[Int].compare(this.index, that.index)
def compare(that: CharacterSymbol[C]): Int = 1
override def compare(that: Symbol[C]): Int = that match {
case that: CharacterSymbol[C] => compare(that)
case that: TerminalSymbol[C] => compare(that)
}
override def toString(): String = "$" + index.toString()
}
object Symbols {
def fromString[C: Ordering](string: Seq[C], index: Int): Seq[Symbol[C]] = string.map(c => CharacterSymbol(c)) :+ TerminalSymbol[C](index)
}
class State[C: Ordering] { self =>
private val transitionMap: HashMap[C, Transition[C]] = HashMap.empty[C, Transition[C]]
def transitions: Map[C, Transition[C]] = transitionMap.toMap
def getTransitions: Map[C, Transition[C]] = transitions
def getTransition(c: C): Option[Transition[C]] = transitions.get(c)
def addTransition(c: C, transition: Transition[C]) = transitionMap.put(c, transition)
private var sourceTransitionOption: Option[Transition[C]] = None
def sourceTransition: Option[Transition[C]] = sourceTransitionOption
def getSourceTransition: Option[Transition[C]] = sourceTransition
def setSourceTransition(transition: Transition[C]) = {
sourceTransitionOption = Some(transition)
}
private var suffixLinkOption: Option[State[C]] = None
def suffixLink: Option[State[C]] = suffixLinkOption
def getSuffixLink: Option[State[C]] = suffixLink
def setSuffixLink(suffixLink: State[C]) = {
suffixLinkOption = Some(suffixLink)
}
def parent: Option[State[C]] = sourceTransition.map(_.source)
def children: Seq[State[C]] = transitions.values.map(_.dest).toSeq
def isLeaf: Boolean = transitions.isEmpty
def inInternal: Boolean = !transitions.isEmpty
def isRoot: Boolean = false
def isPerp: Boolean = false
private def statesUptoRoot: Iterator[State[C]] = new BufferedIterator[State[C]] {
private var cursor: State[C] = self
override def head: State[C] = cursor
override def hasNext: Boolean = cursor != null
override def next(): State[C] = {
val currentCursor = cursor
cursor = cursor.sourceTransition.map(_.source).getOrElse(null)
currentCursor
}
}
def depth: Int = statesUptoRoot.size - 1
def prefix: Seq[C] = sourceTransition.map(_.prefix).getOrElse(Seq.empty[C])
def toState: State[C] = this
def toDebugString: String = {
val builder = new StringBuilder("root\n")
def addEntryLines(state: State[C], prefix: String): Unit = {
if (!state.transitions.isEmpty) {
for ((first, transition) <- state.transitions.toSeq.sortBy(_._1)) {
builder ++= prefix + "|-" + transition.label + "\n"
addEntryLines(transition.dest, prefix + "| ")
}
}
}
addEntryLines(this, "")
builder.toString()
}
def printDebugString: Unit = {
println(toDebugString)
}
}
class RootState[C: Ordering] extends State[C] {
override def toString: String = "root"
override def isRoot: Boolean = true
}
class PerpState[C: Ordering](val root: RootState[C]) extends State[C] {
private val transitionToRoot = new Transition(Seq.empty[C], this, 0, 0, root)
override def getTransition(c: C): Option[Transition[C]] = Some(transitionToRoot)
override def toString: String = "perp"
override def isPerp: Boolean = true
}
class SlicedSeq[A](val original: Seq[A], val from: Int, val until: Int) extends Seq[A] {
def sliced: Seq[A] = original.slice(from, until)
override def apply(idx: Int): A = sliced.apply(idx)
override def iterator: Iterator[A] = sliced.iterator
override def length: Int = sliced.length
}
class Transition[C](val sequence: Seq[C], val source: State[C], val start: Int, val end: Int, val dest: State[C])
extends SlicedSeq(sequence, start - 1, end) {
def subsequence: Seq[C] = sequence.slice(start - 1, end)
def prefix: Seq[C] = sequence.slice(0, end)
def label: String = subsequence.map(_.toString).mkString
}
class GeneralizedSuffixTree[C: Ordering] {
private val sequenceBuffer: ArrayBuffer[Seq[C]] = ArrayBuffer.empty[Seq[C]]
def sequences: Seq[Seq[C]] = sequenceBuffer
val root = new RootState[Symbol[C]]()
val perp = new PerpState[Symbol[C]](root)
root.setSuffixLink(perp)
def addSequence(sequence: Seq[C]): Unit = {
val sequenceIndex = sequences.length
sequenceBuffer.append(sequence)
val symbols = Symbols.fromString(sequence, sequenceIndex)
val terminalSymbol = symbols.last
def inf: Int = symbols.length
def t_(i: Int): Symbol[C] = {
symbols(i - 1)
}
def createTransition(sequence: Seq[Symbol[C]], from: State[Symbol[C]], startAndEnd: (Int, Int), to: State[Symbol[C]]): Unit = {
val start = startAndEnd._1
val end = startAndEnd._2
val transition = new Transition(sequence, from, start, end, to)
from.addTransition(transition.head, transition)
to.setSourceTransition(transition)
}
def createSuffixLink(arg: State[Symbol[C]], res: State[Symbol[C]]): Unit = {
arg.setSuffixLink(res)
}
def findTransition(t: Symbol[C], from: State[Symbol[C]]): Option[Transition[Symbol[C]]] = {
from.getTransition(t)
}
def splitTransition(gprime: Transition[Symbol[C]], s: State[Symbol[C]], kp: (Int, Int)): State[Symbol[C]] = {
val k = kp._1
val p = kp._2
val pprime = gprime.end
val kprime = gprime.start
val sprime = gprime.dest
val r = new State[Symbol[C]]()
// s -> sprime => s -> r -> sprime
createTransition(gprime.sequence, r, (kprime + p - k + 1, pprime), sprime)
createTransition(gprime.sequence, s, (kprime, kprime + p - k), r)
r
}
def fprime(s: State[Symbol[C]]): State[Symbol[C]] = {
s.getSuffixLink.get
}
def update(state: State[Symbol[C]], ki: (Int, Int)): (State[Symbol[C]], Int) = {
var s = state
var k = ki._1
var i = ki._2
var sk = (s, k)
var oldr = root.toState
var endPointAndR = testAndSplit(s, (k, i - 1), t_(i))
var endPoint = endPointAndR._1
var r = endPointAndR._2
while (!endPoint) {
val rprime = new State[Symbol[C]]()
createTransition(symbols, r, (i, inf), rprime)
if (oldr != root) {
createSuffixLink(oldr, r)
}
oldr = r
sk = canonize(fprime(s), (k, i - 1))
s = sk._1
k = sk._2
endPointAndR = testAndSplit(s, (k, i - 1), t_(i))
endPoint = endPointAndR._1
r = endPointAndR._2
}
if (oldr != root) {
createSuffixLink(oldr, s)
}
(s, k)
}
def testAndSplit(state: State[Symbol[C]], kp: (Int, Int), t: Symbol[C]): (Boolean, State[Symbol[C]]) = {
var s = state
var k = kp._1
var p = kp._2
if (k <= p) {
var gprime = findTransition(t_(k), s).get
var pprime = gprime.end
var kprime = gprime.start
var sprime = gprime.to
//if (t == t_(kprime + p - k + 1)) {
if (t == gprime(p - k + 1)) {
(true, s)
} else { // split the transition with new state r in between
val r = splitTransition(gprime, s, kp)
(false, r)
}
} else {
val trans = findTransition(t, s)
if (!trans.isDefined) {
(false, s)
} else {
(true, s)
}
}
}
def canonize(state: State[Symbol[C]], kp: (Int, Int)): (State[Symbol[C]], Int) = {
var s = state
var k = kp._1
var p = kp._2
if (p < k) {
(s, k)
} else {
var gprime = findTransition(t_(k), s).get
var pprime = gprime.end
var kprime = gprime.start
var sprime = gprime.dest
while ((pprime - kprime) <= (p - k)) { // while length of transition is leq length of remains
k = k + pprime - kprime + 1 // proceed k to the length of transition
s = sprime // push state to the target of transition
if (k <= p) { // if remains are still left, find the next transition
gprime = findTransition(t_(k), s).get
pprime = gprime.end
kprime = gprime.start
sprime = gprime.dest
}
}
(s, k)
}
}
var s = root.toState
var k = 1
var i = 0
var sk = (s, k)
while (t_(i + 1) != terminalSymbol) {
i = i + 1
sk = update(s, (k, i))
s = sk._1
k = sk._2
sk = canonize(s, (k, i))
s = sk._1
k = sk._2
}
// add terminal symbol
i = i + 1
sk = update(s, (k, i))
s = sk._1
k = sk._2
sk = canonize(s, (k, i))
s = sk._1
k = sk._2
}
def addSequences(sequences: Seq[C]*): Unit = {
for (sequence <- sequences) {
addSequence(sequence)
}
}
}
object Application {
def main(args: Array[String]): Unit = {
{
val sequence = "mississippi"
val tree = new GeneralizedSuffixTree[Char]()
tree.addSequence(sequence)
tree.root.printDebugString
val debugString =
"""
root
|-i
| |-ppi$0
| |-ssi
| | |-ppi$0
| | |-ssippi$0
| |-$0
|-mississippi$0
|-p
| |-i$0
| |-pi$0
|-s
| |-i
| | |-ppi$0
| | |-ssippi$0
| |-si
| | |-ppi$0
| | |-ssippi$0
|-$0
"""
println(debugString.trim == tree.root.toDebugString.trim)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment