Last active
April 9, 2019 13:07
-
-
Save elbakramer/3b8cac916f3491b2150e24a011e5d5cb to your computer and use it in GitHub Desktop.
WIP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import scala.collection.mutable.HashMap | |
| import scala.collection.mutable.ArrayBuffer | |
| sealed trait Symbol[C] extends Ordered[Symbol[C]] { | |
| def isTerminal(): Boolean | |
| } | |
| case class CharacterSymbol[C: Ordering](val character: C) extends Symbol[C] { | |
| def isTerminal(): Boolean = false | |
| def compare(that: CharacterSymbol[C]): Int = Ordering[C].compare(this.character, that.character) | |
| def compare(that: TerminalSymbol[C]): Int = -1 | |
| override def compare(that: Symbol[C]): Int = that match { | |
| case that: CharacterSymbol[C] => compare(that) | |
| case that: TerminalSymbol[C] => compare(that) | |
| } | |
| override def toString(): String = character.toString() | |
| } | |
| case class TerminalSymbol[C: Ordering](val index: Int) extends Symbol[C] { | |
| def isTerminal(): Boolean = true | |
| def compare(that: TerminalSymbol[C]): Int = Ordering[Int].compare(this.index, that.index) | |
| def compare(that: CharacterSymbol[C]): Int = 1 | |
| override def compare(that: Symbol[C]): Int = that match { | |
| case that: CharacterSymbol[C] => compare(that) | |
| case that: TerminalSymbol[C] => compare(that) | |
| } | |
| override def toString(): String = "$" + index.toString() | |
| } | |
| object Symbols { | |
| def fromString[C: Ordering](string: Seq[C], index: Int): Seq[Symbol[C]] = string.map(c => CharacterSymbol(c)) :+ TerminalSymbol[C](index) | |
| } | |
| class State[C: Ordering] { self => | |
| private val transitionMap: HashMap[C, Transition[C]] = HashMap.empty[C, Transition[C]] | |
| def transitions: Map[C, Transition[C]] = transitionMap.toMap | |
| def getTransitions: Map[C, Transition[C]] = transitions | |
| def getTransition(c: C): Option[Transition[C]] = transitions.get(c) | |
| def addTransition(c: C, transition: Transition[C]) = transitionMap.put(c, transition) | |
| private var sourceTransitionOption: Option[Transition[C]] = None | |
| def sourceTransition: Option[Transition[C]] = sourceTransitionOption | |
| def getSourceTransition: Option[Transition[C]] = sourceTransition | |
| def setSourceTransition(transition: Transition[C]) = { | |
| sourceTransitionOption = Some(transition) | |
| } | |
| private var suffixLinkOption: Option[State[C]] = None | |
| def suffixLink: Option[State[C]] = suffixLinkOption | |
| def getSuffixLink: Option[State[C]] = suffixLink | |
| def setSuffixLink(suffixLink: State[C]) = { | |
| suffixLinkOption = Some(suffixLink) | |
| } | |
| def parent: Option[State[C]] = sourceTransition.map(_.source) | |
| def children: Seq[State[C]] = transitions.values.map(_.dest).toSeq | |
| def isLeaf: Boolean = transitions.isEmpty | |
| def inInternal: Boolean = !transitions.isEmpty | |
| def isRoot: Boolean = false | |
| def isPerp: Boolean = false | |
| private def statesUptoRoot: Iterator[State[C]] = new BufferedIterator[State[C]] { | |
| private var cursor: State[C] = self | |
| override def head: State[C] = cursor | |
| override def hasNext: Boolean = cursor != null | |
| override def next(): State[C] = { | |
| val currentCursor = cursor | |
| cursor = cursor.sourceTransition.map(_.source).getOrElse(null) | |
| currentCursor | |
| } | |
| } | |
| def depth: Int = statesUptoRoot.size - 1 | |
| def prefix: Seq[C] = sourceTransition.map(_.prefix).getOrElse(Seq.empty[C]) | |
| def toState: State[C] = this | |
| def toDebugString: String = { | |
| val builder = new StringBuilder("root\n") | |
| def addEntryLines(state: State[C], prefix: String): Unit = { | |
| if (!state.transitions.isEmpty) { | |
| for ((first, transition) <- state.transitions.toSeq.sortBy(_._1)) { | |
| builder ++= prefix + "|-" + transition.label + "\n" | |
| addEntryLines(transition.dest, prefix + "| ") | |
| } | |
| } | |
| } | |
| addEntryLines(this, "") | |
| builder.toString() | |
| } | |
| def printDebugString: Unit = { | |
| println(toDebugString) | |
| } | |
| } | |
| class RootState[C: Ordering] extends State[C] { | |
| override def toString: String = "root" | |
| override def isRoot: Boolean = true | |
| } | |
| class PerpState[C: Ordering](val root: RootState[C]) extends State[C] { | |
| private val transitionToRoot = new Transition(Seq.empty[C], this, 0, 0, root) | |
| override def getTransition(c: C): Option[Transition[C]] = Some(transitionToRoot) | |
| override def toString: String = "perp" | |
| override def isPerp: Boolean = true | |
| } | |
| class SlicedSeq[A](val original: Seq[A], val from: Int, val until: Int) extends Seq[A] { | |
| def sliced: Seq[A] = original.slice(from, until) | |
| override def apply(idx: Int): A = sliced.apply(idx) | |
| override def iterator: Iterator[A] = sliced.iterator | |
| override def length: Int = sliced.length | |
| } | |
| class Transition[C](val sequence: Seq[C], val source: State[C], val start: Int, val end: Int, val dest: State[C]) | |
| extends SlicedSeq(sequence, start - 1, end) { | |
| def subsequence: Seq[C] = sequence.slice(start - 1, end) | |
| def prefix: Seq[C] = sequence.slice(0, end) | |
| def label: String = subsequence.map(_.toString).mkString | |
| } | |
| class GeneralizedSuffixTree[C: Ordering] { | |
| private val sequenceBuffer: ArrayBuffer[Seq[C]] = ArrayBuffer.empty[Seq[C]] | |
| def sequences: Seq[Seq[C]] = sequenceBuffer | |
| val root = new RootState[Symbol[C]]() | |
| val perp = new PerpState[Symbol[C]](root) | |
| root.setSuffixLink(perp) | |
| def addSequence(sequence: Seq[C]): Unit = { | |
| val sequenceIndex = sequences.length | |
| sequenceBuffer.append(sequence) | |
| val symbols = Symbols.fromString(sequence, sequenceIndex) | |
| val terminalSymbol = symbols.last | |
| def inf: Int = symbols.length | |
| def t_(i: Int): Symbol[C] = { | |
| symbols(i - 1) | |
| } | |
| def createTransition(sequence: Seq[Symbol[C]], from: State[Symbol[C]], startAndEnd: (Int, Int), to: State[Symbol[C]]): Unit = { | |
| val start = startAndEnd._1 | |
| val end = startAndEnd._2 | |
| val transition = new Transition(sequence, from, start, end, to) | |
| from.addTransition(transition.head, transition) | |
| to.setSourceTransition(transition) | |
| } | |
| def createSuffixLink(arg: State[Symbol[C]], res: State[Symbol[C]]): Unit = { | |
| arg.setSuffixLink(res) | |
| } | |
| def findTransition(t: Symbol[C], from: State[Symbol[C]]): Option[Transition[Symbol[C]]] = { | |
| from.getTransition(t) | |
| } | |
| def splitTransition(gprime: Transition[Symbol[C]], s: State[Symbol[C]], kp: (Int, Int)): State[Symbol[C]] = { | |
| val k = kp._1 | |
| val p = kp._2 | |
| val pprime = gprime.end | |
| val kprime = gprime.start | |
| val sprime = gprime.dest | |
| val r = new State[Symbol[C]]() | |
| // s -> sprime => s -> r -> sprime | |
| createTransition(gprime.sequence, r, (kprime + p - k + 1, pprime), sprime) | |
| createTransition(gprime.sequence, s, (kprime, kprime + p - k), r) | |
| r | |
| } | |
| def fprime(s: State[Symbol[C]]): State[Symbol[C]] = { | |
| s.getSuffixLink.get | |
| } | |
| def update(state: State[Symbol[C]], ki: (Int, Int)): (State[Symbol[C]], Int) = { | |
| var s = state | |
| var k = ki._1 | |
| var i = ki._2 | |
| var sk = (s, k) | |
| var oldr = root.toState | |
| var endPointAndR = testAndSplit(s, (k, i - 1), t_(i)) | |
| var endPoint = endPointAndR._1 | |
| var r = endPointAndR._2 | |
| while (!endPoint) { | |
| val rprime = new State[Symbol[C]]() | |
| createTransition(symbols, r, (i, inf), rprime) | |
| if (oldr != root) { | |
| createSuffixLink(oldr, r) | |
| } | |
| oldr = r | |
| sk = canonize(fprime(s), (k, i - 1)) | |
| s = sk._1 | |
| k = sk._2 | |
| endPointAndR = testAndSplit(s, (k, i - 1), t_(i)) | |
| endPoint = endPointAndR._1 | |
| r = endPointAndR._2 | |
| } | |
| if (oldr != root) { | |
| createSuffixLink(oldr, s) | |
| } | |
| (s, k) | |
| } | |
| def testAndSplit(state: State[Symbol[C]], kp: (Int, Int), t: Symbol[C]): (Boolean, State[Symbol[C]]) = { | |
| var s = state | |
| var k = kp._1 | |
| var p = kp._2 | |
| if (k <= p) { | |
| var gprime = findTransition(t_(k), s).get | |
| var pprime = gprime.end | |
| var kprime = gprime.start | |
| var sprime = gprime.to | |
| //if (t == t_(kprime + p - k + 1)) { | |
| if (t == gprime(p - k + 1)) { | |
| (true, s) | |
| } else { // split the transition with new state r in between | |
| val r = splitTransition(gprime, s, kp) | |
| (false, r) | |
| } | |
| } else { | |
| val trans = findTransition(t, s) | |
| if (!trans.isDefined) { | |
| (false, s) | |
| } else { | |
| (true, s) | |
| } | |
| } | |
| } | |
| def canonize(state: State[Symbol[C]], kp: (Int, Int)): (State[Symbol[C]], Int) = { | |
| var s = state | |
| var k = kp._1 | |
| var p = kp._2 | |
| if (p < k) { | |
| (s, k) | |
| } else { | |
| var gprime = findTransition(t_(k), s).get | |
| var pprime = gprime.end | |
| var kprime = gprime.start | |
| var sprime = gprime.dest | |
| while ((pprime - kprime) <= (p - k)) { // while length of transition is leq length of remains | |
| k = k + pprime - kprime + 1 // proceed k to the length of transition | |
| s = sprime // push state to the target of transition | |
| if (k <= p) { // if remains are still left, find the next transition | |
| gprime = findTransition(t_(k), s).get | |
| pprime = gprime.end | |
| kprime = gprime.start | |
| sprime = gprime.dest | |
| } | |
| } | |
| (s, k) | |
| } | |
| } | |
| var s = root.toState | |
| var k = 1 | |
| var i = 0 | |
| var sk = (s, k) | |
| while (t_(i + 1) != terminalSymbol) { | |
| i = i + 1 | |
| sk = update(s, (k, i)) | |
| s = sk._1 | |
| k = sk._2 | |
| sk = canonize(s, (k, i)) | |
| s = sk._1 | |
| k = sk._2 | |
| } | |
| // add terminal symbol | |
| i = i + 1 | |
| sk = update(s, (k, i)) | |
| s = sk._1 | |
| k = sk._2 | |
| sk = canonize(s, (k, i)) | |
| s = sk._1 | |
| k = sk._2 | |
| } | |
| def addSequences(sequences: Seq[C]*): Unit = { | |
| for (sequence <- sequences) { | |
| addSequence(sequence) | |
| } | |
| } | |
| } | |
| object Application { | |
| def main(args: Array[String]): Unit = { | |
| { | |
| val sequence = "mississippi" | |
| val tree = new GeneralizedSuffixTree[Char]() | |
| tree.addSequence(sequence) | |
| tree.root.printDebugString | |
| val debugString = | |
| """ | |
| root | |
| |-i | |
| | |-ppi$0 | |
| | |-ssi | |
| | | |-ppi$0 | |
| | | |-ssippi$0 | |
| | |-$0 | |
| |-mississippi$0 | |
| |-p | |
| | |-i$0 | |
| | |-pi$0 | |
| |-s | |
| | |-i | |
| | | |-ppi$0 | |
| | | |-ssippi$0 | |
| | |-si | |
| | | |-ppi$0 | |
| | | |-ssippi$0 | |
| |-$0 | |
| """ | |
| println(debugString.trim == tree.root.toDebugString.trim) | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment