Skip to content

Instantly share code, notes, and snippets.

@HaraldKorneliussen
Last active September 22, 2024 20:10
Show Gist options
  • Select an option

  • Save HaraldKorneliussen/6f1e958f917ffed8bf80d2d3a3922913 to your computer and use it in GitHub Desktop.

Select an option

Save HaraldKorneliussen/6f1e958f917ffed8bf80d2d3a3922913 to your computer and use it in GitHub Desktop.
Recursive Run-Length encoding
#!/usr/bin/python
# In the binary case, maybe the length of a b-run should be encoded bit-flipped.
# Then the bit-flip of the compressed string should decode into the bit-flip
# of the uncompressed string, which seems like a neat property.
# It would be even neater if we could also preserve reversal of the string, but then
# we have to rethink a lot. The special casing of the ending sabotages most meaning
# we can squeeze into reversal.
def bijEnc(num, symbols="ab"):
"""
Encode a natural number as a word in the alphabet specified by the symbols string,
so that every natural number has exactly one representation. This is often called
the bijective base encoding of the number.
"""
ret = ''
slen = len(symbols)
while num > 0:
num -= 1
ret = symbols[num % slen] + ret
num = num // slen
return ret
def bijDec(s, symbols="ab"):
"""
Decode a string in the alphabet specified by the symbols string, into a natural number.
Inverse function of bijEnc, provided you use the same symbols string.
"""
acc = 0
for c in s:
acc *= len(symbols)
acc += (symbols.index(c) + 1)
return acc
def rrle(s):
"""
Recursively run-length encode the runs in s
"""
if len(s) < 2:
return s
i = 0
ret = ""
runLetter = s[i]
runLength = 0
while i < len(s):
# How long is the run?
while i < len(s) and s[i] == runLetter:
runLength += 1
i += 1
# Are there more letters to encode? If so, just write out this run
if i < len(s):
toEncode = rrle(bijEnc(runLength))
ret += runLetter + runLetter.join((c for c in toEncode))
runLetter = s[i]
runLength = 0
# We've come to the end of the string. Write out the final run, which requires
# special treatment to be bijective. We can't just write out the run length as
# is, because then the output string would always be even in length, and then
# how would the decoder interpret an odd-length string?
# So instead we halve the run length value, and write out odd-length runs with an
# extra runLetter at the end, and even-length runs without.
oddRun = runLength % 2 == 1
runLength //= 2
if runLength > 0:
toEncode = rrle(bijEnc(runLength))
ret += runLetter + runLetter.join((c for c in toEncode))
if oddRun:
ret += runLetter
return ret
def rrld(s):
"""
Recursively run-length decode the string s
"""
if len(s) < 2:
return s
i = 1
ret = ""
runLetter = s[0]
runLengthString = ""
repeats = 0
while i < len(s):
# extract the string encoding the run length
while i < len(s) and s[i-1] == runLetter:
runLengthString += s[i]
i += 2
# in all but the last run,
if i < len(s):
# recurse on the run length string
# to decode the actual run length as a number
repeats = bijDec(rrld(runLengthString))
# append the run
ret += runLetter * repeats
# prepare the next run
runLetter = s[i-1]
runLengthString = ""
# The final run is decoded as usual
repeats = bijDec(rrld(runLengthString))
# but there may be a final letter to handle:
if i == len(s):
# If it's the same letter as the run was,
# we double the run length and add one to find
# the actual run length
if (runLetter == s[i-1]):
ret += runLetter * (2 * repeats + 1)
# If it's a different letter, it actually started
# a new run of just that one letter. So we append
# the next-to-final run the usual way, and then
# the final "run" of length one.
else:
ret += runLetter * repeats
ret += s[i-1]
else:
# If there was no final letter, we just double the decoded
# run length to find the actual run length.
ret += runLetter * (2 * repeats)
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment