Last active
September 22, 2024 20:10
-
-
Save HaraldKorneliussen/6f1e958f917ffed8bf80d2d3a3922913 to your computer and use it in GitHub Desktop.
Recursive Run-Length encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # In the binary case, maybe the length of a b-run should be encoded bit-flipped. | |
| # Then the bit-flip of the compressed string should decode into the bit-flip | |
| # of the uncompressed string, which seems like a neat property. | |
| # It would be even neater if we could also preserve reversal of the string, but then | |
| # we have to rethink a lot. The special casing of the ending sabotages most meaning | |
| # we can squeeze into reversal. | |
| def bijEnc(num, symbols="ab"): | |
| """ | |
| Encode a natural number as a word in the alphabet specified by the symbols string, | |
| so that every natural number has exactly one representation. This is often called | |
| the bijective base encoding of the number. | |
| """ | |
| ret = '' | |
| slen = len(symbols) | |
| while num > 0: | |
| num -= 1 | |
| ret = symbols[num % slen] + ret | |
| num = num // slen | |
| return ret | |
| def bijDec(s, symbols="ab"): | |
| """ | |
| Decode a string in the alphabet specified by the symbols string, into a natural number. | |
| Inverse function of bijEnc, provided you use the same symbols string. | |
| """ | |
| acc = 0 | |
| for c in s: | |
| acc *= len(symbols) | |
| acc += (symbols.index(c) + 1) | |
| return acc | |
| def rrle(s): | |
| """ | |
| Recursively run-length encode the runs in s | |
| """ | |
| if len(s) < 2: | |
| return s | |
| i = 0 | |
| ret = "" | |
| runLetter = s[i] | |
| runLength = 0 | |
| while i < len(s): | |
| # How long is the run? | |
| while i < len(s) and s[i] == runLetter: | |
| runLength += 1 | |
| i += 1 | |
| # Are there more letters to encode? If so, just write out this run | |
| if i < len(s): | |
| toEncode = rrle(bijEnc(runLength)) | |
| ret += runLetter + runLetter.join((c for c in toEncode)) | |
| runLetter = s[i] | |
| runLength = 0 | |
| # We've come to the end of the string. Write out the final run, which requires | |
| # special treatment to be bijective. We can't just write out the run length as | |
| # is, because then the output string would always be even in length, and then | |
| # how would the decoder interpret an odd-length string? | |
| # So instead we halve the run length value, and write out odd-length runs with an | |
| # extra runLetter at the end, and even-length runs without. | |
| oddRun = runLength % 2 == 1 | |
| runLength //= 2 | |
| if runLength > 0: | |
| toEncode = rrle(bijEnc(runLength)) | |
| ret += runLetter + runLetter.join((c for c in toEncode)) | |
| if oddRun: | |
| ret += runLetter | |
| return ret | |
| def rrld(s): | |
| """ | |
| Recursively run-length decode the string s | |
| """ | |
| if len(s) < 2: | |
| return s | |
| i = 1 | |
| ret = "" | |
| runLetter = s[0] | |
| runLengthString = "" | |
| repeats = 0 | |
| while i < len(s): | |
| # extract the string encoding the run length | |
| while i < len(s) and s[i-1] == runLetter: | |
| runLengthString += s[i] | |
| i += 2 | |
| # in all but the last run, | |
| if i < len(s): | |
| # recurse on the run length string | |
| # to decode the actual run length as a number | |
| repeats = bijDec(rrld(runLengthString)) | |
| # append the run | |
| ret += runLetter * repeats | |
| # prepare the next run | |
| runLetter = s[i-1] | |
| runLengthString = "" | |
| # The final run is decoded as usual | |
| repeats = bijDec(rrld(runLengthString)) | |
| # but there may be a final letter to handle: | |
| if i == len(s): | |
| # If it's the same letter as the run was, | |
| # we double the run length and add one to find | |
| # the actual run length | |
| if (runLetter == s[i-1]): | |
| ret += runLetter * (2 * repeats + 1) | |
| # If it's a different letter, it actually started | |
| # a new run of just that one letter. So we append | |
| # the next-to-final run the usual way, and then | |
| # the final "run" of length one. | |
| else: | |
| ret += runLetter * repeats | |
| ret += s[i-1] | |
| else: | |
| # If there was no final letter, we just double the decoded | |
| # run length to find the actual run length. | |
| ret += runLetter * (2 * repeats) | |
| return ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment