Skip to content

Instantly share code, notes, and snippets.

@ixfd64
Last active March 14, 2026 23:10
Show Gist options
  • Select an option

  • Save ixfd64/c998e81ef7d6158c73c74592461fac62 to your computer and use it in GitHub Desktop.

Select an option

Save ixfd64/c998e81ef7d6158c73c74592461fac62 to your computer and use it in GitHub Desktop.
Python script to compute Huffman and Shannon-Fano-Elias codes, created as part of an assignment for ECE 253 at UC Santa Cruz
# A program that accepts a given text and generates its Huffman code and
# Shannon–Fano–Elias code, as well as additional information.
#
# Created as part of an assignment for ECE 253 at UC Santa Cruz.
import heapq
import math
import textwrap
from collections import Counter
# constants
line_size = 60
text = """Information theory studies the transmission, processing, extraction, and utilization of information. Abstractly, information can be thought of as the resolution of uncertainty. In the case of communication of information over a noisy channel, this abstract concept was formalized by Claude Shannon in a paper entitled A Mathematical Theory of Communication, in which information is thought of as a set of possible messages, and the goal is to send these messages over a noisy channel, and to have the receiver reconstruct the message with low probability of error, in spite of the channel noise. Shannon main result, the noisy-channel coding theorem showed that, in the limit of many channel uses, the rate of information that is asymptotically achievable is equal to the channel capacity, a quantity dependent merely on the statistics of the channel over which the messages are sent.
Coding theory is concerned with finding explicit methods, called codes, for increasing the efficiency and reducing the error rate of data communication over noisy channels to near the channel capacity. These codes can be roughly subdivided into data compression and error-correction techniques. In the latter case, it took many years to find the methods Shannon work proved were possible.
The recent development of various methods of modulation such as PCM and PPM which exchange bandwidth for signal-to-noise ratio has intensified the interest in a general theory of communication. A basis for such a theory is contained in the important papers of Nyquist and Hartley on this subject."""
# custom node type for Huffman trees
class Node:
def __init__(self, symbol=None, p=None, left=None, right=None):
self.symbol = symbol
self.p = p
self.left = left
self.right = right
# comparison operator for priority queue
def __lt__(self, other):
return self.p < other.p
# compute the probability of each character
def get_probs(input_str) -> dict:
freqs = Counter(input_str)
n = len(input_str)
return {char: freqs[char] / n for char in freqs}
# compute the entropy of a probability distribution
def entropy(p: dict) -> float:
return -sum(p_i * math.log2(p_i) for p_i in p.values())
# Python's built-in bin() only accepts integers, so we have to implement our
# own method to convert a decimal value to binary
def float2bin(num: float, num_bits: int) -> str:
bits = ""
for _bit in range(num_bits):
num *= 2
if num >= 1:
bits += "1"
num -= 1
else:
bits += "0"
return bits
# encode the input text using a provided set of codewords
def encode(input_str, codewords) -> str:
return ''.join(codewords[char] for char in input_str)
# compute the average code length
def codeword_len(probs, codewords) -> float:
cum_sum = 0
for char in probs:
cum_sum += probs[char] * len(codewords[char])
return cum_sum
# helper function to traverse nodes
def traverse(node, codeword="", codewords=None):
if codewords is None:
codewords = {}
if node.symbol is not None:
codewords[node.symbol] = codeword
else:
traverse(node.left, codeword + "0", codewords)
traverse(node.right, codeword + "1", codewords)
return codewords
# generate Huffman code given a set of probabilities
def generate_huffman_code(probs) -> dict:
heap = [Node(symbol, p_i) for symbol, p_i in probs.items()]
heapq.heapify(heap)
while len(heap) > 1:
min_prob = heapq.heappop(heap)
next_min_prob = heapq.heappop(heap)
heapq.heappush(heap, Node(None, min_prob.p + next_min_prob.p, min_prob, next_min_prob))
h_codewords = {}
traverse(heap[0], codewords=h_codewords)
return h_codewords
# generate Shannon–Fano–Elias code given a set of probabilities
def generate_sfe_code(probs) -> dict:
items = sorted(probs.items(), key=lambda x:-x[1])
f = 0
sfe_codewords = {}
for s,p in items:
bits = math.ceil(math.log2(1 / p)) + 1
sfe_codewords[s] = float2bin(f + p / 2, bits)
f += p
return sfe_codewords
def main() -> None:
p = get_probs(text)
h = entropy(p)
print(f"\nH(X) = {h}")
h_codewords = generate_huffman_code(p)
print("\nHuffman coding\n----")
print(f"Average codeword length: {codeword_len(p, h_codewords)}\n")
# uncomment below lines to print the codeword for each symbol
# for char in h_codewords:
# if char == ' ':
# char_name = "space\t\t"
# elif char == '\n':
# char_name = "line break\t"
# else:
# char_name = char + "\t\t\t"
# print(f"Symbol: {char_name} codeword: {h_codewords[char]}")
print("\nEncoded:")
huffman_strs = textwrap.wrap(encode(text, h_codewords), line_size)
for line in huffman_strs:
print(line)
sfe_codewords = generate_sfe_code(p)
print("\nShannon-Fano-Elias coding\n----")
print(f"Average codeword length: {codeword_len(p, sfe_codewords)}\n")
# uncomment below lines to print the codeword for each symbol
# for char in sfe_codewords:
# if char == ' ':
# char_name = "space\t\t"
# elif char == '\n':
# char_name = "line break\t"
# else:
# char_name = char + "\t\t\t"
# print(f"Symbol: {char_name} codeword: {h_codewords[char]}")
print("\nEncoded:")
sfe_strs = textwrap.wrap(encode(text, sfe_codewords), line_size)
for line in sfe_strs:
print(line)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment