Last active
March 14, 2026 23:10
-
-
Save ixfd64/c998e81ef7d6158c73c74592461fac62 to your computer and use it in GitHub Desktop.
Python script to compute Huffman and Shannon-Fano-Elias codes, created as part of an assignment for ECE 253 at UC Santa Cruz
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # A program that accepts a given text and generates its Huffman code and | |
| # Shannon–Fano–Elias code, as well as additional information. | |
| # | |
| # Created as part of an assignment for ECE 253 at UC Santa Cruz. | |
| import heapq | |
| import math | |
| import textwrap | |
| from collections import Counter | |
| # constants | |
| line_size = 60 | |
| text = """Information theory studies the transmission, processing, extraction, and utilization of information. Abstractly, information can be thought of as the resolution of uncertainty. In the case of communication of information over a noisy channel, this abstract concept was formalized by Claude Shannon in a paper entitled A Mathematical Theory of Communication, in which information is thought of as a set of possible messages, and the goal is to send these messages over a noisy channel, and to have the receiver reconstruct the message with low probability of error, in spite of the channel noise. Shannon main result, the noisy-channel coding theorem showed that, in the limit of many channel uses, the rate of information that is asymptotically achievable is equal to the channel capacity, a quantity dependent merely on the statistics of the channel over which the messages are sent. | |
| Coding theory is concerned with finding explicit methods, called codes, for increasing the efficiency and reducing the error rate of data communication over noisy channels to near the channel capacity. These codes can be roughly subdivided into data compression and error-correction techniques. In the latter case, it took many years to find the methods Shannon work proved were possible. | |
| The recent development of various methods of modulation such as PCM and PPM which exchange bandwidth for signal-to-noise ratio has intensified the interest in a general theory of communication. A basis for such a theory is contained in the important papers of Nyquist and Hartley on this subject.""" | |
| # custom node type for Huffman trees | |
| class Node: | |
| def __init__(self, symbol=None, p=None, left=None, right=None): | |
| self.symbol = symbol | |
| self.p = p | |
| self.left = left | |
| self.right = right | |
| # comparison operator for priority queue | |
| def __lt__(self, other): | |
| return self.p < other.p | |
| # compute the probability of each character | |
| def get_probs(input_str) -> dict: | |
| freqs = Counter(input_str) | |
| n = len(input_str) | |
| return {char: freqs[char] / n for char in freqs} | |
| # compute the entropy of a probability distribution | |
| def entropy(p: dict) -> float: | |
| return -sum(p_i * math.log2(p_i) for p_i in p.values()) | |
| # Python's built-in bin() only accepts integers, so we have to implement our | |
| # own method to convert a decimal value to binary | |
| def float2bin(num: float, num_bits: int) -> str: | |
| bits = "" | |
| for _bit in range(num_bits): | |
| num *= 2 | |
| if num >= 1: | |
| bits += "1" | |
| num -= 1 | |
| else: | |
| bits += "0" | |
| return bits | |
| # encode the input text using a provided set of codewords | |
| def encode(input_str, codewords) -> str: | |
| return ''.join(codewords[char] for char in input_str) | |
| # compute the average code length | |
| def codeword_len(probs, codewords) -> float: | |
| cum_sum = 0 | |
| for char in probs: | |
| cum_sum += probs[char] * len(codewords[char]) | |
| return cum_sum | |
| # helper function to traverse nodes | |
| def traverse(node, codeword="", codewords=None): | |
| if codewords is None: | |
| codewords = {} | |
| if node.symbol is not None: | |
| codewords[node.symbol] = codeword | |
| else: | |
| traverse(node.left, codeword + "0", codewords) | |
| traverse(node.right, codeword + "1", codewords) | |
| return codewords | |
| # generate Huffman code given a set of probabilities | |
| def generate_huffman_code(probs) -> dict: | |
| heap = [Node(symbol, p_i) for symbol, p_i in probs.items()] | |
| heapq.heapify(heap) | |
| while len(heap) > 1: | |
| min_prob = heapq.heappop(heap) | |
| next_min_prob = heapq.heappop(heap) | |
| heapq.heappush(heap, Node(None, min_prob.p + next_min_prob.p, min_prob, next_min_prob)) | |
| h_codewords = {} | |
| traverse(heap[0], codewords=h_codewords) | |
| return h_codewords | |
| # generate Shannon–Fano–Elias code given a set of probabilities | |
| def generate_sfe_code(probs) -> dict: | |
| items = sorted(probs.items(), key=lambda x:-x[1]) | |
| f = 0 | |
| sfe_codewords = {} | |
| for s,p in items: | |
| bits = math.ceil(math.log2(1 / p)) + 1 | |
| sfe_codewords[s] = float2bin(f + p / 2, bits) | |
| f += p | |
| return sfe_codewords | |
| def main() -> None: | |
| p = get_probs(text) | |
| h = entropy(p) | |
| print(f"\nH(X) = {h}") | |
| h_codewords = generate_huffman_code(p) | |
| print("\nHuffman coding\n----") | |
| print(f"Average codeword length: {codeword_len(p, h_codewords)}\n") | |
| # uncomment below lines to print the codeword for each symbol | |
| # for char in h_codewords: | |
| # if char == ' ': | |
| # char_name = "space\t\t" | |
| # elif char == '\n': | |
| # char_name = "line break\t" | |
| # else: | |
| # char_name = char + "\t\t\t" | |
| # print(f"Symbol: {char_name} codeword: {h_codewords[char]}") | |
| print("\nEncoded:") | |
| huffman_strs = textwrap.wrap(encode(text, h_codewords), line_size) | |
| for line in huffman_strs: | |
| print(line) | |
| sfe_codewords = generate_sfe_code(p) | |
| print("\nShannon-Fano-Elias coding\n----") | |
| print(f"Average codeword length: {codeword_len(p, sfe_codewords)}\n") | |
| # uncomment below lines to print the codeword for each symbol | |
| # for char in sfe_codewords: | |
| # if char == ' ': | |
| # char_name = "space\t\t" | |
| # elif char == '\n': | |
| # char_name = "line break\t" | |
| # else: | |
| # char_name = char + "\t\t\t" | |
| # print(f"Symbol: {char_name} codeword: {h_codewords[char]}") | |
| print("\nEncoded:") | |
| sfe_strs = textwrap.wrap(encode(text, sfe_codewords), line_size) | |
| for line in sfe_strs: | |
| print(line) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment