ixfd64/ECE253_hw6_prob10.py

## ECE253_hw6_prob10.py
# A program that accepts a given text and generates its Huffman code and
# Shannon–Fano–Elias code, as well as additional information.
#
# Created as part of an assignment for ECE 253 at UC Santa Cruz.

import heapq
import math
import textwrap

from collections import Counter

# constants
line_size = 60
text = """Information theory studies the transmission, processing, extraction, and utilization of information. Abstractly, information can be thought of as the resolution of uncertainty. In the case of communication of information over a noisy channel, this abstract concept was formalized by Claude Shannon in a paper entitled A Mathematical Theory of Communication, in which information is thought of as a set of possible messages, and the goal is to send these messages over a noisy channel, and to have the receiver reconstruct the message with low probability of error, in spite of the channel noise. Shannon main result, the noisy-channel coding theorem showed that, in the limit of many channel uses, the rate of information that is asymptotically achievable is equal to the channel capacity, a quantity dependent merely on the statistics of the channel over which the messages are sent.

Coding theory is concerned with finding explicit methods, called codes, for increasing the efficiency and reducing the error rate of data communication over noisy channels to near the channel capacity. These codes can be roughly subdivided into data compression and error-correction techniques. In the latter case, it took many years to find the methods Shannon work proved were possible.

The recent development of various methods of modulation such as PCM and PPM which exchange bandwidth for signal-to-noise ratio has intensified the interest in a general theory of communication. A basis for such a theory is contained in the important papers of Nyquist and Hartley on this subject."""


# custom node type for Huffman trees
class Node:
    def __init__(self,  symbol=None, p=None,  left=None, right=None):
        self.symbol = symbol
        self.p = p
        self.left = left
        self.right = right

    # comparison operator for priority queue
    def __lt__(self, other):
        return self.p < other.p


# compute the probability of each character
def get_probs(input_str) -> dict:
    freqs = Counter(input_str)
    n = len(input_str)
    return {char: freqs[char] / n for char in freqs}


# compute the entropy of a probability distribution
def entropy(p: dict) -> float:
    return -sum(p_i * math.log2(p_i) for p_i in p.values())


# Python's built-in bin() only accepts integers, so we have to implement our
# own method to convert a decimal value to binary
def float2bin(num: float, num_bits: int) -> str:
    bits = ""
    for _bit in range(num_bits):
        num *= 2
        if num >= 1:
            bits += "1"
            num -= 1
        else:
            bits += "0"
    return bits


# encode the input text using a provided set of codewords
def encode(input_str, codewords) -> str:
    return ''.join(codewords[char] for char in input_str)


# compute the average code length
def codeword_len(probs, codewords) -> float:
    cum_sum = 0
    for char in probs:
        cum_sum += probs[char] * len(codewords[char])
    return cum_sum


# helper function to traverse nodes
def traverse(node, codeword="", codewords=None):
    if codewords is None:
        codewords = {}
    if node.symbol is not None:
        codewords[node.symbol] = codeword
    else:
        traverse(node.left, codeword + "0", codewords)
        traverse(node.right, codeword + "1", codewords)
    return codewords


# generate Huffman code given a set of probabilities
def generate_huffman_code(probs) -> dict:
    heap = [Node(symbol, p_i) for symbol, p_i in probs.items()]
    heapq.heapify(heap)

    while len(heap) > 1:
        min_prob = heapq.heappop(heap)
        next_min_prob = heapq.heappop(heap)
        heapq.heappush(heap, Node(None, min_prob.p + next_min_prob.p, min_prob, next_min_prob))

    h_codewords = {}
    traverse(heap[0], codewords=h_codewords)

    return h_codewords


# generate Shannon–Fano–Elias code given a set of probabilities
def generate_sfe_code(probs) -> dict:
    items = sorted(probs.items(), key=lambda x:-x[1])
    f = 0
    sfe_codewords = {}
    for s,p in items:
        bits = math.ceil(math.log2(1 / p)) + 1
        sfe_codewords[s] = float2bin(f + p / 2, bits)
        f += p
    return sfe_codewords


def main() -> None:

    p = get_probs(text)
    h = entropy(p)
    print(f"\nH(X) = {h}")

    h_codewords = generate_huffman_code(p)
    print("\nHuffman coding\n----")
    print(f"Average codeword length: {codeword_len(p, h_codewords)}\n")

    # uncomment below lines to print the codeword for each symbol
    # for char in h_codewords:
    #     if char == ' ':
    #         char_name = "space\t\t"
    #     elif char == '\n':
    #         char_name = "line break\t"
    #     else:
    #         char_name = char + "\t\t\t"
    #     print(f"Symbol: {char_name} codeword: {h_codewords[char]}")

    print("\nEncoded:")
    huffman_strs = textwrap.wrap(encode(text, h_codewords), line_size)
    for line in huffman_strs:
        print(line)

    sfe_codewords = generate_sfe_code(p)

    print("\nShannon-Fano-Elias coding\n----")
    print(f"Average codeword length: {codeword_len(p, sfe_codewords)}\n")

    # uncomment below lines to print the codeword for each symbol
    # for char in sfe_codewords:
    #     if char == ' ':
    #         char_name = "space\t\t"
    #     elif char == '\n':
    #         char_name = "line break\t"
    #     else:
    #         char_name = char + "\t\t\t"
    #     print(f"Symbol: {char_name} codeword: {h_codewords[char]}")

    print("\nEncoded:")
    sfe_strs = textwrap.wrap(encode(text, sfe_codewords), line_size)
    for line in sfe_strs:
        print(line)

if __name__ == "__main__":
    main()
	# A program that accepts a given text and generates its Huffman code and
	# Shannon–Fano–Elias code, as well as additional information.
	#
	# Created as part of an assignment for ECE 253 at UC Santa Cruz.

	import heapq
	import math
	import textwrap

	from collections import Counter

	# constants
	line_size = 60
	text = """Information theory studies the transmission, processing, extraction, and utilization of information. Abstractly, information can be thought of as the resolution of uncertainty. In the case of communication of information over a noisy channel, this abstract concept was formalized by Claude Shannon in a paper entitled A Mathematical Theory of Communication, in which information is thought of as a set of possible messages, and the goal is to send these messages over a noisy channel, and to have the receiver reconstruct the message with low probability of error, in spite of the channel noise. Shannon main result, the noisy-channel coding theorem showed that, in the limit of many channel uses, the rate of information that is asymptotically achievable is equal to the channel capacity, a quantity dependent merely on the statistics of the channel over which the messages are sent.

	Coding theory is concerned with finding explicit methods, called codes, for increasing the efficiency and reducing the error rate of data communication over noisy channels to near the channel capacity. These codes can be roughly subdivided into data compression and error-correction techniques. In the latter case, it took many years to find the methods Shannon work proved were possible.

	The recent development of various methods of modulation such as PCM and PPM which exchange bandwidth for signal-to-noise ratio has intensified the interest in a general theory of communication. A basis for such a theory is contained in the important papers of Nyquist and Hartley on this subject."""


	# custom node type for Huffman trees
	class Node:
	def __init__(self, symbol=None, p=None, left=None, right=None):
	self.symbol = symbol
	self.p = p
	self.left = left
	self.right = right

	# comparison operator for priority queue
	def __lt__(self, other):
	return self.p < other.p


	# compute the probability of each character
	def get_probs(input_str) -> dict:
	freqs = Counter(input_str)
	n = len(input_str)
	return {char: freqs[char] / n for char in freqs}


	# compute the entropy of a probability distribution
	def entropy(p: dict) -> float:
	return -sum(p_i * math.log2(p_i) for p_i in p.values())


	# Python's built-in bin() only accepts integers, so we have to implement our
	# own method to convert a decimal value to binary
	def float2bin(num: float, num_bits: int) -> str:
	bits = ""
	for _bit in range(num_bits):
	num *= 2
	if num >= 1:
	bits += "1"
	num -= 1
	else:
	bits += "0"
	return bits


	# encode the input text using a provided set of codewords
	def encode(input_str, codewords) -> str:
	return ''.join(codewords[char] for char in input_str)


	# compute the average code length
	def codeword_len(probs, codewords) -> float:
	cum_sum = 0
	for char in probs:
	cum_sum += probs[char] * len(codewords[char])
	return cum_sum


	# helper function to traverse nodes
	def traverse(node, codeword="", codewords=None):
	if codewords is None:
	codewords = {}
	if node.symbol is not None:
	codewords[node.symbol] = codeword
	else:
	traverse(node.left, codeword + "0", codewords)
	traverse(node.right, codeword + "1", codewords)
	return codewords


	# generate Huffman code given a set of probabilities
	def generate_huffman_code(probs) -> dict:
	heap = [Node(symbol, p_i) for symbol, p_i in probs.items()]
	heapq.heapify(heap)

	while len(heap) > 1:
	min_prob = heapq.heappop(heap)
	next_min_prob = heapq.heappop(heap)
	heapq.heappush(heap, Node(None, min_prob.p + next_min_prob.p, min_prob, next_min_prob))

	h_codewords = {}
	traverse(heap[0], codewords=h_codewords)

	return h_codewords


	# generate Shannon–Fano–Elias code given a set of probabilities
	def generate_sfe_code(probs) -> dict:
	items = sorted(probs.items(), key=lambda x:-x[1])
	f = 0
	sfe_codewords = {}
	for s,p in items:
	bits = math.ceil(math.log2(1 / p)) + 1
	sfe_codewords[s] = float2bin(f + p / 2, bits)
	f += p
	return sfe_codewords


	def main() -> None:

	p = get_probs(text)
	h = entropy(p)
	print(f"\nH(X) = {h}")

	h_codewords = generate_huffman_code(p)
	print("\nHuffman coding\n----")
	print(f"Average codeword length: {codeword_len(p, h_codewords)}\n")

	# uncomment below lines to print the codeword for each symbol
	# for char in h_codewords:
	# if char == ' ':
	# char_name = "space\t\t"
	# elif char == '\n':
	# char_name = "line break\t"
	# else:
	# char_name = char + "\t\t\t"
	# print(f"Symbol: {char_name} codeword: {h_codewords[char]}")

	print("\nEncoded:")
	huffman_strs = textwrap.wrap(encode(text, h_codewords), line_size)
	for line in huffman_strs:
	print(line)

	sfe_codewords = generate_sfe_code(p)

	print("\nShannon-Fano-Elias coding\n----")
	print(f"Average codeword length: {codeword_len(p, sfe_codewords)}\n")

	# uncomment below lines to print the codeword for each symbol
	# for char in sfe_codewords:
	# if char == ' ':
	# char_name = "space\t\t"
	# elif char == '\n':
	# char_name = "line break\t"
	# else:
	# char_name = char + "\t\t\t"
	# print(f"Symbol: {char_name} codeword: {h_codewords[char]}")

	print("\nEncoded:")
	sfe_strs = textwrap.wrap(encode(text, sfe_codewords), line_size)
	for line in sfe_strs:
	print(line)

	if __name__ == "__main__":
	main()
No results found