wecsam/infer_spaces.py

## infer_spaces.py
#!/usr/bin/env python3
import heapq
import requests
import sys

def get_words() -> frozenset[str]:
    '''
    Returns the set of valid words.
    '''
    return frozenset(
        requests.get("https://github.com/dwyl/english-words/raw/refs/heads/master/words_alpha.txt").text.splitlines()
    )

def infer_spaces(words_without_spaces: str, words: frozenset[str]) -> list[str]:
    '''
    Given a string of words without spaces in between, splits into words.
    '''
    # There is no point in checking a substring that is longer than the longest possible word.
    # It will never be a word.
    max_word_length = max(len(word) for word in words)

    # To avoid duplicate work, track the indices in the original string at which we have already checked for words.
    visited_positions = set[int]()

    # For each index in the original string that is the end of a word, remember the index that is the start of the word.
    start_of_word_ending_at_position = dict[int, int]()

    # Use a max priority queue to prioritize finding larger words.
    # `heapq` implements a min queue, but all the values inside will be negated.
    negative_positions_of_starts_of_words = [0]
    while negative_positions_of_starts_of_words:
        start = -heapq.heappop(negative_positions_of_starts_of_words)
        if start in visited_positions:
            # This index has already been checked.
            # It must have been added to the queue multiple times before it was popped.
            continue

        visited_positions.add(start)
        if start == len(words_without_spaces):
            break

        for end in range(start + 1, min(len(words_without_spaces), start + max_word_length) + 1):
            if end in visited_positions:
                # There is no point in putting an index in the queue if it will get ignored later.
                # Note that it is still possible for an index to be enqueued multiple times before it is popped.
                continue

            if words_without_spaces[start:end] in words:
                # We found a word!
                start_of_word_ending_at_position[end] = start
                heapq.heappush(negative_positions_of_starts_of_words, -end)

    if len(words_without_spaces) not in visited_positions:
        raise ValueError("The string is not made exclusively of words.")

    # Backtrack to get the list of words.
    result = list[str]()
    end = len(words_without_spaces)
    while end != 0:
        start = start_of_word_ending_at_position[end]
        result.append(words_without_spaces[start:end])
        end = start
    result.reverse()
    return result

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage:", sys.argv[0], "[words without spaces]")
    else:
        print(*infer_spaces(sys.argv[1].lower(), words=get_words()))
	#!/usr/bin/env python3
	import heapq
	import requests
	import sys

	def get_words() -> frozenset[str]:
	'''
	Returns the set of valid words.
	'''
	return frozenset(
	requests.get("https://github.com/dwyl/english-words/raw/refs/heads/master/words_alpha.txt").text.splitlines()
	)

	def infer_spaces(words_without_spaces: str, words: frozenset[str]) -> list[str]:
	'''
	Given a string of words without spaces in between, splits into words.
	'''
	# There is no point in checking a substring that is longer than the longest possible word.
	# It will never be a word.
	max_word_length = max(len(word) for word in words)

	# To avoid duplicate work, track the indices in the original string at which we have already checked for words.
	visited_positions = set[int]()

	# For each index in the original string that is the end of a word, remember the index that is the start of the word.
	start_of_word_ending_at_position = dict[int, int]()

	# Use a max priority queue to prioritize finding larger words.
	# `heapq` implements a min queue, but all the values inside will be negated.
	negative_positions_of_starts_of_words = [0]
	while negative_positions_of_starts_of_words:
	start = -heapq.heappop(negative_positions_of_starts_of_words)
	if start in visited_positions:
	# This index has already been checked.
	# It must have been added to the queue multiple times before it was popped.
	continue

	visited_positions.add(start)
	if start == len(words_without_spaces):
	break

	for end in range(start + 1, min(len(words_without_spaces), start + max_word_length) + 1):
	if end in visited_positions:
	# There is no point in putting an index in the queue if it will get ignored later.
	# Note that it is still possible for an index to be enqueued multiple times before it is popped.
	continue

	if words_without_spaces[start:end] in words:
	# We found a word!
	start_of_word_ending_at_position[end] = start
	heapq.heappush(negative_positions_of_starts_of_words, -end)

	if len(words_without_spaces) not in visited_positions:
	raise ValueError("The string is not made exclusively of words.")

	# Backtrack to get the list of words.
	result = list[str]()
	end = len(words_without_spaces)
	while end != 0:
	start = start_of_word_ending_at_position[end]
	result.append(words_without_spaces[start:end])
	end = start
	result.reverse()
	return result

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage:", sys.argv[0], "[words without spaces]")
	else:
	print(*infer_spaces(sys.argv[1].lower(), words=get_words()))
No results found