Skip to content

Instantly share code, notes, and snippets.

@wecsam
Created March 6, 2026 04:21
Show Gist options
  • Select an option

  • Save wecsam/1dba247adb6a2616404729fafd5c6333 to your computer and use it in GitHub Desktop.

Select an option

Save wecsam/1dba247adb6a2616404729fafd5c6333 to your computer and use it in GitHub Desktop.
Given a string of English words without spaces in between, inserts spaces where they can be inferred.
#!/usr/bin/env python3
import heapq
import requests
import sys
def get_words() -> frozenset[str]:
'''
Returns the set of valid words.
'''
return frozenset(
requests.get("https://github.com/dwyl/english-words/raw/refs/heads/master/words_alpha.txt").text.splitlines()
)
def infer_spaces(words_without_spaces: str, words: frozenset[str]) -> list[str]:
'''
Given a string of words without spaces in between, splits into words.
'''
# There is no point in checking a substring that is longer than the longest possible word.
# It will never be a word.
max_word_length = max(len(word) for word in words)
# To avoid duplicate work, track the indices in the original string at which we have already checked for words.
visited_positions = set[int]()
# For each index in the original string that is the end of a word, remember the index that is the start of the word.
start_of_word_ending_at_position = dict[int, int]()
# Use a max priority queue to prioritize finding larger words.
# `heapq` implements a min queue, but all the values inside will be negated.
negative_positions_of_starts_of_words = [0]
while negative_positions_of_starts_of_words:
start = -heapq.heappop(negative_positions_of_starts_of_words)
if start in visited_positions:
# This index has already been checked.
# It must have been added to the queue multiple times before it was popped.
continue
visited_positions.add(start)
if start == len(words_without_spaces):
break
for end in range(start + 1, min(len(words_without_spaces), start + max_word_length) + 1):
if end in visited_positions:
# There is no point in putting an index in the queue if it will get ignored later.
# Note that it is still possible for an index to be enqueued multiple times before it is popped.
continue
if words_without_spaces[start:end] in words:
# We found a word!
start_of_word_ending_at_position[end] = start
heapq.heappush(negative_positions_of_starts_of_words, -end)
if len(words_without_spaces) not in visited_positions:
raise ValueError("The string is not made exclusively of words.")
# Backtrack to get the list of words.
result = list[str]()
end = len(words_without_spaces)
while end != 0:
start = start_of_word_ending_at_position[end]
result.append(words_without_spaces[start:end])
end = start
result.reverse()
return result
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage:", sys.argv[0], "[words without spaces]")
else:
print(*infer_spaces(sys.argv[1].lower(), words=get_words()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment