Created
March 6, 2026 04:21
-
-
Save wecsam/1dba247adb6a2616404729fafd5c6333 to your computer and use it in GitHub Desktop.
Given a string of English words without spaces in between, inserts spaces where they can be inferred.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import heapq | |
| import requests | |
| import sys | |
| def get_words() -> frozenset[str]: | |
| ''' | |
| Returns the set of valid words. | |
| ''' | |
| return frozenset( | |
| requests.get("https://github.com/dwyl/english-words/raw/refs/heads/master/words_alpha.txt").text.splitlines() | |
| ) | |
| def infer_spaces(words_without_spaces: str, words: frozenset[str]) -> list[str]: | |
| ''' | |
| Given a string of words without spaces in between, splits into words. | |
| ''' | |
| # There is no point in checking a substring that is longer than the longest possible word. | |
| # It will never be a word. | |
| max_word_length = max(len(word) for word in words) | |
| # To avoid duplicate work, track the indices in the original string at which we have already checked for words. | |
| visited_positions = set[int]() | |
| # For each index in the original string that is the end of a word, remember the index that is the start of the word. | |
| start_of_word_ending_at_position = dict[int, int]() | |
| # Use a max priority queue to prioritize finding larger words. | |
| # `heapq` implements a min queue, but all the values inside will be negated. | |
| negative_positions_of_starts_of_words = [0] | |
| while negative_positions_of_starts_of_words: | |
| start = -heapq.heappop(negative_positions_of_starts_of_words) | |
| if start in visited_positions: | |
| # This index has already been checked. | |
| # It must have been added to the queue multiple times before it was popped. | |
| continue | |
| visited_positions.add(start) | |
| if start == len(words_without_spaces): | |
| break | |
| for end in range(start + 1, min(len(words_without_spaces), start + max_word_length) + 1): | |
| if end in visited_positions: | |
| # There is no point in putting an index in the queue if it will get ignored later. | |
| # Note that it is still possible for an index to be enqueued multiple times before it is popped. | |
| continue | |
| if words_without_spaces[start:end] in words: | |
| # We found a word! | |
| start_of_word_ending_at_position[end] = start | |
| heapq.heappush(negative_positions_of_starts_of_words, -end) | |
| if len(words_without_spaces) not in visited_positions: | |
| raise ValueError("The string is not made exclusively of words.") | |
| # Backtrack to get the list of words. | |
| result = list[str]() | |
| end = len(words_without_spaces) | |
| while end != 0: | |
| start = start_of_word_ending_at_position[end] | |
| result.append(words_without_spaces[start:end]) | |
| end = start | |
| result.reverse() | |
| return result | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 2: | |
| print("Usage:", sys.argv[0], "[words without spaces]") | |
| else: | |
| print(*infer_spaces(sys.argv[1].lower(), words=get_words())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment