pepkin88/spell.ls

## spell.ls
### Preface ###

# This is a translation to LiveScript of Peter's Norvig script
# for correcting the misspelled words: http://norvig.com/spell-correct.html
# I was trying to minimize the line count, while preserving a relative readability.
# The variable names are analogous to those in the original snippet, as of 2018-02-06.
# All of the code works in LiveScript v1.5.

# Copyright (c) 2018 Marek Pepke
# MIT license: www.opensource.org/licenses/mit-license.php


### Pure code ###

# This submition has only 13 lines of code.
words = (.to-lower-case!match /[a-z]+/g)
Counter = (.reduce _, Object.create null <| (obj, v) -> obj => ..[v] = ~~..[v] + 1)
WORDS = Counter words require(\fs)read-file-sync \big.txt \utf8
edits = (word, depth = 0) -> []
	for i to word.length
		[L, right] = [word.slice 0 i; word.slice i]
		for R in (if depth then edits right, depth - 1 else [right])
			..push L + R.1 + R.0 + R.slice 2 if R.length >= 2
			for c in ['' ...[\a to \z]]
				..push L + c + R, L + c + R.slice 1
known = (.filter (WORDS.) => return unless ..length)
candidates = (word) -> known [word] or known edits word or known edits word, 1 or [word]
correction = (word) -> candidates word.to-lower-case! .reduce (a, b) -> if WORDS[a] > WORDS[b] then a else b


### Annotated code ###

# A basic knowledge of LiveScript and JavaScript is required.
# LiveScript reference: http://livescript.net
# JavaScript reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript

# Using kebab-case, to show a bit of a flavor; equivalent to camelCase.
# Using `/[a-z]+/g` instead of `/\w+/g` to disallow numbers and underscores in words.
words = (.to-lower-case!match /[a-z]+/g)

# A substitute for Python's Counter class.
# Using immediately executed partially applied function, to reorder arguments of `Array::reduce`.
# Building a dictionary on top of an object with no prototype (`Object.create null`),
# to ensure no unwanted side effects, while checking the existence of certain words.
# Using cascade for `obj`, so that `obj` gets returned at the end.
# Using `~~` to coerce non-existent fields to 0.
Counter = (.reduce _, Object.create null <| (obj, v) -> obj => ..[v] = ~~..[v] + 1)

# Using raw word-counts instead of probability, because scale here is not important.
# Using Node.js environment, reading big.txt, from http://norvig.com/big.txt
WORDS = Counter words require(\fs)read-file-sync \big.txt \utf8

# Using a recursive approach for more than one edits away from the desired word.
# Decided not to use `Set`s, as they are tricky to iterate over in the current version of LiveScript.
# Also, the presumed performance loss because of that is unnoticeable.
edits = (word, depth = 0) -> [] # at the end, returning this array, by using cascade
	for i to word.length
		[L, right] = [word.slice 0 i; word.slice i]
		# Recursing over `right`, instead of over `word`
		# - shorter cases, yet full coverage, thus increasing the performance.
		for R in (if depth then edits right, depth - 1 else [right])
			..push L + R.1 + R.0 + R.slice 2 if R.length >= 2 # transpose case
			# Including an empty string in the letter array allows adding deletes to the edits list.
			for c in ['' ...[\a to \z]]
				..push L + c + R, L + c + R.slice 1 # delete, replace and insert cases

# Short-returning a `void` (`undefined`), unless the array after filtering is not empty.
# I would have used `for..when..else`, but currently it is not working like I would have hoped for.
known = (.filter (WORDS.) => return unless ..length)

# Because `known` can return `void`, `or` can work as desired.
candidates = (word) -> known [word] or known edits word or known edits word, 1 or [word]

# Mo need for coercing `WORDS[x]` to an integer,
# as it words from `candidates` are almost always known words, so they exist in `WORDS`;
# The one case, where a candidate is not a known word,
# will be from a single-item array, which won't trigger `reduce`'s callback.
correction = (word) -> candidates word.to-lower-case! .reduce (a, b) -> if WORDS[a] > WORDS[b] then a else b


### Alternative `edits` function ###

# A different version of the `edits` function, incorporating regular expressions,
# which allows for a more clear display of certain word manipulation.
edits = (original-word, depth = 0) -> []
	# This time recursing over whole words, because they won't be split into 2 pieces.
	for word in (if depth then edits original-word, depth - 1 else [original-word])
		for i to word.length
			# Replacing `what` fragment at position of `i` characters from the end.
			# Using cascade for an easy access, acting as a temporary variable.
			((what, with-what) -> ..push word.replace //#what(?=.{#i}$)// with-what)
				.. '(.)(.)' \$2$1 # transpose case
				.. \. '' # delete case
				for c in [\a to \z]
					.. \. c # replace cases
					.. '' c # insert cases


### Minified code ###

# A golfed version of the script, sacrificed the performance to the compactness.
# Some corners were cut, to reach such a low size,
# although the script should work in the same way as the non-abridged version.
# All of this could be in only one line, but I have rearranged the code over a few lines for readability,
# thanks to the fact, that a `;` is the same length as a new line character,
# and a `=>` is the same length as a new line + a tab.
# The `c` function stands for `correction`, it's the main function.
# It is assumed, that a lowercase word is passed to the `c` function.
# More characters could be shaved off, by changing the name of the `big.txt` file,
# changing `/[^a-z]+/` to `/\W+/`, but thus allowing numbers and underscores in the words,
# or by preloading the `fs` module or using functions of a preloaded `Prelude.ls` library,
# as those options are provided by the `lsc` executable.

# Weighs only 344 bytes, as of 2018-02-06.
W=(require(\fs)readFileSync(\big.txt \utf8)toLowerCase!/ /[^a-z]+/)reduce (->it=>-~=..[&1]),{}
e=->[]=>(if&1=>e it,&1-1 else[it])map ->s=it~slice;for,i in 1+it
	(f=->..[*]=s(0 i)+it+s i+&1) [s(i)1]+[s(i)0],2;f '' 1;[\a to\z]map ->[it,it]map f
k=(.filter (W.)=>return if!..0)
c=(w)->(k([w])||k(e w)||k(e w,1)||[w])reduce ->|W[&1]>W[&0]=>&1|_=>&0


### Annotated minified code ###

# Using a split (`/`) operator, in place of the `match` method. Possible empty strings
# at the beginning and the end of the array, but harmless, because of the low weight in the `W` object.
# `-~=..[&1]` - intcasting increment, an unary assignment, which increments the variable,
# equivalent to `..[&1] = -~..[&1]`. An opposite number to a bitwise negation of a value is equal to
# that value incremented by 1. Utilizing the casting to integer (intcasting) side effect, for `void` values.
# Using an ordinary object (`{}`) for the dictionary, instead of `Object.create null`.
# This causes no actual side effects, because:
#   - the only values in the prototype chain are functions,
#     the only all-lowercase function name is `constructor`;
#   - intcasting a function creates 0, which is desirable;
#   - the value is stored in the object itself, not the prototype;
#   - accessing `constructor` results in getting a proper value for this word;
#   - accessing e.g. `tostring` results in getting `void`;
#   - accessing e.g. `toString` breaks the "passing only lowercase arguments to the `c` function" assumption.
W=(require(\fs)readFileSync(\big.txt \utf8)toLowerCase!/ /[^a-z]+/)reduce (->it=>-~=..[&1]),{}

# `for,i in 1+it` - iterating over an extended string by a one character, but using only the index,
# but in reality the mechanism is bugged and this is compiled to
# `for (i$ = 0, len$ = 1 + it.length; i$ < len$; ++i$)`, which will work too.
# Using `..[*]=x` as an equivalent to `..push x`, 1 byte less.
# Safeguarding with `[x]`:
#   1. `[x]+[y]` causes `[x]toString! + [y]toString!`;
#   2. `['string']toString!` evaluates to `'string'`;
#   3. `[void]toString!` evaluates to `''`.
# Although `to` can be a variable name, in which case `to\z` would become `to.z`,
# the `to` in `[\a to\z]` is treated as a keyword, because is in a special context of square brackets,
# therefore it's possible to omit a space there and save 1 character.
# Using `[\a to\z]map ->` instead of a 1-char shorter `for[\a to\z]=>`, because of reusing of the `x$` variable
# by the compiler, if the cascade operator `..` is in the scope of a new function. Possibly a bug.
# `[it,it]map f` is equivalent to `f it,0;f it,1`, 1 byte shorter.
# Cannot use `f it;f it,1`, because `i+&1` is not safeguarded by intcasting, which itself costs 2 bytes.
e=->[]=>(if&1=>e it,&1-1 else[it])map ->s=it~slice;for,i in 1+it
	(f=->..[*]=s(0 i)+it+s i+&1) [s(i)1]+[s(i)0],2;f '' 1;[\a to\z]map ->[it,it]map f

# Checking the truthiness of the first item, instead of checking of the length of the array.
# The only bad case is, when an empty string is a first item, which either is harmless or won't happen.
k=(.filter (W.)=>return if!..0)

# Declaring and using `w` is shorter than using `it`/`&0`.
# `k x` is shorter than `k(x)`, but that requires using the `or` keyword,
# which cannot be easily glued to other symbols, like `||` can.
# Using an implicit switch notation, instead of an `if` expression. Saves 2 bytes.
# Putting `&0` in the else clause, as a safeguard for the condition evaluating to `NaN`. Probably unnecessary.
c=(w)->(k([w])||k(e w)||k(e w,1)||[w])reduce ->|W[&1]>W[&0]=>&1|_=>&0
	### Preface ###

	# This is a translation to LiveScript of Peter's Norvig script
	# for correcting the misspelled words: http://norvig.com/spell-correct.html
	# I was trying to minimize the line count, while preserving a relative readability.
	# The variable names are analogous to those in the original snippet, as of 2018-02-06.
	# All of the code works in LiveScript v1.5.

	# Copyright (c) 2018 Marek Pepke
	# MIT license: www.opensource.org/licenses/mit-license.php



	### Pure code ###

	# This submition has only 13 lines of code.
	words = (.to-lower-case!match /[a-z]+/g)
	Counter = (.reduce _, Object.create null <\| (obj, v) -> obj => ..[v] = ~~..[v] + 1)
	WORDS = Counter words require(\fs)read-file-sync \big.txt \utf8
	edits = (word, depth = 0) -> []
	for i to word.length
	[L, right] = [word.slice 0 i; word.slice i]
	for R in (if depth then edits right, depth - 1 else [right])
	..push L + R.1 + R.0 + R.slice 2 if R.length >= 2
	for c in ['' ...[\a to \z]]
	..push L + c + R, L + c + R.slice 1
	known = (.filter (WORDS.) => return unless ..length)
	candidates = (word) -> known [word] or known edits word or known edits word, 1 or [word]
	correction = (word) -> candidates word.to-lower-case! .reduce (a, b) -> if WORDS[a] > WORDS[b] then a else b



	### Annotated code ###

	# A basic knowledge of LiveScript and JavaScript is required.
	# LiveScript reference: http://livescript.net
	# JavaScript reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript

	# Using kebab-case, to show a bit of a flavor; equivalent to camelCase.
	# Using `/[a-z]+/g` instead of `/\w+/g` to disallow numbers and underscores in words.
	words = (.to-lower-case!match /[a-z]+/g)

	# A substitute for Python's Counter class.
	# Using immediately executed partially applied function, to reorder arguments of `Array::reduce`.
	# Building a dictionary on top of an object with no prototype (`Object.create null`),
	# to ensure no unwanted side effects, while checking the existence of certain words.
	# Using cascade for `obj`, so that `obj` gets returned at the end.
	# Using `~~` to coerce non-existent fields to 0.
	Counter = (.reduce _, Object.create null <\| (obj, v) -> obj => ..[v] = ~~..[v] + 1)

	# Using raw word-counts instead of probability, because scale here is not important.
	# Using Node.js environment, reading big.txt, from http://norvig.com/big.txt
	WORDS = Counter words require(\fs)read-file-sync \big.txt \utf8

	# Using a recursive approach for more than one edits away from the desired word.
	# Decided not to use `Set`s, as they are tricky to iterate over in the current version of LiveScript.
	# Also, the presumed performance loss because of that is unnoticeable.
	edits = (word, depth = 0) -> [] # at the end, returning this array, by using cascade
	for i to word.length
	[L, right] = [word.slice 0 i; word.slice i]
	# Recursing over `right`, instead of over `word`
	# - shorter cases, yet full coverage, thus increasing the performance.
	for R in (if depth then edits right, depth - 1 else [right])
	..push L + R.1 + R.0 + R.slice 2 if R.length >= 2 # transpose case
	# Including an empty string in the letter array allows adding deletes to the edits list.
	for c in ['' ...[\a to \z]]
	..push L + c + R, L + c + R.slice 1 # delete, replace and insert cases

	# Short-returning a `void` (`undefined`), unless the array after filtering is not empty.
	# I would have used `for..when..else`, but currently it is not working like I would have hoped for.
	known = (.filter (WORDS.) => return unless ..length)

	# Because `known` can return `void`, `or` can work as desired.
	candidates = (word) -> known [word] or known edits word or known edits word, 1 or [word]

	# Mo need for coercing `WORDS[x]` to an integer,
	# as it words from `candidates` are almost always known words, so they exist in `WORDS`;
	# The one case, where a candidate is not a known word,
	# will be from a single-item array, which won't trigger `reduce`'s callback.
	correction = (word) -> candidates word.to-lower-case! .reduce (a, b) -> if WORDS[a] > WORDS[b] then a else b



	### Alternative `edits` function ###

	# A different version of the `edits` function, incorporating regular expressions,
	# which allows for a more clear display of certain word manipulation.
	edits = (original-word, depth = 0) -> []
	# This time recursing over whole words, because they won't be split into 2 pieces.
	for word in (if depth then edits original-word, depth - 1 else [original-word])
	for i to word.length
	# Replacing `what` fragment at position of `i` characters from the end.
	# Using cascade for an easy access, acting as a temporary variable.
	((what, with-what) -> ..push word.replace //#what(?=.{#i}$)// with-what)
	.. '(.)(.)' \$2$1 # transpose case
	.. \. '' # delete case
	for c in [\a to \z]
	.. \. c # replace cases
	.. '' c # insert cases



	### Minified code ###

	# A golfed version of the script, sacrificed the performance to the compactness.
	# Some corners were cut, to reach such a low size,
	# although the script should work in the same way as the non-abridged version.
	# All of this could be in only one line, but I have rearranged the code over a few lines for readability,
	# thanks to the fact, that a `;` is the same length as a new line character,
	# and a `=>` is the same length as a new line + a tab.
	# The `c` function stands for `correction`, it's the main function.
	# It is assumed, that a lowercase word is passed to the `c` function.
	# More characters could be shaved off, by changing the name of the `big.txt` file,
	# changing `/[^a-z]+/` to `/\W+/`, but thus allowing numbers and underscores in the words,
	# or by preloading the `fs` module or using functions of a preloaded `Prelude.ls` library,
	# as those options are provided by the `lsc` executable.

	# Weighs only 344 bytes, as of 2018-02-06.
	W=(require(\fs)readFileSync(\big.txt \utf8)toLowerCase!/ /[^a-z]+/)reduce (->it=>-~=..[&1]),{}
	e=->[]=>(if&1=>e it,&1-1 else[it])map ->s=it~slice;for,i in 1+it
	(f=->..[*]=s(0 i)+it+s i+&1) [s(i)1]+[s(i)0],2;f '' 1;[\a to\z]map ->[it,it]map f
	k=(.filter (W.)=>return if!..0)
	c=(w)->(k([w])\|\|k(e w)\|\|k(e w,1)\|\|[w])reduce ->\|W[&1]>W[&0]=>&1\|_=>&0



	### Annotated minified code ###

	# Using a split (`/`) operator, in place of the `match` method. Possible empty strings
	# at the beginning and the end of the array, but harmless, because of the low weight in the `W` object.
	# `-~=..[&1]` - intcasting increment, an unary assignment, which increments the variable,
	# equivalent to `..[&1] = -~..[&1]`. An opposite number to a bitwise negation of a value is equal to
	# that value incremented by 1. Utilizing the casting to integer (intcasting) side effect, for `void` values.
	# Using an ordinary object (`{}`) for the dictionary, instead of `Object.create null`.
	# This causes no actual side effects, because:
	# - the only values in the prototype chain are functions,
	# the only all-lowercase function name is `constructor`;
	# - intcasting a function creates 0, which is desirable;
	# - the value is stored in the object itself, not the prototype;
	# - accessing `constructor` results in getting a proper value for this word;
	# - accessing e.g. `tostring` results in getting `void`;
	# - accessing e.g. `toString` breaks the "passing only lowercase arguments to the `c` function" assumption.
	W=(require(\fs)readFileSync(\big.txt \utf8)toLowerCase!/ /[^a-z]+/)reduce (->it=>-~=..[&1]),{}

	# `for,i in 1+it` - iterating over an extended string by a one character, but using only the index,
	# but in reality the mechanism is bugged and this is compiled to
	# `for (i$ = 0, len$ = 1 + it.length; i$ < len$; ++i$)`, which will work too.
	# Using `..[*]=x` as an equivalent to `..push x`, 1 byte less.
	# Safeguarding with `[x]`:
	# 1. `[x]+[y]` causes `[x]toString! + [y]toString!`;
	# 2. `['string']toString!` evaluates to `'string'`;
	# 3. `[void]toString!` evaluates to `''`.
	# Although `to` can be a variable name, in which case `to\z` would become `to.z`,
	# the `to` in `[\a to\z]` is treated as a keyword, because is in a special context of square brackets,
	# therefore it's possible to omit a space there and save 1 character.
	# Using `[\a to\z]map ->` instead of a 1-char shorter `for[\a to\z]=>`, because of reusing of the `x$` variable
	# by the compiler, if the cascade operator `..` is in the scope of a new function. Possibly a bug.
	# `[it,it]map f` is equivalent to `f it,0;f it,1`, 1 byte shorter.
	# Cannot use `f it;f it,1`, because `i+&1` is not safeguarded by intcasting, which itself costs 2 bytes.
	e=->[]=>(if&1=>e it,&1-1 else[it])map ->s=it~slice;for,i in 1+it
	(f=->..[*]=s(0 i)+it+s i+&1) [s(i)1]+[s(i)0],2;f '' 1;[\a to\z]map ->[it,it]map f

	# Checking the truthiness of the first item, instead of checking of the length of the array.
	# The only bad case is, when an empty string is a first item, which either is harmless or won't happen.
	k=(.filter (W.)=>return if!..0)

	# Declaring and using `w` is shorter than using `it`/`&0`.
	# `k x` is shorter than `k(x)`, but that requires using the `or` keyword,
	# which cannot be easily glued to other symbols, like `\|\|` can.
	# Using an implicit switch notation, instead of an `if` expression. Saves 2 bytes.
	# Putting `&0` in the else clause, as a safeguard for the condition evaluating to `NaN`. Probably unnecessary.
	c=(w)->(k([w])\|\|k(e w)\|\|k(e w,1)\|\|[w])reduce ->\|W[&1]>W[&0]=>&1\|_=>&0
No results found