Skip to content

Instantly share code, notes, and snippets.

@paucus
Created March 13, 2017 02:30
Show Gist options
  • Select an option

  • Save paucus/00177fdb5b77f476e6711665085e9391 to your computer and use it in GitHub Desktop.

Select an option

Save paucus/00177fdb5b77f476e6711665085e9391 to your computer and use it in GitHub Desktop.
%%% assignment 2.20
% Indexing a file
% The aim of this exercise is to index a text file, by line number.
% We can think of the input being a list of text strings, and below we’ve provided an outline Erlang
% module that reads text files into this format, as well as a couple of example files to process.
% The output of the main function should be a list of entries consisting of a word
% and a list of the ranges of lines on which it occurs.
%
% For example, the entry
%
% { "foo" , [{3,5},{7,7},{11,13}] }
%
% means that the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
%
% To take the problem further, you might like to think about these ways of refining the solution.
% * Removing all short words (e.g. words of length less than 3) or all common words .
% (DONE)
% * Sorting the output so that the words occur in lexicographic order.
% (DONE)
% * Normalising the words so that capitalised ("Foo") and non capitalised versions ("foo") of a word are identified.
% (DONE)
% * Normalising so that common endings, plurals etc. identified.
% * (Harder) Thinking how you could make the data representation more efficient than the one you first chose.
% This might be efficient for lookup only, or for both creation and lookup.
-module(a220).
-include_lib("eunit/include/eunit.hrl").
-created_by("Marcelo Ruiz Camauër").
-export([index/1,do/0,compact_index/1]).
%% index(filename)->[{ "foo" , [{3,5},{7,7},{11,13}] }, ...] a list of word tuples indicating foo appears in lines 3-5, 7, etc.
%% returns IndexList
index(Filename)->
F=index:get_file_contents(Filename),
% word index, entries are tuples of {word,[line1, line2, ...]}
IndexList = process_lines(F,1,[]),
IndexListOrdered = lists:keysort(1,IndexList),
% io:format("Final list: ~s ~s ~n",[IndexListOrdered,length(IndexListOrdered)])
IndexListOrdered.
%% For each line, parse it and add the words to indexlist, returning indexlist.
process_lines([],_CurrentLine,_IndexList) ->
_IndexList; % nothing left to process
process_lines([X|Xs],CurrentLine,IndexList)-> % multiple lines to process
IndexList2 = parse_words(X,CurrentLine,IndexList),
process_lines(Xs,CurrentLine+1,IndexList2);
process_lines(_X,_CurrentLine,_IndexList)->
_IndexList.
%% find words in the current line based on separation characters, and drop common words, tallying the rest
parse_words(X,CurrentLine,IndexList)->
Words = string:tokens(X," .,;-:!?()/*$%#@&\n\t\\"), % split words on these characters
insert_words(Words,CurrentLine,IndexList).
insert_words([],_,IndexList)->
% io:format("IndexList: ~p ~n",[IndexList]),
% _Result = lists:keyfind("the",1,IndexList), %debug
IndexList;
insert_words([Word|Words],CurrentLine,IndexList)->
Stopwords = ["a","all","and","any","are","as","be","but","by","in","is","it","not","of","on","or","that","the","this","to"],
Word2 = string:to_lower(Word),
case lists:member(Word2,Stopwords) of
true -> Result = stopword;
false -> Result = lists:keyfind(Word2,1,IndexList)
end,
case Result of
stopword -> insert_words(Words,CurrentLine,IndexList);
false ->
IndexList2=lists:append(IndexList,[{Word2,{[CurrentLine]}}]), % not there, add it to the list
insert_words(Words,CurrentLine,IndexList2);
{A,{B}} ->
NewTuple = {A,{lists:append(B,[CurrentLine])}}, % append line number to current tuple
IndexList2=lists:keyreplace(Word2,1,IndexList,NewTuple), % replace the tuple
insert_words(Words,CurrentLine,IndexList2)
end.
compact_index([X|Rest]) when is_integer(X) ->
compact_index([{X, X}|Rest]);
compact_index([{X, Y}, Z|Rest]) when Z =:= Y; Z =:= Y+1 ->
compact_index([{X, Z}|Rest]);
compact_index([{_,_}=T|Rest]) ->
[T|compact_index(Rest)];
compact_index([]) -> [].
format_index_entry({E, I}) ->
lists:flatten([E, $\s|format_index(compact_index(I))]).
format_index(I) ->
string:join([fi(X) || X<-I], ",").
fi({X,X}) -> io_lib:write(X);
fi({X,Y}) -> [io_lib:write(X), $-, io_lib:write(Y)].
formatter([{A,{B}}|Xs])->
io:format("~p ~n",[format_index_entry({A,B})]),
formatter(Xs);
formatter([])->done.
do()->
L=index("gettysburg-address.txt"),
formatter(L).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment