Created
March 13, 2017 02:30
-
-
Save paucus/00177fdb5b77f476e6711665085e9391 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| %%% assignment 2.20 | |
| % Indexing a file | |
| % The aim of this exercise is to index a text file, by line number. | |
| % We can think of the input being a list of text strings, and below we’ve provided an outline Erlang | |
| % module that reads text files into this format, as well as a couple of example files to process. | |
| % The output of the main function should be a list of entries consisting of a word | |
| % and a list of the ranges of lines on which it occurs. | |
| % | |
| % For example, the entry | |
| % | |
| % { "foo" , [{3,5},{7,7},{11,13}] } | |
| % | |
| % means that the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file. | |
| % | |
| % To take the problem further, you might like to think about these ways of refining the solution. | |
| % * Removing all short words (e.g. words of length less than 3) or all common words . | |
| % (DONE) | |
| % * Sorting the output so that the words occur in lexicographic order. | |
| % (DONE) | |
| % * Normalising the words so that capitalised ("Foo") and non capitalised versions ("foo") of a word are identified. | |
| % (DONE) | |
| % * Normalising so that common endings, plurals etc. identified. | |
| % * (Harder) Thinking how you could make the data representation more efficient than the one you first chose. | |
| % This might be efficient for lookup only, or for both creation and lookup. | |
| -module(a220). | |
| -include_lib("eunit/include/eunit.hrl"). | |
| -created_by("Marcelo Ruiz Camauër"). | |
| -export([index/1,do/0,compact_index/1]). | |
| %% index(filename)->[{ "foo" , [{3,5},{7,7},{11,13}] }, ...] a list of word tuples indicating foo appears in lines 3-5, 7, etc. | |
| %% returns IndexList | |
| index(Filename)-> | |
| F=index:get_file_contents(Filename), | |
| % word index, entries are tuples of {word,[line1, line2, ...]} | |
| IndexList = process_lines(F,1,[]), | |
| IndexListOrdered = lists:keysort(1,IndexList), | |
| % io:format("Final list: ~s ~s ~n",[IndexListOrdered,length(IndexListOrdered)]) | |
| IndexListOrdered. | |
| %% For each line, parse it and add the words to indexlist, returning indexlist. | |
| process_lines([],_CurrentLine,_IndexList) -> | |
| _IndexList; % nothing left to process | |
| process_lines([X|Xs],CurrentLine,IndexList)-> % multiple lines to process | |
| IndexList2 = parse_words(X,CurrentLine,IndexList), | |
| process_lines(Xs,CurrentLine+1,IndexList2); | |
| process_lines(_X,_CurrentLine,_IndexList)-> | |
| _IndexList. | |
| %% find words in the current line based on separation characters, and drop common words, tallying the rest | |
| parse_words(X,CurrentLine,IndexList)-> | |
| Words = string:tokens(X," .,;-:!?()/*$%#@&\n\t\\"), % split words on these characters | |
| insert_words(Words,CurrentLine,IndexList). | |
| insert_words([],_,IndexList)-> | |
| % io:format("IndexList: ~p ~n",[IndexList]), | |
| % _Result = lists:keyfind("the",1,IndexList), %debug | |
| IndexList; | |
| insert_words([Word|Words],CurrentLine,IndexList)-> | |
| Stopwords = ["a","all","and","any","are","as","be","but","by","in","is","it","not","of","on","or","that","the","this","to"], | |
| Word2 = string:to_lower(Word), | |
| case lists:member(Word2,Stopwords) of | |
| true -> Result = stopword; | |
| false -> Result = lists:keyfind(Word2,1,IndexList) | |
| end, | |
| case Result of | |
| stopword -> insert_words(Words,CurrentLine,IndexList); | |
| false -> | |
| IndexList2=lists:append(IndexList,[{Word2,{[CurrentLine]}}]), % not there, add it to the list | |
| insert_words(Words,CurrentLine,IndexList2); | |
| {A,{B}} -> | |
| NewTuple = {A,{lists:append(B,[CurrentLine])}}, % append line number to current tuple | |
| IndexList2=lists:keyreplace(Word2,1,IndexList,NewTuple), % replace the tuple | |
| insert_words(Words,CurrentLine,IndexList2) | |
| end. | |
| compact_index([X|Rest]) when is_integer(X) -> | |
| compact_index([{X, X}|Rest]); | |
| compact_index([{X, Y}, Z|Rest]) when Z =:= Y; Z =:= Y+1 -> | |
| compact_index([{X, Z}|Rest]); | |
| compact_index([{_,_}=T|Rest]) -> | |
| [T|compact_index(Rest)]; | |
| compact_index([]) -> []. | |
| format_index_entry({E, I}) -> | |
| lists:flatten([E, $\s|format_index(compact_index(I))]). | |
| format_index(I) -> | |
| string:join([fi(X) || X<-I], ","). | |
| fi({X,X}) -> io_lib:write(X); | |
| fi({X,Y}) -> [io_lib:write(X), $-, io_lib:write(Y)]. | |
| formatter([{A,{B}}|Xs])-> | |
| io:format("~p ~n",[format_index_entry({A,B})]), | |
| formatter(Xs); | |
| formatter([])->done. | |
| do()-> | |
| L=index("gettysburg-address.txt"), | |
| formatter(L). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment