paucus/a220.erl

## a220.erl
%%% assignment 2.20

% Indexing a file
% The aim of this exercise is to index a text file, by line number.
% We can think of the input being a list of text strings, and below we’ve provided an outline Erlang
% module that reads text files into this format, as well as a couple of example files to process.
% The output of the main function should be a list of entries consisting of a word
% and a list of the ranges of lines on which it occurs.
%
% For example, the entry
%
% { "foo" , [{3,5},{7,7},{11,13}] }
%
% means that the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
%
% To take the problem further, you might like to think about these ways of refining the solution.
% * Removing all short words (e.g. words of length less than 3) or all common words .
% (DONE)

% * Sorting the output so that the words occur in lexicographic order.
% (DONE)

% * Normalising the words so that capitalised ("Foo") and non capitalised versions ("foo") of a word are identified.
% (DONE)

% * Normalising so that common endings, plurals etc. identified.
% * (Harder) Thinking how you could make the data representation more efficient than the one you first chose.
% This might be efficient for lookup only, or for both creation and lookup.

-module(a220).
-include_lib("eunit/include/eunit.hrl").
-created_by("Marcelo Ruiz Camauër").

-export([index/1,do/0,compact_index/1]).

%% index(filename)->[{ "foo" , [{3,5},{7,7},{11,13}] }, ...] a list of word tuples indicating foo appears in lines 3-5, 7, etc.
%% returns IndexList
index(Filename)->
    F=index:get_file_contents(Filename),
    % word index, entries are tuples of {word,[line1, line2, ...]}
    IndexList = process_lines(F,1,[]),
    IndexListOrdered = lists:keysort(1,IndexList),
%    io:format("Final list: ~s ~s ~n",[IndexListOrdered,length(IndexListOrdered)])
    IndexListOrdered.

%% For each line, parse it and add the words to indexlist, returning indexlist.
process_lines([],_CurrentLine,_IndexList) ->
    _IndexList;                                             % nothing left to process
process_lines([X|Xs],CurrentLine,IndexList)->               % multiple lines to process
    IndexList2 = parse_words(X,CurrentLine,IndexList),
    process_lines(Xs,CurrentLine+1,IndexList2);
process_lines(_X,_CurrentLine,_IndexList)->
    _IndexList.

%% find words in the current line based on separation characters, and drop common words, tallying the rest
parse_words(X,CurrentLine,IndexList)->
    Words = string:tokens(X," .,;-:!?()/*$%#@&\n\t\\"),   % split words on these characters
    insert_words(Words,CurrentLine,IndexList).

insert_words([],_,IndexList)->
    % io:format("IndexList: ~p ~n",[IndexList]),
   % _Result = lists:keyfind("the",1,IndexList),  %debug
    IndexList;

insert_words([Word|Words],CurrentLine,IndexList)->
    Stopwords = ["a","all","and","any","are","as","be","but","by","in","is","it","not","of","on","or","that","the","this","to"],
    Word2 = string:to_lower(Word),
    case lists:member(Word2,Stopwords) of
        true -> Result = stopword;
        false -> Result = lists:keyfind(Word2,1,IndexList)
    end,
    case Result of
        stopword -> insert_words(Words,CurrentLine,IndexList);
        false   ->
            IndexList2=lists:append(IndexList,[{Word2,{[CurrentLine]}}]),                % not there, add it to the list
            insert_words(Words,CurrentLine,IndexList2);
        {A,{B}} ->
            NewTuple = {A,{lists:append(B,[CurrentLine])}},              % append line number to current tuple
            IndexList2=lists:keyreplace(Word2,1,IndexList,NewTuple),      % replace the tuple
            insert_words(Words,CurrentLine,IndexList2)
    end.

compact_index([X|Rest]) when is_integer(X) ->
    compact_index([{X, X}|Rest]);
compact_index([{X, Y}, Z|Rest]) when Z =:= Y; Z =:= Y+1 ->
    compact_index([{X, Z}|Rest]);
compact_index([{_,_}=T|Rest]) ->
    [T|compact_index(Rest)];
compact_index([]) -> [].

format_index_entry({E, I}) ->
    lists:flatten([E, $\s|format_index(compact_index(I))]).

format_index(I) ->
    string:join([fi(X) || X<-I], ",").

fi({X,X}) -> io_lib:write(X);
fi({X,Y}) -> [io_lib:write(X), $-, io_lib:write(Y)].


formatter([{A,{B}}|Xs])->
    io:format("~p ~n",[format_index_entry({A,B})]),
    formatter(Xs);
formatter([])->done.

do()->
    L=index("gettysburg-address.txt"),
    formatter(L).
	%%% assignment 2.20

	% Indexing a file
	% The aim of this exercise is to index a text file, by line number.
	% We can think of the input being a list of text strings, and below we’ve provided an outline Erlang
	% module that reads text files into this format, as well as a couple of example files to process.
	% The output of the main function should be a list of entries consisting of a word
	% and a list of the ranges of lines on which it occurs.
	%
	% For example, the entry
	%
	% { "foo" , [{3,5},{7,7},{11,13}] }
	%
	% means that the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
	%
	% To take the problem further, you might like to think about these ways of refining the solution.
	% * Removing all short words (e.g. words of length less than 3) or all common words .
	% (DONE)

	% * Sorting the output so that the words occur in lexicographic order.
	% (DONE)

	% * Normalising the words so that capitalised ("Foo") and non capitalised versions ("foo") of a word are identified.
	% (DONE)

	% * Normalising so that common endings, plurals etc. identified.
	% * (Harder) Thinking how you could make the data representation more efficient than the one you first chose.
	% This might be efficient for lookup only, or for both creation and lookup.

	-module(a220).
	-include_lib("eunit/include/eunit.hrl").
	-created_by("Marcelo Ruiz Camauër").

	-export([index/1,do/0,compact_index/1]).

	%% index(filename)->[{ "foo" , [{3,5},{7,7},{11,13}] }, ...] a list of word tuples indicating foo appears in lines 3-5, 7, etc.
	%% returns IndexList
	index(Filename)->
	F=index:get_file_contents(Filename),
	% word index, entries are tuples of {word,[line1, line2, ...]}
	IndexList = process_lines(F,1,[]),
	IndexListOrdered = lists:keysort(1,IndexList),
	% io:format("Final list: ~s ~s ~n",[IndexListOrdered,length(IndexListOrdered)])
	IndexListOrdered.

	%% For each line, parse it and add the words to indexlist, returning indexlist.
	process_lines([],_CurrentLine,_IndexList) ->
	_IndexList; % nothing left to process
	process_lines([X\|Xs],CurrentLine,IndexList)-> % multiple lines to process
	IndexList2 = parse_words(X,CurrentLine,IndexList),
	process_lines(Xs,CurrentLine+1,IndexList2);
	process_lines(_X,_CurrentLine,_IndexList)->
	_IndexList.

	%% find words in the current line based on separation characters, and drop common words, tallying the rest
	parse_words(X,CurrentLine,IndexList)->
	Words = string:tokens(X," .,;-:!?()/*$%#@&\n\t\\"), % split words on these characters
	insert_words(Words,CurrentLine,IndexList).

	insert_words([],_,IndexList)->
	% io:format("IndexList: ~p ~n",[IndexList]),
	% _Result = lists:keyfind("the",1,IndexList), %debug
	IndexList;

	insert_words([Word\|Words],CurrentLine,IndexList)->
	Stopwords = ["a","all","and","any","are","as","be","but","by","in","is","it","not","of","on","or","that","the","this","to"],
	Word2 = string:to_lower(Word),
	case lists:member(Word2,Stopwords) of
	true -> Result = stopword;
	false -> Result = lists:keyfind(Word2,1,IndexList)
	end,
	case Result of
	stopword -> insert_words(Words,CurrentLine,IndexList);
	false ->
	IndexList2=lists:append(IndexList,[{Word2,{[CurrentLine]}}]), % not there, add it to the list
	insert_words(Words,CurrentLine,IndexList2);
	{A,{B}} ->
	NewTuple = {A,{lists:append(B,[CurrentLine])}}, % append line number to current tuple
	IndexList2=lists:keyreplace(Word2,1,IndexList,NewTuple), % replace the tuple
	insert_words(Words,CurrentLine,IndexList2)
	end.

	compact_index([X\|Rest]) when is_integer(X) ->
	compact_index([{X, X}\|Rest]);
	compact_index([{X, Y}, Z\|Rest]) when Z =:= Y; Z =:= Y+1 ->
	compact_index([{X, Z}\|Rest]);
	compact_index([{_,_}=T\|Rest]) ->
	[T\|compact_index(Rest)];
	compact_index([]) -> [].

	format_index_entry({E, I}) ->
	lists:flatten([E, $\s\|format_index(compact_index(I))]).

	format_index(I) ->
	string:join([fi(X) \|\| X<-I], ",").

	fi({X,X}) -> io_lib:write(X);
	fi({X,Y}) -> [io_lib:write(X), $-, io_lib:write(Y)].



	formatter([{A,{B}}\|Xs])->
	io:format("~p ~n",[format_index_entry({A,B})]),
	formatter(Xs);
	formatter([])->done.

	do()->
	L=index("gettysburg-address.txt"),
	formatter(L).
No results found