|
from gensim.summarization import summarize |
|
import sys |
|
import os |
|
|
|
def load_file(filename): |
|
text = list() |
|
# print(filename) |
|
with open(filename, 'r') as file: |
|
for line in file: |
|
text.append(line) |
|
|
|
return text |
|
|
|
errorList = list() |
|
|
|
def find_num_of_words(text): |
|
ln = list() |
|
for i in text: |
|
if i is ' ': |
|
ln.append(1) |
|
return len(ln) |
|
|
|
def main(): |
|
|
|
# Initialise the input, output folder |
|
input = sys.argv[1] |
|
output = sys.argv[2] |
|
|
|
# List of all files in a folder |
|
output_file_list = list() |
|
|
|
# List of text of all files which are unsummarized |
|
total_file_text_unsummarized = list() |
|
|
|
# List of text of all files which are summarized |
|
total_file_text_summarized = list() |
|
|
|
for file in os.listdir(input): |
|
output_file_list.append(file) |
|
|
|
# Load all the files in text |
|
for file in output_file_list: |
|
|
|
total_file_text_unsummarized.append(load_file('{0}/{1}'.format(input,file))) |
|
|
|
|
|
for index ,textfile in enumerate(total_file_text_unsummarized): |
|
# textfile = textfile |
|
word_count = 200 |
|
filename = output_file_list[index] |
|
text_file = str() |
|
|
|
for line in textfile: |
|
text_file += line |
|
# print('---- TEXT FILE -- \n {0}'.format(text_file)) |
|
summarized_text = summarize(text_file, word_count=word_count) |
|
len_of_words_summarized = find_num_of_words(summarized_text) |
|
if len_of_words_summarized <= 50 or len_of_words_summarized >= 200: |
|
error = "{0} has the length {1}".format(filename, len_of_words_summarized) |
|
errorList.append(error) |
|
|
|
name_of_file = '/{0}.txt'.format(filename) |
|
# print('HHHHHHHHH{0}'.format(output+name_of_file)) |
|
with open(output+name_of_file,'a') as file: |
|
|
|
file.write(summarized_text) |
|
sys.stdout.write('{0}\r'.format(index)) |
|
sys.stdout.flush() |
|
|
|
print('--------- SUMMARIZATION COMPLETED!!------------\n') |
|
print('--------- FOLLOWING ARE THE LIST OF ERROR FILEs-----\n') |
|
print(errorList) |
|
with open(output+'/ERRORLIST.txt','w') as file: |
|
for error in errorList: |
|
file.write(error) |
|
|
|
|
|
print('----- END ------\n') |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|