Last active
June 2, 2025 18:51
-
-
Save SequentialDesign/fe289e240cbb0673f3fb64c1beab1210 to your computer and use it in GitHub Desktop.
word and phrase counter in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require "csv" | |
| require "progress_bar" | |
| input_file = "movie_subtitles_en_new.txt" | |
| output_dir = "split_files" | |
| Dir.mkdir(output_dir) unless Dir.exist?(output_dir) | |
| # configuration | |
| MIN_PHRASE_LEN = 2 | |
| MAX_PHRASE_LEN = 7 | |
| MIN_PHRASE_COUNT = 2 | |
| # sanitize characters not in UTF-8 by erasing them | |
| santized_input_file = open(input_file, "r") { |io| io.read.encode("UTF-8", invalid: :replace, replace: "") } | |
| puts "splitting input file into sentences..." | |
| text = santized_input_file | |
| .gsub(/[A-Z].*?:/, '') # Remove uppercase words followed by a colon | |
| .gsub(/--/, '') # Remove double hyphens | |
| .gsub(/ - /, '') # Remove double hyphens | |
| .gsub(/\s*'\s*/, "'") # fix inconsistent ' | |
| .gsub(/OK/, 'okay') # Replace "OK" with "okay" | |
| .squeeze(' ') # Replace multiple spaces with a single space | |
| .strip # Remove leading/trailing whitespace | |
| .gsub(/\.{3}/, '') # Remove three dots | |
| .gsub(/\++\$\++/, '') # get rid of weird string | |
| sentences = text.downcase.gsub(/all right/, "alright").gsub(/[¡¿"]/, "").split(/\n/) | |
| # initialize progress bar for file splitting | |
| bar = ProgressBar.new(sentences.size / 5_000 + 1) | |
| puts "creating chunk files..." | |
| sentences.each_slice(5_000).with_index do |chunk, index| | |
| File.write("#{output_dir}/chunk_#{index + 1}.txt", chunk.join("\n")) | |
| bar.increment! | |
| end | |
| # files to generate | |
| word_csv = 'word_counts.csv' | |
| word_clean_csv = 'word_counts_no_commas.csv' | |
| phrase_csv = 'phrase_counts.csv' | |
| phrase_clean_csv = 'phrase_counts_no_commas.csv' | |
| # initialize CSVs | |
| [word_csv, phrase_csv, phrase_clean_csv].each do |file| | |
| CSV.open(file, 'w') { |csv| csv << ['length', 'phrase', 'count'] } unless File.exist?(file) | |
| end | |
| puts "processing chunks..." | |
| chunk_files = Dir.children(output_dir) | |
| main_bar = ProgressBar.new(chunk_files.size) # for both processing stages | |
| # initialize counters | |
| word_counts = {} | |
| phrase_counts = Hash.new { |h,k| h[k] = Hash.new(0) } | |
| phrase_counts_clean = Hash.new { |h,k| h[k] = Hash.new(0) } | |
| chunk_files.each do |filename| | |
| chunk_path = "#{output_dir}/#{filename}" | |
| text = File.read(chunk_path) | |
| # word processing | |
| words = text.split | |
| words.each { |word| word_counts[word] = (word_counts[word] || 0) + 1 } | |
| # phrase processing - two versions | |
| # 1. original version with punctuation | |
| words_with_punct = text.split | |
| (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n| | |
| next if words_with_punct.size < n | |
| words_with_punct.each_cons(n) do |words| | |
| phrase = words.join(' ') | |
| phrase_counts[n][phrase] += 1 | |
| end | |
| end | |
| # 2. clean version without punctuation | |
| clean_words = text.gsub(/[.,!?]/, ' ').split | |
| (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n| | |
| next if clean_words.size < n | |
| clean_words.each_cons(n) do |words| | |
| phrase = words.join(' ') | |
| phrase_counts_clean[n][phrase] += 1 | |
| end | |
| end | |
| main_bar.increment! | |
| end | |
| # save word counts | |
| puts "\nSaving word counts..." | |
| CSV.open(word_csv, 'w') do |csv| | |
| csv << ['word', 'count'] | |
| word_counts.sort_by { |w,c| -c }.each { |w,c| csv << [w,c] } | |
| end | |
| # save cleaned word counts (original functionality) | |
| cleaned_words = Hash.new(0) | |
| word_counts.each { |w,c| cleaned_words[w.gsub(/,/,'')] += c } | |
| CSV.open(word_clean_csv, 'w') do |csv| | |
| csv << ['word', 'count'] | |
| cleaned_words.sort_by { |w,c| -c }.each { |w,c| csv << [w,c] } | |
| end | |
| # save phrase counts (with punctuation) | |
| puts "saving phrase counts (original)..." | |
| CSV.open(phrase_csv, 'w') do |csv| | |
| csv << ['length', 'phrase', 'count'] | |
| (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n| | |
| phrase_counts[n] | |
| .select { |p,c| c >= MIN_PHRASE_COUNT } | |
| .sort_by { |p,c| -c } | |
| .each { |p,c| csv << [n, p, c] } | |
| end | |
| end | |
| # save cleaned phrase counts (without punctuation) | |
| puts "Saving phrase counts (no commas)..." | |
| CSV.open(phrase_clean_csv, 'w') do |csv| | |
| csv << ['length', 'phrase', 'count'] | |
| (MIN_PHRASE_LEN..MAX_PHRASE_LEN).each do |n| | |
| phrase_counts_clean[n] | |
| .select { |p,c| c >= MIN_PHRASE_COUNT } | |
| .sort_by { |p,c| -c } | |
| .each { |p,c| csv << [n, p, c] } | |
| end | |
| end | |
| # final report | |
| total_phrases = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { |n| phrase_counts[n].size } | |
| total_phrases_clean = (MIN_PHRASE_LEN..MAX_PHRASE_LEN).sum { |n| phrase_counts_clean[n].size } | |
| puts "\nall done !" | |
| puts "word counts:".ljust(30) + "#{word_counts.size} unique words" | |
| puts "cleaned words:".ljust(30) + "#{cleaned_words.size} merged entries" | |
| puts "phrase counts (original):".ljust(30) + "#{total_phrases} phrases" | |
| puts "phrase counts (no commas):".ljust(30) + "#{total_phrases_clean} phrases" | |
| puts "files created:".ljust(30) + "#{word_csv}, #{word_clean_csv}, #{phrase_csv}, #{phrase_clean_csv}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment