Skip to content

Instantly share code, notes, and snippets.

@ianhenrysmith
Created January 28, 2025 18:18
Show Gist options
  • Select an option

  • Save ianhenrysmith/abc91d40345782d97bf071acada5fc23 to your computer and use it in GitHub Desktop.

Select an option

Save ianhenrysmith/abc91d40345782d97bf071acada5fc23 to your computer and use it in GitHub Desktop.
ruby utility class for splitting a large CSV into several smaller ones (default size: 100k rows)
require 'csv'
require 'fileutils'
require 'optparse'
# example usage: ruby csv_splitter.rb ~/Downloads/accountant_sales_readiness_and_gusto_pro_jan28.csv
# with options
# ruby csv_splitter.rb -c 50000 -o output_folder <filename.csv>
def split_csv(input_file:, output_dir:, chunk_size: 100_000, keep_headers: true)
# Create output directory if it doesn't exist
FileUtils.mkdir_p(output_dir)
# Get total number of rows and calculate number of chunks
total_rows = File.foreach(input_file).count - 1 # subtract 1 for header
total_chunks = (total_rows.to_f / chunk_size).ceil
# Read headers once
headers = CSV.read(input_file, headers: true).headers
# Initialize counters
chunk_index = 0
row_count = 0
current_chunk = []
# Process the file
CSV.foreach(input_file, headers: true) do |row|
current_chunk << row
row_count += 1
if row_count >= chunk_size
chunk_index += 1
output_file = File.join(output_dir, "chunk_#{chunk_index}_of_#{total_chunks}.csv")
# Write chunk to file
CSV.open(output_file, 'w') do |csv|
# Write headers if requested
csv << headers if keep_headers
# Write all rows in the current chunk
current_chunk.each do |chunk_row|
csv << chunk_row
end
end
puts "Processed chunk #{chunk_index} of #{total_chunks}"
# Reset for next chunk
current_chunk = []
row_count = 0
end
end
# Handle any remaining rows
if current_chunk.any?
chunk_index += 1
output_file = File.join(output_dir, "chunk_#{chunk_index}_of_#{total_chunks}.csv")
CSV.open(output_file, 'w') do |csv|
csv << headers if keep_headers
current_chunk.each do |chunk_row|
csv << chunk_row
end
end
puts "Processed final chunk #{chunk_index} of #{total_chunks}"
end
end
# Parse command line arguments
if __FILE__ == $PROGRAM_NAME
options = {
chunk_size: 100_000,
keep_headers: true,
output_dir: 'split_files'
}
OptionParser.new do |opts|
opts.banner = "Usage: ruby csv_splitter.rb [options] input_file"
opts.on("-c", "--chunk-size SIZE", Integer, "Number of rows per file (default: 100000)") do |size|
options[:chunk_size] = size
end
opts.on("-o", "--output-dir DIR", "Output directory (default: split_files)") do |dir|
options[:output_dir] = dir
end
opts.on("-n", "--no-headers", "Don't include headers in output files") do
options[:keep_headers] = false
end
opts.on("-h", "--help", "Show this help message") do
puts opts
exit
end
end.parse!
if ARGV.empty?
puts "Error: Input file is required"
puts "Use --help for usage information"
exit 1
end
split_csv(
input_file: ARGV[0],
output_dir: options[:output_dir],
chunk_size: options[:chunk_size],
keep_headers: options[:keep_headers]
)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment