Created
January 28, 2025 18:18
-
-
Save ianhenrysmith/abc91d40345782d97bf071acada5fc23 to your computer and use it in GitHub Desktop.
ruby utility class for splitting a large CSV into several smaller ones (default size: 100k rows)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'csv' | |
| require 'fileutils' | |
| require 'optparse' | |
| # example usage: ruby csv_splitter.rb ~/Downloads/accountant_sales_readiness_and_gusto_pro_jan28.csv | |
| # with options | |
| # ruby csv_splitter.rb -c 50000 -o output_folder <filename.csv> | |
| def split_csv(input_file:, output_dir:, chunk_size: 100_000, keep_headers: true) | |
| # Create output directory if it doesn't exist | |
| FileUtils.mkdir_p(output_dir) | |
| # Get total number of rows and calculate number of chunks | |
| total_rows = File.foreach(input_file).count - 1 # subtract 1 for header | |
| total_chunks = (total_rows.to_f / chunk_size).ceil | |
| # Read headers once | |
| headers = CSV.read(input_file, headers: true).headers | |
| # Initialize counters | |
| chunk_index = 0 | |
| row_count = 0 | |
| current_chunk = [] | |
| # Process the file | |
| CSV.foreach(input_file, headers: true) do |row| | |
| current_chunk << row | |
| row_count += 1 | |
| if row_count >= chunk_size | |
| chunk_index += 1 | |
| output_file = File.join(output_dir, "chunk_#{chunk_index}_of_#{total_chunks}.csv") | |
| # Write chunk to file | |
| CSV.open(output_file, 'w') do |csv| | |
| # Write headers if requested | |
| csv << headers if keep_headers | |
| # Write all rows in the current chunk | |
| current_chunk.each do |chunk_row| | |
| csv << chunk_row | |
| end | |
| end | |
| puts "Processed chunk #{chunk_index} of #{total_chunks}" | |
| # Reset for next chunk | |
| current_chunk = [] | |
| row_count = 0 | |
| end | |
| end | |
| # Handle any remaining rows | |
| if current_chunk.any? | |
| chunk_index += 1 | |
| output_file = File.join(output_dir, "chunk_#{chunk_index}_of_#{total_chunks}.csv") | |
| CSV.open(output_file, 'w') do |csv| | |
| csv << headers if keep_headers | |
| current_chunk.each do |chunk_row| | |
| csv << chunk_row | |
| end | |
| end | |
| puts "Processed final chunk #{chunk_index} of #{total_chunks}" | |
| end | |
| end | |
| # Parse command line arguments | |
| if __FILE__ == $PROGRAM_NAME | |
| options = { | |
| chunk_size: 100_000, | |
| keep_headers: true, | |
| output_dir: 'split_files' | |
| } | |
| OptionParser.new do |opts| | |
| opts.banner = "Usage: ruby csv_splitter.rb [options] input_file" | |
| opts.on("-c", "--chunk-size SIZE", Integer, "Number of rows per file (default: 100000)") do |size| | |
| options[:chunk_size] = size | |
| end | |
| opts.on("-o", "--output-dir DIR", "Output directory (default: split_files)") do |dir| | |
| options[:output_dir] = dir | |
| end | |
| opts.on("-n", "--no-headers", "Don't include headers in output files") do | |
| options[:keep_headers] = false | |
| end | |
| opts.on("-h", "--help", "Show this help message") do | |
| puts opts | |
| exit | |
| end | |
| end.parse! | |
| if ARGV.empty? | |
| puts "Error: Input file is required" | |
| puts "Use --help for usage information" | |
| exit 1 | |
| end | |
| split_csv( | |
| input_file: ARGV[0], | |
| output_dir: options[:output_dir], | |
| chunk_size: options[:chunk_size], | |
| keep_headers: options[:keep_headers] | |
| ) | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment