Created
March 10, 2026 18:55
-
-
Save stefanvermaas/3b5180a982c2a55f55a4019b6f7d0628 to your computer and use it in GitHub Desktop.
Automatically detect the column separator in a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require "csv" | |
| class CSV | |
| class Separator | |
| DEFAULT_SEPARATOR = "," | |
| class Error < StandardError; end | |
| class DetectionError < Error | |
| def message | |
| "unable to detect the separator because there are not enough rows to be certain. At least two rows are required." | |
| end | |
| end | |
| class EmptyFileError < Error | |
| def message | |
| "unable to detect the separator because the CSV file is empty. At least two rows are required." | |
| end | |
| end | |
| # Automatically detects the separator in the first 5 lines of any CSV file. | |
| # | |
| # A real separator should appear the **same number of times on every line** (since each row has the same number of columns). | |
| # If a candidate has a consistent, non-zero count across all sampled lines, it's a strong match. | |
| # | |
| # @param file_path [Pathname] | |
| # @param candidates [Array<String>] | |
| # @param headers_included [Boolean] | |
| # @return [String] The detected separator | |
| def self.detect(file_path, candidates: [ ",", ";", "\t", "|" ], headers_included: true) | |
| lines = File.foreach(file_path).first(5).select { it.present? } | |
| # If the CSV file is empty, we're unable to detect the separator. | |
| raise EmptyFileError if lines.empty? | |
| # If the CSV file only contains two lines, we're unable to reliably detect the separator. | |
| raise DetectionError if headers_included && lines.size <= 3 || !headers_included && lines.size <= 2 | |
| scores = candidates.each_with_object({}) do |sep, hash| | |
| if (counts = lines.map { it.count(sep) }).all? { |c| c > 0 } | |
| hash[sep] = counts.first | |
| else | |
| hash[sep] = 0 | |
| end | |
| end | |
| best = scores.max_by { |_, count| count } | |
| best && best.last > 0 ? best.first : DEFAULT_SEPARATOR | |
| end | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment