Skip to content

Instantly share code, notes, and snippets.

@isayev
Created April 28, 2014 17:15
Show Gist options
  • Select an option

  • Save isayev/11378249 to your computer and use it in GitHub Desktop.

Select an option

Save isayev/11378249 to your computer and use it in GitHub Desktop.
splits file returns the files splitted and the number of files(current_piece)
import csv
import os
import pprint
import datetime
#splits file returns the files splitted and the number of files(current_piece)
import csv
def splits(filehandler, delimiter=',', row_limit=100000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
"""
Splits a CSV file into multiple pieces.
Arguments:
`row_limit`: The number of rows you want in each output file. 10,000 by default.
`output_name_template`: A %s-style template for the numbered output files.
`output_path`: Where to stick the output files.
`keep_headers`: Whether or not to print the headers in each output file.
"""
created_files = []
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
print current_out_path
created_files.append(current_out_path)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
print current_out_path
created_files.append(current_out_path)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
return created_files,current_piece
'''
This module is a simple utility for working with a single csv file,
or muttiple csv files in a directory.
You can instantiate a csv file by simply doing
>>>import csvgoo2
>>> c = CsvFile('sample.csv')
and then grab the data by doing
>>>c.fill()
The CsFile object only grabs the data when you explicitly call the fill() method
in order to save memory. You can also use the empty() method to clear data once
its been filled
'''
#CsvFile USAGE
#c = CsvFile('sample.csv')
#print c.columns
#c.fill()
class CsvFile(object):
def __init__(self, name):
self.name = name
self.data = []
self.columns = []
def __str__(self):
return self.name
def empty(self):
self.data = []
def fill(self):
self.data = [x for x in csv.DictReader(open(self.name, 'rU'))]
self.columns = self.data[0].keys()
'''
The Repo class is similar to the csv file object
but lets you gather all csv files in a directory at once
the get() method just instantiates a CsvFile object.
'''
class Repo(object):
def __init__(self, pathdir = '.', fillall = False):
if not fillall:
self.items = [CsvFile(f).empty() for f in os.listdir(pathdir) if os.path.isfile(f) and f.lower().endswith('.csv')]
else:
self.items = [CsvFile(f) for f in os.listdir(pathdir) if os.path.isfile(f) and f.lower().endswith('.csv')]
def fill_all(self):
self.items = [x.fill for x in self.items]
def get(self, filea):
return CsvFile(str(filea))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment