Created
April 28, 2014 17:15
-
-
Save isayev/11378249 to your computer and use it in GitHub Desktop.
splits file returns the files splitted and the number of files(current_piece)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import os | |
| import pprint | |
| import datetime | |
| #splits file returns the files splitted and the number of files(current_piece) | |
| import csv | |
| def splits(filehandler, delimiter=',', row_limit=100000, | |
| output_name_template='output_%s.csv', output_path='.', keep_headers=True): | |
| """ | |
| Splits a CSV file into multiple pieces. | |
| Arguments: | |
| `row_limit`: The number of rows you want in each output file. 10,000 by default. | |
| `output_name_template`: A %s-style template for the numbered output files. | |
| `output_path`: Where to stick the output files. | |
| `keep_headers`: Whether or not to print the headers in each output file. | |
| """ | |
| created_files = [] | |
| reader = csv.reader(filehandler, delimiter=delimiter) | |
| current_piece = 1 | |
| current_out_path = os.path.join( | |
| output_path, | |
| output_name_template % current_piece | |
| ) | |
| current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) | |
| current_limit = row_limit | |
| if keep_headers: | |
| headers = reader.next() | |
| current_out_writer.writerow(headers) | |
| print current_out_path | |
| created_files.append(current_out_path) | |
| for i, row in enumerate(reader): | |
| if i + 1 > current_limit: | |
| current_piece += 1 | |
| current_limit = row_limit * current_piece | |
| current_out_path = os.path.join( | |
| output_path, | |
| output_name_template % current_piece | |
| ) | |
| current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) | |
| print current_out_path | |
| created_files.append(current_out_path) | |
| if keep_headers: | |
| current_out_writer.writerow(headers) | |
| current_out_writer.writerow(row) | |
| return created_files,current_piece | |
| ''' | |
| This module is a simple utility for working with a single csv file, | |
| or muttiple csv files in a directory. | |
| You can instantiate a csv file by simply doing | |
| >>>import csvgoo2 | |
| >>> c = CsvFile('sample.csv') | |
| and then grab the data by doing | |
| >>>c.fill() | |
| The CsFile object only grabs the data when you explicitly call the fill() method | |
| in order to save memory. You can also use the empty() method to clear data once | |
| its been filled | |
| ''' | |
| #CsvFile USAGE | |
| #c = CsvFile('sample.csv') | |
| #print c.columns | |
| #c.fill() | |
| class CsvFile(object): | |
| def __init__(self, name): | |
| self.name = name | |
| self.data = [] | |
| self.columns = [] | |
| def __str__(self): | |
| return self.name | |
| def empty(self): | |
| self.data = [] | |
| def fill(self): | |
| self.data = [x for x in csv.DictReader(open(self.name, 'rU'))] | |
| self.columns = self.data[0].keys() | |
| ''' | |
| The Repo class is similar to the csv file object | |
| but lets you gather all csv files in a directory at once | |
| the get() method just instantiates a CsvFile object. | |
| ''' | |
| class Repo(object): | |
| def __init__(self, pathdir = '.', fillall = False): | |
| if not fillall: | |
| self.items = [CsvFile(f).empty() for f in os.listdir(pathdir) if os.path.isfile(f) and f.lower().endswith('.csv')] | |
| else: | |
| self.items = [CsvFile(f) for f in os.listdir(pathdir) if os.path.isfile(f) and f.lower().endswith('.csv')] | |
| def fill_all(self): | |
| self.items = [x.fill for x in self.items] | |
| def get(self, filea): | |
| return CsvFile(str(filea)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment