-
-
Save conradlee/1331132 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| import os | |
| import sys | |
| import subprocess | |
| import optparse | |
| import tempfile | |
| # Special feature: can convert files so large that they | |
| # don't fit in memory. Works for weighted/unweighted, | |
| # directed/undirected edges. | |
| def edgelist_to_pajek(input_filename, output_filename="", directed=False, weighted=False, buffer_size=500): | |
| """ | |
| Input filename is the name of an edgelist file with the following format | |
| node1ID node2ID [weight] | |
| node1ID node3ID [weight] | |
| ... | |
| nodeiID nodejID [weight] | |
| where nodeIDs are separated by whitespace. | |
| Edge weights will only be used if the "weighted" argument is set to True. | |
| Buffer size is in megabytes. | |
| If output is unspecified, then I use stdout. | |
| """ | |
| # Sort out I/O | |
| if output_filename: | |
| output_file = open(output_filename, "w") | |
| else: | |
| output_file = sys.stdout | |
| node_idx_map = {} | |
| # Write vertices section and produce map from original nodeIDs to | |
| # contiguous integer ids that start from one. | |
| with Tempfile() as unique_nodes_file: | |
| unique_nodes_command = "<%s awk '{ print $1; print $2; }' | sort -n --buffer-size=%dM | uniq>%s" % (input_filename, buffer_size, unique_nodes_file.name) | |
| unique_nodes_command = "<" + input_filename + " " + unique_nodes_command | |
| run_command(unique_nodes_command) | |
| num_nodes = int(run_command("wc -l %s" % unique_nodes_file.name).split()[0]) | |
| output_file.write("*Vertices\t%d\n" % num_nodes) | |
| with open(unique_nodes_file.name) as nodes_file: | |
| for idx, line in enumerate(nodes_file): | |
| node_id = int(line.rstrip("\n")) | |
| pajek_idx = idx + 1 # Pajek indexing starts with 1 | |
| output_file.write('\t%d "%d"\n' % (pajek_idx, node_id)) | |
| # Might be slow to add to dict this way, one at a time | |
| node_idx_map[node_id] = pajek_idx | |
| # Now write edges | |
| if directed: | |
| output_file.write("*Arcs\n") | |
| else: | |
| output_file.write("*Edges\n") | |
| input_file = open(input_filename) | |
| for i, line in enumerate(input_file): | |
| try: | |
| if weighted: | |
| n1, n2, weight = line.strip().split() | |
| output_file.write("\t%d\t%d\t%0.6f\n" % (node_idx_map[int(n1)], | |
| node_idx_map[int(n2)], | |
| float(weight))) | |
| else: | |
| n1, n2 = map(int, line.strip().split()[:2]) | |
| output_file.write("\t%d\t%d\n" % (node_idx_map[n1], | |
| node_idx_map[n2])) | |
| except ValueError: | |
| raise ValueError, "Problem parsing input file on line %d, which reads: \n\t%s\nIf you selected the -w option for weighted edegs, make sure this line has an edg\ | |
| e weight" % (i + 1, line) | |
| input_file.close() | |
| output_file.close() | |
| def run_command(command): | |
| # Necessary for compatability with python 2.6 which is missing | |
| # some of the conveneince funcitons in python 2.7 | |
| """ Warning: Will hang if stderr or stdout is large """ | |
| process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| retcode = process.wait() | |
| if retcode != 0: | |
| raise Exception, "Problem running command: " + command | |
| stdout, stderr = process.communicate() | |
| return stdout | |
| class Tempfile: | |
| def __enter__(self): | |
| self.file = tempfile.NamedTemporaryFile(delete=False) | |
| return self.file | |
| def __exit__(self, type, value, traceback): | |
| try: | |
| os.remove(self.file.name) | |
| except OSError: | |
| pass | |
| if __name__ == "__main__": | |
| parser = optparse.OptionParser(usage="Usage: %prog input_filename <options>") | |
| parser.add_option('-d', | |
| help="specifies that edges are directed.", | |
| dest="directed", | |
| default=False, | |
| action="store_true") | |
| parser.add_option('-w', | |
| help="specifies that edges are weighted.", | |
| dest="weighted", | |
| default=False, | |
| action="store_true") | |
| parser.add_option('-o', | |
| "--out_filename", | |
| help="Filename for output, which is in pajek format. Default [stdout]", | |
| dest="out_filename", | |
| type="string", | |
| default="") | |
| parser.add_option('-b', | |
| "--buffer_size", | |
| help="Size of buffer for sort command to use (in megabytes) Default [%default]", | |
| dest="buffer_size", | |
| type="int", | |
| default=500) | |
| (opts, args) = parser.parse_args() | |
| input_filename = sys.argv[1] | |
| edgelist_to_pajek(input_filename, | |
| output_filename = opts.out_filename, | |
| directed = opts.directed, | |
| weighted = opts.weighted, | |
| buffer_size = opts.buffer_size) |
Yes, it's only 3 GB available for 32-bit Windows application.
I'll remind you about the script if I don't find any alternative in a week.
Cheers,
Alex.
Thanks for great the program! Could finally do the neighbour detection all swedish twitter accounts :) (see result at http://twittercensus.se/graph2014 )
Hi
i am newbe in pajek. i am importing data from gephi into pajek via txt to pajek. But pajek is giving error Error: this seems unix file or corrupted etc.
Any one can help me regarding this issue....
Thanks in advance!
is there any code available to create temporal network using pajek? because i want to convert network into a temporal network by using dates as time stamp. Basically i am saying that is there any code which can turn csv into pajek file including time stamp
@semenoffalex
That network shouldn't be a problem on your machine (although perhaps with the 32-bit operating system, Windows can only access 3GB, not four---I'm not sure about this).
So it should indeed be possible to write a relatively simple native python script that converts your edgelist file into the pajek format. I don't have time at the moment, but if you remind me later this week, I'll write it up for you.
Conrad