Created
July 26, 2011 10:38
-
-
Save endpnt/1106469 to your computer and use it in GitHub Desktop.
create large social graphs / .gml from raw email data with igraph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from sys import argv, exit | |
| from os import listdir, path | |
| from email.Parser import Parser | |
| from email.utils import parsedate, parseaddr | |
| import cPickle as pickle | |
| import sys | |
| import itertools | |
| import igraph | |
| import re | |
| re_email = re.compile(r'[a-z0-9!#$%&*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum)\b', re.I) | |
| def _load_mails(maildir): | |
| p = Parser() | |
| for mailpath in listdir(maildir): | |
| mail = p.parse(open(maildir + mailpath), headersonly=True) | |
| tmp_refs = set() | |
| tmp_id = None; | |
| tmp_from = None; | |
| tmp_to = set() | |
| if 'Message-ID' in mail.keys(): | |
| ids.add(mail['Message-ID']) | |
| tmp_id = mail['Message-ID'] | |
| if 'In-Reply-To' in mail.keys(): | |
| refs.add(mail['In-Reply-To']) | |
| tmp_refs.add(mail['In-Reply-To']) | |
| if 'References' in mail.keys(): | |
| for r in mail['References'].split(' '): | |
| refs.add(r.strip()) | |
| tmp_refs.add(r.strip()) | |
| if 'Date' in mail.keys(): | |
| dates.add(parsedate(mail['Date'])) | |
| if 'To' in mail.keys(): | |
| for m in re.findall(re_email, mail['To']): | |
| contacts.add(m) | |
| tmp_to.add(m) | |
| if 'From' in mail.keys(): | |
| for m in re.findall(re_email, mail['From']): | |
| contacts.add(m) | |
| tmp_from = m | |
| if 'CC' in mail.keys(): | |
| for m in re.findall(re_email, mail['CC']): | |
| contacts.add(m) | |
| tmp_to.add(m) | |
| if tmp_id and len(tmp_refs) > 0: | |
| idmap[tmp_id] = tuple(tmp_refs) | |
| if tmp_from and len(tmp_to) > 0: | |
| mailmap.extend(list([(tmp_from, v) for v in tmp_to])) | |
| del mail, tmp_id, tmp_refs, tmp_to, tmp_from | |
| if len(argv) < 2: | |
| print "Usage: " | |
| exit(1) | |
| ids = set() | |
| refs = set() | |
| dates = set() | |
| contacts = set() | |
| idmap = {} | |
| mailmap = [] | |
| if path.exists('mailcache'): | |
| print 'Loading mail cache...' | |
| ids, refs, dates, contacts, idmap, mailmap = pickle.load(open('mailcache')) | |
| else: | |
| print 'Loading mails...' | |
| for dir in argv[1:]: | |
| _load_mails(dir) | |
| print 'Saving cache.' | |
| pickle.dump( (ids,refs,dates,contacts,idmap, mailmap), open('mailcache','w')) | |
| print "mails: %s " % len(ids) | |
| print "with ref: %s " % len(idmap) | |
| print "contacts: %s " % len(contacts) | |
| print "id map: %s " % len(idmap) | |
| print "mail map: %s " % len(mailmap) | |
| if path.exists('graphcache'): | |
| print 'Loading graph cache...' | |
| g = pickle.load(open('graphcache')) | |
| else: | |
| print 'Loading graph...' | |
| contacts = sorted(contacts) | |
| mailmap = sorted(mailmap) | |
| # create dictionary with ids for every contact email | |
| cl = dict( [ (v,i) for i,v in enumerate(contacts) ] ) | |
| # create numeric egdes from the mail "from > to" mapping | |
| edges = [ (cl[f], cl[t]) for f, t in mailmap ] | |
| # count the weight | |
| ew = [ [e, edges.count(e)] for e in set(edges) ] | |
| # split edge and weight | |
| edges, weight = map(list, zip(*ew)) | |
| # load numeric edges into graph | |
| g = igraph.Graph(edges) | |
| g.es['weight'] = weight | |
| g.vs['lable'] = contacts | |
| print 'Saving graph cache.' | |
| pickle.dump( g, open('graphcache','w')) | |
| print g.summary() | |
| # filter more isolated nodes | |
| g.vs['degree'] = g.degree() | |
| g.delete_vertices(g.vs.select(_degree_lt=5)) # try 2,5,20,100 | |
| print g.summary() | |
| g.write_gml('1.gml') | |
| exit(0) | |
| # TODO | |
| if path.exists('layoutcache'): | |
| print 'Loading layout cache...' | |
| layout = pickle.load(open('layoutcache')) | |
| else: | |
| print 'Loading layout...' | |
| layout = g.layout("kk") | |
| print 'Saving layout cache.' | |
| pickle.dump( layout, open('layoutcache','w')) | |
| #igraph.plot(g, '1.pdf',layout = layout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment