Skip to content

Instantly share code, notes, and snippets.

@endpnt
Created July 26, 2011 10:38
Show Gist options
  • Select an option

  • Save endpnt/1106469 to your computer and use it in GitHub Desktop.

Select an option

Save endpnt/1106469 to your computer and use it in GitHub Desktop.
create large social graphs / .gml from raw email data with igraph
#!/usr/bin/env python
from sys import argv, exit
from os import listdir, path
from email.Parser import Parser
from email.utils import parsedate, parseaddr
import cPickle as pickle
import sys
import itertools
import igraph
import re
re_email = re.compile(r'[a-z0-9!#$%&*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum)\b', re.I)
def _load_mails(maildir):
p = Parser()
for mailpath in listdir(maildir):
mail = p.parse(open(maildir + mailpath), headersonly=True)
tmp_refs = set()
tmp_id = None;
tmp_from = None;
tmp_to = set()
if 'Message-ID' in mail.keys():
ids.add(mail['Message-ID'])
tmp_id = mail['Message-ID']
if 'In-Reply-To' in mail.keys():
refs.add(mail['In-Reply-To'])
tmp_refs.add(mail['In-Reply-To'])
if 'References' in mail.keys():
for r in mail['References'].split(' '):
refs.add(r.strip())
tmp_refs.add(r.strip())
if 'Date' in mail.keys():
dates.add(parsedate(mail['Date']))
if 'To' in mail.keys():
for m in re.findall(re_email, mail['To']):
contacts.add(m)
tmp_to.add(m)
if 'From' in mail.keys():
for m in re.findall(re_email, mail['From']):
contacts.add(m)
tmp_from = m
if 'CC' in mail.keys():
for m in re.findall(re_email, mail['CC']):
contacts.add(m)
tmp_to.add(m)
if tmp_id and len(tmp_refs) > 0:
idmap[tmp_id] = tuple(tmp_refs)
if tmp_from and len(tmp_to) > 0:
mailmap.extend(list([(tmp_from, v) for v in tmp_to]))
del mail, tmp_id, tmp_refs, tmp_to, tmp_from
if len(argv) < 2:
print "Usage: "
exit(1)
ids = set()
refs = set()
dates = set()
contacts = set()
idmap = {}
mailmap = []
if path.exists('mailcache'):
print 'Loading mail cache...'
ids, refs, dates, contacts, idmap, mailmap = pickle.load(open('mailcache'))
else:
print 'Loading mails...'
for dir in argv[1:]:
_load_mails(dir)
print 'Saving cache.'
pickle.dump( (ids,refs,dates,contacts,idmap, mailmap), open('mailcache','w'))
print "mails: %s " % len(ids)
print "with ref: %s " % len(idmap)
print "contacts: %s " % len(contacts)
print "id map: %s " % len(idmap)
print "mail map: %s " % len(mailmap)
if path.exists('graphcache'):
print 'Loading graph cache...'
g = pickle.load(open('graphcache'))
else:
print 'Loading graph...'
contacts = sorted(contacts)
mailmap = sorted(mailmap)
# create dictionary with ids for every contact email
cl = dict( [ (v,i) for i,v in enumerate(contacts) ] )
# create numeric egdes from the mail "from > to" mapping
edges = [ (cl[f], cl[t]) for f, t in mailmap ]
# count the weight
ew = [ [e, edges.count(e)] for e in set(edges) ]
# split edge and weight
edges, weight = map(list, zip(*ew))
# load numeric edges into graph
g = igraph.Graph(edges)
g.es['weight'] = weight
g.vs['lable'] = contacts
print 'Saving graph cache.'
pickle.dump( g, open('graphcache','w'))
print g.summary()
# filter more isolated nodes
g.vs['degree'] = g.degree()
g.delete_vertices(g.vs.select(_degree_lt=5)) # try 2,5,20,100
print g.summary()
g.write_gml('1.gml')
exit(0)
# TODO
if path.exists('layoutcache'):
print 'Loading layout cache...'
layout = pickle.load(open('layoutcache'))
else:
print 'Loading layout...'
layout = g.layout("kk")
print 'Saving layout cache.'
pickle.dump( layout, open('layoutcache','w'))
#igraph.plot(g, '1.pdf',layout = layout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment