Skip to content

Instantly share code, notes, and snippets.

@peterbe
Created December 23, 2017 01:32
Show Gist options
  • Select an option

  • Save peterbe/67b9e40af60a1d5bcb1cfb4b2937b088 to your computer and use it in GitHub Desktop.

Select an option

Save peterbe/67b9e40af60a1d5bcb1cfb4b2937b088 to your computer and use it in GitHub Desktop.
# This requires at least Python 3.6
import time
import statistics
import random
def f1(seq): # Raymond Hettinger
hash_ = {}
[hash_.__setitem__(x, 1) for x in seq]
return hash_.keys()
def f2(seq): # *********
# order preserving
checked = []
for e in seq:
if e not in checked:
checked.append(e)
return checked
def f3(seq):
# Not order preserving
keys = {}
for e in seq:
keys[e] = 1
return keys.keys()
def f4(seq): # ********** order preserving
noDupes = []
[noDupes.append(i) for i in seq if not noDupes.count(i)]
return noDupes
def f5(seq, idfun=None): # Alex Martelli ******* order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen:
continue
seen[marker] = 1
result.append(item)
return result
def f5b(seq, idfun=None): # Alex Martelli ******* order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker not in seen:
seen[marker] = 1
result.append(item)
return result
# def f6(seq):
# # Not order preserving
# return list(Set(seq))
def f7(seq):
# Not order preserving
return list(set(seq))
def f8(seq): # Dave Kirby
# Order preserving
seen = set()
return [x for x in seq if x not in seen and not seen.add(x)]
def f9(seq):
# Not order preserving, even in Py >=3.6
return {}.fromkeys(seq).keys()
def f10(seq, idfun=None): # Andrew Dalke
# Order preserving
return list(_f10(seq, idfun))
def _f10(seq, idfun=None):
seen = set()
if idfun is None:
for x in seq:
if x in seen:
continue
seen.add(x)
yield x
else:
for x in seq:
x = idfun(x)
if x in seen:
continue
seen.add(x)
yield x
def f11(seq): # f10 but simpler
# Order preserving
return list(_f10(seq))
def _f11(seq):
seen = set()
for x in seq:
if x in seen:
continue
seen.add(x)
yield x
def f12(seq):
# Raymond Hettinger
# https://twitter.com/raymondh/status/944125570534621185
return list(dict.fromkeys(seq))
def timing(f, n, a):
t1 = time.clock()
for _ in range(n):
f(a)
t2 = time.clock()
return (t2 - t1) * 1000
def getRandomString(length=10, loweronly=1, numbersonly=0,
lettersonly=0):
""" return a very random string """
_letters = 'abcdefghijklmnopqrstuvwxyz'
if numbersonly:
l = list('0123456789')
elif lettersonly:
l = list(_letters + _letters.upper())
else:
lowercase = _letters+'0123456789'*2
l = list(lowercase + lowercase.upper())
random.shuffle(l)
s = ''.join(l)
if len(s) < length:
s = s + getRandomString(loweronly=1)
s = s[:length]
if loweronly:
return s.lower()
else:
return s
def run():
testdata = {}
for i in range(35):
k = getRandomString(5, lettersonly=1)
v = getRandomString(100)
testdata[k] = v
# print(f1(list('abracadabra')))
# print(f3(list('abracadabra')))
testdata = []
for i in range(20000):
testdata.append(getRandomString(3, loweronly=True))
order_preserving = f2, f4, f5, f5b, f8, f10, f11
order_preserving = f2, f5, f5b, f8, f10, f11, f12
not_order_preserving = f1, f3, f7, f9
testfuncs = order_preserving + not_order_preserving
for i, f in enumerate(order_preserving):
if i:
prev_f = order_preserving[i - 1]
r1 = prev_f(testdata)
r2 = f(testdata)
# print(len(r1), 100 * len(r1) / len(testdata))
assert r1 == r2, '{} != {}'.format(prev_f, f)
for i, f in enumerate(not_order_preserving):
if i:
prev_f = not_order_preserving[i - 1]
r1 = prev_f(testdata)
r2 = f(testdata)
# print(len(r1))
assert set(r1) == set(r2), '{} != {}'.format(prev_f, f)
times = {}
testfuncs = list(testfuncs)
for _ in range(10):
random.shuffle(testfuncs)
for f in testfuncs:
if f not in times:
times[f] = []
times[f].append(timing(f, 100, testdata))
results = []
for f, mss in times.items():
results.append((
f,
f in order_preserving,
statistics.mean(mss),
statistics.median(mss),
))
print(
'FUNCTION'.ljust(15),
'ORDER PRESERVING'.ljust(20),
'MEAN'.ljust(10),
'MEDIAN',
)
results.sort(key=lambda x: (not x[1], x[2]))
# results.sort(key=lambda x: x[0].__name__)
for f, is_order_preserving, mean, median in results:
print(
f.__name__.ljust(15),
('yes' if is_order_preserving else 'no').ljust(20),
'{:.1f}'.format(mean).ljust(10),
'{:.1f}'.format(median),
)
if __name__ == '__main__':
run()
@apieum
Copy link

apieum commented Aug 11, 2025

Some links still reference this and python has changed a lot since then, so I've tested with recent implementations of python if it's still accurate and propose you a new version:

# This requires at least Python 3.6

import time
import statistics
import random
import string
try:
    from tqdm import tqdm
except ImportError:
    print("tqdm not found. Please install it with 'pip install tqdm'")
    # Create a dummy tqdm class if it's not installed
    def tqdm(iterable, *args, **kwargs):
        return iterable


def f1(seq):  # Raymond Hettinger
    hash_ = {}
    [hash_.__setitem__(x, 1) for x in seq]
    return hash_.keys()


def f2(seq):   # ********* slow
    # order preserving
    checked = []
    for e in seq:
        if e not in checked:
            checked.append(e)
    return checked


def f3(seq):
    # Not order preserving
    keys = {}
    for e in seq:
        keys[e] = 1
    return keys.keys()


def f4(seq):  # ********** order preserving
    noDupes = []
    [noDupes.append(i) for i in seq if not noDupes.count(i)]
    return noDupes


def f5(seq, idfun=None):  # Alex Martelli ******* order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker in seen:
            continue
        seen[marker] = 1
        result.append(item)
    return result


def f5b(seq, idfun=None):  # Alex Martelli ******* order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker not in seen:
            seen[marker] = 1
            result.append(item)

    return result


# def f6(seq):
#     # Not order preserving
#     return list(Set(seq))


def f7(seq):
    # Not order preserving
    return list(set(seq))


def f8(seq):  # Dave Kirby
    # Order preserving
    seen = set()
    return [x for x in seq if x not in seen and not seen.add(x)]


def f9(seq):
    # Not order preserving, even in Py >=3.6
    return {}.fromkeys(seq).keys()


def f10(seq, idfun=None):  # Andrew Dalke
    # Order preserving
    return list(_f10(seq, idfun))


def _f10(seq, idfun=None):
    seen = set()
    if idfun is None:
        for x in seq:
            if x in seen:
                continue
            seen.add(x)
            yield x
    else:
        for x in seq:
            x = idfun(x)
            if x in seen:
                continue
            seen.add(x)
            yield x


def f11(seq):  # f10 but simpler
    # Order preserving
    return list(_f10(seq))


def _f11(seq):
    seen = set()
    for x in seq:
        if x in seen:
            continue
        seen.add(x)
        yield x


def f12(seq):
    # Raymond Hettinger
    # https://twitter.com/raymondh/status/944125570534621185
    return list(dict.fromkeys(seq))

def f13(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

def timing(f, n, a):
    t1 = time.perf_counter()
    for _ in range(n):
        f(a)
    t2 = time.perf_counter()
    return (t2 - t1) * 1000


def getRandomString(length=10, loweronly=1, numbersonly=0,
                    lettersonly=0):
    """ return a very random string """
    if numbersonly:
        population = string.digits
    elif lettersonly:
        population = string.ascii_letters
    else:
        population = string.ascii_lowercase + string.digits
        if not loweronly:
            population += string.ascii_uppercase

    return ''.join(random.choices(population, k=length))


def run():
    testdata = {}

    for i in range(35):
        k = getRandomString(5, lettersonly=1)
        v = getRandomString(100)
        testdata[k] = v

    # print(f1(list('abracadabra')))
    # print(f3(list('abracadabra')))
    testdata = []

    print("Generating test data...")
    for i in tqdm(range(20000), desc="Generating test data"):
        testdata.append(getRandomString(3, loweronly=True))

    order_preserving = f5, f5b, f8, f10, f11, f12, f13

    not_order_preserving = f1, f3, f7, f9
    testfuncs = order_preserving + not_order_preserving

    print("\nVerifying function correctness...")
    for i, f in enumerate(tqdm(order_preserving, desc="Verifying order-preserving")):
        if i:
            prev_f = order_preserving[i - 1]
            r1 = prev_f(testdata)
            r2 = f(testdata)
            # print(len(r1), 100 * len(r1) / len(testdata))
            assert r1 == r2, '{} != {}'.format(prev_f, f)

    for i, f in enumerate(tqdm(not_order_preserving, desc="Verifying non-order-preserving")):
        if i:
            prev_f = not_order_preserving[i - 1]
            r1 = prev_f(testdata)
            r2 = f(testdata)
            # print(len(r1))
            assert set(r1) == set(r2), '{} != {}'.format(prev_f, f)

    times = {}

    testfuncs = list(testfuncs)
    print("\nRunning performance benchmarks...")
    for i in tqdm(range(10), desc="Benchmark Rounds"):
        random.shuffle(testfuncs)
        # Create a nested progress bar for the functions in this round
        inner_loop = tqdm(testfuncs, desc=f"  Round {i+1:2d}/10", leave=False)
        for f in inner_loop:
            # Update the postfix to show the current function
            inner_loop.set_postfix_str(f"func={f.__name__}")
            if f not in times:
                times[f] = []
            times[f].append(timing(f, 100, testdata))

    results = []
    for f, mss in times.items():
        results.append((
            f,
            f in order_preserving,
            statistics.mean(mss),
            statistics.median(mss),
        ))

    print("\n\n" + "="*60)
    print("RESULTS".center(60))
    print("="*60)
    print(
        'FUNCTION'.ljust(15),
        'ORDER PRESERVING'.ljust(20),
        'MEAN (ms)'.ljust(15),
        'MEDIAN (ms)',
    )
    results.sort(key=lambda x: (not x[1], x[2]))
    # results.sort(key=lambda x: x[0].__name__)
    for f, is_order_preserving, mean, median in results:
        print(
            f.__name__.ljust(15),
            ('yes' if is_order_preserving else 'no').ljust(20),
            '{:.1f}'.format(mean).ljust(15),
            '{:.1f}'.format(median),
        )


if __name__ == '__main__':
    run()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment