Compare sorting algorithms' performance

Measure a relative performance of sorting algorithms implementations.

Plot execution time vs. input sequence length dependencies for various implementation of sorting algorithm and different input sequence types ([example figures]).

Consider three type of input sequences:

ones: sequence of all 1's. Example: {1, 1, 1, 1, 1}
range: ascending sequence, i.e. already sorted. Example: {1, 2, 3, 10, 15}
shuffledrange: sequence with elements randomly distributed. Example: {5, 3, 9, 6, 8}

Consider at least two different sorting function (different algorithms or/and different implementation of the same algorithm). For example, consider Bubble Sort, Insertion sort, Quicksort or/and implementations of Quicksort with different pivot selection mechanisms. Where possible, use existing implementations.

Preliminary subtask:

General steps:

Define sorting routines to be considered.
Define appropriate sequence generators and write timings.
Plot timings.
What conclusions about relative performance of the sorting routines could be made based on the plots?

Python

Interpreter: Python 2.5

Examples of sorting routines

def builtinsort(x):
    x.sort()

def partition(seq, pivot):
   low, middle, up = [], [], []
   for x in seq:
       if x < pivot:
           low.append(x)
       elif x == pivot:
           middle.append(x)
       else:
           up.append(x)
   return low, middle, up
import random
def qsortranpart(seq):
   size = len(seq)
   if size < 2: return seq
   low, middle, up = partition(seq, seq[random.randrange(size)])
   return qsortranpart(low) + middle + qsortranpart(up)

Sequence generators

def ones(n):
    return [1]*n

def reversedrange(n):
    x = range(n)
    x.reverse()
    return x

def shuffledrange(n):
    x = range(n)
    random.shuffle(x)
    return x

Write timings

def write_timings(npoints=10, maxN=10**4, sort_functions=(builtinsort,insertion_sort, qsort), sequence_creators = (ones, range, shuffledrange)):    
   Ns = range(2, maxN, maxN//npoints)
   for sort in sort_functions:
       for make_seq in sequence_creators:
           Ts = map(lambda n: usec(sort, (make_seq(n),)), Ns)
           writedat('%s-%s-%d-%d.xy' % (sort.__name__,  make_seq.__name__, len(Ns), max(Ns)), Ns, Ts)

Where writedat() is defined in the Write float arrays to a text file, usec() - Query Performance, insertion_sort() - Insertion sort, qsort - Quicksort subtasks, correspondingly.

Plot timings

import operator
import numpy, pylab
def plotdd(dictplotdict):
   """See ``plot_timings()`` below."""
   symbols = ('o', '^', 'v', '<', '>', 's', '+', 'x', 'D', 'd',
              '1', '2', '3', '4', 'h', 'H', 'p', '|', '_')
   colors = map(None, 'bgrcmyk') # split string on distinct characters
   for npoints, plotdict in dictplotdict.iteritems():
       for ttle, lst in plotdict.iteritems():            
           pylab.hold(False)                                
           for i, (label, polynom, x, y) in enumerate(sorted(lst,key=operator.itemgetter(0))):
               pylab.plot(x, y, colors[i % len(colors)] + symbols[i % len(symbols)], label='%s %s' % (polynom, label))
               pylab.hold(True)
               y = numpy.polyval(polynom, x)
               pylab.plot(x, y, colors[i % len(colors)], label= '_nolegend_')                
           pylab.legend(loc='upper left')
           pylab.xlabel(polynom.variable)
           pylab.ylabel('log2( time in microseconds )')                
           pylab.title(ttle, verticalalignment='bottom')
           figname = '_%(npoints)03d%(ttle)s' % vars()
           pylab.savefig(figname+'.png')
           pylab.savefig(figname+'.pdf')
           print figname

See Plot x, y arrays and Polynomial Fitting subtasks for a basic usage of pylab.plot() and numpy.polyfit().

import collections, itertools, glob, re
import numpy
def plot_timings():
   makedict = lambda: collections.defaultdict(lambda: collections.defaultdict(list))
   df = makedict()
   ds = makedict()
   # populate plot dictionaries
   for filename in glob.glob('*.xy'):
       m = re.match(r'([^-]+)-([^-]+)-(\d+)-(\d+)\.xy', filename)
       print filename
       assert m, filename
       funcname, seqname, npoints, maxN = m.groups()
       npoints, maxN = int(npoints), int(maxN)        
       a = numpy.fromiter(itertools.imap(float, open(filename).read().split()), dtype='f')
       Ns = a[::2]  # sequences lengths
       Ts = a[1::2] # corresponding times 
       assert len(Ns) == len(Ts) == npoints
       assert max(Ns) <= maxN
       #
       logsafe = numpy.logical_and(Ns>0, Ts>0)
       Ts = numpy.log2(Ts[logsafe])
       Ns = numpy.log2(Ns[logsafe])
       coeffs = numpy.polyfit(Ns, Ts, deg=1)
       poly = numpy.poly1d(coeffs, variable='log2(N)')
       #
       df[npoints][funcname].append((seqname, poly, Ns, Ts))
       ds[npoints][seqname].append((funcname, poly, Ns, Ts))
   # actual plotting
   plotdd(df)
   plotdd(ds) # see ``plotdd()`` above

Figures: log2( time in microseconds ) vs. log2( sequence length )

sort_functions = [
    builtinsort,         # see implementation above
    insertion_sort,      # see Insertion sort
    insertion_sort_lowb, # insertion_sort, where sequential search is replaced
                         #     by lower_bound() function
    qsort,               # see Quicksort
    qsortranlc,          # qsort with randomly choosen pivot
                         #     and the filtering via list comprehension
    qsortranpart,        # qsortranlc with filtering via partition function
    qsortranpartis,      # qsortranpart, where for a small input sequence lengths
    ]                    #     insertion_sort is called
if __name__=="__main__":
   import sys
   sys.setrecursionlimit(10000)
   write_timings(npoints=100, maxN=1024, # 1 <= N <= 2**10 an input sequence length
                 sort_functions=sort_functions,
                 sequence_creators = (ones, range, shuffledrange))
   plot_timings()

Executing above script we get belowed figures.

ones

ones.png (143KiB)

builtinsort     - O(N)
insertion_sort  - O(N)
qsort           - O(N**2)
qsortranpart    - O(N)

range

range.png (145KiB)

builtinsort     - O(N)
insertion_sort  - O(N)
qsort           - O(N**2)
qsortranpart    - O(N*log(N))

shuffled range

shuffledrange.png (152KiB)

builtinsort     - O(N)  
insertion_sort  - O(N**4) ???
qsort           - O(N*log(N))
qsortranpart    - O(N) ???