Compare sorting algorithms' performance
You are encouraged to solve this task according to the task description, using any language you may know.
Measure a relative performance of sorting algorithms implementations.
Plot execution time vs. input sequence length dependencies for various implementation of sorting algorithm and different input sequence types ([example figures]).
Consider three type of input sequences:
- ones: sequence of all 1's. Example: {1, 1, 1, 1, 1}
- range: ascending sequence, i.e. already sorted. Example: {1, 2, 3, 10, 15}
- shuffledrange: sequence with elements randomly distributed. Example: {5, 3, 9, 6, 8}
Consider at least two different sorting function (different algorithms or/and different implementation of the same algorithm). For example, consider Bubble Sort, Insertion sort, Quicksort or/and implementations of Quicksort with different pivot selection mechanisms. Where possible, use existing implementations.
Preliminary subtask:
- Bubble Sort, Insertion sort, Quicksort, Radix sort, Shell sort
- Query Performance
- Write float arrays to a text file
- Plot x, y arrays
- Polynomial Fitting
General steps:
- Define sorting routines to be considered.
- Define appropriate sequence generators and write timings.
- Plot timings.
- What conclusions about relative performance of the sorting routines could be made based on the plots?
Python
Interpreter: Python 2.5
Examples of sorting routines
def builtinsort(x): x.sort()
def partition(seq, pivot): low, middle, up = [], [], [] for x in seq: if x < pivot: low.append(x) elif x == pivot: middle.append(x) else: up.append(x) return low, middle, up import random def qsortranpart(seq): size = len(seq) if size < 2: return seq low, middle, up = partition(seq, seq[random.randrange(size)]) return qsortranpart(low) + middle + qsortranpart(up)
Sequence generators
def ones(n): return [1]*n
def reversedrange(n): x = range(n) x.reverse() return x
def shuffledrange(n): x = range(n) random.shuffle(x) return x
Write timings
def write_timings(npoints=10, maxN=10**4, sort_functions=(builtinsort,insertion_sort, qsort), sequence_creators = (ones, range, shuffledrange)):
Ns = range(2, maxN, maxN//npoints)
for sort in sort_functions:
for make_seq in sequence_creators:
Ts = map(lambda n: usec(sort, (make_seq(n),)), Ns)
writedat('%s-%s-%d-%d.xy' % (sort.__name__, make_seq.__name__, len(Ns), max(Ns)), Ns, Ts)
Where writedat() is defined in the Write float arrays to a text file, usec() - Query Performance, insertion_sort() - Insertion sort, qsort - Quicksort subtasks, correspondingly.
Plot timings
This is an example of a library. You may see a list of other libraries used on Rosetta Code at Category:Solutions by Library.
This is an example of a library. You may see a list of other libraries used on Rosetta Code at Category:Solutions by Library.
import operator
import numpy, pylab
def plotdd(dictplotdict):
"""See ``plot_timings()`` below."""
symbols = ('o', '^', 'v', '<', '>', 's', '+', 'x', 'D', 'd',
'1', '2', '3', '4', 'h', 'H', 'p', '|', '_')
colors = map(None, 'bgrcmyk') # split string on distinct characters
for npoints, plotdict in dictplotdict.iteritems():
for ttle, lst in plotdict.iteritems():
pylab.hold(False)
for i, (label, polynom, x, y) in enumerate(sorted(lst,key=operator.itemgetter(0))):
pylab.plot(x, y, colors[i % len(colors)] + symbols[i % len(symbols)], label='%s %s' % (polynom, label))
pylab.hold(True)
y = numpy.polyval(polynom, x)
pylab.plot(x, y, colors[i % len(colors)], label= '_nolegend_')
pylab.legend(loc='upper left')
pylab.xlabel(polynom.variable)
pylab.ylabel('log2( time in microseconds )')
pylab.title(ttle, verticalalignment='bottom')
figname = '_%(npoints)03d%(ttle)s' % vars()
pylab.savefig(figname+'.png')
pylab.savefig(figname+'.pdf')
print figname
See Plot x, y arrays and Polynomial Fitting subtasks for a basic usage of pylab.plot() and numpy.polyfit().
import collections, itertools, glob, re
import numpy
def plot_timings():
makedict = lambda: collections.defaultdict(lambda: collections.defaultdict(list))
df = makedict()
ds = makedict()
# populate plot dictionaries
for filename in glob.glob('*.xy'):
m = re.match(r'([^-]+)-([^-]+)-(\d+)-(\d+)\.xy', filename)
print filename
assert m, filename
funcname, seqname, npoints, maxN = m.groups()
npoints, maxN = int(npoints), int(maxN)
a = numpy.fromiter(itertools.imap(float, open(filename).read().split()), dtype='f')
Ns = a[::2] # sequences lengths
Ts = a[1::2] # corresponding times
assert len(Ns) == len(Ts) == npoints
assert max(Ns) <= maxN
#
logsafe = numpy.logical_and(Ns>0, Ts>0)
Ts = numpy.log2(Ts[logsafe])
Ns = numpy.log2(Ns[logsafe])
coeffs = numpy.polyfit(Ns, Ts, deg=1)
poly = numpy.poly1d(coeffs, variable='log2(N)')
#
df[npoints][funcname].append((seqname, poly, Ns, Ts))
ds[npoints][seqname].append((funcname, poly, Ns, Ts))
# actual plotting
plotdd(df)
plotdd(ds) # see ``plotdd()`` above
Figures: log2( time in microseconds ) vs. log2( sequence length )
sort_functions = [ builtinsort, # see implementation above insertion_sort, # see Insertion sort insertion_sort_lowb, # insertion_sort, where sequential search is replaced # by lower_bound() function qsort, # see Quicksort qsortranlc, # qsort with randomly choosen pivot # and the filtering via list comprehension qsortranpart, # qsortranlc with filtering via partition function qsortranpartis, # qsortranpart, where for a small input sequence lengths ] # insertion_sort is called if __name__=="__main__": import sys sys.setrecursionlimit(10000) write_timings(npoints=100, maxN=1024, # 1 <= N <= 2**10 an input sequence length sort_functions=sort_functions, sequence_creators = (ones, range, shuffledrange)) plot_timings()
Executing above script we get belowed figures.
ones
ones.png (143KiB)
builtinsort - O(N) insertion_sort - O(N) qsort - O(N**2) qsortranpart - O(N)
range
range.png (145KiB)
builtinsort - O(N) insertion_sort - O(N) qsort - O(N**2) qsortranpart - O(N*log(N))
shuffled range
shuffledrange.png (152KiB)
builtinsort - O(N) insertion_sort - O(N**4) ??? qsort - O(N*log(N)) qsortranpart - O(N) ???