diff --git a/src/mergesort.h b/src/mergesort.h
index 144d581caa..baa6b7830b 100644
--- a/src/mergesort.h
+++ b/src/mergesort.h
@@ -14,13 +14,35 @@
 #ifndef LMP_MERGESORT
 #define LMP_MERGESORT
 
-/* ---------------------------------------------------------------------- */
+#include <string.h>
 
-// custom upward merge sort implementation which allows to pass a custom
-// pointer to the comparison function for access to class instances.
-// this avoids having to use global variables.
+// custom hybrid upward merge sort implementation with support to pass
+// an opaque pointer to the comparison function, e.g. for access to
+// class members. this avoids having to use global variables.
+// for improved performance, we employ an in-place insertion sort on
+// chunks of up to 32 elements and switch to merge sort from then on.
 
-// part 1. merge two sublists.
+// part 1. insertion sort for pre-sorting of small chunks
+
+static void insertion_sort(int *index, int num, void *ptr,
+                           int (*comp)(int, int, void*))
+{
+  if (num < 2) return;
+  for (int i=1; i < num; ++i) {
+    int tmp = index[i];
+    for (int j=i-1; j >= 0; --j) {
+      if ((*comp)(index[j],tmp,ptr) > 0) {
+        index[j+1] = index[j];
+      } else {
+        index[j+1] = tmp;
+        break;
+      }
+      if (j == 0) index[0] = tmp;
+    }
+  }
+}
+
+// part 2. merge two sublists
 
 static void do_merge(int *idx, int *buf, int llo, int lhi, int rlo, int rhi,
                      void *ptr, int (*comp)(int, int, void *))
@@ -34,34 +56,65 @@ static void do_merge(int *idx, int *buf, int llo, int lhi, int rlo, int rhi,
     else idx[i++] = buf[r++];
   }
     
-  while(l < lhi) idx[i++] = buf[l++];
-  while(r < rhi) idx[i++] = buf[r++];
+  while (l < lhi) idx[i++] = buf[l++];
+  while (r < rhi) idx[i++] = buf[r++];
 }
 
-// part 2: loop over sublists doubling in size with each iteration
+// part 3: loop over sublists doubling in size with each iteration.
+//         pre-sort sublists with insertion sort for better performance.
 
 static void merge_sort(int *index, int num, void *ptr,
                        int (*comp)(int, int, void *))
 {
   if (num < 2) return;
 
-  int *hold = new int[num];
-  int i,j,k,m;
+  int chunk,i,j;
 
-  i = 1;
-  while (i < num) {
-    memcpy(hold,index,sizeof(int)*num);
-    for (j=0; j < num-1; j += 2*i) {
-      k = j + 2*i;
-      if (k > num) k=num;
-      m = j+i;
-      if (m > num) m=num;
-      do_merge(index,hold,j,m,m,k,ptr,comp);
-    }
-    i *= 2;
+  // do insertion sort on chunks of up to 32 elements
+
+  chunk = 32;
+  for (i=0; i < num; i += chunk) {
+    j = (i+chunk > num) ? num-i : chunk;
+    insertion_sort(index+i,j,ptr,comp);
   }
 
-  delete[] hold;
+  // already done?
+
+  if (chunk >= num) return;
+
+  // continue with merge sort on the pre-sorted chunks.
+  // we need an extra buffer for temporary storage and two
+  // pointers to operate on, so we can swap the pointers
+  // rather than copying to the hold buffer in each pass
+
+  int *buf = new int[num];
+  int *dest = index;
+  int *hold = buf;
+
+  while (chunk < num) {
+    int m;
+
+    // swap hold and destination buffer
+
+    int *tmp = dest; dest = hold; hold = tmp;
+
+    // merge from hold array to destiation array
+
+    for (i=0; i < num-1; i += 2*chunk) {
+      j = i + 2*chunk;
+      if (j > num) j=num;
+      m = i+chunk;
+      if (m > num) m=num;
+      do_merge(dest,hold,i,m,m,j,ptr,comp);
+    }
+    chunk *= 2;
+  }
+
+  // if the final sorted data is in buf, copy back to index
+
+  if (dest == buf) memcpy(index,buf,sizeof(int)*num);
+
+  delete[] buf;
 }
 
 #endif