* applied to different kinds of sortable objects. Implementation of
* the particular sorting variants is given in tuplesortvariants.c.
* This module works efficiently for both small and large amounts
- * of data. Small amounts are sorted in-memory using qsort(). Large
- * amounts are sorted using temporary files and a standard external sort
+ * of data. Small amounts are sorted in-memory. Large amounts are
+ * sorted using temporary files and a standard external sort
* algorithm.
*
* See Knuth, volume 3, for more than you want to know about external
* Historically, we divided the input into sorted runs using replacement
* selection, in the form of a priority tree implemented as a heap
* (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort
- * for run generation.
+ * or radix sort for run generation.
*
* The approximate amount of memory allowed for any one sort operation
* is specified in kilobytes by the caller (most pass work_mem). Initially,
* we absorb tuples and simply store them in an unsorted array as long as
* we haven't exceeded workMem. If we reach the end of the input without
- * exceeding workMem, we sort the array using qsort() and subsequently return
+ * exceeding workMem, we sort the array in memory and subsequently return
* tuples just by scanning the tuple array sequentially. If we do exceed
* workMem, we begin to emit tuples into sorted runs in temporary tapes.
- * When tuples are dumped in batch after quicksorting, we begin a new run
+ * When tuples are dumped in batch after in-memory sorting, we begin a new run
* with a new output tape. If we reach the max number of tapes, we write
* subsequent runs on the existing tapes in a round-robin fashion. We will
* need multiple merge passes to finish the merge in that case. After the
static void tuplesort_free(Tuplesortstate *state);
static void tuplesort_updatemax(Tuplesortstate *state);
-/*
- * Specialized comparators that we can inline into specialized sorts. The goal
- * is to try to sort two tuples without having to follow the pointers to the
- * comparator or the tuple.
- *
- * XXX: For now, there is no specialization for cases where datum1 is
- * authoritative and we don't even need to fall back to a callback at all (that
- * would be true for types like int4/int8/timestamp/date, but not true for
- * abbreviations of text or multi-key sorts. There could be! Is it worth it?
- */
-
-/* Used if first key's comparator is ssup_datum_unsigned_cmp */
-static pg_attribute_always_inline int
-qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
-{
- int compare;
-
- compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1,
- b->datum1, b->isnull1,
- &state->base.sortKeys[0]);
- if (compare != 0)
- return compare;
-
- /*
- * No need to waste effort calling the tiebreak function when there are no
- * other keys to sort on.
- */
- if (state->base.onlyKey != NULL)
- return 0;
-
- return state->base.comparetup_tiebreak(a, b, state);
-}
-
-/* Used if first key's comparator is ssup_datum_signed_cmp */
-static pg_attribute_always_inline int
-qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
-{
- int compare;
-
- compare = ApplySignedSortComparator(a->datum1, a->isnull1,
- b->datum1, b->isnull1,
- &state->base.sortKeys[0]);
-
- if (compare != 0)
- return compare;
-
- /*
- * No need to waste effort calling the tiebreak function when there are no
- * other keys to sort on.
- */
- if (state->base.onlyKey != NULL)
- return 0;
-
- return state->base.comparetup_tiebreak(a, b, state);
-}
-
-/* Used if first key's comparator is ssup_datum_int32_cmp */
-static pg_attribute_always_inline int
-qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
-{
- int compare;
-
- compare = ApplyInt32SortComparator(a->datum1, a->isnull1,
- b->datum1, b->isnull1,
- &state->base.sortKeys[0]);
-
- if (compare != 0)
- return compare;
-
- /*
- * No need to waste effort calling the tiebreak function when there are no
- * other keys to sort on.
- */
- if (state->base.onlyKey != NULL)
- return 0;
-
- return state->base.comparetup_tiebreak(a, b, state);
-}
/*
* Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
* any variant of SortTuples, using the appropriate comparetup function.
* qsort_ssup() is specialized for the case where the comparetup function
* reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
- * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for
- * common comparison functions on pass-by-value leading datums.
+ * and Datum sorts.
*/
-#define ST_SORT qsort_tuple_unsigned
-#define ST_ELEMENT_TYPE SortTuple
-#define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state)
-#define ST_COMPARE_ARG_TYPE Tuplesortstate
-#define ST_CHECK_FOR_INTERRUPTS
-#define ST_SCOPE static
-#define ST_DEFINE
-#include "lib/sort_template.h"
-
-#define ST_SORT qsort_tuple_signed
-#define ST_ELEMENT_TYPE SortTuple
-#define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state)
-#define ST_COMPARE_ARG_TYPE Tuplesortstate
-#define ST_CHECK_FOR_INTERRUPTS
-#define ST_SCOPE static
-#define ST_DEFINE
-#include "lib/sort_template.h"
-
-#define ST_SORT qsort_tuple_int32
-#define ST_ELEMENT_TYPE SortTuple
-#define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state)
-#define ST_COMPARE_ARG_TYPE Tuplesortstate
-#define ST_CHECK_FOR_INTERRUPTS
-#define ST_SCOPE static
-#define ST_DEFINE
-#include "lib/sort_template.h"
-
#define ST_SORT qsort_tuple
#define ST_ELEMENT_TYPE SortTuple
#define ST_COMPARE_RUNTIME_POINTER
#define ST_DEFINE
#include "lib/sort_template.h"
+/* state for radix sort */
+typedef struct RadixSortInfo
+{
+ union
+ {
+ size_t count;
+ size_t offset;
+ };
+ size_t next_offset;
+} RadixSortInfo;
+
+/*
+ * Threshold below which qsort_tuple() is generally faster than a radix sort.
+ */
+#define QSORT_THRESHOLD 40
+
+
/*
* tuplesort_begin_xxx
*
*/
if (SERIAL(state))
{
- /* Just qsort 'em and we're done */
+ /* Sort in memory and we're done */
tuplesort_sort_memtuples(state);
state->status = TSS_SORTEDINMEM;
}
/*
* Sort all tuples accumulated within the allowed amount of memory for
- * this run using quicksort
+ * this run.
*/
tuplesort_sort_memtuples(state);
state->boundUsed = true;
}
+
+/* radix sort routines */
+
+/*
+ * Retrieve byte from datum, indexed by 'level': 0 for MSB, 7 for LSB
+ */
+static inline uint8
+current_byte(Datum key, int level)
+{
+ int shift = (sizeof(Datum) - 1 - level) * BITS_PER_BYTE;
+
+ return (key >> shift) & 0xFF;
+}
+
/*
- * Sort all memtuples using specialized qsort() routines.
+ * Normalize datum such that unsigned comparison is order-preserving,
+ * taking ASC/DESC into account as well.
+ */
+static inline Datum
+normalize_datum(Datum orig, SortSupport ssup)
+{
+ Datum norm_datum1;
+
+ if (ssup->comparator == ssup_datum_signed_cmp)
+ {
+ norm_datum1 = orig + ((uint64) PG_INT64_MAX) + 1;
+ }
+ else if (ssup->comparator == ssup_datum_int32_cmp)
+ {
+ /*
+ * First truncate to uint32. Technically, we don't need to do this,
+ * but it forces the upper half of the datum to be zero regardless of
+ * sign.
+ */
+ uint32 u32 = DatumGetUInt32(orig) + ((uint32) PG_INT32_MAX) + 1;
+
+ norm_datum1 = UInt32GetDatum(u32);
+ }
+ else
+ {
+ Assert(ssup->comparator == ssup_datum_unsigned_cmp);
+ norm_datum1 = orig;
+ }
+
+ if (ssup->ssup_reverse)
+ norm_datum1 = ~norm_datum1;
+
+ return norm_datum1;
+}
+
+/*
+ * radix_sort_recursive
+ *
+ * Radix sort by (pass-by-value) datum1, diverting to qsort_tuple()
+ * for tiebreaks.
+ *
+ * This is a modification of
+ * ska_byte_sort() from https://github.com/skarupke/ska_sort
+ * The original copyright notice follows:
+ *
+ * Copyright Malte Skarupke 2016.
+ * Distributed under the Boost Software License, Version 1.0.
+ *
+ * Boost Software License - Version 1.0 - August 17th, 2003
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute,
+ * execute, and transmit the Software, and to prepare derivative works of the
+ * Software, and to permit third-parties to whom the Software is furnished to
+ * do so, all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer,
+ * must be included in all copies of the Software, in whole or in part, and
+ * all derivative works of the Software, unless such copies or derivative
+ * works are solely in the form of machine-executable object code generated by
+ * a source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+static void
+radix_sort_recursive(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *state)
+{
+ RadixSortInfo partitions[256] = {0};
+ uint8 remaining_partitions[256];
+ size_t total = 0;
+ int num_partitions = 0;
+ int num_remaining;
+ SortSupport ssup = &state->base.sortKeys[0];
+ size_t start_offset = 0;
+ SortTuple *partition_begin = begin;
+
+ /* count number of occurrences of each byte */
+ for (SortTuple *st = begin; st < begin + n_elems; st++)
+ {
+ uint8 this_partition;
+
+ /* extract the byte for this level from the normalized datum */
+ this_partition = current_byte(normalize_datum(st->datum1, ssup),
+ level);
+
+ /* save it for the permutation step */
+ st->curbyte = this_partition;
+
+ partitions[this_partition].count++;
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /* compute partition offsets */
+ for (int i = 0; i < 256; i++)
+ {
+ size_t count = partitions[i].count;
+
+ if (count != 0)
+ {
+ partitions[i].offset = total;
+ total += count;
+ remaining_partitions[num_partitions] = i;
+ num_partitions++;
+ }
+ partitions[i].next_offset = total;
+ }
+
+ /*
+ * Swap tuples to correct partition.
+ *
+ * In traditional American flag sort, a swap sends the current element to
+ * the correct partition, but the array pointer only advances if the
+ * partner of the swap happens to be an element that belongs in the
+ * current partition. That only requires one pass through the array, but
+ * the disadvantage is we don't know if the pointer can advance until the
+ * swap completes. Here lies the most interesting innovation from the
+ * upstream ska_byte_sort: After initiating the swap, we immediately
+ * proceed to the next element. This makes better use of CPU pipelining,
+ * but also means that we will often need multiple iterations of this
+ * loop. ska_byte_sort() maintains a separate list of which partitions
+ * haven't finished, which is updated every loop iteration. Here we simply
+ * check each partition during every iteration.
+ *
+ * If we started with a single partition, there is nothing to do. If a
+ * previous loop iteration results in only one partition that hasn't been
+ * counted as sorted, we know it's actually sorted and can exit the loop.
+ */
+ num_remaining = num_partitions;
+ while (num_remaining > 1)
+ {
+ /* start the count over */
+ num_remaining = num_partitions;
+
+ for (int i = 0; i < num_partitions; i++)
+ {
+ uint8 idx = remaining_partitions[i];
+
+ for (SortTuple *st = begin + partitions[idx].offset;
+ st < begin + partitions[idx].next_offset;
+ st++)
+ {
+ size_t offset = partitions[st->curbyte].offset++;
+ SortTuple tmp;
+
+ /* swap current tuple with destination position */
+ Assert(offset < n_elems);
+ tmp = *st;
+ *st = begin[offset];
+ begin[offset] = tmp;
+
+ CHECK_FOR_INTERRUPTS();
+ };
+
+ /* Is this partition sorted? */
+ if (partitions[idx].offset == partitions[idx].next_offset)
+ num_remaining--;
+ }
+ }
+
+ /* recurse */
+ for (uint8 *rp = remaining_partitions;
+ rp < remaining_partitions + num_partitions;
+ rp++)
+ {
+ size_t end_offset = partitions[*rp].next_offset;
+ SortTuple *partition_end = begin + end_offset;
+ size_t num_elements = end_offset - start_offset;
+
+ if (num_elements > 1)
+ {
+ if (level < sizeof(Datum) - 1)
+ {
+ if (num_elements < QSORT_THRESHOLD)
+ {
+ qsort_tuple(partition_begin,
+ num_elements,
+ state->base.comparetup,
+ state);
+ }
+ else
+ {
+ radix_sort_recursive(partition_begin,
+ num_elements,
+ level + 1,
+ state);
+ }
+ }
+ else if (state->base.onlyKey == NULL)
+ {
+ /*
+ * We've finished radix sort on all bytes of the pass-by-value
+ * datum (possibly abbreviated), now sort using the tiebreak
+ * comparator.
+ */
+ qsort_tuple(partition_begin,
+ num_elements,
+ state->base.comparetup_tiebreak,
+ state);
+ }
+ }
+
+ start_offset = end_offset;
+ partition_begin = partition_end;
+ }
+}
+
+/*
+ * Entry point for radix_sort_recursive
*
- * Quicksort is used for small in-memory sorts, and external sort runs.
+ * Partition tuples by isnull1, then sort both partitions, using
+ * radix sort on the NOT NULL partition if it's large enough.
+ */
+static void
+radix_sort_tuple(SortTuple *data, size_t n, Tuplesortstate *state)
+{
+ bool nulls_first = state->base.sortKeys[0].ssup_nulls_first;
+ SortTuple *null_start;
+ SortTuple *not_null_start;
+ size_t d1 = 0,
+ d2,
+ null_count,
+ not_null_count;
+
+ /*
+ * Find the first NOT NULL if NULLS FIRST, or first NULL if NULLS LAST.
+ * This also serves as a quick check for the common case where all tuples
+ * are NOT NULL in the first sort key.
+ */
+ while (d1 < n && data[d1].isnull1 == nulls_first)
+ {
+ d1++;
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If we have more than one tuple left after the quick check, partition
+ * the remainder using branchless cyclic permutation, based on
+ * https://orlp.net/blog/branchless-lomuto-partitioning/
+ */
+ Assert(n > 0);
+ if (d1 < n - 1)
+ {
+ size_t i = d1,
+ j = d1;
+ SortTuple tmp = data[d1]; /* create gap at front */
+
+ while (j < n - 1)
+ {
+ /* gap is at j, move i's element to gap */
+ data[j] = data[i];
+ /* advance j to the first unknown element */
+ j += 1;
+ /* move the first unknown element back to i */
+ data[i] = data[j];
+ /* advance i if this element belongs in the left partition */
+ i += (data[i].isnull1 == nulls_first);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /* place gap between left and right partitions */
+ data[j] = data[i];
+ /* restore the saved element */
+ data[i] = tmp;
+ /* assign it to the correct partition */
+ i += (data[i].isnull1 == nulls_first);
+
+ /* d1 is now the number of elements in the left partition */
+ d1 = i;
+ }
+
+ d2 = n - d1;
+
+ /* set pointers and counts for each partition */
+ if (nulls_first)
+ {
+ null_start = data;
+ null_count = d1;
+ not_null_start = data + d1;
+ not_null_count = d2;
+ }
+ else
+ {
+ not_null_start = data;
+ not_null_count = d1;
+ null_start = data + d1;
+ null_count = d2;
+ }
+
+ for (SortTuple *st = null_start;
+ st < null_start + null_count;
+ st++)
+ Assert(st->isnull1 == true);
+ for (SortTuple *st = not_null_start;
+ st < not_null_start + not_null_count;
+ st++)
+ Assert(st->isnull1 == false);
+
+ /*
+ * Sort the NULL partition using tiebreak comparator, if necessary.
+ */
+ if (state->base.onlyKey == NULL && null_count > 1)
+ {
+ qsort_tuple(null_start,
+ null_count,
+ state->base.comparetup_tiebreak,
+ state);
+ }
+
+ /*
+ * Sort the NOT NULL partition, using radix sort if large enough,
+ * otherwise fall back to quicksort.
+ */
+ if (not_null_count < QSORT_THRESHOLD)
+ {
+ qsort_tuple(not_null_start,
+ not_null_count,
+ state->base.comparetup,
+ state);
+ }
+ else
+ {
+ bool presorted = true;
+
+ for (SortTuple *st = not_null_start + 1;
+ st < not_null_start + not_null_count;
+ st++)
+ {
+ if (COMPARETUP(state, st - 1, st) > 0)
+ {
+ presorted = false;
+ break;
+ }
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (presorted)
+ return;
+ else
+ {
+ radix_sort_recursive(not_null_start,
+ not_null_count,
+ 0,
+ state);
+ }
+ }
+}
+
+/* Verify in-memory sort using standard comparator. */
+static void
+verify_memtuples_sorted(Tuplesortstate *state)
+{
+#ifdef USE_ASSERT_CHECKING
+ for (SortTuple *st = state->memtuples + 1;
+ st < state->memtuples + state->memtupcount;
+ st++)
+ Assert(COMPARETUP(state, st - 1, st) <= 0);
+#endif
+}
+
+/*
+ * Sort all memtuples using specialized routines.
+ *
+ * Quicksort or radix sort is used for small in-memory sorts,
+ * and external sort runs.
*/
static void
tuplesort_sort_memtuples(Tuplesortstate *state)
if (state->memtupcount > 1)
{
/*
- * Do we have the leading column's value or abbreviation in datum1,
- * and is there a specialization for its comparator?
+ * Do we have the leading column's value or abbreviation in datum1?
*/
if (state->base.haveDatum1 && state->base.sortKeys)
{
- if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
- {
- qsort_tuple_unsigned(state->memtuples,
- state->memtupcount,
- state);
- return;
- }
- else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
- {
- qsort_tuple_signed(state->memtuples,
- state->memtupcount,
- state);
- return;
- }
- else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
+ SortSupport ssup = &state->base.sortKeys[0];
+
+ /* Does it compare as an integer? */
+ if (state->memtupcount >= QSORT_THRESHOLD &&
+ (ssup->comparator == ssup_datum_unsigned_cmp ||
+ ssup->comparator == ssup_datum_signed_cmp ||
+ ssup->comparator == ssup_datum_int32_cmp))
{
- qsort_tuple_int32(state->memtuples,
- state->memtupcount,
- state);
+ radix_sort_tuple(state->memtuples,
+ state->memtupcount,
+ state);
+ verify_memtuples_sorted(state);
return;
}
}