* 'hash_exprs'. When multiple expressions are present, the hash values
* returned by each hash function are combined to produce a single hash value.
*
+ * If any hash_expr yields NULL and the corresponding hash operator is strict,
+ * the created ExprState will return NULL. (If the operator is not strict,
+ * we treat NULL values as having a hash value of zero. The hash functions
+ * themselves are always treated as strict.)
+ *
* desc: tuple descriptor for the to-be-hashed expressions
* ops: TupleTableSlotOps for the TupleDesc
* hashfunc_oids: Oid for each hash function to call, one for each 'hash_expr'
- * collations: collation to use when calling the hash function.
- * hash_expr: list of expressions to hash the value of
- * opstrict: array corresponding to the 'hashfunc_oids' to store op_strict()
+ * collations: collation to use when calling the hash function
+ * hash_exprs: list of expressions to hash the value of
+ * opstrict: strictness flag for each hash function's comparison operator
* parent: PlanState node that the 'hash_exprs' will be evaluated at
* init_value: Normally 0, but can be set to other values to seed the hash
* with some other value. Using non-zero is slightly less efficient but can
* be useful.
- * keep_nulls: if true, evaluation of the returned ExprState will abort early
- * returning NULL if the given hash function is strict and the Datum to hash
- * is null. When set to false, any NULL input Datums are skipped.
*/
ExprState *
ExecBuildHash32Expr(TupleDesc desc, const TupleTableSlotOps *ops,
const Oid *hashfunc_oids, const List *collations,
const List *hash_exprs, const bool *opstrict,
- PlanState *parent, uint32 init_value, bool keep_nulls)
+ PlanState *parent, uint32 init_value)
{
ExprState *state = makeNode(ExprState);
ExprEvalStep scratch = {0};
fmgr_info(funcid, finfo);
/*
- * Build the steps to evaluate the hash function's argument have it so
- * the value of that is stored in the 0th argument of the hash func.
+ * Build the steps to evaluate the hash function's argument, placing
+ * the value in the 0th argument of the hash func.
*/
ExecInitExprRec(expr,
state,
scratch.d.hashdatum.fcinfo_data = fcinfo;
scratch.d.hashdatum.fn_addr = finfo->fn_addr;
- scratch.opcode = opstrict[i] && !keep_nulls ? strict_opcode : opcode;
+ scratch.opcode = opstrict[i] ? strict_opcode : opcode;
scratch.d.hashdatum.jumpdone = -1;
ExprEvalPushStep(state, &scratch);
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
+#include "utils/tuplestore.h"
#include "utils/wait_event.h"
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
/* must provide our own instrumentation support */
if (node->ps.instrument)
- InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);
+ InstrStopNode(node->ps.instrument, node->hashtable->reportTuples);
/*
* We do not return the hash table directly because it's not a subtype of
HashJoinTable hashtable;
TupleTableSlot *slot;
ExprContext *econtext;
+ double nullTuples = 0;
/*
* get state info from node
econtext = node->ps.ps_ExprContext;
/*
- * Get all tuples from the node below the Hash node and insert into the
- * hash table (or temp files).
+ * Get all tuples from the node below the Hash node and insert the
+ * potentially-matchable ones into the hash table (or temp files). Tuples
+ * that can't possibly match because they have null join keys are dumped
+ * into a separate tuplestore, or just summarily discarded if we don't
+ * need to emit them with null-extension.
*/
for (;;)
{
if (!isnull)
{
+ /* normal case with a non-null join key */
uint32 hashvalue = DatumGetUInt32(hashdatum);
int bucketNumber;
/* It's a skew tuple, so put it into that hash table */
ExecHashSkewTableInsert(hashtable, slot, hashvalue,
bucketNumber);
- hashtable->skewTuples += 1;
}
else
{
}
hashtable->totalTuples += 1;
}
+ else if (node->keep_null_tuples)
+ {
+ /* null join key, but we must save tuple to be emitted later */
+ if (node->null_tuple_store == NULL)
+ node->null_tuple_store = ExecHashBuildNullTupleStore(hashtable);
+ tuplestore_puttupleslot(node->null_tuple_store, slot);
+ nullTuples += 1;
+ }
+ /* else we can discard the tuple immediately */
}
/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spacePeak = hashtable->spaceUsed;
- hashtable->partialTuples = hashtable->totalTuples;
+ /* Report total number of tuples output (but not those discarded) */
+ hashtable->reportTuples = hashtable->totalTuples + nullTuples;
}
/* ----------------------------------------------------------------
HashJoinTable hashtable;
TupleTableSlot *slot;
ExprContext *econtext;
- uint32 hashvalue;
Barrier *build_barrier;
int i;
for (;;)
{
bool isnull;
+ uint32 hashvalue;
slot = ExecProcNode(outerNode);
if (TupIsNull(slot))
&isnull));
if (!isnull)
+ {
+ /* normal case with a non-null join key */
ExecParallelHashTableInsert(hashtable, slot, hashvalue);
- hashtable->partialTuples++;
+ hashtable->reportTuples++;
+ }
+ else if (node->keep_null_tuples)
+ {
+ /* null join key, but save tuple to be emitted later */
+ if (node->null_tuple_store == NULL)
+ node->null_tuple_store = ExecHashBuildNullTupleStore(hashtable);
+ tuplestore_puttupleslot(node->null_tuple_store, slot);
+ hashtable->reportTuples++;
+ }
+ /* else we can discard the tuple immediately */
}
/*
/*
* We're not yet attached to a batch. We all agree on the dimensions and
- * number of inner tuples (for the empty table optimization).
+ * number of inner tuples. (In parallel mode, totalTuples isn't used in
+ * this module, but we must report it for nodeHashjoin.c's empty-table
+ * optimization.)
*/
hashtable->curbatch = -1;
hashtable->nbuckets = pstate->nbuckets;
Assert(node->plan.qual == NIL);
- /*
- * Delay initialization of hash_expr until ExecInitHashJoin(). We cannot
- * build the ExprState here as we don't yet know the join type we're going
- * to be hashing values for and we need to know that before calling
- * ExecBuildHash32Expr as the keep_nulls parameter depends on the join
- * type.
- */
+ /* these fields will be filled by ExecInitHashJoin() */
hashstate->hash_expr = NULL;
+ hashstate->null_tuple_store = NULL;
+ hashstate->keep_null_tuples = false;
return hashstate;
}
hashtable->nbatch_outstart = nbatch;
hashtable->growEnabled = true;
hashtable->totalTuples = 0;
- hashtable->partialTuples = 0;
+ hashtable->reportTuples = 0;
hashtable->skewTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
*/
HashJoinTuple hashTuple;
int hashTupleSize;
- double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
/* Create the HashJoinTuple */
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
/*
* Increase the (optimal) number of buckets if we just exceeded the
* NTUP_PER_BUCKET threshold, but only when there's still a single
- * batch.
+ * batch. Note that totalTuples - skewTuples is a reliable indicator
+ * of the hash table's size only as long as there's just one batch.
*/
if (hashtable->nbatch == 1 &&
- ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
+ (hashtable->totalTuples - hashtable->skewTuples) >
+ (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
{
/* Guard against integer overflow and alloc size overflow */
if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
Assert(hashTuple != hashTuple->next.unshared);
/* Account for space used, and back off if we've used too much */
+ hashtable->skewTuples += 1;
hashtable->spaceUsed += hashTupleSize;
hashtable->spaceUsedSkew += hashTupleSize;
if (hashtable->spaceUsed > hashtable->spacePeak)
hashtable->spaceUsedSkew -= tupleSize;
}
+ /*
+ * We must reduce skewTuples, but totalTuples doesn't change since it
+ * counts both main and skew tuples.
+ */
+ hashtable->skewTuples -= 1;
+
hashTuple = nextHashTuple;
/* allow this loop to be cancellable */
}
}
+/*
+ * Build a tuplestore suitable for holding null-keyed input tuples.
+ * (This function doesn't care whether it's for outer or inner tuples.)
+ *
+ * Note that in a parallel hash join, each worker has its own tuplestore(s)
+ * for these. There's no need to interact with other workers to decide
+ * what to do with them. So they're always in private storage.
+ */
+Tuplestorestate *
+ExecHashBuildNullTupleStore(HashJoinTable hashtable)
+{
+ Tuplestorestate *tstore;
+ MemoryContext oldcxt;
+
+ /*
+ * We keep the tuplestore in the hashCxt to ensure it won't go away too
+ * soon. Size it at work_mem/16 so that it doesn't bloat the node's space
+ * consumption too much.
+ */
+ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+ tstore = tuplestore_begin_heap(false, false, work_mem / 16);
+ MemoryContextSwitchTo(oldcxt);
+ return tstore;
+}
+
/*
* Reserve space in the DSM segment for instrumentation data.
*/
#include "miscadmin.h"
#include "utils/lsyscache.h"
#include "utils/sharedtuplestore.h"
+#include "utils/tuplestore.h"
#include "utils/wait_event.h"
#define HJ_SCAN_BUCKET 3
#define HJ_FILL_OUTER_TUPLE 4
#define HJ_FILL_INNER_TUPLES 5
-#define HJ_NEED_NEW_BATCH 6
+#define HJ_FILL_OUTER_NULL_TUPLES 6
+#define HJ_FILL_INNER_NULL_TUPLES 7
+#define HJ_NEED_NEW_BATCH 8
/* Returns true if doing null-fill on outer relation */
#define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL)
/*
* If the inner relation is completely empty, and we're not
* doing a left outer join, we can quit without scanning the
- * outer relation.
+ * outer relation. (If the inner relation contains only
+ * null-keyed tuples that we need to emit, we'll fall through
+ * and do the outer-relation scan. In principle we could go
+ * emit those tuples then quit, but it would complicate the
+ * state machine logic. The case seems rare enough to not be
+ * worth optimizing.)
*/
- if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
+ if (hashtable->totalTuples == 0 &&
+ hashNode->null_tuple_store == NULL &&
+ !HJ_FILL_OUTER(node))
{
if (parallel)
{
ExecParallelHashJoinPartitionOuter(node);
BarrierArriveAndWait(build_barrier,
WAIT_EVENT_HASH_BUILD_HASH_OUTER);
- }
- else if (BarrierPhase(build_barrier) == PHJ_BUILD_FREE)
- {
- /*
- * If we attached so late that the job is finished and
- * the batch state has been freed, we can return
- * immediately.
- */
- return NULL;
+ Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUN);
}
- /* Each backend should now select a batch to work on. */
- Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUN);
+ /*
+ * Each backend should now select a batch to work on.
+ * However, if we've already collected some null-keyed
+ * tuples, dump them first. (That is critical when we
+ * arrive late enough that no more batches are available;
+ * otherwise we'd fail to dump those tuples at all.)
+ */
hashtable->curbatch = -1;
- node->hj_JoinState = HJ_NEED_NEW_BATCH;
+
+ if (node->hj_NullOuterTupleStore)
+ node->hj_JoinState = HJ_FILL_OUTER_NULL_TUPLES;
+ else if (hashNode->null_tuple_store)
+ node->hj_JoinState = HJ_FILL_INNER_NULL_TUPLES;
+ else
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
continue;
}
if (parallel)
{
/*
- * Only one process is currently allow to handle
+ * Only one process is currently allowed to handle
* each batch's unmatched tuples, in a parallel
- * join.
+ * join. However, each process must deal with any
+ * null-keyed tuples it found.
*/
if (ExecParallelPrepHashTableForUnmatched(node))
node->hj_JoinState = HJ_FILL_INNER_TUPLES;
+ else if (node->hj_NullOuterTupleStore)
+ node->hj_JoinState = HJ_FILL_OUTER_NULL_TUPLES;
+ else if (hashNode->null_tuple_store)
+ node->hj_JoinState = HJ_FILL_INNER_NULL_TUPLES;
else
node->hj_JoinState = HJ_NEED_NEW_BATCH;
}
}
}
else
- node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ {
+ /* might have outer null-keyed tuples to fill */
+ Assert(hashNode->null_tuple_store == NULL);
+ if (node->hj_NullOuterTupleStore)
+ node->hj_JoinState = HJ_FILL_OUTER_NULL_TUPLES;
+ else
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ }
continue;
}
if (!(parallel ? ExecParallelScanHashTableForUnmatched(node, econtext)
: ExecScanHashTableForUnmatched(node, econtext)))
{
- /* no more unmatched tuples */
- node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ /* no more unmatched tuples, but maybe there are nulls */
+ if (node->hj_NullOuterTupleStore)
+ node->hj_JoinState = HJ_FILL_OUTER_NULL_TUPLES;
+ else if (hashNode->null_tuple_store)
+ node->hj_JoinState = HJ_FILL_INNER_NULL_TUPLES;
+ else
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
continue;
}
InstrCountFiltered2(node, 1);
break;
+ case HJ_FILL_OUTER_NULL_TUPLES:
+
+ /*
+ * We have finished a batch, but we are doing left/full join,
+ * so any null-keyed outer tuples have to be emitted before we
+ * continue to the next batch.
+ *
+ * (We could delay this till the end of the join, but there
+ * seems little percentage in that.)
+ *
+ * We have to use tuplestore_gettupleslot_force because
+ * hj_OuterTupleSlot may not be able to store a MinimalTuple.
+ */
+ while (tuplestore_gettupleslot_force(node->hj_NullOuterTupleStore,
+ true, false,
+ node->hj_OuterTupleSlot))
+ {
+ /*
+ * Generate a fake join tuple with nulls for the inner
+ * tuple, and return it if it passes the non-join quals.
+ */
+ econtext->ecxt_outertuple = node->hj_OuterTupleSlot;
+ econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ else
+ InstrCountFiltered2(node, 1);
+
+ ResetExprContext(econtext);
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /* We don't need the tuplestore any more, so discard it. */
+ tuplestore_end(node->hj_NullOuterTupleStore);
+ node->hj_NullOuterTupleStore = NULL;
+
+ /* Fill inner tuples too if it's a full join, else advance. */
+ if (hashNode->null_tuple_store)
+ node->hj_JoinState = HJ_FILL_INNER_NULL_TUPLES;
+ else
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ break;
+
+ case HJ_FILL_INNER_NULL_TUPLES:
+
+ /*
+ * We have finished a batch, but we are doing
+ * right/right-anti/full join, so any null-keyed inner tuples
+ * have to be emitted before we continue to the next batch.
+ *
+ * (We could delay this till the end of the join, but there
+ * seems little percentage in that.)
+ */
+ while (tuplestore_gettupleslot(hashNode->null_tuple_store,
+ true, false,
+ node->hj_HashTupleSlot))
+ {
+ /*
+ * Generate a fake join tuple with nulls for the outer
+ * tuple, and return it if it passes the non-join quals.
+ */
+ econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
+ econtext->ecxt_innertuple = node->hj_HashTupleSlot;
+
+ if (otherqual == NULL || ExecQual(otherqual, econtext))
+ return ExecProject(node->js.ps.ps_ProjInfo);
+ else
+ InstrCountFiltered2(node, 1);
+
+ ResetExprContext(econtext);
+
+ /* allow this loop to be cancellable */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * Ideally we'd discard the tuplestore now, but we can't
+ * because we might need it for rescans.
+ */
+
+ /* Now we can advance to the next batch. */
+ node->hj_JoinState = HJ_NEED_NEW_BATCH;
+ break;
+
case HJ_NEED_NEW_BATCH:
/*
/*
* Build ExprStates to obtain hash values for either side of the join.
- * This must be done here as ExecBuildHash32Expr needs to know how to
- * handle NULL inputs and the required handling of that depends on the
- * jointype. We don't know the join type in ExecInitHash() and we
- * must build the ExprStates before ExecHashTableCreate() so we
+ * Note: must build the ExprStates before ExecHashTableCreate() so we
* properly attribute any SubPlans that exist in the hash expressions
* to the correct PlanState.
*/
/*
* Determine the hash function for each side of the join for the given
- * hash operator.
+ * join operator, and detect whether the join operator is strict.
*/
foreach(lc, node->hashoperators)
{
/*
* Build an ExprState to generate the hash value for the expressions
- * on the outer of the join. This ExprState must finish generating
- * the hash value when HJ_FILL_OUTER() is true. Otherwise,
- * ExecBuildHash32Expr will set up the ExprState to abort early if it
- * finds a NULL. In these cases, we don't need to store these tuples
- * in the hash table as the jointype does not require it.
+ * on the outer side of the join.
*/
hjstate->hj_OuterHash =
ExecBuildHash32Expr(hjstate->js.ps.ps_ResultTupleDesc,
node->hashkeys,
hash_strict,
&hjstate->js.ps,
- 0,
- HJ_FILL_OUTER(hjstate));
+ 0);
/* As above, but for the inner side of the join */
hashstate->hash_expr =
hash->hashkeys,
hash_strict,
&hashstate->ps,
- 0,
- HJ_FILL_INNER(hjstate));
+ 0);
+
+ /* Remember whether we need to save tuples with null join keys */
+ hjstate->hj_KeepNullTuples = HJ_FILL_OUTER(hjstate);
+ hashstate->keep_null_tuples = HJ_FILL_INNER(hjstate);
/*
* Set up the skew table hash function while we have a record of the
* initialize hash-specific info
*/
hjstate->hj_HashTable = NULL;
+ hjstate->hj_NullOuterTupleStore = NULL;
hjstate->hj_FirstOuterTupleSlot = NULL;
hjstate->hj_CurHashValue = 0;
void
ExecEndHashJoin(HashJoinState *node)
{
+ HashState *hashNode = castNode(HashState, innerPlanState(node));
+
+ /*
+ * Free tuple stores if we made them (must do this before
+ * ExecHashTableDestroy deletes hashCxt)
+ */
+ if (node->hj_NullOuterTupleStore)
+ {
+ tuplestore_end(node->hj_NullOuterTupleStore);
+ node->hj_NullOuterTupleStore = NULL;
+ }
+ if (hashNode->null_tuple_store)
+ {
+ tuplestore_end(hashNode->null_tuple_store);
+ hashNode->null_tuple_store = NULL;
+ }
+
/*
* Free hash table
*/
if (!isnull)
{
+ /* normal case with a non-null join key */
/* remember outer relation is not empty for possible rescan */
hjstate->hj_OuterNotEmpty = true;
return slot;
}
+ else if (hjstate->hj_KeepNullTuples)
+ {
+ /* null join key, but we must save tuple to be emitted later */
+ if (hjstate->hj_NullOuterTupleStore == NULL)
+ hjstate->hj_NullOuterTupleStore = ExecHashBuildNullTupleStore(hashtable);
+ tuplestore_puttupleslot(hjstate->hj_NullOuterTupleStore, slot);
+ }
/*
* That tuple couldn't match because of a NULL, so discard it and
&isnull));
if (!isnull)
+ {
+ /* normal case with a non-null join key */
return slot;
+ }
+ else if (hjstate->hj_KeepNullTuples)
+ {
+ /* null join key, but we must save tuple to be emitted later */
+ if (hjstate->hj_NullOuterTupleStore == NULL)
+ hjstate->hj_NullOuterTupleStore = ExecHashBuildNullTupleStore(hashtable);
+ tuplestore_puttupleslot(hjstate->hj_NullOuterTupleStore, slot);
+ }
/*
* That tuple couldn't match because of a NULL, so discard it and
int start_batchno;
int batchno;
+ /*
+ * If we are a very slow worker, MultiExecParallelHash could have observed
+ * build_barrier phase PHJ_BUILD_FREE and not bothered to set up batch
+ * accessors. In that case we must be done.
+ */
+ if (hashtable->batches == NULL)
+ return false;
+
/*
* If we were already attached to a batch, remember not to bother checking
* it again, and detach from it (possibly freeing the hash table if we are
PlanState *outerPlan = outerPlanState(node);
PlanState *innerPlan = innerPlanState(node);
+ /*
+ * We're always going to rescan the outer rel, so drop the associated
+ * null-keys tuplestore; we'll rebuild it during the rescan. (Must do
+ * this before ExecHashTableDestroy deletes hashCxt.)
+ */
+ if (node->hj_NullOuterTupleStore)
+ {
+ tuplestore_end(node->hj_NullOuterTupleStore);
+ node->hj_NullOuterTupleStore = NULL;
+ }
+
/*
* In a multi-batch join, we currently have to do rescans the hard way,
* primarily because batch temp files may have already been released. But
*/
if (node->hj_HashTable != NULL)
{
+ HashState *hashNode = castNode(HashState, innerPlan);
+
+ Assert(hashNode->hashtable == node->hj_HashTable);
+
if (node->hj_HashTable->nbatch == 1 &&
innerPlan->chgParam == NULL)
{
*/
node->hj_OuterNotEmpty = false;
+ /*
+ * Also, rewind inner null-key tuplestore so that we can return
+ * those tuples again.
+ */
+ if (hashNode->null_tuple_store)
+ tuplestore_rescan(hashNode->null_tuple_store);
+
/* ExecHashJoin can skip the BUILD_HASHTABLE step */
node->hj_JoinState = HJ_NEED_NEW_OUTER;
}
else
{
/* must destroy and rebuild hash table */
- HashState *hashNode = castNode(HashState, innerPlan);
- Assert(hashNode->hashtable == node->hj_HashTable);
/* accumulate stats from old hash table, if wanted */
/* (this should match ExecShutdownHash) */
if (hashNode->ps.instrument && !hashNode->hinstrument)
if (hashNode->hinstrument)
ExecHashAccumInstrumentation(hashNode->hinstrument,
hashNode->hashtable);
+
+ /* free inner null-key tuplestore before ExecHashTableDestroy */
+ if (hashNode->null_tuple_store)
+ {
+ tuplestore_end(hashNode->null_tuple_store);
+ hashNode->null_tuple_store = NULL;
+ }
+
/* for safety, be sure to clear child plan node's pointer too */
hashNode->hashtable = NULL;
ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
HashJoinTable hashtable = hjstate->hj_HashTable;
TupleTableSlot *slot;
- uint32 hashvalue;
int i;
Assert(hjstate->hj_FirstOuterTupleSlot == NULL);
for (;;)
{
bool isnull;
+ uint32 hashvalue;
slot = ExecProcNode(outerState);
if (TupIsNull(slot))
if (!isnull)
{
+ /* normal case with a non-null join key */
int batchno;
int bucketno;
bool shouldFree;
if (shouldFree)
heap_free_minimal_tuple(mintup);
}
+ else if (hjstate->hj_KeepNullTuples)
+ {
+ /* null join key, but we must save tuple to be emitted later */
+ if (hjstate->hj_NullOuterTupleStore == NULL)
+ hjstate->hj_NullOuterTupleStore = ExecHashBuildNullTupleStore(hashtable);
+ tuplestore_puttupleslot(hjstate->hj_NullOuterTupleStore, slot);
+ }
+ /* else we can just discard the tuple immediately */
+
CHECK_FOR_INTERRUPTS();
}
{
int plan_node_id = state->js.ps.plan->plan_node_id;
ParallelHashJoinState *pstate;
+ HashState *hashNode;
/* Nothing to do if we failed to create a DSM segment. */
if (pcxt->seg == NULL)
/* Clear any shared batch files. */
SharedFileSetDeleteAll(&pstate->fileset);
+ /* We'd better clear our local null-key tuplestores, too. */
+ if (state->hj_NullOuterTupleStore)
+ {
+ tuplestore_end(state->hj_NullOuterTupleStore);
+ state->hj_NullOuterTupleStore = NULL;
+ }
+ hashNode = (HashState *) innerPlanState(state);
+ if (hashNode->null_tuple_store)
+ {
+ tuplestore_end(hashNode->null_tuple_store);
+ hashNode->null_tuple_store = NULL;
+ }
+
+
/* Reset build_barrier to PHJ_BUILD_ELECT so we can go around again. */
BarrierInit(&pstate->build_barrier, 0);
}
}
}
+/*
+ * tuplestore_gettupleslot_force - exported function to fetch a tuple
+ *
+ * This is identical to tuplestore_gettupleslot except the given slot can be
+ * any kind of slot; it need not be one that will accept a MinimalTuple.
+ */
+bool
+tuplestore_gettupleslot_force(Tuplestorestate *state, bool forward,
+ bool copy, TupleTableSlot *slot)
+{
+ MinimalTuple tuple;
+ bool should_free;
+
+ tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free);
+
+ if (tuple)
+ {
+ if (copy && !should_free)
+ {
+ tuple = heap_copy_minimal_tuple(tuple, 0);
+ should_free = true;
+ }
+ ExecForceStoreMinimalTuple(tuple, slot, should_free);
+ return true;
+ }
+ else
+ {
+ ExecClearTuple(slot);
+ return false;
+ }
+}
+
/*
* tuplestore_advance - exported function to adjust position without fetching
*
const List *collations,
const List *hash_exprs,
const bool *opstrict, PlanState *parent,
- uint32 init_value, bool keep_nulls);
+ uint32 init_value);
extern ExprState *ExecBuildGroupingEqual(TupleDesc ldesc, TupleDesc rdesc,
const TupleTableSlotOps *lops, const TupleTableSlotOps *rops,
int numCols,
* inner batch file. Subsequently, while reading either inner or outer batch
* files, we might find tuples that no longer belong to the current batch;
* if so, we just dump them out to the correct batch file.
+ *
+ * If an input tuple has a null join key, then it cannot match anything from
+ * the other side of the join. Normally we can just discard such a tuple
+ * immediately, but if it comes from the outer side of an outer join then we
+ * must emit it with null-extension of the other side. For various reasons
+ * it's not convenient to do that immediately on seeing the tuple, so we dump
+ * the tuple into a tuplestore and emit it later. (In the unlikely but
+ * supported case of a non-strict join operator, we treat null keys as normal
+ * data.)
* ----------------------------------------------------------------
*/
bool growEnabled; /* flag to shut off nbatch increases */
- double totalTuples; /* # tuples obtained from inner plan */
- double partialTuples; /* # tuples obtained from inner plan by me */
- double skewTuples; /* # tuples inserted into skew tuples */
+ /*
+ * totalTuples is the running total of tuples inserted into either the
+ * main or skew hash tables. reportTuples is the number of tuples that we
+ * want EXPLAIN to show as output from the Hash node (this includes saved
+ * null-keyed tuples as well as those inserted into the hash tables).
+ * skewTuples is the number of tuples present in the skew hash table.
+ */
+ double totalTuples;
+ double reportTuples;
+ double skewTuples;
/*
* These arrays are allocated for the life of the hash join, but only if
int *numbatches,
int *num_skew_mcvs);
extern int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);
+extern Tuplestorestate *ExecHashBuildNullTupleStore(HashJoinTable hashtable);
extern void ExecHashEstimate(HashState *node, ParallelContext *pcxt);
extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt);
extern void ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt);
* hj_NullOuterTupleSlot prepared null tuple for right/right-anti/full
* outer joins
* hj_NullInnerTupleSlot prepared null tuple for left/full outer joins
+ * hj_NullOuterTupleStore tuplestore holding outer tuples that have
+ * null join keys (but must be emitted anyway)
* hj_FirstOuterTupleSlot first tuple retrieved from outer plan
* hj_JoinState current state of ExecHashJoin state machine
+ * hj_KeepNullTuples true to keep outer tuples with null join keys
* hj_MatchedOuter true if found a join match for current outer
* hj_OuterNotEmpty true if outer relation known not empty
* ----------------
TupleTableSlot *hj_HashTupleSlot;
TupleTableSlot *hj_NullOuterTupleSlot;
TupleTableSlot *hj_NullInnerTupleSlot;
+ Tuplestorestate *hj_NullOuterTupleStore;
TupleTableSlot *hj_FirstOuterTupleSlot;
int hj_JoinState;
+ bool hj_KeepNullTuples;
bool hj_MatchedOuter;
bool hj_OuterNotEmpty;
} HashJoinState;
FmgrInfo *skew_hashfunction; /* lookup data for skew hash function */
Oid skew_collation; /* collation to call skew_hashfunction with */
+ Tuplestorestate *null_tuple_store; /* where to put null-keyed tuples */
+ bool keep_null_tuples; /* do we need to save such tuples? */
+
/*
* In a parallelized hash join, the leader retains a pointer to the
* shared-memory stats area in its shared_info field, and then copies the
extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
bool copy, TupleTableSlot *slot);
+extern bool tuplestore_gettupleslot_force(Tuplestorestate *state, bool forward,
+ bool copy, TupleTableSlot *slot);
+
extern bool tuplestore_advance(Tuplestorestate *state, bool forward);
extern bool tuplestore_skiptuples(Tuplestorestate *state,
explain (costs off)
select * from
(values (1, array[10,20]), (2, array[20,30])) as v1(v1x,v1ys)
-left join (values (1, 10), (2, 20)) as v2(v2x,v2y) on v2x = v1x
+left join (values (1, 10), (2, 20), (2, null)) as v2(v2x,v2y) on v2x = v1x
left join unnest(v1ys) as u1(u1y) on u1y = v2y;
QUERY PLAN
-------------------------------------------------------------
select * from
(values (1, array[10,20]), (2, array[20,30])) as v1(v1x,v1ys)
-left join (values (1, 10), (2, 20)) as v2(v2x,v2y) on v2x = v1x
+left join (values (1, 10), (2, 20), (2, null)) as v2(v2x,v2y) on v2x = v1x
left join unnest(v1ys) as u1(u1y) on u1y = v2y;
v1x | v1ys | v2x | v2y | u1y
-----+---------+-----+-----+-----
1 | {10,20} | 1 | 10 | 10
2 | {20,30} | 2 | 20 | 20
-(2 rows)
+ 2 | {20,30} | 2 | |
+(3 rows)
--
-- test handling of potential equivalence clauses above outer joins
-- estimated size.
create table simple as
select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+insert into simple values (null, null);
alter table simple set (parallel_workers = 2);
analyze simple;
-- Make a relation whose size we will under-estimate. We want stats
select count(*) from simple r full outer join simple s using (id);
count
-------
- 20000
+ 20002
(1 row)
rollback to settings;
select count(*) from simple r full outer join simple s using (id);
count
-------
- 20000
+ 20002
(1 row)
rollback to settings;
select count(*) from simple r full outer join simple s using (id);
count
-------
- 20000
+ 20002
(1 row)
rollback to settings;
select count(*) from simple r full outer join simple s using (id);
count
-------
- 20000
+ 20002
(1 row)
rollback to settings;
select count(*) from simple r full outer join simple s on (r.id = 0 - s.id);
count
-------
- 40000
+ 40002
(1 row)
rollback to settings;
select count(*) from simple r full outer join simple s on (r.id = 0 - s.id);
count
-------
- 40000
+ 40002
(1 row)
rollback to settings;
select count(*) from simple r full outer join simple s on (r.id = 0 - s.id);
count
-------
- 40000
+ 40002
(1 row)
rollback to settings;
explain (costs off)
select * from
(values (1, array[10,20]), (2, array[20,30])) as v1(v1x,v1ys)
-left join (values (1, 10), (2, 20)) as v2(v2x,v2y) on v2x = v1x
+left join (values (1, 10), (2, 20), (2, null)) as v2(v2x,v2y) on v2x = v1x
left join unnest(v1ys) as u1(u1y) on u1y = v2y;
select * from
(values (1, array[10,20]), (2, array[20,30])) as v1(v1x,v1ys)
-left join (values (1, 10), (2, 20)) as v2(v2x,v2y) on v2x = v1x
+left join (values (1, 10), (2, 20), (2, null)) as v2(v2x,v2y) on v2x = v1x
left join unnest(v1ys) as u1(u1y) on u1y = v2y;
--
-- estimated size.
create table simple as
select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+insert into simple values (null, null);
alter table simple set (parallel_workers = 2);
analyze simple;