initdb required: Refactor hashing
authorPeter Geoghegan <pg@heroku.com>
Sat, 15 Mar 2014 10:52:51 +0000 (03:52 -0700)
committerPeter Geoghegan <pg@heroku.com>
Sat, 15 Mar 2014 10:52:51 +0000 (03:52 -0700)
Do some variability and commonality analysis among jsonb_hash and the
jsonb_hash_ops opclass.  Eliminate some redundant code.  Totally remove
the slightly dubious practice of using a CRC32 for hashing.

src/backend/utils/adt/jsonb_gin.c
src/backend/utils/adt/jsonb_op.c
src/backend/utils/adt/jsonb_support.c
src/include/utils/jsonb.h

index 664ce5a5a21cea46a19402586fdfb3419069335c..4fd35a41976d73d97f82789f2c8563c241e1c207 100644 (file)
@@ -14,7 +14,7 @@
 #include "postgres.h"
 
 #include "access/gin.h"
-#include "access/hash.h"
+#include "access/skey.h"
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
 #include "utils/builtins.h"
@@ -28,7 +28,6 @@ typedef struct PathHashStack
 
 static text *make_text_key(const char *str, int len, char flag);
 static text *make_scalar_text_key(const JsonbValue * v, char flag);
-static void hash_scalar_value(const JsonbValue * v, PathHashStack * stack);
 
 /*
  *
@@ -325,12 +324,6 @@ gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
    tail.hash_state = 0;
    stack = &tail;
 
-   /*
-    * Calculate hashes of all key_1.key_2. ... .key_n.value paths as entries.
-    * The order of array elements doesn't matter, so array keys are empty in
-    * path.  For faster calculation of hashes, use a stack of precalculated
-    * hashes of prefixes.
-    */
    while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
    {
        uint32          temphash;
@@ -345,10 +338,10 @@ gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
        /*
         * Keys and values hashed as one.
         *
-        * Note that we don't hash anything that directly reflects the nesting
-        * structure (e.g. whether a structure is an array or object).  It's
-        * generally assumed that per column jsonb values frequently have a
-        * somewhat homogeneous structure.
+        * No particular effort is made to mix in the structure of the Jsonb
+        * itself (number of keys, values and elements notwithstanding), on the
+        * assumption that it's very homogeneous among Json datums in the same
+        * column.
         */
        switch (r)
        {
@@ -362,12 +355,12 @@ gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
            case WJB_KEY:
                /* Calc hash of key and separated into preserved stack item */
                stack->hash_state = stack->next->hash_state;
-               hash_scalar_value(&v, stack);
+               hash_scalar_value(&v, &stack->hash_state);
                break;
            case WJB_VALUE:
            case WJB_ELEM:
                stack->hash_state = stack->next->hash_state;
-               hash_scalar_value(&v, stack);
+               hash_scalar_value(&v, &stack->hash_state);
                temphash = stack->hash_state;
                entries[i++] = temphash;
                break;
@@ -470,39 +463,8 @@ make_scalar_text_key(const JsonbValue * v, char flag)
            item = make_text_key(v->string.val, v->string.len, flag);
            break;
        default:
-           elog(ERROR, "invalid jsonb scalar type: %d", v->type);
+           elog(ERROR, "invalid jsonb scalar type");
    }
 
    return item;
 }
-
-/*
- * Hash a JsonbValue scalar value, and push it on to hashing stack
- */
-static void
-hash_scalar_value(const JsonbValue * v, PathHashStack * stack)
-{
-   switch (v->type)
-   {
-       case jbvNull:
-           stack->hash_state ^= 0x01;
-           break;
-       case jbvBool:
-           stack->hash_state ^= v->boolean? 0x02:0x04;
-           break;
-       case jbvNumeric:
-           /*
-            * A hash value unaffected by trailing zeroes is required.
-            */
-           stack->hash_state ^= DatumGetInt32(DirectFunctionCall1(hash_numeric,
-                                              NumericGetDatum(v->numeric)));
-           break;
-       case jbvString:
-           stack->hash_state ^= hash_any((unsigned char *) v->string.val,
-                                         v->string.len);
-           break;
-       default:
-           elog(ERROR, "invalid jsonb scalar type");
-           break;
-   }
-}
index 263dc5e65c813430d69f6cb51b701fc01d227d8e..b22c8bcbd389e2d8d184f0994306459cd46d5a8f 100644 (file)
@@ -237,7 +237,9 @@ jsonb_cmp(PG_FUNCTION_ARGS)
    PG_RETURN_INT32(res);
 }
 
-/* Hash operator class jsonb hashing function */
+/*
+ * Hash operator class jsonb hashing function
+ */
 Datum
 jsonb_hash(PG_FUNCTION_ARGS)
 {
@@ -245,63 +247,38 @@ jsonb_hash(PG_FUNCTION_ARGS)
    JsonbIterator *it;
    int32       r;
    JsonbValue  v;
-   int         crc;
+   uint32      hash = 0;
 
    if (JB_ROOT_COUNT(jb) == 0)
        PG_RETURN_INT32(0);
 
    it = JsonbIteratorInit(VARDATA(jb));
-   INIT_CRC32(crc);
 
    while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
    {
        switch (r)
        {
            case WJB_BEGIN_ARRAY:
-               COMP_CRC32(crc, "ab", 3);
-               COMP_CRC32(crc, &v.array.nElems, sizeof(int));
-               COMP_CRC32(crc, &v.array.scalar, sizeof(bool));
-               break;
            case WJB_BEGIN_OBJECT:
-               COMP_CRC32(crc, "hb", 3);
-               COMP_CRC32(crc, &v.object.nPairs, sizeof(int));
+           case WJB_END_ARRAY:
+           case WJB_END_OBJECT:
+               /*
+                * No particular effort is made to mix in the structure of the
+                * Jsonb itself (number of keys, values and elements
+                * notwithstanding), on the assumption that it's very
+                * homogeneous among Json datums in the same column.
+                */
                break;
            case WJB_KEY:
-               COMP_CRC32(crc, "k", 2);
            case WJB_VALUE:
            case WJB_ELEM:
-               switch (v.type)
-               {
-                   case jbvString:
-                       COMP_CRC32(crc, v.string.val, v.string.len);
-                       break;
-                   case jbvNull:
-                       COMP_CRC32(crc, "N", 2);
-                       break;
-                   case jbvBool:
-                       COMP_CRC32(crc, &v.boolean, sizeof(bool));
-                       break;
-                   case jbvNumeric:
-                       crc ^= DatumGetInt32(DirectFunctionCall1(hash_numeric,
-                                                                NumericGetDatum(v.numeric)));
-                       break;
-                   default:
-                       elog(ERROR, "invalid jsonb iterator type");
-               }
-               break;
-           case WJB_END_ARRAY:
-               COMP_CRC32(crc, "ae", 3);
-               break;
-           case WJB_END_OBJECT:
-               COMP_CRC32(crc, "he", 3);
+               hash_scalar_value(&v, &hash);
                break;
            default:
                elog(ERROR, "invalid JsonbIteratorNext rc: %d", r);
        }
    }
 
-   FIN_CRC32(crc);
-
    PG_FREE_IF_COPY(jb, 0);
-   PG_RETURN_INT32(crc);
+   PG_RETURN_INT32(hash);
 }
index 5c0b671d947e8a75137ee900ce730711dc68372c..bb4ec45a2381f2ee197f64c6acddfcf9218b7318 100644 (file)
@@ -11,6 +11,7 @@
  */
 #include "postgres.h"
 
+#include "access/hash.h"
 #include "miscadmin.h"
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
@@ -1002,6 +1003,44 @@ arrayToJsonbSortedArray(ArrayType *a)
    return result;
 }
 
+/*
+ * Hash a JsonbValue scalar value
+ */
+void
+hash_scalar_value(const JsonbValue * v, uint32 * hash_state)
+{
+   int tmp;
+
+   /*
+    * Combine hash values of successive keys, values and elements by rotating
+    * the previous value left 1 bit, then XOR'ing in the new element's hash
+    * value.
+    */
+   *hash_state = (*hash_state << 1) | (*hash_state >> 31);
+   switch (v->type)
+   {
+       case jbvNull:
+           *hash_state ^= 0x01;
+           break;
+       case jbvBool:
+           *hash_state ^= v->boolean? 0x02:0x04;
+           break;
+       case jbvNumeric:
+           /* Must be unaffected by trailing zeroes */
+           tmp = DatumGetInt32(DirectFunctionCall1(hash_numeric,
+                                                   NumericGetDatum(v->numeric)));
+           *hash_state ^= tmp;
+           break;
+       case jbvString:
+           tmp = hash_any((unsigned char *) v->string.val, v->string.len);
+           *hash_state ^= tmp;
+           break;
+       default:
+           elog(ERROR, "invalid jsonb scalar type");
+           break;
+   }
+}
+
 /*
  * Are two scalar JsonbValues a and b equal?
  *
index ef606cef500721df73708e34f1289721f07e7a63..568c78de0972fbb1ecf1a86fbd79f7c220b90bd6 100644 (file)
@@ -295,6 +295,7 @@ extern int JsonbIteratorNext(JsonbIterator **it, JsonbValue *v,
 extern Jsonb *JsonbValueToJsonb(JsonbValue *v);
 extern bool deepContains(JsonbIterator ** val, JsonbIterator ** mContained);
 extern JsonbValue *arrayToJsonbSortedArray(ArrayType *a);
+extern void hash_scalar_value(const JsonbValue * v, uint32 * hash_state);
 
 /* jsonb.c support function */
 extern char *JsonbToCString(StringInfo out, char *in, int estimated_len);