Review Board 1.7.22


HIVE-4435: Column stats: Distinct value estimator should use hash functions that are pairwise independent

Review Request #10841 - Created April 29, 2013 and updated

Shreepadma Venugopalan
trunk
HIVE-4435
Reviewers
hive
ashutoshc, navis
hive-git
Fixes the FM estimator to use hash functions that are pairwise independent.
The estimates are within the expected error after this fix. Tested on TPCH of varying sizes.
ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java
Revision 69e6f46 New Change
[20] 25 lines
[+20]
26

    
   
26

   
27
public class NumDistinctValueEstimator {
27
public class NumDistinctValueEstimator {
28

    
   
28

   
29
  static final Log LOG = LogFactory.getLog(NumDistinctValueEstimator.class.getName());
29
  static final Log LOG = LogFactory.getLog(NumDistinctValueEstimator.class.getName());
30

    
   
30

   
31
  private final int bitVectorSize = 32;
31
  /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number.

    
   
32
   * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1.

    
   
33
   * If a,b,x didn't come from a finite field ax1 + b mod k and ax2 + b mod k will not be pair wise

    
   
34
   * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1

    
   
35
   * thus introducing errors in the estimates.

    
   
36
   */

    
   
37
  private static final int bitVectorSize = 31;
32
  private int numBitVectors;
38
  private int numBitVectors;
33

    
   
39

   
34
  // Refer to Flajolet-Martin'86 for the value of phi
40
  // Refer to Flajolet-Martin'86 for the value of phi
35
  private final double phi =  0.77351;
41
  private final double phi =  0.77351;
36

    
   
42

   
[+20] [20] 14 lines
[+20] public class NumDistinctValueEstimator {
51
    }
57
    }
52

    
   
58

   
53
    a = new int[numBitVectors];
59
    a = new int[numBitVectors];
54
    b = new int[numBitVectors];
60
    b = new int[numBitVectors];
55

    
   
61

   
56
    aValue = new Random(79798);
62
    /* Use a large prime number as a seed to the random number generator.
57
    bValue = new Random(34115);
63
     * Java's random number generator uses the Linear Congruential Generator to generate random

    
   
64
     * numbers using the following recurrence relation,

    
   
65
     *

    
   
66
     * X(n+1) = (a X(n) + c ) mod m

    
   
67
     *

    
   
68
     *  where X0 is the seed. Java implementation uses m = 2^48. This is problematic because 2^48

    
   
69
     *  is not a prime number and hence the set of numbers from 0 to m don't form a finite field.

    
   
70
     *  If these numbers don't come from a finite field any give X(n) and X(n+1) may not be pair

    
   
71
     *  wise independent.

    
   
72
     *

    
   
73
     *  However, empirically passing in prime numbers as seeds seems to work better than when passing

    
   
74
     *  composite numbers as seeds. Ideally Java's Random should pick m such that m is prime.

    
   
75
     *

    
   
76
     */

    
   
77
    aValue = new Random(99397);

    
   
78
    bValue = new Random(9876413);
58

    
   
79

   
59
    for (int i = 0; i < numBitVectors; i++) {
80
    for (int i = 0; i < numBitVectors; i++) {
60
      int randVal;
81
      int randVal;
61
      /* a and b shouldn't be even; If a and b are even, then none of the values
82
      /* a and b shouldn't be even; If a and b are even, then none of the values
62
       * will set bit 0 thus introducing errors in the estimate. Both a and b can be even
83
       * will set bit 0 thus introducing errors in the estimate. Both a and b can be even
[+20] [20] 11 lines
[+20] public class NumDistinctValueEstimator {
74
      } while (randVal % 2 == 0);
95
      } while (randVal % 2 == 0);
75

    
   
96

   
76
      b[i] = randVal;
97
      b[i] = randVal;
77

    
   
98

   
78
      if (a[i] < 0) {
99
      if (a[i] < 0) {
79
        a[i] = a[i] + (1 << (bitVectorSize -1));
100
        a[i] = a[i] + (1 << bitVectorSize - 1);
80
      }
101
      }
81

    
   
102

   
82
      if (b[i] < 0) {
103
      if (b[i] < 0) {
83
        b[i] = b[i] + (1 << (bitVectorSize -1));
104
        b[i] = b[i] + (1 << bitVectorSize - 1);
84
      }
105
      }
85
    }
106
    }
86
  }
107
  }
87

    
   
108

   
88
  public NumDistinctValueEstimator(String s, int numBitVectors) {
109
  public NumDistinctValueEstimator(String s, int numBitVectors) {
[+20] [20] 106 lines
[+20] [+] private FastBitSet[] deserialize(String s, int numBitVectors) {
195
    }
216
    }
196
    return b;
217
    return b;
197
  }
218
  }
198

    
   
219

   
199
  private int generateHash(long v, int hashNum) {
220
  private int generateHash(long v, int hashNum) {
200
    int mod = 1 << (bitVectorSize - 1) - 1;
221
    int mod = (1<<bitVectorSize) - 1;
201
    long tempHash = a[hashNum] * v + b[hashNum];
222
    long tempHash = a[hashNum] * v  + b[hashNum];
202
    tempHash %= mod;
223
    tempHash %= mod;
203
    int hash = (int) tempHash;
224
    int hash = (int) tempHash;
204

    
   
225

   
205
    /* Hash function should map the long value to 0...2^L-1.
226
    /* Hash function should map the long value to 0...2^L-1.
206
     * Hence hash value has to be non-negative.
227
     * Hence hash value has to be non-negative.
207
     */
228
     */
208
    if (hash < 0) {
229
    if (hash < 0) {
209
      hash = hash + mod + 1;
230
      hash = hash + mod;
210
    }
231
    }
211
    return hash;
232
    return hash;
212
  }
233
  }
213

    
   
234

   
214
  private int generateHashForPCSA(long v) {
235
  private int generateHashForPCSA(long v) {
[+20] [20] 49 lines
[+20] [+] public void addToEstimatorPCSA(long v) {
264

    
   
285

   
265
    // Set bitvector[index] := 1
286
    // Set bitvector[index] := 1
266
    bitVector[hash%numBitVectors].set(index);
287
    bitVector[hash%numBitVectors].set(index);
267
  }
288
  }
268

    
   
289

   

    
   
290

   
269
  public void mergeEstimators(NumDistinctValueEstimator o) {
291
  public void mergeEstimators(NumDistinctValueEstimator o) {
270
    // Bitwise OR the bitvector with the bitvector in the agg buffer
292
    // Bitwise OR the bitvector with the bitvector in the agg buffer
271
    for (int i=0; i<numBitVectors; i++) {
293
    for (int i=0; i<numBitVectors; i++) {
272
      bitVector[i].or(o.getBitVector(i));
294
      bitVector[i].or(o.getBitVector(i));
273
    }
295
    }
[+20] [20] 13 lines
[+20] [+] public long estimateNumDistinctValuesPCSA() {
287

    
   
309

   
288
    numDistinctValues = ((numBitVectors/phi) * Math.pow(2.0, S/numBitVectors));
310
    numDistinctValues = ((numBitVectors/phi) * Math.pow(2.0, S/numBitVectors));
289
    return ((long)numDistinctValues);
311
    return ((long)numDistinctValues);
290
  }
312
  }
291

    
   
313

   
292
  /* We use two estimators - one due to Flajolet-Martin and a modification due to
314
  /* We use the Flajolet-Martin estimator to estimate the number of distinct values.FM uses the
293
   * Alon-Matias-Szegedy. FM uses the location of the least significant zero as an estimate of
315
   * location of the least significant zero as an estimate of log2(phi*ndvs).
294
   * log2(phi*ndvs).

   
295
   * AMS uses the location of the most significant one as an estimate of the log2(ndvs).

   
296
   * We average the two estimators with suitable modifications to obtain an estimate of ndvs.

   
297
   */
316
   */
298
  public long estimateNumDistinctValues() {
317
  public long estimateNumDistinctValues() {
299
    int sumLeastSigZero = 0;
318
    int sumLeastSigZero = 0;
300
    int sumMostSigOne = 0;

   
301
    double avgLeastSigZero;
319
    double avgLeastSigZero;
302
    double avgMostSigOne;

   
303
    double numDistinctValues;
320
    double numDistinctValues;
304

    
   
321

   
305
    for (int i=0; i< numBitVectors; i++) {
322
    for (int i=0; i< numBitVectors; i++) {
306
      int leastSigZero = bitVector[i].nextClearBit(0);
323
      int leastSigZero = bitVector[i].nextClearBit(0);
307
      sumLeastSigZero += leastSigZero;
324
      sumLeastSigZero += leastSigZero;
308
      int mostSigOne = bitVectorSize;

   
309

    
   

   
310
      for (int j=0; j< bitVectorSize; j++) {

   
311
        if (bitVector[i].get(j)) {

   
312
          mostSigOne = j;

   
313
        }

   
314
      }

   
315
      sumMostSigOne += mostSigOne;

   
316
    }
325
    }
317

    
   
326

   
318
    avgLeastSigZero =
327
    avgLeastSigZero =
319
        (double)(sumLeastSigZero/(numBitVectors * 1.0)) - (Math.log(phi)/Math.log(2.0));
328
        (double)(sumLeastSigZero/(numBitVectors * 1.0)) - (Math.log(phi)/Math.log(2.0));
320
    avgMostSigOne = (double)(sumMostSigOne/(numBitVectors * 1.0));
329
    numDistinctValues = Math.pow(2.0, avgLeastSigZero);
321
    numDistinctValues = Math.pow(2.0, (avgMostSigOne + avgLeastSigZero)/2.0);

   
322
    return ((long)(numDistinctValues));
330
    return ((long)(numDistinctValues));
323
  }
331
  }
324
}
332
}
  1. ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java: Loading...