Review Board 1.7.22


support standard UDFs and generic UDFs in vectorized mode with an adaptor

Review Request #14113 - Created Sept. 12, 2013 and updated

Eric Hanson
vectorization
HIVE-4961
Reviewers
hive
hive-git
Added support standard UDFs and generic UDFs in vectorized mode with an adaptor. Includes unit tests.
Also did extensive ad-hoc end-to end tests with a variety of UDFs.
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
Revision 3d8ade6 New Change
[20] 28 lines
[+20]
29

    
   
29

   
30
import org.apache.commons.logging.Log;
30
import org.apache.commons.logging.Log;
31
import org.apache.commons.logging.LogFactory;
31
import org.apache.commons.logging.LogFactory;
32
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
32
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
33
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
33
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;

    
   
34
import org.apache.hadoop.hive.ql.exec.FunctionInfo;

    
   
35
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
34
import org.apache.hadoop.hive.ql.exec.UDF;
36
import org.apache.hadoop.hive.ql.exec.UDF;
35
import org.apache.hadoop.hive.ql.exec.vector.expressions.ConstantVectorExpression;
37
import org.apache.hadoop.hive.ql.exec.vector.expressions.ConstantVectorExpression;
36
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterConstantBooleanVectorExpression;
38
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterConstantBooleanVectorExpression;
37
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprAndExpr;
39
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprAndExpr;
38
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprOrExpr;
40
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterExprOrExpr;
39
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar;
41
import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar;
40
import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression;
42
import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression;
41
import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsNotNull;
43
import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsNotNull;
42
import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsNull;
44
import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsNull;
43
import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsTrue;
45
import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectColumnIsTrue;
44
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
46
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;

    
   
47
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFAdaptor;

    
   
48
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFArgDesc;
45
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFUnixTimeStampLong;
49
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFUnixTimeStampLong;
46
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
50
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
47
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCount;
51
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCount;
48
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCountStar;
52
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCountStar;
49
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFAvgDouble;
53
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFAvgDouble;
[+20] [20] 89 lines
[+20] [+] private int getInputColumnIndex(String name) {
139
    } else {
143
    } else {
140
      return columnMap.get(name);
144
      return columnMap.get(name);
141
    }
145
    }
142
  }
146
  }
143

    
   
147

   

    
   
148
  /* Return true if we are running in the planner, and false if we

    
   
149
   * are running in a task.

    
   
150
   */

    
   
151
  /*

    
   
152
  private boolean isPlanner() {

    
   
153

   

    
   
154
    // This relies on the behavior that columnMap is null in the planner.

    
   
155
    return columnMap == null;

    
   
156
  }

    
   
157
  */

    
   
158

   
144
  private class OutputColumnManager {
159
  private class OutputColumnManager {
145
    private final int initialOutputCol;
160
    private final int initialOutputCol;
146
    private int outputColCount = 0;
161
    private int outputColCount = 0;
147

    
   
162

   
148
    OutputColumnManager(int initialOutputCol) {
163
    OutputColumnManager(int initialOutputCol) {
[+20] [20] 92 lines
[+20] [+] public VectorExpression[] getVectorExpressions(List<ExprNodeDesc> exprNodes) throws HiveException {
241
    VectorExpression ve = null;
256
    VectorExpression ve = null;
242
    if (exprDesc instanceof ExprNodeColumnDesc) {
257
    if (exprDesc instanceof ExprNodeColumnDesc) {
243
      ve = getVectorExpression((ExprNodeColumnDesc) exprDesc);
258
      ve = getVectorExpression((ExprNodeColumnDesc) exprDesc);
244
    } else if (exprDesc instanceof ExprNodeGenericFuncDesc) {
259
    } else if (exprDesc instanceof ExprNodeGenericFuncDesc) {
245
      ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc;
260
      ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc;

    
   
261
      if (isCustomUDF(expr)) {

    
   
262
        ve = getCustomUDFExpression(expr);

    
   
263
      } else {
246
      ve = getVectorExpression(expr.getGenericUDF(),
264
        ve = getVectorExpression(expr.getGenericUDF(),
247
          expr.getChildExprs());
265
            expr.getChildExprs());

    
   
266
      }
248
    } else if (exprDesc instanceof ExprNodeConstantDesc) {
267
    } else if (exprDesc instanceof ExprNodeConstantDesc) {
249
      ve = getConstantVectorExpression((ExprNodeConstantDesc) exprDesc);
268
      ve = getConstantVectorExpression((ExprNodeConstantDesc) exprDesc);
250
    }
269
    }
251
    if (ve == null) {
270
    if (ve == null) {
252
      throw new HiveException("Could not vectorize expression: "+exprDesc.getName());
271
      throw new HiveException("Could not vectorize expression: "+exprDesc.getName());
253
    }
272
    }
254
    return ve;
273
    return ve;
255
  }
274
  }
256

    
   
275

   

    
   
276
  // Return true if this is a custom UDF or custom GenericUDF.

    
   
277
  // This is for use only in the planner. It will fail in a task.

    
   
278
  public static boolean isCustomUDF(ExprNodeGenericFuncDesc expr) {

    
   
279
    String udfName = expr.getFuncText();

    
   
280
    if (udfName == null) {

    
   
281
      return false;

    
   
282
    }

    
   
283
    FunctionInfo funcInfo = FunctionRegistry.getFunctionInfo(udfName);

    
   
284
    if (funcInfo == null) {

    
   
285
      return false;

    
   
286
    }

    
   
287
    boolean isNativeFunc = funcInfo.isNative();

    
   
288
    return !isNativeFunc;

    
   
289
  }

    
   
290

   
257
  /**
291
  /**
258
   * Handles only the special case of unary operators on a constant.
292
   * Handles only the special case of unary operators on a constant.
259
   * @param exprDesc
293
   * @param exprDesc
260
   * @return The same expression if no folding done, else return the constant
294
   * @return The same expression if no folding done, else return the constant
261
   *         expression.
295
   *         expression.
[+20] [20] 210 lines
[+20] [+] private VectorExpression getVectorExpression(GenericUDFBridge udf,
472
    }
506
    }
473

    
   
507

   
474
    throw new HiveException("Udf: "+udf.getClass().getSimpleName()+", is not supported");
508
    throw new HiveException("Udf: "+udf.getClass().getSimpleName()+", is not supported");
475
  }
509
  }
476

    
   
510

   

    
   
511
  /*

    
   
512
   * Return vector expression for a custom (i.e. not built-in) UDF.

    
   
513
   */

    
   
514
  private VectorExpression getCustomUDFExpression(ExprNodeGenericFuncDesc expr)

    
   
515
      throws HiveException {

    
   
516

   

    
   
517
    //GenericUDFBridge udfBridge = (GenericUDFBridge) expr.getGenericUDF();

    
   
518
    List<ExprNodeDesc> childExprList = expr.getChildExprs();

    
   
519

   

    
   
520
    // argument descriptors

    
   
521
    VectorUDFArgDesc[] argDescs = new VectorUDFArgDesc[expr.getChildExprs().size()];

    
   
522
    for (int i = 0; i < argDescs.length; i++) {

    
   
523
      argDescs[i] = new VectorUDFArgDesc();

    
   
524
    }

    
   
525

   

    
   
526
    // positions of variable arguments (columns or non-constant expressions)

    
   
527
    List<Integer> variableArgPositions = new ArrayList<Integer>();

    
   
528

   

    
   
529
    // Column numbers of batch corresponding to expression result arguments

    
   
530
    List<Integer> exprResultColumnNums = new ArrayList<Integer>();

    
   
531

   

    
   
532
    // Prepare children

    
   
533
    List<VectorExpression> vectorExprs = new ArrayList<VectorExpression>();

    
   
534

   

    
   
535
    for (int i = 0; i < childExprList.size(); i++) {

    
   
536
      ExprNodeDesc child = childExprList.get(i);

    
   
537
      if (child instanceof ExprNodeGenericFuncDesc) {

    
   
538
        VectorExpression e = getVectorExpression(child);

    
   
539
        vectorExprs.add(e);

    
   
540
        variableArgPositions.add(i);

    
   
541
        exprResultColumnNums.add(e.getOutputColumn());

    
   
542
        argDescs[i].setVariable(e.getOutputColumn());

    
   
543
      } else if (child instanceof ExprNodeColumnDesc) {

    
   
544
        variableArgPositions.add(i);

    
   
545
        argDescs[i].setVariable(getInputColumnIndex(((ExprNodeColumnDesc) child).getColumn()));

    
   
546
      } else if (child instanceof ExprNodeConstantDesc) {

    
   
547

   

    
   
548
        // this is a constant

    
   
549
        argDescs[i].setConstant((ExprNodeConstantDesc) child);

    
   
550
      } else {

    
   
551
        throw new HiveException("Unable to vectorize Custom UDF");

    
   
552
      }

    
   
553
    }

    
   
554

   

    
   
555
    // Allocate output column and get column number;

    
   
556
    int outputCol = -1;

    
   
557
    String resultColVectorType;

    
   
558
    String resultType = expr.getTypeInfo().getTypeName();

    
   
559
    if (resultType.equalsIgnoreCase("string")) {

    
   
560
      resultColVectorType = "String";

    
   
561
    } else if (isIntFamily(resultType)) {

    
   
562
      resultColVectorType = "Long";

    
   
563
    } else if (isFloatFamily(resultType)) {

    
   
564
      resultColVectorType = "Double";

    
   
565
    } else if (resultType.equalsIgnoreCase("timestamp")) {

    
   
566
      resultColVectorType = "Long";

    
   
567
    } else {

    
   
568
      throw new HiveException("Unable to vectorize due to unsupported custom UDF return type "

    
   
569
                                + resultType);

    
   
570
    }

    
   
571
    outputCol = ocm.allocateOutputColumn(resultColVectorType);

    
   
572

   

    
   
573
    // Make vectorized operator

    
   
574
    VectorExpression ve;

    
   
575
    ve = new VectorUDFAdaptor(expr, outputCol, resultColVectorType, argDescs);

    
   
576

   

    
   
577
    // Set child expressions

    
   
578
    VectorExpression[] childVEs = null;

    
   
579
    if (exprResultColumnNums.size() != 0) {

    
   
580
      childVEs = new VectorExpression[exprResultColumnNums.size()];

    
   
581
      for (int i = 0; i < childVEs.length; i++) {

    
   
582
        childVEs[i] = vectorExprs.get(i);

    
   
583
      }

    
   
584
    }

    
   
585
    ve.setChildExpressions(childVEs);

    
   
586

   

    
   
587
    // Free output columns if inputs have non-leaf expression trees.

    
   
588
    for (Integer i : exprResultColumnNums) {

    
   
589
      ocm.freeOutputColumn(i);

    
   
590
    }

    
   
591
    return ve;

    
   
592
  }

    
   
593

   

    
   
594
  // return true if this is any kind of float

    
   
595
  public static boolean isFloatFamily(String resultType) {

    
   
596
    return resultType.equalsIgnoreCase("double")

    
   
597
        || resultType.equalsIgnoreCase("float");

    
   
598
  }

    
   
599

   

    
   
600
  // Return true if this data type is handled in the output vector as an integer.

    
   
601
  public static boolean isIntFamily(String resultType) {

    
   
602
    return resultType.equalsIgnoreCase("tinyint")

    
   
603
        || resultType.equalsIgnoreCase("smallint")

    
   
604
        || resultType.equalsIgnoreCase("int")

    
   
605
        || resultType.equalsIgnoreCase("bigint")

    
   
606
        || resultType.equalsIgnoreCase("boolean");

    
   
607
  }

    
   
608

   
477
  /* Return a unary string vector expression. This is used for functions like
609
  /* Return a unary string vector expression. This is used for functions like
478
   * UPPER() and LOWER().
610
   * UPPER() and LOWER().
479
   */
611
   */
480
  private VectorExpression getUnaryStringExpression(String vectorExprClassName,
612
  private VectorExpression getUnaryStringExpression(String vectorExprClassName,
481
      String resultType, // result type name
613
      String resultType, // result type name
[+20] [20] 800 lines
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFAdaptor.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFArgDesc.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
Revision 5b467bb New Change
 
ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
Revision 048824a New Change
 
ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java
Revision 52fe47b New Change
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorUDFAdaptor.java
New File
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/util/GenericUDFIsNull.java
New File
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/util/LongUDF.java
New File
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/util/TestUDF.java
New File
 
  1. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java: Loading...
  2. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFAdaptor.java: Loading...
  3. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFArgDesc.java: Loading...
  4. ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java: Loading...
  5. ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java: Loading...
  6. ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java: Loading...
  7. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorUDFAdaptor.java: Loading...
  8. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/util/GenericUDFIsNull.java: Loading...
  9. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/util/LongUDF.java: Loading...
  10. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/util/TestUDF.java: Loading...