Review Board 1.7.22


Add Vectorized Substr

Review Request #11106 - Created May 13, 2013 and submitted

Timothy Chen
vectorization
HIVE-4495
Reviewers
hive
hive-git
Add Vectorized Substr

 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
New File

    
   
1
package org.apache.hadoop.hive.ql.exec.vector.expressions;

    
   
2

   

    
   
3
import java.io.UnsupportedEncodingException;

    
   
4

   

    
   
5
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;

    
   
6
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;

    
   
7

   

    
   
8
/**

    
   
9
 * This class provides the implementation of vectorized substring, with a single start index parameter.

    
   
10
 * If the start index is invalid (outside of the string boundaries) then an empty string will be in the output.

    
   
11
 */

    
   
12
public class StringSubstrColStart extends VectorExpression {

    
   
13
  private final int startIdx;

    
   
14
  private final int colNum;

    
   
15
  private final int outputColumn;

    
   
16
  private static byte[] EMPTY_STRING;

    
   
17

   

    
   
18
  // Populating the Empty string bytes. Putting it as static since it should be immutable and can be shared

    
   
19
  static {

    
   
20
    try {

    
   
21
      EMPTY_STRING = "".getBytes("UTF-8");

    
   
22
    } catch(UnsupportedEncodingException e) {

    
   
23
      e.printStackTrace();

    
   
24
    }

    
   
25
  }

    
   
26

   

    
   
27
  public StringSubstrColStart(int colNum, int startIdx, int outputColumn) {

    
   
28
    this.colNum = colNum;

    
   
29
    this.startIdx = startIdx;

    
   
30
    this.outputColumn = outputColumn;

    
   
31
  }

    
   
32

   

    
   
33
  /**

    
   
34
   * Given the substring start index param it finds the starting offset of the passed in utf8 string byte array

    
   
35
   * that matches the index.

    
   
36
   * @param utf8String byte array that holds the utf8 string

    
   
37
   * @param start start offset of the byte array the string starts at

    
   
38
   * @param len length of the bytes the string holds in the byte array

    
   
39
   * @param substrStart the Start index for the substring operation

    
   
40
   */

    
   
41
  static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {

    
   
42
    int curIdx = -1;

    
   
43

   

    
   
44
    if (substrStart < 0) {

    
   
45
      int length = 0;

    
   
46
      for (int i = start; i != len; ++i) {

    
   
47
        if ((utf8String[i] & 0xc0) != 0x80) {

    
   
48
          ++length;

    
   
49
        }

    
   
50
      }

    
   
51

   

    
   
52
      if (-length > substrStart) {

    
   
53
        return -1;

    
   
54
      }

    
   
55

   

    
   
56
      substrStart = length + substrStart;

    
   
57
    }

    
   
58

   

    
   
59
    int end = start + len;

    
   
60
    for (int i = start; i != end; ++i) {

    
   
61
      if ((utf8String[i] & 0xc0) != 0x80) {

    
   
62
        ++curIdx;

    
   
63
        if (curIdx == substrStart) {

    
   
64
          return i;

    
   
65
        }

    
   
66
      }

    
   
67
    }

    
   
68
    return -1;

    
   
69
  }

    
   
70

   

    
   
71
  @Override

    
   
72
  public void evaluate(VectorizedRowBatch batch) {

    
   
73
    if (childExpressions != null) {

    
   
74
      super.evaluateChildren(batch);

    
   
75
    }

    
   
76

   

    
   
77
    BytesColumnVector inV = (BytesColumnVector) batch.cols[colNum];

    
   
78
    BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];

    
   
79

   

    
   
80
    int n = batch.size;

    
   
81

   

    
   
82
    if (n == 0) {

    
   
83
      return;

    
   
84
    }

    
   
85

   

    
   
86

   

    
   
87
    byte[][] vector = inV.vector;

    
   
88
    int[] sel = batch.selected;

    
   
89
    int[] len = inV.length;

    
   
90
    int[] start = inV.start;

    
   
91

   

    
   
92
    if (inV.isRepeating) {

    
   
93
      outV.isRepeating = true;

    
   
94
      if (!inV.noNulls && inV.isNull[0]) {

    
   
95
        outV.isNull[0] = true;

    
   
96
        outV.noNulls = false;

    
   
97
        outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);

    
   
98
        return;

    
   
99
      } else {

    
   
100
        outV.noNulls = true;

    
   
101
        int offset = getSubstrStartOffset(vector[0], sel[0], len[0], startIdx);

    
   
102
        if (offset != -1) {

    
   
103
          outV.setRef(0, vector[0], offset, len[0] - offset);

    
   
104
        } else {

    
   
105
          outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);

    
   
106
        }

    
   
107
      }

    
   
108
    } else {

    
   
109
      outV.isRepeating = false;

    
   
110
      if (batch.selectedInUse) {

    
   
111
        if (!inV.noNulls) {

    
   
112
          outV.noNulls = false;

    
   
113
          for (int i = 0; i != n; ++i) {

    
   
114
            int selected = sel[i];

    
   
115
            if (!inV.isNull[selected]) {

    
   
116
              int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx);

    
   
117
              outV.isNull[selected] = false;

    
   
118
              if (offset != -1) {

    
   
119
                outV.setRef(selected, vector[selected], offset, len[selected] - offset);

    
   
120
              } else {

    
   
121
                outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);

    
   
122
              }

    
   
123
            } else {

    
   
124
              outV.isNull[selected] = true;

    
   
125
            }

    
   
126
          }

    
   
127
        } else {

    
   
128
          outV.noNulls = true;

    
   
129
          for (int i = 0; i != n; ++i) {

    
   
130
            int selected = sel[i];

    
   
131
            int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx);

    
   
132
            if (offset != -1) {

    
   
133
              outV.setRef(selected, vector[selected], offset, len[selected] - offset);

    
   
134
            } else {

    
   
135
              outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);

    
   
136
            }

    
   
137
          }

    
   
138
        }

    
   
139
      } else {

    
   
140
        if (!inV.noNulls) {

    
   
141
          outV.noNulls = false;

    
   
142
          System.arraycopy(inV.isNull, 0, outV.isNull, 0, n);

    
   
143
          for (int i = 0; i != n; ++i) {

    
   
144
            if (!inV.isNull[i]) {

    
   
145
              int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx);

    
   
146
              if (offset != -1) {

    
   
147
                outV.setRef(i, vector[i], offset, len[i] - offset);

    
   
148
              } else {

    
   
149
                outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);

    
   
150
              }

    
   
151
            }

    
   
152
          }

    
   
153
        } else {

    
   
154
          outV.noNulls = true;

    
   
155
          for (int i = 0; i != n; ++i) {

    
   
156
            int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx);

    
   
157
            if (offset != -1) {

    
   
158
              outV.setRef(i, vector[i], offset, len[i] - offset);

    
   
159
            } else {

    
   
160
              outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);

    
   
161
            }

    
   
162
          }

    
   
163
        }

    
   
164
      }

    
   
165
    }

    
   
166
  }

    
   
167

   

    
   
168
  @Override

    
   
169
  public int getOutputColumn() {

    
   
170
    return outputColumn;

    
   
171
  }

    
   
172

   

    
   
173
  @Override

    
   
174
  public String getOutputType() {

    
   
175
    return "string";

    
   
176
  }

    
   
177
}
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
New File
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Revision 6e26412 New Change
 
  1. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java: Loading...
  2. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java: Loading...
  3. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java: Loading...