Review Board 1.7.22


New code for VectorizedRowBatch to form basis of vectorized query execution

Review Request #10592 - Created April 17, 2013 and updated

Eric Hanson
vectorization
HIVE-4284
Reviewers
hive
hive-git
New code for VectorizedRowBatch to form basis of vectorized query execution

 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
New File

    
   
1
/**

    
   
2
 * Licensed to the Apache Software Foundation (ASF) under one

    
   
3
 * or more contributor license agreements.  See the NOTICE file

    
   
4
 * distributed with this work for additional information

    
   
5
 * regarding copyright ownership.  The ASF licenses this file

    
   
6
 * to you under the Apache License, Version 2.0 (the

    
   
7
 * "License"); you may not use this file except in compliance

    
   
8
 * with the License.  You may obtain a copy of the License at

    
   
9
 *

    
   
10
 *     http://www.apache.org/licenses/LICENSE-2.0

    
   
11
 *

    
   
12
 * Unless required by applicable law or agreed to in writing, software

    
   
13
 * distributed under the License is distributed on an "AS IS" BASIS,

    
   
14
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    
   
15
 * See the License for the specific language governing permissions and

    
   
16
 * limitations under the License.

    
   
17
 */

    
   
18

   

    
   
19
package org.apache.hadoop.hive.ql.exec.vector;

    
   
20

   

    
   
21
import org.apache.hadoop.io.LongWritable;

    
   
22
import org.apache.hadoop.io.Writable;

    
   
23

   

    
   
24
/**

    
   
25
 * This class supports string and binary data by value reference -- i.e. each field is 

    
   
26
 * explicitly present, as opposed to provided by a dictionary reference.

    
   
27
 * In some cases, all the values will be in the same byte array to begin with,

    
   
28
 * but this need not be the case. If each value is in a separate byte 

    
   
29
 * array to start with, or not all of the values are in the same original

    
   
30
 * byte array, you can still assign data by reference into this column vector.

    
   
31
 * This gives flexibility to use this in multiple situations. 

    
   
32
 * <p>

    
   
33
 * When setting data by reference, the caller

    
   
34
 * is responsible for allocating the byte arrays used to hold the data.

    
   
35
 * You can also set data by value, as long as you call the initBuffer() method first.

    
   
36
 * You can mix "by value" and "by reference" in the same column vector,

    
   
37
 * though that use is probably not typical.

    
   
38
 */

    
   
39
public class BytesColumnVector extends ColumnVector {

    
   
40
  public byte[][] vector; 

    
   
41
  public int[] start; // start offset of each field

    
   
42
  public int[] length; // length of each field

    
   
43
  // If the value repeats for every entry, then it is stored in vector[0]

    
   
44
  // and isRepeating from the superclass is set to true.

    
   
45
  private byte[] buffer; // optional buffer to use when actually copying in data

    
   
46
  private int nextFree; // next free position in buffer

    
   
47
  

    
   
48
  // Estimate that there will be 16 bytes per entry

    
   
49
  static final int defaultBufferSize = 16 * VectorizedRowBatch.defaultSize;

    
   
50
  

    
   
51
  /**

    
   
52
   * Use this constructor for normal operation.

    
   
53
   * All column vectors should be the default size normally.

    
   
54
   */

    
   
55
  public BytesColumnVector() {

    
   
56
    this(VectorizedRowBatch.defaultSize);

    
   
57
  }

    
   
58
  

    
   
59
  /**

    
   
60
   * Don't call this constructor except for testing purposes.

    
   
61
   * 

    
   
62
   * @param size  number of elements in the column vector

    
   
63
   */

    
   
64
  public BytesColumnVector(int size) {

    
   
65
    super(size);

    
   
66
    vector = new byte[size][];

    
   
67
    start = new int[size];

    
   
68
    length = new int[size]; 

    
   
69
  }

    
   
70
  

    
   
71
  /** Set a field by reference.

    
   
72
   *  

    
   
73
   * @param elementNum index within column vector to set

    
   
74
   * @param sourceBuf container of source data

    
   
75
   * @param start start byte position within source

    
   
76
   * @param length  length of source byte sequence

    
   
77
   */

    
   
78
  public void setRef(int elementNum, byte[] sourceBuf, int start, int length) {

    
   
79
    vector[elementNum] = sourceBuf;

    
   
80
    this.start[elementNum] = start;

    
   
81
    this.length[elementNum] = length;

    
   
82
  }

    
   
83
  

    
   
84
  /** 

    
   
85
   * You must call initBuffer first before using setVal().

    
   
86
   * Provide the estimated number of bytes needed to hold

    
   
87
   * a full column vector worth of byte string data.

    
   
88
   * 

    
   
89
   * @param estimatedValueSize  Estimated size of buffer space needed

    
   
90
   */

    
   
91
  public void initBuffer(int estimatedValueSize) {

    
   
92
    nextFree = 0;

    
   
93
    // if buffer is already allocated, keep using it, don't re-allocate

    
   
94
    if (buffer != null) {

    
   
95
      return;

    
   
96
    }

    
   
97
    // allocate 20% extra space to limit need to re-allocate

    
   
98
    int bufferSize = this.vector.length * (int)(estimatedValueSize * 1.2);

    
   
99
    if (bufferSize < defaultBufferSize) {

    
   
100
      bufferSize = defaultBufferSize;

    
   
101
    }

    
   
102
    buffer = new byte[bufferSize]; 

    
   
103
  }

    
   
104
  

    
   
105
  /**

    
   
106
   * Initialize buffer to default size

    
   
107
   */

    
   
108
  public void initBuffer() {

    
   
109
    initBuffer(0);

    
   
110
  }

    
   
111
  

    
   
112
  /**

    
   
113
   * @return amount of buffer space currently allocated

    
   
114
   */

    
   
115
  public int bufferSize() {

    
   
116
    if (buffer == null) {

    
   
117
      return 0;

    
   
118
    }

    
   
119
    return buffer.length;

    
   
120
  }

    
   
121
  

    
   
122
  /**

    
   
123
   * Set a field by actually copying in to a local buffer.

    
   
124
   * If you must actually copy data in to the array, use this method.

    
   
125
   * DO NOT USE this method unless it's not practical to set data by reference with setRef().

    
   
126
   * Setting data by reference tends to run a lot faster than copying data in.

    
   
127
   * 

    
   
128
   * @param elementNum index within column vector to set

    
   
129
   * @param sourceBuf container of source data

    
   
130
   * @param start start byte position within source

    
   
131
   * @param length  length of source byte sequence

    
   
132
   */

    
   
133
  public void setVal(int elementNum, byte[] sourceBuf, int start, int length) {

    
   
134
    if (nextFree + length > buffer.length) {

    
   
135
      increaseBufferSpace(length);

    
   
136
    }

    
   
137
    System.arraycopy(sourceBuf, start, buffer, nextFree, length);

    
   
138
    vector[elementNum] = buffer;

    
   
139
    this.start[elementNum] = nextFree;

    
   
140
    this.length[elementNum] = length;

    
   
141
    nextFree += length;

    
   
142
  }

    
   
143
  

    
   
144
  /**

    
   
145
   * Increase buffer space enough to accommodate next element.

    
   
146
   * This uses an exponential increase mechanism to rapidly 

    
   
147
   * increase buffer size to enough to hold all data.

    
   
148
   * As batches get re-loaded, buffer space allocated will quickly

    
   
149
   * stabilize.

    
   
150
   * 

    
   
151
   * @param nextElemLength size of next element to be added

    
   
152
   */

    
   
153
  public void increaseBufferSpace(int nextElemLength) {

    
   
154
    // Keep doubling buffer size until there will be enough space for next element.

    
   
155
    int newLength = 2 * buffer.length; 

    
   
156
    while(nextFree + nextElemLength > newLength) {

    
   
157
      newLength *= 2;

    
   
158
    }

    
   
159
    // Allocate new buffer, copy data to it, and set buffer to new buffer.

    
   
160
    byte[] newBuffer = new byte[newLength];

    
   
161
    System.arraycopy(buffer, 0, newBuffer, 0, nextFree);

    
   
162
    buffer = newBuffer;

    
   
163
  }

    
   
164

   

    
   
165
  @Override

    
   
166
  public Writable getWritableObject(int index) {

    
   
167
    // TODO finish this

    
   
168
    assert false : "not implemented";

    
   
169
    return null;

    
   
170
  }

    
   
171
  

    
   
172
}
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java
New File
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java
New File
 
  1. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java: Loading...
  2. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java: Loading...
  3. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java: Loading...
  4. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java: Loading...
  5. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java: Loading...
  6. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java: Loading...