Review Board 1.7.22


New code for VectorizedRowBatch to form basis of vectorized query execution

Review Request #10592 - Created April 17, 2013 and updated

Eric Hanson
vectorization
HIVE-4284
Reviewers
hive
hive-git
New code for VectorizedRowBatch to form basis of vectorized query execution

 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
New File

    
   
1
/**

    
   
2
 * Licensed to the Apache Software Foundation (ASF) under one

    
   
3
 * or more contributor license agreements.  See the NOTICE file

    
   
4
 * distributed with this work for additional information

    
   
5
 * regarding copyright ownership.  The ASF licenses this file

    
   
6
 * to you under the Apache License, Version 2.0 (the

    
   
7
 * "License"); you may not use this file except in compliance

    
   
8
 * with the License.  You may obtain a copy of the License at

    
   
9
 *

    
   
10
 *     http://www.apache.org/licenses/LICENSE-2.0

    
   
11
 *

    
   
12
 * Unless required by applicable law or agreed to in writing, software

    
   
13
 * distributed under the License is distributed on an "AS IS" BASIS,

    
   
14
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    
   
15
 * See the License for the specific language governing permissions and

    
   
16
 * limitations under the License.

    
   
17
 */

    
   
18

   

    
   
19
package org.apache.hadoop.hive.ql.exec.vector;

    
   
20

   

    
   
21
import org.apache.hadoop.io.Writable;

    
   
22

   

    
   
23
/**

    
   
24
 * This class supports string and binary data by value reference -- i.e. each field is 

    
   
25
 * explicitly present, as opposed to provided by a dictionary reference.

    
   
26
 * In some cases, all the values will be in the same byte array to begin with,

    
   
27
 * but this need not be the case. If each value is in a separate byte 

    
   
28
 * array to start with, or not all of the values are in the same original

    
   
29
 * byte array, you can still assign data by reference into this column vector.

    
   
30
 * This gives flexibility to use this in multiple situations. 

    
   
31
 * <p>

    
   
32
 * When setting data by reference, the caller

    
   
33
 * is responsible for allocating the byte arrays used to hold the data.

    
   
34
 * You can also set data by value, as long as you call the initBuffer() method first.

    
   
35
 * You can mix "by value" and "by reference" in the same column vector,

    
   
36
 * though that use is probably not typical.

    
   
37
 */

    
   
38
public class BytesColumnVector extends ColumnVector {

    
   
39
  public byte[][] vector; 

    
   
40
  public int[] start;          // start offset of each field

    
   
41
  

    
   
42
  /*

    
   
43
   * The length of each field. If the value repeats for every entry, then it is stored 

    
   
44
   * in vector[0] and isRepeating from the superclass is set to true.

    
   
45
   */

    
   
46
  public int[] length; 

    
   
47
  private byte[] buffer;   // optional buffer to use when actually copying in data

    
   
48
  private int nextFree;    // next free position in buffer

    
   
49
  

    
   
50
  // Estimate that there will be 16 bytes per entry

    
   
51
  static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE;

    
   
52
  

    
   
53
  // Proportion of extra space to provide when allocating more buffer space. 

    
   
54
  static final float EXTRA_SPACE_FACTOR = (float) 1.2;

    
   
55
  

    
   
56
  /**

    
   
57
   * Use this constructor for normal operation.

    
   
58
   * All column vectors should be the default size normally.

    
   
59
   */

    
   
60
  public BytesColumnVector() {

    
   
61
    this(VectorizedRowBatch.DEFAULT_SIZE);

    
   
62
  }

    
   
63
  

    
   
64
  /**

    
   
65
   * Don't call this constructor except for testing purposes.

    
   
66
   * 

    
   
67
   * @param size  number of elements in the column vector

    
   
68
   */

    
   
69
  public BytesColumnVector(int size) {

    
   
70
    super(size);

    
   
71
    vector = new byte[size][];

    
   
72
    start = new int[size];

    
   
73
    length = new int[size]; 

    
   
74
  }

    
   
75
  

    
   
76
  /** Set a field by reference.

    
   
77
   *  

    
   
78
   * @param elementNum index within column vector to set

    
   
79
   * @param sourceBuf container of source data

    
   
80
   * @param start start byte position within source

    
   
81
   * @param length  length of source byte sequence

    
   
82
   */

    
   
83
  public void setRef(int elementNum, byte[] sourceBuf, int start, int length) {

    
   
84
    vector[elementNum] = sourceBuf;

    
   
85
    this.start[elementNum] = start;

    
   
86
    this.length[elementNum] = length;

    
   
87
  }

    
   
88
  

    
   
89
  /** 

    
   
90
   * You must call initBuffer first before using setVal().

    
   
91
   * Provide the estimated number of bytes needed to hold

    
   
92
   * a full column vector worth of byte string data.

    
   
93
   * 

    
   
94
   * @param estimatedValueSize  Estimated size of buffer space needed

    
   
95
   */

    
   
96
  public void initBuffer(int estimatedValueSize) {

    
   
97
    nextFree = 0;

    
   
98
    

    
   
99
    // if buffer is already allocated, keep using it, don't re-allocate

    
   
100
    if (buffer != null) {

    
   
101
      return;

    
   
102
    }

    
   
103
    

    
   
104
    // allocate a little extra space to limit need to re-allocate

    
   
105
    int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR);

    
   
106
    if (bufferSize < DEFAULT_BUFFER_SIZE) {

    
   
107
      bufferSize = DEFAULT_BUFFER_SIZE;

    
   
108
    }

    
   
109
    buffer = new byte[bufferSize]; 

    
   
110
  }

    
   
111
  

    
   
112
  /**

    
   
113
   * Initialize buffer to default size.

    
   
114
   */

    
   
115
  public void initBuffer() {

    
   
116
    initBuffer(0);

    
   
117
  }

    
   
118
  

    
   
119
  /**

    
   
120
   * @return amount of buffer space currently allocated

    
   
121
   */

    
   
122
  public int bufferSize() {

    
   
123
    if (buffer == null) {

    
   
124
      return 0;

    
   
125
    }

    
   
126
    return buffer.length;

    
   
127
  }

    
   
128
  

    
   
129
  /**

    
   
130
   * Set a field by actually copying in to a local buffer.

    
   
131
   * If you must actually copy data in to the array, use this method.

    
   
132
   * DO NOT USE this method unless it's not practical to set data by reference with setRef().

    
   
133
   * Setting data by reference tends to run a lot faster than copying data in.

    
   
134
   * 

    
   
135
   * @param elementNum index within column vector to set

    
   
136
   * @param sourceBuf container of source data

    
   
137
   * @param start start byte position within source

    
   
138
   * @param length  length of source byte sequence

    
   
139
   */

    
   
140
  public void setVal(int elementNum, byte[] sourceBuf, int start, int length) {

    
   
141
    if ((nextFree + length) > buffer.length) {

    
   
142
      increaseBufferSpace(length);

    
   
143
    }

    
   
144
    System.arraycopy(sourceBuf, start, buffer, nextFree, length);

    
   
145
    vector[elementNum] = buffer;

    
   
146
    this.start[elementNum] = nextFree;

    
   
147
    this.length[elementNum] = length;

    
   
148
    nextFree += length;

    
   
149
  }

    
   
150
  

    
   
151
  /**

    
   
152
   * Increase buffer space enough to accommodate next element.

    
   
153
   * This uses an exponential increase mechanism to rapidly 

    
   
154
   * increase buffer size to enough to hold all data.

    
   
155
   * As batches get re-loaded, buffer space allocated will quickly

    
   
156
   * stabilize.

    
   
157
   * 

    
   
158
   * @param nextElemLength size of next element to be added

    
   
159
   */

    
   
160
  public void increaseBufferSpace(int nextElemLength) {

    
   
161
    

    
   
162
    // Keep doubling buffer size until there will be enough space for next element.

    
   
163
    int newLength = 2 * buffer.length; 

    
   
164
    while((nextFree + nextElemLength) > newLength) {

    
   
165
      newLength *= 2;

    
   
166
    }

    
   
167
    

    
   
168
    // Allocate new buffer, copy data to it, and set buffer to new buffer.

    
   
169
    byte[] newBuffer = new byte[newLength];

    
   
170
    System.arraycopy(buffer, 0, newBuffer, 0, nextFree);

    
   
171
    buffer = newBuffer;

    
   
172
  }

    
   
173

   

    
   
174
  @Override

    
   
175
  public Writable getWritableObject(int index) {

    
   
176
    

    
   
177
    // TODO finish this

    
   
178
    throw new UnsupportedOperationException("unfinished");

    
   
179
  }

    
   
180
  

    
   
181
}
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java
New File
 
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java
New File
 
ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java
New File
 
  1. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java: Loading...
  2. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java: Loading...
  3. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java: Loading...
  4. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java: Loading...
  5. ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java: Loading...
  6. ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java: Loading...