Review Board 1.7.22


HIVE-4732 Reduce or eliminate the expensive Schema equals() check for AvroSerde

Review Request #12480 - Created July 11, 2013 and updated

Mohammad Islam
trunk
HIVE-4732
Reviewers
hive
ashutoshc, jghoman
hive-git
From our performance analysis, we found AvroSerde's schema.equals() call consumed a substantial amount ( nearly 40%) of time. This patch intends to minimize the number schema.equals() calls by pushing the check as late/fewer as possible.

At first, we added a unique id for each record reader which is then included in every AvroGenericRecordWritable. Then, we introduce two new data structures (one hashset and one hashmap) to store intermediate data to avoid duplicates checkings. Hashset contains all the record readers' IDs that don't need any re-encoding. On the other hand, HashMap contains the already used re-encoders. It works as cache and allows re-encoders reuse. With this change, our test shows nearly 40% reduction in Avro record reading time.
 
   

 
ql/src/java/org/apache/hadoop/hive/ql/io/avro/AvroGenericRecordReader.java
Revision ed2a9af New Change
[20] 18 lines
[+20]
19

    
   
19

   
20

    
   
20

   
21
import java.io.IOException;
21
import java.io.IOException;
22
import java.util.Map;
22
import java.util.Map;
23
import java.util.Properties;
23
import java.util.Properties;

    
   
24
import java.util.UUID;
24

    
   
25

   
25
import org.apache.avro.Schema;
26
import org.apache.avro.Schema;
26
import org.apache.avro.file.DataFileReader;
27
import org.apache.avro.file.DataFileReader;
27
import org.apache.avro.generic.GenericData;
28
import org.apache.avro.generic.GenericData;
28
import org.apache.avro.generic.GenericDatumReader;
29
import org.apache.avro.generic.GenericDatumReader;
[+20] [20] 26 lines
[+20] [+] public class AvroGenericRecordReader implements
55

    
   
56

   
56
  final private org.apache.avro.file.FileReader<GenericRecord> reader;
57
  final private org.apache.avro.file.FileReader<GenericRecord> reader;
57
  final private long start;
58
  final private long start;
58
  final private long stop;
59
  final private long stop;
59
  protected JobConf jobConf;
60
  protected JobConf jobConf;

    
   
61
  /**

    
   
62
   * A unique ID for each record reader.

    
   
63
   */

    
   
64
  final private UUID recordReaderID;
60

    
   
65

   
61
  public AvroGenericRecordReader(JobConf job, FileSplit split, Reporter reporter) throws IOException {
66
  public AvroGenericRecordReader(JobConf job, FileSplit split, Reporter reporter) throws IOException {
62
    this.jobConf = job;
67
    this.jobConf = job;
63
    Schema latest;
68
    Schema latest;
64

    
   
69

   
[+20] [20] 11 lines
[+20] public class AvroGenericRecordReader implements
76

    
   
81

   
77
    this.reader = new DataFileReader<GenericRecord>(new FsInput(split.getPath(), job), gdr);
82
    this.reader = new DataFileReader<GenericRecord>(new FsInput(split.getPath(), job), gdr);
78
    this.reader.sync(split.getStart());
83
    this.reader.sync(split.getStart());
79
    this.start = reader.tell();
84
    this.start = reader.tell();
80
    this.stop = split.getStart() + split.getLength();
85
    this.stop = split.getStart() + split.getLength();

    
   
86
    this.recordReaderID = UUID.randomUUID();
81
  }
87
  }
82

    
   
88

   
83
  /**
89
  /**
84
   * Attempt to retrieve the reader schema.  We have a couple opportunities
90
   * Attempt to retrieve the reader schema.  We have a couple opportunities
85
   * to provide this, depending on whether or not we're just selecting data
91
   * to provide this, depending on whether or not we're just selecting data
[+20] [20] 60 lines
[+20] [+] public boolean next(NullWritable nullWritable, AvroGenericRecordWritable record) throws IOException {
146
      return false;
152
      return false;
147
    }
153
    }
148

    
   
154

   
149
    GenericData.Record r = (GenericData.Record)reader.next();
155
    GenericData.Record r = (GenericData.Record)reader.next();
150
    record.setRecord(r);
156
    record.setRecord(r);

    
   
157
    record.setRecordReaderID(recordReaderID);
151

    
   
158

   
152
    return true;
159
    return true;
153
  }
160
  }
154

    
   
161

   
155
  @Override
162
  @Override
[+20] [20] 30 lines
serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java
Revision e994411 New Change
 
serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroGenericRecordWritable.java
Revision 66f0348 New Change
 
serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroDeserializer.java
Revision 3828940 New Change
 
serde/src/test/org/apache/hadoop/hive/serde2/avro/TestSchemaReEncoder.java
Revision 9af751b New Change
 
serde/src/test/org/apache/hadoop/hive/serde2/avro/Utils.java
Revision 2b948eb New Change
 
  1. ql/src/java/org/apache/hadoop/hive/ql/io/avro/AvroGenericRecordReader.java: Loading...
  2. serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java: Loading...
  3. serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroGenericRecordWritable.java: Loading...
  4. serde/src/test/org/apache/hadoop/hive/serde2/avro/TestAvroDeserializer.java: Loading...
  5. serde/src/test/org/apache/hadoop/hive/serde2/avro/TestSchemaReEncoder.java: Loading...
  6. serde/src/test/org/apache/hadoop/hive/serde2/avro/Utils.java: Loading...