Review Board 1.7.22


PIG-3015 Rewrite of AvroStorage

Review Request #8104 - Created Nov. 17, 2012 and updated

Joseph Adler
PIG-3015
Reviewers
pig
cheolsoo
pig-git
The current AvroStorage implementation has a lot of issues: it requires old versions of Avro, it copies data much more than needed, and it's verbose and complicated. (One pet peeve of mine is that old versions of Avro don't support Snappy compression.)

I rewrote AvroStorage from scratch to fix these issues. In early tests, the new implementation is significantly faster, and the code is a lot simpler. Rewriting AvroStorage also enabled me to implement support for Trevni.

This is the latest version of the patch, complete with test cases and TrevniStorage. (Test cases for TrevniStorage are still missing).

 
build.xml
Revision aa6e09d New Change
[20] 311 lines
[+20]
312
    </path>
312
    </path>
313

    
   
313

   
314
    <path id="test.classpath">
314
    <path id="test.classpath">
315
        <!-- need to put this first, otherwise junit-3 testcases can break -->
315
        <!-- need to put this first, otherwise junit-3 testcases can break -->
316
        <pathelement location="${ivy.lib.dir}/junit-3.8.1.jar"/>
316
        <pathelement location="${ivy.lib.dir}/junit-3.8.1.jar"/>

    
   
317
        <pathelement location="${ivy.lib.dir}/httpclient-4.1.jar"/> <!-- needed for avrostorage tests -->
317
        <pathelement location="${build.classes}"/>
318
    	<pathelement location="${build.classes}"/>
318
        <pathelement location="${test.src.dir}"/>
319
        <pathelement location="${test.src.dir}"/>
319
        <pathelement location="${piggybank.jarfile}"/>
320
        <pathelement location="${piggybank.jarfile}"/>
320
        <path refid="classpath"/>
321
        <path refid="classpath"/>
321
    </path>
322
    </path>
[+20] [20] 7 lines
[+20]
329
            <include name="jackson-core-asl-${jackson.version}.jar"/>
330
            <include name="jackson-core-asl-${jackson.version}.jar"/>
330
            <include name="joda-time-${joda-time.version}.jar"/>
331
            <include name="joda-time-${joda-time.version}.jar"/>
331
            <include name="guava-${guava.version}.jar"/>
332
            <include name="guava-${guava.version}.jar"/>
332
            <include name="automaton-${automaton.version}.jar"/>
333
            <include name="automaton-${automaton.version}.jar"/>
333
            <include name="jansi-${jansi.version}.jar"/>
334
            <include name="jansi-${jansi.version}.jar"/>

    
   
335
            <include name="avro-${avro.version}.jar"/> 	

    
   
336
            <include name="avro-mapred-${avro.version}.jar"/> 	

    
   
337
            <include name="trevni-core-${avro.version}.jar"/> 	

    
   
338
            <include name="trevni-avro-${avro.version}.jar"/>

    
   
339
        	<include name="snappy-java-1.0.5-M3.jar"/>
334
            <include name="asm*.jar"/>
340
            <include name="asm*.jar"/>
335
        </patternset>
341
        </patternset>
336
    </fileset>
342
    </fileset>
337

    
   
343

   
338
    <fileset dir="${ivy.lib.dir}" id="runtime.dependencies.jar">
344
    <fileset dir="${ivy.lib.dir}" id="runtime.dependencies.jar">
339
        <patternset id="pattern.runtime.dependencies.jar">
345
        <patternset id="pattern.runtime.dependencies.jar">
340
            <patternset refid="pattern.runtime.dependencies-withouthadoop.jar"/>
346
            <patternset refid="pattern.runtime.dependencies-withouthadoop.jar"/>
341
            <include name="hadoop-core-${hadoop-core.version}.jar"/>
347
            <include name="hadoop-core-${hadoop-core.version}.jar"/>
342
            <include name="hadoop-*-${hadoop-common.version}.jar"/>
348
            <include name="hadoop-*-${hadoop-common.version}.jar"/>
343
            <exclude name="hadoop-hdfs-${hadoop-hdfs.version}.jar"/>
349
            <exclude name="hadoop-hdfs-${hadoop-hdfs.version}.jar"/>
344
            <include name="junit-${junit.version}.jar"/>
350
            <include name="junit-${junit.version}.jar"/>
345
            <include name="jsch-${jsch.version}.jar"/>
351
            <include name="jsch-${jsch.version}.jar"/>
346
            <include name="protobuf-java-${protobuf-java.version}.jar"/>
352
            <include name="protobuf-java-${protobuf-java.version}.jar"/>
347
            <include name="avro-${avro.version}.jar"/>

   
348
            <include name="commons*.jar"/>
353
        	<include name="commons*.jar"/>
349
            <include name="log4j*.jar"/>
354
            <include name="log4j*.jar"/>
350
            <include name="slf4j*.jar"/>
355
            <include name="slf4j*.jar"/>
351
            <include name="jsp-api*.jar"/>
356
            <include name="jsp-api*.jar"/>

    
   
357
            <include name="avro-${avro.version}.jar"/>

    
   
358
            <include name="avro-mapred-${avro.version}.jar"/>	

    
   
359
            <include name="trevni-core-${avro.version}.jar"/>	

    
   
360
            <include name="trevni-avro-${avro.version}.jar"/>

    
   
361
        	<include name="snappy-java-1.0.5-M3.jar"/>
352
            <include name="asm*.jar"/>
362
            <include name="asm*.jar"/>
353
        </patternset>
363
        </patternset>
354
    </fileset>
364
    </fileset>
355

    
   
365

   
356
    <fileset dir="${ivy.lib.dir}" id="core.dependencies.jar">
366
    <fileset dir="${ivy.lib.dir}" id="core.dependencies.jar">
[+20] [20] 1151 lines
[+20]
1508
       <ivy:configure settingsid="${ant.project.name}.ivy.settings" file="${ivysettings.xml}" override='false'/>
1518
       <ivy:configure settingsid="${ant.project.name}.ivy.settings" file="${ivysettings.xml}" override='false'/>
1509
     </target>
1519
     </target>
1510

    
   
1520

   
1511
     <target name="ivy-resolve" depends="ivy-init" unless="ivy.resolved" description="Resolve Ivy dependencies">
1521
     <target name="ivy-resolve" depends="ivy-init" unless="ivy.resolved" description="Resolve Ivy dependencies">
1512
       <property name="ivy.resolved" value="true"/>
1522
       <property name="ivy.resolved" value="true"/>
1513
       <ivy:resolve settingsRef="${ant.project.name}.ivy.settings"/>
1523
       <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" refresh="true"/>
1514
     </target>
1524
     </target>
1515

    
   
1525

   
1516
     <target name="ivy-compile" depends="ivy-resolve" description="Retrieve Ivy-managed artifacts for compile configuration">
1526
     <target name="ivy-compile" depends="ivy-resolve" description="Retrieve Ivy-managed artifacts for compile configuration">
1517
       <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
1527
       <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
1518
                 pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="compile"/>
1528
                 pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="compile"/>
[+20] [20] 120 lines
ivy.xml
Revision 3a1cb2e New Change
 
.eclipse.templates/.classpath
Revision a213e93 New Change
 
ivy/libraries.properties
Revision 629feb4 New Change
 
src/docs/src/documentation/content/xdocs/func.xml
Revision 9f8d740 New Change
 
src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/JobControlCompiler.java
Revision 5b54490 New Change
 
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POStore.java
Revision 249aecb New Change
 
src/org/apache/pig/builtin/AvroStorage.java
New File
 
src/org/apache/pig/builtin/TrevniStorage.java
New File
 
src/org/apache/pig/impl/util/avro/AvroArrayReader.java
New File
 
src/org/apache/pig/impl/util/avro/AvroBagWrapper.java
New File
 
src/org/apache/pig/impl/util/avro/AvroMapWrapper.java
New File
 
src/org/apache/pig/impl/util/avro/AvroRecordReader.java
New File
 
src/org/apache/pig/impl/util/avro/AvroRecordWriter.java
New File
 
src/org/apache/pig/impl/util/avro/AvroStorageDataConversionUtilities.java
New File
 
src/org/apache/pig/impl/util/avro/AvroStorageSchemaConversionUtilities.java
New File
 
src/org/apache/pig/impl/util/avro/AvroTupleWrapper.java
New File
 
test/commit-tests
Revision c6fbbca New Change
 
test/unit-tests
Revision 7cede06 New Change
 
test/org/apache/pig/builtin/TestAvroStorage.java
New File
 
  1. build.xml: Loading...
  2. ivy.xml: Loading...
  3. .eclipse.templates/.classpath: Loading...
  4. ivy/libraries.properties: Loading...
  5. src/docs/src/documentation/content/xdocs/func.xml: Loading...
  6. src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/JobControlCompiler.java: Loading...
  7. src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POStore.java: Loading...
  8. src/org/apache/pig/builtin/AvroStorage.java: Loading...
  9. src/org/apache/pig/builtin/TrevniStorage.java: Loading...
  10. src/org/apache/pig/impl/util/avro/AvroArrayReader.java: Loading...
  11. src/org/apache/pig/impl/util/avro/AvroBagWrapper.java: Loading...
  12. src/org/apache/pig/impl/util/avro/AvroMapWrapper.java: Loading...
  13. src/org/apache/pig/impl/util/avro/AvroRecordReader.java: Loading...
  14. src/org/apache/pig/impl/util/avro/AvroRecordWriter.java: Loading...
  15. src/org/apache/pig/impl/util/avro/AvroStorageDataConversionUtilities.java: Loading...
  16. src/org/apache/pig/impl/util/avro/AvroStorageSchemaConversionUtilities.java: Loading...
  17. src/org/apache/pig/impl/util/avro/AvroTupleWrapper.java: Loading...
  18. test/commit-tests: Loading...
  19. test/unit-tests: Loading...
  20. test/org/apache/pig/builtin/TestAvroStorage.java: Loading...
This diff has been split across 3 pages: 1 2 3 >