Review Board 1.7.22


SchemaTuple in Pig

Review Request #4651 - Created April 5, 2012 and updated

Jonathan Coveney
PIG-2632
Reviewers
pig
julien
pig
This work builds on Dmitriy's PrimitiveTuple work. The idea is that, knowing the Schema on the frontend, we can code generate Tuples which can be used for fun and profit. In rudimentary tests, the memory efficiency is 2-4x better, and it's ~15% smaller serialized (heavily heavily depends on the data, though). Need to do get/set tests, but assuming that it's on par (or even faster) than Tuple, the memory gain is huge.

Need to clean up the code and add tests.

Right now, it generates a SchemaTuple for every inputSchema and outputSchema given to UDF's. The next step is to make a SchemaBag, where I think the serialization savings will be really huge.

Needs tests and comments, but I want the code to settle a bit.

 

Diff revision 6

This is not the most recent revision of the diff. The latest diff is revision 10. See what's changed.

1 2 3 4 5 6 7 8 9 10
1 2 3 4 5 6 7 8 9 10

  1. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/JobControlCompiler.java: Loading...
  2. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigGenericMapBase.java: Loading...
  3. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigGenericMapReduce.java: Loading...
  4. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigTupleDefaultRawComparator.java: Loading...
  5. trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java: Loading...
  6. trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java: Loading...
  7. trunk/src/org/apache/pig/data/AppendableSchemaTuple.java: Loading...
  8. trunk/src/org/apache/pig/data/BinInterSedes.java: Loading...
  9. trunk/src/org/apache/pig/data/BinSedesTupleFactory.java: Loading...
  10. trunk/src/org/apache/pig/data/DataByteArray.java: Loading...
  11. trunk/src/org/apache/pig/data/FieldIsNullException.java: Loading...
  12. trunk/src/org/apache/pig/data/PBooleanTuple.java: Loading...
  13. trunk/src/org/apache/pig/data/PDoubleTuple.java: Loading...
  14. trunk/src/org/apache/pig/data/PFloatTuple.java: Loading...
  15. trunk/src/org/apache/pig/data/PIntTuple.java: Loading...
  16. trunk/src/org/apache/pig/data/PLongTuple.java: Loading...
  17. trunk/src/org/apache/pig/data/PStringTuple.java: Loading...
  18. trunk/src/org/apache/pig/data/PrimitiveFieldTuple.java: Loading...
  19. trunk/src/org/apache/pig/data/PrimitiveTuple.java: Loading...
  20. trunk/src/org/apache/pig/data/SchemaTuple.java: Loading...
This diff has been split across 2 pages: 1 2 >
trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/JobControlCompiler.java
Revision 1351455 New Change
[20] 69 lines
[+20]
70
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
70
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
71
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper;
71
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper;
72
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
72
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
73
import org.apache.pig.data.BagFactory;
73
import org.apache.pig.data.BagFactory;
74
import org.apache.pig.data.DataType;
74
import org.apache.pig.data.DataType;

    
   
75
import org.apache.pig.data.SchemaTupleFrontend;
75
import org.apache.pig.data.Tuple;
76
import org.apache.pig.data.Tuple;
76
import org.apache.pig.data.TupleFactory;
77
import org.apache.pig.data.TupleFactory;
77
import org.apache.pig.impl.PigContext;
78
import org.apache.pig.impl.PigContext;
78
import org.apache.pig.impl.io.FileLocalizer;
79
import org.apache.pig.impl.io.FileLocalizer;
79
import org.apache.pig.impl.io.FileSpec;
80
import org.apache.pig.impl.io.FileSpec;
[+20] [20] 15 lines
[+20]
95
import org.apache.pig.impl.util.Pair;
96
import org.apache.pig.impl.util.Pair;
96
import org.apache.pig.impl.util.UDFContext;
97
import org.apache.pig.impl.util.UDFContext;
97
import org.apache.pig.impl.util.Utils;
98
import org.apache.pig.impl.util.Utils;
98
import org.apache.pig.tools.pigstats.ScriptState;
99
import org.apache.pig.tools.pigstats.ScriptState;
99

    
   
100

   
100

    
   

   
101
/**
101
/**
102
 * This is compiler class that takes an MROperPlan and converts
102
 * This is compiler class that takes an MROperPlan and converts
103
 * it into a JobControl object with the relevant dependency info
103
 * it into a JobControl object with the relevant dependency info
104
 * maintained. The JobControl Object is made up of Jobs each of
104
 * maintained. The JobControl Object is made up of Jobs each of
105
 * which has a JobConf. The MapReduceOper corresponds to a Job
105
 * which has a JobConf. The MapReduceOper corresponds to a Job
[+20] [20] 473 lines
[+20] [+] private Job getJob(MROperPlan plan, MapReduceOper mro, Configuration config, PigContext pigContext) throws JobCreationException{
579

    
   
579

   
580
            // Search to see if we have any UDFs that need to pack things into the
580
            // Search to see if we have any UDFs that need to pack things into the
581
            // distrubted cache.
581
            // distrubted cache.
582
            setupDistributedCacheForUdfs(mro, pigContext, conf);
582
            setupDistributedCacheForUdfs(mro, pigContext, conf);
583

    
   
583

   

    
   
584
            SchemaTupleFrontend.copyAllGeneratedToDistributedCache(pigContext, conf);

    
   
585

   
584
            POPackage pack = null;
586
            POPackage pack = null;
585
            if(mro.reducePlan.isEmpty()){
587
            if(mro.reducePlan.isEmpty()){
586
                //MapOnly Job
588
                //MapOnly Job
587
                nwJob.setMapperClass(PigMapOnly.Map.class);
589
                nwJob.setMapperClass(PigMapOnly.Map.class);
588
                nwJob.setNumReduceTasks(0);
590
                nwJob.setNumReduceTasks(0);
[+20] [20] 887 lines
trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigGenericMapBase.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigGenericMapReduce.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigTupleDefaultRawComparator.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/AppendableSchemaTuple.java
New File
 
trunk/src/org/apache/pig/data/BinInterSedes.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/BinSedesTupleFactory.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/DataByteArray.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/FieldIsNullException.java
New File
 
trunk/src/org/apache/pig/data/PBooleanTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PDoubleTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PFloatTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PIntTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PLongTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PStringTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PrimitiveFieldTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/PrimitiveTuple.java
Revision 1351455 New Change
 
trunk/src/org/apache/pig/data/SchemaTuple.java
New File
 
  1. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/JobControlCompiler.java: Loading...
  2. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigGenericMapBase.java: Loading...
  3. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigGenericMapReduce.java: Loading...
  4. trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/PigTupleDefaultRawComparator.java: Loading...
  5. trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java: Loading...
  6. trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java: Loading...
  7. trunk/src/org/apache/pig/data/AppendableSchemaTuple.java: Loading...
  8. trunk/src/org/apache/pig/data/BinInterSedes.java: Loading...
  9. trunk/src/org/apache/pig/data/BinSedesTupleFactory.java: Loading...
  10. trunk/src/org/apache/pig/data/DataByteArray.java: Loading...
  11. trunk/src/org/apache/pig/data/FieldIsNullException.java: Loading...
  12. trunk/src/org/apache/pig/data/PBooleanTuple.java: Loading...
  13. trunk/src/org/apache/pig/data/PDoubleTuple.java: Loading...
  14. trunk/src/org/apache/pig/data/PFloatTuple.java: Loading...
  15. trunk/src/org/apache/pig/data/PIntTuple.java: Loading...
  16. trunk/src/org/apache/pig/data/PLongTuple.java: Loading...
  17. trunk/src/org/apache/pig/data/PStringTuple.java: Loading...
  18. trunk/src/org/apache/pig/data/PrimitiveFieldTuple.java: Loading...
  19. trunk/src/org/apache/pig/data/PrimitiveTuple.java: Loading...
  20. trunk/src/org/apache/pig/data/SchemaTuple.java: Loading...
This diff has been split across 2 pages: 1 2 >