Review Board 1.7.22


SchemaTuple in Pig

Review Request #4651 - Created April 5, 2012 and updated

Jonathan Coveney
PIG-2632
Reviewers
pig
julien
pig
This work builds on Dmitriy's PrimitiveTuple work. The idea is that, knowing the Schema on the frontend, we can code generate Tuples which can be used for fun and profit. In rudimentary tests, the memory efficiency is 2-4x better, and it's ~15% smaller serialized (heavily heavily depends on the data, though). Need to do get/set tests, but assuming that it's on par (or even faster) than Tuple, the memory gain is huge.

Need to clean up the code and add tests.

Right now, it generates a SchemaTuple for every inputSchema and outputSchema given to UDF's. The next step is to make a SchemaBag, where I think the serialization savings will be really huge.

Needs tests and comments, but I want the code to settle a bit.

 

Changes between revision 2 and 3

1 2 3 4 5 6 7 8 9 10
1 2 3 4 5 6 7 8 9 10

  1. trunk/src/org/apache/pig/data/SchemaTupleClassGenerator.java: Loading...
  2. trunk/src/org/apache/pig/data/TupleFactory.java: Loading...
trunk/src/org/apache/pig/data/SchemaTupleClassGenerator.java
Diff Revision 2 Diff Revision 3
[20] 249 lines
[+20] [+] public void prepare() {
250
            add("    } else {");
250
            add("    } else {");
251
            add("        int i = 0;");
251
            add("        int i = 0;");
252
            add("        boolean themNull;");
252
            add("        boolean themNull;");
253
        }
253
        }
254

    
   
254

   
255
        boolean compVal = false;

   
256
        boolean compTup = false;
255
        boolean compTup = false;
257
        boolean compStr = false;
256
        boolean compStr = false;
258
        boolean compIsNull = false;
257
        boolean compIsNull = false;
259
        boolean compByte = false;
258
        boolean compByte = false;
260

    
   
259

   
[+20] [20] 28 lines
[+20] [+] public void process(int fieldNum, Schema.FieldSchema fs) {
289
            } else if (isString()) {
288
            } else if (isString()) {
290
                if (!compStr) {
289
                if (!compStr) {
291
                   add("            String str;");
290
                   add("            String str;");
292
                   compStr = true;
291
                   compStr = true;
293
                }
292
                }
294
                add("        str = t.getString("+fieldNum+");");
293
                add("        try {");

    
   
294
                add("            str = t.getString("+fieldNum+");");

    
   
295
                add("        } catch (ExecException e) {");

    
   
296
                add("            throw new RuntimeException(\"Failed to retrieve String field "+fieldNum+" in tuple: \" + t, e);");

    
   
297
                add("        }");
295
                add("        compIsNull = str == null || themNull;");
298
                add("        compIsNull = str == null || themNull;");
296
                add("        if (pos_"+fieldNum+" == null && !compIsNull) {");
299
                add("        if (pos_"+fieldNum+" == null && !compIsNull) {");
297
                add("            return -1;");
300
                add("            return -1;");
298
                add("        } else if (!compIsNull) {");
301
                add("        } else if (!compIsNull) {");
299
                add("            i = pos_"+fieldNum+".compareTo(str);");
302
                add("            i = pos_"+fieldNum+".compareTo(str);");
300
                add("            if (i != 0)");
303
                add("            if (i != 0)");
301
                add("                return i;");
304
                add("                return i;");
302
                add("        } else if (pos_"+fieldNum+" != null) {");
305
                add("        } else if (pos_"+fieldNum+" != null) {");
303
                add("            return 1;");
306
                add("            return 1;");
304
                add("        }");
307
                add("        }");
305
            } else if (isBytearray()) {
308
            } else if (isBytearray()) {
306
                if (!compByte) {
309
                if (!compByte) {
307
                    add("            byte[] compBuf;");
310
                    add("        byte[] compBuf;");
308
                    compVal = true;
311
                    compByte = true;
309
                }
312
                }
310
                add("        compBuf = t.getBytes("+fieldNum+");");
313
                add("        try {");

    
   
314
                add("            compBuf = t.getBytes("+fieldNum+");");

    
   
315
                add("        } catch (ExecException e) {");

    
   
316
                add("            throw new RuntimeException(\"Failed to retrieve byte[] field "+fieldNum+" in tuple: \" + t, e);");

    
   
317
                add("        }");
311
                add("        compIsNull = compBuf == null || themNull;");
318
                add("        compIsNull = compBuf == null || themNull;");
312
                add("        if (pos_"+fieldNum+" == null && !compIsNull) {");
319
                add("        if (pos_"+fieldNum+" == null && !compIsNull) {");
313
                add("            return -1;");
320
                add("            return -1;");
314
                add("        } else if (!compIsNull) {");
321
                add("        } else if (!compIsNull) {");
315
                add("            i = new DataByteArray(pos_"+fieldNum+").compareTo(new DataByteArray(compBuf));");
322
                add("            i = new DataByteArray(pos_"+fieldNum+").compareTo(new DataByteArray(compBuf));");
[+20] [20] 999 lines
trunk/src/org/apache/pig/data/TupleFactory.java
Diff Revision 2 Diff Revision 3
 
  1. trunk/src/org/apache/pig/data/SchemaTupleClassGenerator.java: Loading...
  2. trunk/src/org/apache/pig/data/TupleFactory.java: Loading...