Review Board 1.7.22


Improve RCFile::sync(long) by 10x

Review Request #10795 - Created April 26, 2013 and updated

Gopal V
trunk
HIVE-4423
Reviewers
hive
ashutoshc, haglein
hive-git
Speed up RCFile::sync() by reading large blocks of data from HDFS rather than using readByte() on the input stream. 

This improves the loop behaviour and reduces the number of calls on the synchronized read() methods within HDFS, resulting in a 10x performance boost to this function.

In real time, it converts a call that takes upto a second and brings it below 100ms, by reading 512 byte chunks instead of reading data 1 byte at a time.
ant test -Dtestcase=TestRCFile -Dmodule=ql
ant test -Dtestcase=TestCliDriver -Dqfile_regex=.*rcfile.* -Dmodule=ql

And benchmarking with count(1) on the store_sales rcfile table at scale=10

before: 43.8, after: 39.5 

Diff revision 1 (Latest)

  1. ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java: Loading...
ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java
Revision d3d98d0 New Change
[20] 1382 lines
[+20] [+] public synchronized void sync(long position) throws IOException {
1383
        return;
1383
        return;
1384
      }
1384
      }
1385

    
   
1385

   
1386
      try {
1386
      try {
1387
        seek(position + 4); // skip escape
1387
        seek(position + 4); // skip escape
1388
        in.readFully(syncCheck);
1388

   
1389
        int syncLen = sync.length;
1389
        int prefix = sync.length;
1390
        for (int i = 0; in.getPos() < end; i++) {
1390
        int n = conf.getInt("io.bytes.per.checksum", 512);
1391
          int j = 0;
1391
        byte[] buffer = new byte[prefix+n];
1392
          for (; j < syncLen; j++) {
1392
        n = (int)Math.min(n, end - in.getPos());
1393
            if (sync[j] != syncCheck[(i + j) % syncLen]) {
1393
        /* fill array with a pattern that will never match sync */
1394
              break;
1394
        Arrays.fill(buffer, (byte)(~sync[0])); 
1395
            }
1395
        while(n > 0 && (in.getPos() + n) <= end) {
1396
          }
1396
          position = in.getPos();
1397
          if (j == syncLen) {
1397
          in.readFully(buffer, prefix, n);
1398
            in.seek(in.getPos() - SYNC_SIZE); // position before
1398
          /* the buffer has n+sync bytes */
1399
            // sync
1399
          for(int i = 0; i < n; i++) {

    
   
1400
            int j;

    
   
1401
            for(j = 0; j < sync.length && sync[j] == buffer[i+j]; j++) {

    
   
1402
              /* nothing */

    
   
1403
            }

    
   
1404
            if(j == sync.length) {

    
   
1405
              /* simplified from (position + (i - prefix) + sync.length) - SYNC_SIZE */

    
   
1406
              in.seek(position + i - SYNC_SIZE);
1400
            return;
1407
              return;
1401
          }
1408
            }
1402
          syncCheck[i % syncLen] = in.readByte();

   
1403
        }
1409
          }

    
   
1410
          /* move the last 16 bytes to the prefix area */

    
   
1411
          System.arraycopy(buffer, buffer.length - prefix - 1, buffer, 0, prefix);

    
   
1412
          n = (int)Math.min(n, end - in.getPos());

    
   
1413
        }
1404
      } catch (ChecksumException e) { // checksum failure
1414
      } catch (ChecksumException e) { // checksum failure
1405
        handleChecksumException(e);
1415
        handleChecksumException(e);
1406
      }
1416
      }
1407
    }
1417
    }
1408

    
   
1418

   
[+20] [20] 442 lines
  1. ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java: Loading...