Review Board 1.7.22


HBASE-5081 Distributed log splitting deleteNode races againsth splitLog retry

Review Request #3292 - Created Dec. 21, 2011 and discarded

Jimmy Xiang
0.92
HBASE-5081
Reviewers
hbase
lhofhansl, stack, tedyu
hbase-git
In this patch, after a task is done, we don't delete the node if the task is failed.  So that when it's retried later on, there won't be race problem.

It used to delete the node always.
mvn -Dtest=TestDistributedLogSplitting clean test

Diff revision 4

This is not the most recent revision of the diff. The latest diff is revision 8. See what's changed.

1 2 3 4 5 6 7 8
1 2 3 4 5 6 7 8

  1. src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java: Loading...
src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
Revision 667a8b1 New Change
[20] 329 lines
[+20] [+] private void setDone(String path, TerminationStatus status) {
330
      } else {
330
      } else {
331
        tot_mgr_log_split_err.incrementAndGet();
331
        tot_mgr_log_split_err.incrementAndGet();
332
        LOG.warn("Error splitting " + path);
332
        LOG.warn("Error splitting " + path);
333
      }
333
      }
334
    }
334
    }

    
   
335
    boolean safeToDeleteNode = true;
335
    Task task = tasks.get(path);
336
    Task task = tasks.get(path);
336
    if (task == null) {
337
    if (task == null) {
337
      if (!ZKSplitLog.isRescanNode(watcher, path)) {
338
      if (!ZKSplitLog.isRescanNode(watcher, path)) {
338
        tot_mgr_unacquired_orphan_done.incrementAndGet();
339
        tot_mgr_unacquired_orphan_done.incrementAndGet();
339
        LOG.debug("unacquired orphan task is done " + path);
340
        LOG.debug("unacquired orphan task is done " + path);
[+20] [20] 7 lines
[+20] private void setDone(String path, TerminationStatus status) {
347
        if (!task.isOrphan()) {
348
        if (!task.isOrphan()) {
348
          synchronized (task.batch) {
349
          synchronized (task.batch) {
349
            if (status == SUCCESS) {
350
            if (status == SUCCESS) {
350
              task.batch.done++;
351
              task.batch.done++;
351
            } else {
352
            } else {

    
   
353
              // Asynchronous deleting the node will cause race issue

    
   
354
              // against split log retry.  In this case, we can leave the node there.

    
   
355
              safeToDeleteNode = false;
352
              task.batch.error++;
356
              task.batch.error++;
353
            }
357
            }
354
            task.batch.notify();
358
            task.batch.notify();
355
          }
359
          }
356
        }
360
        }
357
      }
361
      }
358
    }
362
    }
359
    // delete the task node in zk. Keep trying indefinitely - its an async
363
    // delete the task node in zk. Keep trying indefinitely - its an async
360
    // call and no one is blocked waiting for this node to be deleted. All
364
    // call and no one is blocked waiting for this node to be deleted. All
361
    // task names are unique (log.<timestamp>) there is no risk of deleting
365
    // task names are unique (log.<timestamp>) there is no risk of deleting
362
    // a future task.
366
    // a future task.  This is true if the task status is SUCCESS, if not,

    
   
367
    // it will race against split log retry. It will be safer to leave the

    
   
368
    // node there if the task is failed and it is not an orphan.

    
   
369
    if (safeToDeleteNode) {
363
    deleteNode(path, Long.MAX_VALUE);
370
      deleteNode(path, Long.MAX_VALUE);

    
   
371
    }
364
    return;
372
    return;
365
  }
373
  }
366

    
   
374

   
367
  private void createNode(String path, Long retry_count) {
375
  private void createNode(String path, Long retry_count) {
368
    ZKUtil.asyncCreate(this.watcher, path,
376
    ZKUtil.asyncCreate(this.watcher, path,
[+20] [20] 723 lines
  1. src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java: Loading...