Review Board 1.7.22


HBASE-5196 Failure in region split after PONR could cause region hole

Review Request #3488 - Created Jan. 13, 2012 and submitted

Jimmy Xiang
HBASE-5196
Reviewers
hbase
hbase-git
When the master starts up, this patch tries to scan all offline split parents and fix up missing daughters as the ServerShutdownHandler does.
I test the fix in my real cluster and it does fix the problem.

I am working on a unit test now.
src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Revision cb2f084 New Change
[20] 22 lines
[+20]
23
import java.lang.reflect.Constructor;
23
import java.lang.reflect.Constructor;
24
import java.lang.reflect.InvocationTargetException;
24
import java.lang.reflect.InvocationTargetException;
25
import java.net.InetAddress;
25
import java.net.InetAddress;
26
import java.net.InetSocketAddress;
26
import java.net.InetSocketAddress;
27
import java.util.ArrayList;
27
import java.util.ArrayList;

    
   
28
import java.util.HashMap;
28
import java.util.List;
29
import java.util.List;
29
import java.util.Map;
30
import java.util.Map;
30
import java.util.Set;
31
import java.util.Set;
31
import java.util.concurrent.atomic.AtomicReference;
32
import java.util.concurrent.atomic.AtomicReference;
32
import java.util.concurrent.Callable;
33
import java.util.concurrent.Callable;
[+20] [20] 33 lines
[+20]
66
import org.apache.hadoop.hbase.ipc.HBaseServer;
67
import org.apache.hadoop.hbase.ipc.HBaseServer;
67
import org.apache.hadoop.hbase.ipc.HMasterInterface;
68
import org.apache.hadoop.hbase.ipc.HMasterInterface;
68
import org.apache.hadoop.hbase.ipc.HMasterRegionInterface;
69
import org.apache.hadoop.hbase.ipc.HMasterRegionInterface;
69
import org.apache.hadoop.hbase.ipc.ProtocolSignature;
70
import org.apache.hadoop.hbase.ipc.ProtocolSignature;
70
import org.apache.hadoop.hbase.ipc.RpcServer;
71
import org.apache.hadoop.hbase.ipc.RpcServer;

    
   
72
import org.apache.hadoop.hbase.master.CatalogJanitor.SplitParentFirstComparator;
71
import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
73
import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
72
import org.apache.hadoop.hbase.master.handler.DeleteTableHandler;
74
import org.apache.hadoop.hbase.master.handler.DeleteTableHandler;
73
import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
75
import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
74
import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
76
import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
75
import org.apache.hadoop.hbase.master.handler.ModifyTableHandler;
77
import org.apache.hadoop.hbase.master.handler.ModifyTableHandler;

    
   
78
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
76
import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler;
79
import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler;
77
import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
80
import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
78
import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
81
import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
79
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
82
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
80
import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
83
import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
[+20] [20] 444 lines
[+20] [+] private void finishInitialization(MonitoredTask status)
525
    status.setStatus("Starting assignment manager");
528
    status.setStatus("Starting assignment manager");
526
    this.assignmentManager.joinCluster();
529
    this.assignmentManager.joinCluster();
527

    
   
530

   
528
    this.balancer.setClusterStatus(getClusterStatus());
531
    this.balancer.setClusterStatus(getClusterStatus());
529
    this.balancer.setMasterServices(this);
532
    this.balancer.setMasterServices(this);
530
    
533

   

    
   
534
    // Fixing up missing daughters if any

    
   
535
    status.setStatus("Fixing up missing daughters");

    
   
536
    fixupDaughters(status);

    
   
537

   
531
    // Start balancer and meta catalog janitor after meta and regions have
538
    // Start balancer and meta catalog janitor after meta and regions have
532
    // been assigned.
539
    // been assigned.
533
    status.setStatus("Starting balancer and catalog janitor");
540
    status.setStatus("Starting balancer and catalog janitor");
534
    this.balancerChore = getAndStartBalancerChore(this);
541
    this.balancerChore = getAndStartBalancerChore(this);
535
    this.catalogJanitorChore = new CatalogJanitor(this, this);
542
    this.catalogJanitorChore = new CatalogJanitor(this, this);
[+20] [20] 84 lines
[+20] private void finishInitialization(MonitoredTask status)
620
      ", location=" + catalogTracker.getMetaLocation());
627
      ", location=" + catalogTracker.getMetaLocation());
621
    status.setStatus("META and ROOT assigned.");
628
    status.setStatus("META and ROOT assigned.");
622
    return assigned;
629
    return assigned;
623
  }
630
  }
624

    
   
631

   

    
   
632
  void fixupDaughters(final MonitoredTask status) throws IOException {

    
   
633
    final Map<HRegionInfo, Result> offlineSplitParents =

    
   
634
      new HashMap<HRegionInfo, Result>();

    
   
635
    // This visitor collects offline split parents in the .META. table

    
   
636
    MetaReader.Visitor visitor = new MetaReader.Visitor() {

    
   
637
      @Override

    
   
638
      public boolean visit(Result r) throws IOException {

    
   
639
        if (r == null || r.isEmpty()) return true;

    
   
640
        HRegionInfo info =

    
   
641
          MetaReader.parseHRegionInfoFromCatalogResult(

    
   
642
            r, HConstants.REGIONINFO_QUALIFIER);

    
   
643
        if (info == null) return true; // Keep scanning

    
   
644
        if (info.isOffline() && info.isSplit()) {

    
   
645
          offlineSplitParents.put(info, r);

    
   
646
        }

    
   
647
        // Returning true means "keep scanning"

    
   
648
        return true;

    
   
649
      }

    
   
650
    };

    
   
651
    // Run full scan of .META. catalog table passing in our custom visitor

    
   
652
    MetaReader.fullScan(this.catalogTracker, visitor);

    
   
653
    // Now work on our list of found parents. See if any we can clean up.

    
   
654
    int fixups = 0;

    
   
655
    for (Map.Entry<HRegionInfo, Result> e : offlineSplitParents.entrySet()) {

    
   
656
      if (ServerShutdownHandler.fixupDaughters(

    
   
657
          e.getValue(), assignmentManager, catalogTracker)) {

    
   
658
        fixups++;

    
   
659
      }

    
   
660
    }

    
   
661
    if (fixups != 0) {

    
   
662
      LOG.info("Scanned the catalog and fixed up " + fixups +

    
   
663
        " missing daughter region(s)");

    
   
664
    }

    
   
665
  }

    
   
666

   
625
  /**
667
  /**
626
   * Expire a server if we find it is one of the online servers set.
668
   * Expire a server if we find it is one of the online servers set.
627
   * @param sn ServerName to check.
669
   * @param sn ServerName to check.
628
   * @return True if server was online and so we expired it as unreachable.
670
   * @return True if server was online and so we expired it as unreachable.
629
   */
671
   */
[+20] [20] 1103 lines
src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
Revision 8f4f4b8 New Change
 
src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java
Revision 41f5dff New Change
 
  1. src/main/java/org/apache/hadoop/hbase/master/HMaster.java: Loading...
  2. src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java: Loading...
  3. src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java: Loading...