Review Board 1.7.22


HBASE-5528 Change retrying splitting log forever if throws IOException to numbered times, and abort master when retries exhausted

Review Request #4194 - Created March 6, 2012 and updated

shen chunhui
trunk
Reviewers
hbase
hbase
In current log-splitting retry logic, it will retry forever if throws IOException, I think we'd better change it to numbered times, and abort master when retries exhausted.

 
/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
Revision 1297323 New Change
[20] 43 lines
[+20]
44
import org.apache.hadoop.hbase.ServerName;
44
import org.apache.hadoop.hbase.ServerName;
45
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
45
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
46
import org.apache.hadoop.hbase.regionserver.HRegion;
46
import org.apache.hadoop.hbase.regionserver.HRegion;
47
import org.apache.hadoop.hbase.regionserver.wal.HLog;
47
import org.apache.hadoop.hbase.regionserver.wal.HLog;
48
import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
48
import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
49
import org.apache.hadoop.hbase.regionserver.wal.OrphanHLogAfterSplitException;

   
50
import org.apache.hadoop.hbase.util.Bytes;
49
import org.apache.hadoop.hbase.util.Bytes;
51
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
50
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
52
import org.apache.hadoop.hbase.util.FSTableDescriptors;
51
import org.apache.hadoop.hbase.util.FSTableDescriptors;
53
import org.apache.hadoop.hbase.util.FSUtils;
52
import org.apache.hadoop.hbase.util.FSUtils;
54

    
   
53

   
[+20] [20] 126 lines
[+20] [+] public Path getRootDir() {
181
  public String getClusterId() {
180
  public String getClusterId() {
182
    return clusterId;
181
    return clusterId;
183
  }
182
  }
184

    
   
183

   
185
  /**
184
  /**
186
   * Inspect the log directory to recover any log file without
185
   * Inspect the log directory to recover any log file without an active region
187
   * an active region server.
186
   * server.
188
   * @param onlineServers Set of online servers keyed by
187
   * @param onlineServers Set of online servers keyed by {@link ServerName}
189
   * {@link ServerName}
188
   * @throws IOException
190
   */
189
   */
191
  void splitLogAfterStartup(final Set<ServerName> onlineServers) {
190
  void splitLogAfterStartup(final Set<ServerName> onlineServers)
192
    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
191
      throws IOException {
193
        HLog.SPLIT_SKIP_ERRORS_DEFAULT);

   
194
    Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
192
    Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
195
    do {

   
196
      List<ServerName> serverNames = new ArrayList<ServerName>();
193
    List<ServerName> serverNames = new ArrayList<ServerName>();
197
      try {
194
    if (!this.fs.exists(logsDirPath))
198
        if (!this.fs.exists(logsDirPath)) return;
195
      return;
199
        FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
196
    FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
200

    
   
197

   
201
        if (logFolders == null || logFolders.length == 0) {
198
    if (logFolders == null || logFolders.length == 0) {
202
          LOG.debug("No log files to split, proceeding...");
199
      LOG.debug("No log files to split, proceeding...");
203
          return;
200
      return;
[+20] [20] 13 lines
[+20] public String getClusterId() {
217
            LOG.info("Log folder " + status.getPath()
214
        LOG.info("Log folder " + status.getPath()
218
                + " belongs to an existing region server");
215
            + " belongs to an existing region server");
219
          }
216
      }
220
        }
217
    }
221
        splitLog(serverNames);
218
    splitLog(serverNames);
222
        retrySplitting = false;
219

   
223
      } catch (IOException ioe) {

   
224
        LOG.warn("Failed splitting of " + serverNames, ioe);

   
225
        if (!checkFileSystem()) {
Moved to 283

   
226
          LOG.warn("Bad Filesystem, exiting");
Moved to 284

   
227
          Runtime.getRuntime().halt(1);

   
228
        }

   
229
        try {

   
230
          if (retrySplitting) {

   
231
            Thread.sleep(conf.getInt(

   
232
              "hbase.hlog.split.failure.retry.interval", 30 * 1000));

   
233
          }

   
234
        } catch (InterruptedException e) {

   
235
          LOG.warn("Interrupted, aborting since cannot return w/o splitting");

   
236
          Thread.currentThread().interrupt();

   
237
          retrySplitting = false;

   
238
          Runtime.getRuntime().halt(1);

   
239
        }

   
240
      }

   
241
    } while (retrySplitting);

   
242
  }
220
  }
243
  
221
  
244
  public void splitLog(final ServerName serverName) throws IOException {
222
  public void splitLog(final ServerName serverName) {
245
    List<ServerName> serverNames = new ArrayList<ServerName>();
223
    List<ServerName> serverNames = new ArrayList<ServerName>();
246
    serverNames.add(serverName);
224
    serverNames.add(serverName);
247
    splitLog(serverNames);
225
    splitLog(serverNames);
248
  }
226
  }
249
  
227
  
250
  public void splitLog(final List<ServerName> serverNames) throws IOException {
228
  public void splitLog(final List<ServerName> serverNames) {
251
    long splitTime = 0, splitLogSize = 0;
229
    long splitTime = 0, splitLogSize = 0;
252
    List<Path> logDirs = new ArrayList<Path>();
230
    List<Path> logDirs = new ArrayList<Path>();
253
    for(ServerName serverName: serverNames){
231
    int retrySplittingNum = conf.getInt("hbase.hlog.split.retry.num", 2);

    
   
232
    do{

    
   
233
      try {

    
   
234
        for (ServerName serverName : serverNames) {
254
      Path logDir = new Path(this.rootdir,
235
          Path logDir = new Path(this.rootdir,
255
        HLog.getHLogDirectoryName(serverName.toString()));
236
              HLog.getHLogDirectoryName(serverName.toString()));
256
      Path splitDir = logDir.suffix(HLog.SPLITTING_EXT);
237
          Path splitDir = logDir.suffix(HLog.SPLITTING_EXT);
257
      // rename the directory so a rogue RS doesn't create more HLogs
238
          // rename the directory so a rogue RS doesn't create more HLogs
258
      if (fs.exists(logDir)) {
239
          if (fs.exists(logDir)) {
[+20] [20] 11 lines
[+20] public void splitLog(final List<ServerName> serverNames) throws IOException { public void splitLog(final List<ServerName> serverNames) {
270

    
   
251

   
271
    if (logDirs.isEmpty()) {
252
        if (logDirs.isEmpty()) {
272
      LOG.info("No logs to split");
253
          LOG.info("No logs to split");
273
      return;
254
          return;
274
    }
255
        }
275
      

   
276
    if (distributedLogSplitting) {
256
        if (distributedLogSplitting) {
277
      splitLogManager.handleDeadWorkers(serverNames);
257
          splitLogManager.handleDeadWorkers(serverNames);
278
      splitTime = EnvironmentEdgeManager.currentTimeMillis();
258
          splitTime = EnvironmentEdgeManager.currentTimeMillis();
279
      splitLogSize = splitLogManager.splitLogDistributed(logDirs);
259
          splitLogSize = splitLogManager.splitLogDistributed(logDirs);
280
      splitTime = EnvironmentEdgeManager.currentTimeMillis() - splitTime;
260
          splitTime = EnvironmentEdgeManager.currentTimeMillis() - splitTime;
281
    } else {
261
        } else {
282
      for(Path logDir: logDirs){
262
          for (Path logDir : logDirs) {
283
        // splitLogLock ensures that dead region servers' logs are processed
263
            // splitLogLock ensures that dead region servers' logs are processed
284
        // one at a time
264
            // one at a time
285
        this.splitLogLock.lock();
265
            this.splitLogLock.lock();
286
        try {              

   
287
          HLogSplitter splitter = HLogSplitter.createLogSplitter(

   
288
            conf, rootdir, logDir, oldLogDir, this.fs);

   
289
          try {
266
            try {

    
   
267
              HLogSplitter splitter = HLogSplitter.createLogSplitter(conf,

    
   
268
                  rootdir, logDir, oldLogDir, this.fs);
290
            // If FS is in safe mode, just wait till out of it.
269
              // If FS is in safe mode, just wait till out of it.
291
            FSUtils.waitOnSafeMode(conf, conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 1000));
270
              FSUtils.waitOnSafeMode(conf,

    
   
271
                  conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 1000));
292
            splitter.splitLog();
272
              splitter.splitLog();
293
          } catch (OrphanHLogAfterSplitException e) {

   
294
            LOG.warn("Retrying splitting because of:", e);

   
295
            //An HLogSplitter instance can only be used once.  Get new instance.

   
296
            splitter = HLogSplitter.createLogSplitter(conf, rootdir, logDir,

   
297
              oldLogDir, this.fs);

   
298
            splitter.splitLog();

   
299
          }

   
300
          splitTime = splitter.getTime();
273
              splitTime = splitter.getTime();
301
          splitLogSize = splitter.getSize();
274
              splitLogSize = splitter.getSize();
302
        } finally {
275
            } finally {
303
          this.splitLogLock.unlock();
276
              this.splitLogLock.unlock();
304
        }
277
            }
305
      }
278
          }
306
    }
279
        }

    
   
280
       retrySplittingNum = 0;

    
   
281
      } catch (IOException e) {

    
   
282
        LOG.warn("Failed splitting log of" + serverNames, e);
Moved from 225

    
   
283
        if (!checkFileSystem()) {
Moved from 226

    
   
284
          LOG.warn("Bad Filesystem, exiting");

    
   
285
          retrySplittingNum = 0;

    
   
286
          Runtime.getRuntime().halt(1);

    
   
287
        } else {

    
   
288
          if (retrySplittingNum > 0) {

    
   
289
            LOG.info("Retry splitting log, remanent times = "

    
   
290
                + retrySplittingNum);

    
   
291
          } else {

    
   
292
            master.abort("Failed splitting log after retry", e);

    
   
293
          }

    
   
294
        }

    
   
295
      }

    
   
296
    } while (retrySplittingNum-- > 0);
307

    
   
297

   
308
    if (this.metrics != null) {
298
    if (this.metrics != null) {
309
      this.metrics.addSplit(splitTime, splitLogSize);
299
      this.metrics.addSplit(splitTime, splitLogSize);
310
    }
300
    }
311
  }
301
  }
[+20] [20] 217 lines
/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
Revision 1297323 New Change
 
  1. /src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java: Loading...
  2. /src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java: Loading...