001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.mapreduce; 019 020import java.io.IOException; 021import java.util.HashMap; 022import java.util.Map; 023import java.util.UUID; 024import org.apache.hadoop.conf.Configured; 025import org.apache.hadoop.fs.FileSystem; 026import org.apache.hadoop.fs.Path; 027import org.apache.hadoop.hbase.HBaseConfiguration; 028import org.apache.hadoop.hbase.HConstants; 029import org.apache.hadoop.hbase.TableName; 030import org.apache.hadoop.hbase.client.Admin; 031import org.apache.hadoop.hbase.client.Connection; 032import org.apache.hadoop.hbase.client.ConnectionFactory; 033import org.apache.hadoop.hbase.client.Scan; 034import org.apache.hadoop.hbase.mapreduce.Import.CellImporter; 035import org.apache.hadoop.hbase.mapreduce.Import.Importer; 036import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles; 037import org.apache.hadoop.hbase.util.Bytes; 038import org.apache.hadoop.hbase.util.CommonFSUtils; 039import org.apache.hadoop.mapreduce.Job; 040import org.apache.hadoop.util.Tool; 041import org.apache.hadoop.util.ToolRunner; 042import org.apache.yetus.audience.InterfaceAudience; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046/** 047 * Tool used to copy a table to another one which can be on a different setup. It is also 048 * configurable with a start and time as well as a specification of the region server implementation 049 * if different from the local cluster. 050 */ 051@InterfaceAudience.Public 052public class CopyTable extends Configured implements Tool { 053 private static final Logger LOG = LoggerFactory.getLogger(CopyTable.class); 054 055 final static String NAME = "copytable"; 056 long startTime = 0; 057 long endTime = HConstants.LATEST_TIMESTAMP; 058 int batch = Integer.MAX_VALUE; 059 int cacheRow = -1; 060 int versions = -1; 061 String tableName = null; 062 String startRow = null; 063 String stopRow = null; 064 String dstTableName = null; 065 String peerAddress = null; 066 String families = null; 067 boolean allCells = false; 068 static boolean shuffle = false; 069 070 boolean bulkload = false; 071 Path bulkloadDir = null; 072 073 boolean readingSnapshot = false; 074 String snapshot = null; 075 076 private final static String JOB_NAME_CONF_KEY = "mapreduce.job.name"; 077 078 private Path generateUniqTempDir(boolean withDirCreated) throws IOException { 079 FileSystem fs = CommonFSUtils.getCurrentFileSystem(getConf()); 080 Path dir = new Path(fs.getWorkingDirectory(), NAME); 081 if (!fs.exists(dir)) { 082 fs.mkdirs(dir); 083 } 084 Path newDir = new Path(dir, UUID.randomUUID().toString()); 085 if (withDirCreated) { 086 fs.mkdirs(newDir); 087 } 088 return newDir; 089 } 090 091 private void initCopyTableMapperReducerJob(Job job, Scan scan) throws IOException { 092 Class<? extends TableMapper> mapper = bulkload ? CellImporter.class : Importer.class; 093 if (readingSnapshot) { 094 TableMapReduceUtil.initTableSnapshotMapperJob(snapshot, scan, mapper, null, null, job, true, 095 generateUniqTempDir(true)); 096 } else { 097 TableMapReduceUtil.initTableMapperJob(tableName, scan, mapper, null, null, job); 098 } 099 } 100 101 /** 102 * Sets up the actual job. 103 * @param args The command line parameters. 104 * @return The newly created job. 105 * @throws IOException When setting up the job fails. 106 */ 107 public Job createSubmittableJob(String[] args) throws IOException { 108 if (!doCommandLine(args)) { 109 return null; 110 } 111 112 String jobName = NAME + "_" + (tableName == null ? snapshot : tableName); 113 Job job = Job.getInstance(getConf(), getConf().get(JOB_NAME_CONF_KEY, jobName)); 114 job.setJarByClass(CopyTable.class); 115 Scan scan = new Scan(); 116 117 scan.setBatch(batch); 118 scan.setCacheBlocks(false); 119 120 if (cacheRow > 0) { 121 scan.setCaching(cacheRow); 122 } else { 123 scan.setCaching(getConf().getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 100)); 124 } 125 126 scan.setTimeRange(startTime, endTime); 127 128 if (allCells) { 129 scan.setRaw(true); 130 } 131 if (shuffle) { 132 job.getConfiguration().set(TableInputFormat.SHUFFLE_MAPS, "true"); 133 } 134 if (versions >= 0) { 135 scan.readVersions(versions); 136 } 137 138 if (startRow != null) { 139 scan.withStartRow(Bytes.toBytesBinary(startRow)); 140 } 141 142 if (stopRow != null) { 143 scan.withStopRow(Bytes.toBytesBinary(stopRow)); 144 } 145 146 if (families != null) { 147 String[] fams = families.split(","); 148 Map<String, String> cfRenameMap = new HashMap<>(); 149 for (String fam : fams) { 150 String sourceCf; 151 if (fam.contains(":")) { 152 // fam looks like "sourceCfName:destCfName" 153 String[] srcAndDest = fam.split(":", 2); 154 sourceCf = srcAndDest[0]; 155 String destCf = srcAndDest[1]; 156 cfRenameMap.put(sourceCf, destCf); 157 } else { 158 // fam is just "sourceCf" 159 sourceCf = fam; 160 } 161 scan.addFamily(Bytes.toBytes(sourceCf)); 162 } 163 Import.configureCfRenaming(job.getConfiguration(), cfRenameMap); 164 } 165 job.setNumReduceTasks(0); 166 167 if (bulkload) { 168 initCopyTableMapperReducerJob(job, scan); 169 170 // We need to split the inputs by destination tables so that output of Map can be bulk-loaded. 171 TableInputFormat.configureSplitTable(job, TableName.valueOf(dstTableName)); 172 173 bulkloadDir = generateUniqTempDir(false); 174 LOG.info("HFiles will be stored at " + this.bulkloadDir); 175 HFileOutputFormat2.setOutputPath(job, bulkloadDir); 176 try (Connection conn = ConnectionFactory.createConnection(getConf()); 177 Admin admin = conn.getAdmin()) { 178 HFileOutputFormat2.configureIncrementalLoadMap(job, 179 admin.getDescriptor((TableName.valueOf(dstTableName)))); 180 } 181 } else { 182 initCopyTableMapperReducerJob(job, scan); 183 TableMapReduceUtil.initTableReducerJob(dstTableName, null, job, null, peerAddress); 184 } 185 186 return job; 187 } 188 189 /* 190 * @param errorMsg Error message. Can be null. 191 */ 192 private static void printUsage(final String errorMsg) { 193 if (errorMsg != null && errorMsg.length() > 0) { 194 System.err.println("ERROR: " + errorMsg); 195 } 196 System.err.println("Usage: CopyTable [general options] [--starttime=X] [--endtime=Y] " 197 + "[--new.name=NEW] [--peer.adr=ADR] <tablename | snapshotName>"); 198 System.err.println(); 199 System.err.println("Options:"); 200 System.err.println(" rs.class hbase.regionserver.class of the peer cluster"); 201 System.err.println(" specify if different from current cluster"); 202 System.err.println(" rs.impl hbase.regionserver.impl of the peer cluster"); 203 System.err.println(" startrow the start row"); 204 System.err.println(" stoprow the stop row"); 205 System.err.println(" starttime beginning of the time range (unixtime in millis)"); 206 System.err.println(" without endtime means from starttime to forever"); 207 System.err.println(" endtime end of the time range. Ignored if no starttime specified."); 208 System.err.println(" versions number of cell versions to copy"); 209 System.err.println(" new.name new table's name"); 210 System.err.println(" peer.adr Address of the peer cluster given in the format"); 211 System.err.println(" hbase.zookeeper.quorum:hbase.zookeeper.client" 212 + ".port:zookeeper.znode.parent"); 213 System.err.println(" families comma-separated list of families to copy"); 214 System.err.println(" To copy from cf1 to cf2, give sourceCfName:destCfName. "); 215 System.err.println(" To keep the same name, just give \"cfName\""); 216 System.err.println(" all.cells also copy delete markers and deleted cells"); 217 System.err 218 .println(" bulkload Write input into HFiles and bulk load to the destination " + "table"); 219 System.err.println(" snapshot Copy the data from snapshot to destination table."); 220 System.err.println(); 221 System.err.println("Args:"); 222 System.err.println(" tablename Name of the table to copy"); 223 System.err.println(); 224 System.err.println("Examples:"); 225 System.err 226 .println(" To copy 'TestTable' to a cluster that uses replication for a 1 hour window:"); 227 System.err.println(" $ hbase " 228 + "org.apache.hadoop.hbase.mapreduce.CopyTable --starttime=1265875194289 --endtime=1265878794289 " 229 + "--peer.adr=server1,server2,server3:2181:/hbase --families=myOldCf:myNewCf,cf2,cf3 TestTable "); 230 System.err.println(" To copy data from 'sourceTableSnapshot' to 'destTable': "); 231 System.err.println(" $ hbase org.apache.hadoop.hbase.mapreduce.CopyTable " 232 + "--snapshot --new.name=destTable sourceTableSnapshot"); 233 System.err.println(" To copy data from 'sourceTableSnapshot' and bulk load to 'destTable': "); 234 System.err.println(" $ hbase org.apache.hadoop.hbase.mapreduce.CopyTable " 235 + "--new.name=destTable --snapshot --bulkload sourceTableSnapshot"); 236 System.err.println("For performance consider the following general option:\n" 237 + " It is recommended that you set the following to >=100. A higher value uses more memory but\n" 238 + " decreases the round trip time to the server and may increase performance.\n" 239 + " -Dhbase.client.scanner.caching=100\n" 240 + " The following should always be set to false, to prevent writing data twice, which may produce \n" 241 + " inaccurate results.\n" + " -Dmapreduce.map.speculative=false"); 242 } 243 244 private boolean doCommandLine(final String[] args) { 245 if (args.length < 1) { 246 printUsage(null); 247 return false; 248 } 249 try { 250 for (int i = 0; i < args.length; i++) { 251 String cmd = args[i]; 252 if (cmd.equals("-h") || cmd.startsWith("--h")) { 253 printUsage(null); 254 return false; 255 } 256 257 final String startRowArgKey = "--startrow="; 258 if (cmd.startsWith(startRowArgKey)) { 259 startRow = cmd.substring(startRowArgKey.length()); 260 continue; 261 } 262 263 final String stopRowArgKey = "--stoprow="; 264 if (cmd.startsWith(stopRowArgKey)) { 265 stopRow = cmd.substring(stopRowArgKey.length()); 266 continue; 267 } 268 269 final String startTimeArgKey = "--starttime="; 270 if (cmd.startsWith(startTimeArgKey)) { 271 startTime = Long.parseLong(cmd.substring(startTimeArgKey.length())); 272 continue; 273 } 274 275 final String endTimeArgKey = "--endtime="; 276 if (cmd.startsWith(endTimeArgKey)) { 277 endTime = Long.parseLong(cmd.substring(endTimeArgKey.length())); 278 continue; 279 } 280 281 final String batchArgKey = "--batch="; 282 if (cmd.startsWith(batchArgKey)) { 283 batch = Integer.parseInt(cmd.substring(batchArgKey.length())); 284 continue; 285 } 286 287 final String cacheRowArgKey = "--cacheRow="; 288 if (cmd.startsWith(cacheRowArgKey)) { 289 cacheRow = Integer.parseInt(cmd.substring(cacheRowArgKey.length())); 290 continue; 291 } 292 293 final String versionsArgKey = "--versions="; 294 if (cmd.startsWith(versionsArgKey)) { 295 versions = Integer.parseInt(cmd.substring(versionsArgKey.length())); 296 continue; 297 } 298 299 final String newNameArgKey = "--new.name="; 300 if (cmd.startsWith(newNameArgKey)) { 301 dstTableName = cmd.substring(newNameArgKey.length()); 302 continue; 303 } 304 305 final String peerAdrArgKey = "--peer.adr="; 306 if (cmd.startsWith(peerAdrArgKey)) { 307 peerAddress = cmd.substring(peerAdrArgKey.length()); 308 continue; 309 } 310 311 final String familiesArgKey = "--families="; 312 if (cmd.startsWith(familiesArgKey)) { 313 families = cmd.substring(familiesArgKey.length()); 314 continue; 315 } 316 317 if (cmd.startsWith("--all.cells")) { 318 allCells = true; 319 continue; 320 } 321 322 if (cmd.startsWith("--bulkload")) { 323 bulkload = true; 324 continue; 325 } 326 327 if (cmd.startsWith("--shuffle")) { 328 shuffle = true; 329 continue; 330 } 331 332 if (cmd.startsWith("--snapshot")) { 333 readingSnapshot = true; 334 continue; 335 } 336 337 if (i == args.length - 1) { 338 if (readingSnapshot) { 339 snapshot = cmd; 340 } else { 341 tableName = cmd; 342 } 343 } else { 344 printUsage("Invalid argument '" + cmd + "'"); 345 return false; 346 } 347 } 348 if (dstTableName == null && peerAddress == null) { 349 printUsage("At least a new table name or a peer address must be specified"); 350 return false; 351 } 352 if ((endTime != 0) && (startTime > endTime)) { 353 printUsage("Invalid time range filter: starttime=" + startTime + " > endtime=" + endTime); 354 return false; 355 } 356 357 if (bulkload && peerAddress != null) { 358 printUsage("Remote bulkload is not supported!"); 359 return false; 360 } 361 362 if (readingSnapshot && peerAddress != null) { 363 printUsage("Loading data from snapshot to remote peer cluster is not supported."); 364 return false; 365 } 366 367 if (readingSnapshot && dstTableName == null) { 368 printUsage("The --new.name=<table> for destination table should be " 369 + "provided when copying data from snapshot ."); 370 return false; 371 } 372 373 if (readingSnapshot && snapshot == null) { 374 printUsage("Snapshot shouldn't be null when --snapshot is enabled."); 375 return false; 376 } 377 378 // set dstTableName if necessary 379 if (dstTableName == null) { 380 dstTableName = tableName; 381 } 382 } catch (Exception e) { 383 e.printStackTrace(); 384 printUsage("Can't start because " + e.getMessage()); 385 return false; 386 } 387 return true; 388 } 389 390 /** 391 * Main entry point. 392 * @param args The command line parameters. 393 * @throws Exception When running the job fails. 394 */ 395 public static void main(String[] args) throws Exception { 396 int ret = ToolRunner.run(HBaseConfiguration.create(), new CopyTable(), args); 397 System.exit(ret); 398 } 399 400 @Override 401 public int run(String[] args) throws Exception { 402 Job job = createSubmittableJob(args); 403 if (job == null) return 1; 404 if (!job.waitForCompletion(true)) { 405 LOG.info("Map-reduce job failed!"); 406 if (bulkload) { 407 LOG.info("Files are not bulkloaded!"); 408 } 409 return 1; 410 } 411 int code = 0; 412 if (bulkload) { 413 LOG.info("Trying to bulk load data to destination table: " + dstTableName); 414 LOG.info("command: ./bin/hbase org.apache.hadoop.hbase.tool.LoadIncrementalHFiles {} {}", 415 this.bulkloadDir.toString(), this.dstTableName); 416 code = new LoadIncrementalHFiles(this.getConf()) 417 .run(new String[] { this.bulkloadDir.toString(), this.dstTableName }); 418 if (code == 0) { 419 // bulkloadDir is deleted only LoadIncrementalHFiles was successful so that one can rerun 420 // LoadIncrementalHFiles. 421 FileSystem fs = CommonFSUtils.getCurrentFileSystem(getConf()); 422 if (!fs.delete(this.bulkloadDir, true)) { 423 LOG.error("Deleting folder " + bulkloadDir + " failed!"); 424 code = 1; 425 } 426 } 427 } 428 return code; 429 } 430}