Commit e9cd9f74 authored by Matthew Oliver's avatar Matthew Oliver Committed by Tim Burke

sharder: Keep cleaving on empty shard ranges

When a container is being cleaved there is a possiblity that we're
dealing with an empty or near empty container created on a handoff node.
These containers may have a valid list of shard ranges, so would need
to cleave to the new shards.
Currently, when using a `cleave_batch_size` that is smaller then the
number of shard ranges on the cleaving container, these containers will
have to take a few shard passes to shard, even though there maybe
nothing in them.

This is worse if a really large container is sharding, and due to being
slow, error limitted a node causing a new container on a handoff
location. This empty container would have a large number of shard ranges
and could take a _very_ long time to shard away, slowing the process
down.

This patch eliminates the issue by detecting when no objects are
returned for a shard range. The `_cleave_shard_range` method now
returns 3 possible results:

  - CLEAVE_SUCCESS
  - CLEAVE_FAILED
  - CLEAVE_EMPTY

They all are pretty self explanitory. When `CLEAVE_EMPTY` is returned
the code will:

  - Log
  - Not replicate the empty temp shard container sitting in a
    handoff location
  - Not count the shard range in the `cleave_batch_size` count
  - Update the cleaving context so sharding can move forward

If there already is a shard range DB existing on a handoff node to use
then the sharder wont skip it, even if there are no objects, it'll
replicate it and treat it as normal, including using a `cleave_batch_size`
slot.

Change-Id: Id338f6c3187f93454bcdf025a32a073284a4a159
Closes-Bug: #1839355
parent 6fbbaa91
...@@ -40,6 +40,11 @@ from swift.container.backend import ContainerBroker, \ ...@@ -40,6 +40,11 @@ from swift.container.backend import ContainerBroker, \
from swift.container.replicator import ContainerReplicator from swift.container.replicator import ContainerReplicator
CLEAVE_SUCCESS = 0
CLEAVE_FAILED = 1
CLEAVE_EMPTY = 2
def sharding_enabled(broker): def sharding_enabled(broker):
# NB all shards will by default have been created with # NB all shards will by default have been created with
# X-Container-Sysmeta-Sharding set and will therefore be candidates for # X-Container-Sysmeta-Sharding set and will therefore be candidates for
...@@ -636,6 +641,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -636,6 +641,7 @@ class ContainerSharder(ContainerReplicator):
""" """
part = self.ring.get_part(shard_range.account, shard_range.container) part = self.ring.get_part(shard_range.account, shard_range.container)
node = self.find_local_handoff_for_part(part) node = self.find_local_handoff_for_part(part)
put_timestamp = Timestamp.now().internal
if not node: if not node:
raise DeviceUnavailable( raise DeviceUnavailable(
'No mounted devices found suitable for creating shard broker ' 'No mounted devices found suitable for creating shard broker '
...@@ -644,7 +650,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -644,7 +650,7 @@ class ContainerSharder(ContainerReplicator):
shard_broker = ContainerBroker.create_broker( shard_broker = ContainerBroker.create_broker(
os.path.join(self.root, node['device']), part, shard_range.account, os.path.join(self.root, node['device']), part, shard_range.account,
shard_range.container, epoch=shard_range.epoch, shard_range.container, epoch=shard_range.epoch,
storage_policy_index=policy_index) storage_policy_index=policy_index, put_timestamp=put_timestamp)
# Get the valid info into the broker.container, etc # Get the valid info into the broker.container, etc
shard_broker.get_info() shard_broker.get_info()
...@@ -654,7 +660,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -654,7 +660,7 @@ class ContainerSharder(ContainerReplicator):
'X-Container-Sysmeta-Sharding': 'X-Container-Sysmeta-Sharding':
('True', Timestamp.now().internal)}) ('True', Timestamp.now().internal)})
return part, shard_broker, node['id'] return part, shard_broker, node['id'], put_timestamp
def _audit_root_container(self, broker): def _audit_root_container(self, broker):
# This is the root container, and therefore the tome of knowledge, # This is the root container, and therefore the tome of knowledge,
...@@ -844,7 +850,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -844,7 +850,7 @@ class ContainerSharder(ContainerReplicator):
last_index = next_index = 0 last_index = next_index = 0
for obj in objs: for obj in objs:
if dest_shard_range is None: if dest_shard_range is None:
# no more destinations: yield remainder of batch and return # no more destinations: yield remainder of batch and bail
# NB there may be more batches of objects but none of them # NB there may be more batches of objects but none of them
# will be placed so no point fetching them # will be placed so no point fetching them
yield objs[last_index:], None, info yield objs[last_index:], None, info
...@@ -933,7 +939,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -933,7 +939,7 @@ class ContainerSharder(ContainerReplicator):
continue continue
if dest_shard_range not in dest_brokers: if dest_shard_range not in dest_brokers:
part, dest_broker, node_id = self._get_shard_broker( part, dest_broker, node_id, _junk = self._get_shard_broker(
dest_shard_range, src_broker.root_path, policy_index) dest_shard_range, src_broker.root_path, policy_index)
# save the broker info that was sampled prior to the *first* # save the broker info that was sampled prior to the *first*
# yielded objects for this destination # yielded objects for this destination
...@@ -1162,12 +1168,15 @@ class ContainerSharder(ContainerReplicator): ...@@ -1162,12 +1168,15 @@ class ContainerSharder(ContainerReplicator):
start = time.time() start = time.time()
policy_index = broker.storage_policy_index policy_index = broker.storage_policy_index
try: try:
shard_part, shard_broker, node_id = self._get_shard_broker( shard_part, shard_broker, node_id, put_timestamp = \
shard_range, broker.root_path, policy_index) self._get_shard_broker(shard_range, broker.root_path,
policy_index)
except DeviceUnavailable as duex: except DeviceUnavailable as duex:
self.logger.warning(str(duex)) self.logger.warning(str(duex))
self._increment_stat('cleaved', 'failure', statsd=True) self._increment_stat('cleaved', 'failure', statsd=True)
return False return CLEAVE_FAILED
own_shard_range = broker.get_own_shard_range()
# only cleave from the retiring db - misplaced objects handler will # only cleave from the retiring db - misplaced objects handler will
# deal with any objects in the fresh db # deal with any objects in the fresh db
...@@ -1178,13 +1187,36 @@ class ContainerSharder(ContainerReplicator): ...@@ -1178,13 +1187,36 @@ class ContainerSharder(ContainerReplicator):
source_db_id = source_broker.get_info()['id'] source_db_id = source_broker.get_info()['id']
source_max_row = source_broker.get_max_row() source_max_row = source_broker.get_max_row()
sync_point = shard_broker.get_sync(source_db_id) sync_point = shard_broker.get_sync(source_db_id)
if sync_point < source_max_row: if sync_point < source_max_row or source_max_row == -1:
sync_from_row = max(cleaving_context.last_cleave_to_row or -1, sync_from_row = max(cleaving_context.last_cleave_to_row or -1,
sync_point) sync_point)
objects = None
for objects, info in self.yield_objects( for objects, info in self.yield_objects(
source_broker, shard_range, source_broker, shard_range,
since_row=sync_from_row): since_row=sync_from_row):
shard_broker.merge_items(objects) shard_broker.merge_items(objects)
if objects is None:
self.logger.info("Cleaving '%s': %r - zero objects found",
broker.path, shard_range)
if shard_broker.get_info()['put_timestamp'] == put_timestamp:
# This was just created; don't need to replicate this
# SR because there was nothing there. So cleanup and
# remove the shard_broker from its hand off location.
self.delete_db(shard_broker)
cleaving_context.cursor = shard_range.upper_str
cleaving_context.ranges_done += 1
cleaving_context.ranges_todo -= 1
if shard_range.upper >= own_shard_range.upper:
# cleaving complete
cleaving_context.cleaving_done = True
cleaving_context.store(broker)
# Because nothing was here we wont count it in the shard
# batch count.
return CLEAVE_EMPTY
# Else, it wasn't newly created by us, and
# we don't know what's in it or why. Let it get
# replicated and counted in the batch count.
# Note: the max row stored as a sync point is sampled *before* # Note: the max row stored as a sync point is sampled *before*
# objects are yielded to ensure that is less than or equal to # objects are yielded to ensure that is less than or equal to
# the last yielded row. Other sync points are also copied from the # the last yielded row. Other sync points are also copied from the
...@@ -1199,8 +1231,6 @@ class ContainerSharder(ContainerReplicator): ...@@ -1199,8 +1231,6 @@ class ContainerSharder(ContainerReplicator):
self.logger.debug("Cleaving '%s': %r - shard db already in sync", self.logger.debug("Cleaving '%s': %r - shard db already in sync",
broker.path, shard_range) broker.path, shard_range)
own_shard_range = broker.get_own_shard_range()
replication_quorum = self.existing_shard_replication_quorum replication_quorum = self.existing_shard_replication_quorum
if shard_range.includes(own_shard_range): if shard_range.includes(own_shard_range):
# When shrinking, include deleted own (donor) shard range in # When shrinking, include deleted own (donor) shard range in
...@@ -1242,7 +1272,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -1242,7 +1272,7 @@ class ContainerSharder(ContainerReplicator):
'%s successes, %s required.', shard_range, broker.path, '%s successes, %s required.', shard_range, broker.path,
replication_successes, replication_quorum) replication_successes, replication_quorum)
self._increment_stat('cleaved', 'failure', statsd=True) self._increment_stat('cleaved', 'failure', statsd=True)
return False return CLEAVE_FAILED
elapsed = round(time.time() - start, 3) elapsed = round(time.time() - start, 3)
self._min_stat('cleaved', 'min_time', elapsed) self._min_stat('cleaved', 'min_time', elapsed)
...@@ -1259,7 +1289,7 @@ class ContainerSharder(ContainerReplicator): ...@@ -1259,7 +1289,7 @@ class ContainerSharder(ContainerReplicator):
'Cleaved %s for shard range %s in %gs.', 'Cleaved %s for shard range %s in %gs.',
broker.path, shard_range, elapsed) broker.path, shard_range, elapsed)
self._increment_stat('cleaved', 'success', statsd=True) self._increment_stat('cleaved', 'success', statsd=True)
return True return CLEAVE_SUCCESS
def _cleave(self, broker): def _cleave(self, broker):
# Returns True if misplaced objects have been moved and the entire # Returns True if misplaced objects have been moved and the entire
...@@ -1304,23 +1334,30 @@ class ContainerSharder(ContainerReplicator): ...@@ -1304,23 +1334,30 @@ class ContainerSharder(ContainerReplicator):
cleaving_context.ranges_todo, broker.path) cleaving_context.ranges_todo, broker.path)
ranges_done = [] ranges_done = []
for shard_range in ranges_todo[:self.cleave_batch_size]: for shard_range in ranges_todo:
if shard_range.state == ShardRange.FOUND: if shard_range.state == ShardRange.FOUND:
break break
elif shard_range.state in (ShardRange.CREATED, elif shard_range.state in (ShardRange.CREATED,
ShardRange.CLEAVED, ShardRange.CLEAVED,
ShardRange.ACTIVE): ShardRange.ACTIVE):
if self._cleave_shard_range( cleave_result = self._cleave_shard_range(
broker, cleaving_context, shard_range): broker, cleaving_context, shard_range)
if cleave_result == CLEAVE_SUCCESS:
ranges_done.append(shard_range) ranges_done.append(shard_range)
else: if len(ranges_done) == self.cleave_batch_size:
break
elif cleave_result == CLEAVE_FAILED:
break break
# else, no errors, but no rows found either. keep going,
# and don't count it against our batch size
else: else:
self.logger.warning('Unexpected shard range state for cleave', self.logger.warning('Unexpected shard range state for cleave',
shard_range.state) shard_range.state)
break break
if not ranges_done: if not ranges_done:
# _cleave_shard_range always store()s the context on success; make
# sure we *also* do that if we hit a failure right off the bat
cleaving_context.store(broker) cleaving_context.store(broker)
self.logger.debug( self.logger.debug(
'Cleaved %s shard ranges for %s', len(ranges_done), broker.path) 'Cleaved %s shard ranges for %s', len(ranges_done), broker.path)
......
...@@ -1321,6 +1321,65 @@ class TestSharder(BaseTestSharder): ...@@ -1321,6 +1321,65 @@ class TestSharder(BaseTestSharder):
self.assertEqual(8, context.cleave_to_row) self.assertEqual(8, context.cleave_to_row)
self.assertEqual(8, context.max_row) self.assertEqual(8, context.max_row)
def test_cleave_root_empty_db_with_ranges(self):
broker = self._make_broker()
broker.enable_sharding(Timestamp.now())
shard_bounds = (('', 'd'), ('d', 'x'), ('x', ''))
shard_ranges = self._make_shard_ranges(
shard_bounds, state=ShardRange.CREATED)
broker.merge_shard_ranges(shard_ranges)
self.assertTrue(broker.set_sharding_state())
sharder_conf = {'cleave_batch_size': 1}
with self._mock_sharder(sharder_conf) as sharder:
self.assertTrue(sharder._cleave(broker))
info_lines = sharder.logger.get_lines_for_level('info')
expected_zero_obj = [line for line in info_lines
if " - zero objects found" in line]
self.assertEqual(len(expected_zero_obj), len(shard_bounds))
cleaving_context = CleavingContext.load(broker)
# even though there is a cleave_batch_size of 1, we don't count empty
# ranges when cleaving seeing as they aren't replicated
self.assertEqual(cleaving_context.ranges_done, 3)
self.assertEqual(cleaving_context.ranges_todo, 0)
self.assertTrue(cleaving_context.cleaving_done)
def test_cleave_root_empty_db_with_pre_existing_shard_db_handoff(self):
broker = self._make_broker()
broker.enable_sharding(Timestamp.now())
shard_bounds = (('', 'd'), ('d', 'x'), ('x', ''))
shard_ranges = self._make_shard_ranges(
shard_bounds, state=ShardRange.CREATED)
broker.merge_shard_ranges(shard_ranges)
self.assertTrue(broker.set_sharding_state())
sharder_conf = {'cleave_batch_size': 1}
with self._mock_sharder(sharder_conf) as sharder:
# pre-create a shard broker on a handoff location. This will force
# the sharder to not skip it but instead force to replicate it and
# use up a cleave_batch_size count.
sharder._get_shard_broker(shard_ranges[0], broker.root_path,
0)
self.assertFalse(sharder._cleave(broker))
info_lines = sharder.logger.get_lines_for_level('info')
expected_zero_obj = [line for line in info_lines
if " - zero objects found" in line]
self.assertEqual(len(expected_zero_obj), 1)
cleaving_context = CleavingContext.load(broker)
# even though there is a cleave_batch_size of 1, we don't count empty
# ranges when cleaving seeing as they aren't replicated
self.assertEqual(cleaving_context.ranges_done, 1)
self.assertEqual(cleaving_context.ranges_todo, 2)
self.assertFalse(cleaving_context.cleaving_done)
def test_cleave_shard(self): def test_cleave_shard(self):
broker = self._make_broker(account='.shards_a', container='shard_c') broker = self._make_broker(account='.shards_a', container='shard_c')
own_shard_range = ShardRange( own_shard_range = ShardRange(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment