1 // Copyright (C) 2019 The Android Open Source Project 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <libsnapshot/snapshot.h> 16 17 #include <dirent.h> 18 #include <fcntl.h> 19 #include <math.h> 20 #include <sys/file.h> 21 #include <sys/types.h> 22 #include <sys/unistd.h> 23 24 #include <filesystem> 25 #include <optional> 26 #include <thread> 27 #include <unordered_set> 28 29 #include <android-base/file.h> 30 #include <android-base/logging.h> 31 #include <android-base/parseint.h> 32 #include <android-base/properties.h> 33 #include <android-base/strings.h> 34 #include <android-base/unique_fd.h> 35 #include <cutils/sockets.h> 36 #include <ext4_utils/ext4_utils.h> 37 #include <fs_mgr.h> 38 #include <fs_mgr/file_wait.h> 39 #include <fs_mgr_dm_linear.h> 40 #include <fstab/fstab.h> 41 #include <libdm/dm.h> 42 #include <libfiemap/image_manager.h> 43 #include <liblp/liblp.h> 44 45 #include <android/snapshot/snapshot.pb.h> 46 #include <libsnapshot/snapshot_stats.h> 47 #include "device_info.h" 48 #include "partition_cow_creator.h" 49 #include "snapshot_metadata_updater.h" 50 #include "snapshot_reader.h" 51 #include "utility.h" 52 53 namespace android { 54 namespace snapshot { 55 56 using aidl::android::hardware::boot::MergeStatus; 57 using android::base::unique_fd; 58 using android::dm::DeviceMapper; 59 using android::dm::DmDeviceState; 60 using android::dm::DmTable; 61 using android::dm::DmTargetLinear; 62 using android::dm::DmTargetSnapshot; 63 using android::dm::DmTargetUser; 64 using android::dm::kSectorSize; 65 using android::dm::SnapshotStorageMode; 66 using android::fiemap::FiemapStatus; 67 using android::fiemap::IImageManager; 68 using android::fs_mgr::CreateDmTable; 69 using android::fs_mgr::CreateLogicalPartition; 70 using android::fs_mgr::CreateLogicalPartitionParams; 71 using android::fs_mgr::GetPartitionGroupName; 72 using android::fs_mgr::GetPartitionName; 73 using android::fs_mgr::LpMetadata; 74 using android::fs_mgr::MetadataBuilder; 75 using android::fs_mgr::SlotNumberForSlotSuffix; 76 using chromeos_update_engine::DeltaArchiveManifest; 77 using chromeos_update_engine::Extent; 78 using chromeos_update_engine::FileDescriptor; 79 using chromeos_update_engine::PartitionUpdate; 80 template <typename T> 81 using RepeatedPtrField = google::protobuf::RepeatedPtrField<T>; 82 using std::chrono::duration_cast; 83 using namespace std::chrono_literals; 84 using namespace std::string_literals; 85 86 static constexpr char kBootIndicatorPath[] = "/metadata/ota/snapshot-boot"; 87 static constexpr char kRollbackIndicatorPath[] = "/metadata/ota/rollback-indicator"; 88 static constexpr auto kUpdateStateCheckInterval = 2s; 89 90 MergeFailureCode CheckMergeConsistency(const std::string& name, const SnapshotStatus& status); 91 92 // Note: IImageManager is an incomplete type in the header, so the default 93 // destructor doesn't work. ~SnapshotManager()94 SnapshotManager::~SnapshotManager() {} 95 New(IDeviceInfo * info)96 std::unique_ptr<SnapshotManager> SnapshotManager::New(IDeviceInfo* info) { 97 if (!info) { 98 info = new DeviceInfo(); 99 } 100 101 return std::unique_ptr<SnapshotManager>(new SnapshotManager(info)); 102 } 103 NewForFirstStageMount(IDeviceInfo * info)104 std::unique_ptr<SnapshotManager> SnapshotManager::NewForFirstStageMount(IDeviceInfo* info) { 105 if (!info) { 106 DeviceInfo* impl = new DeviceInfo(); 107 impl->set_first_stage_init(true); 108 info = impl; 109 } 110 auto sm = New(info); 111 112 // The first-stage version of snapuserd is explicitly started by init. Do 113 // not attempt to using it during tests (which run in normal AOSP). 114 if (!sm->device()->IsTestDevice()) { 115 sm->use_first_stage_snapuserd_ = true; 116 } 117 return sm; 118 } 119 SnapshotManager(IDeviceInfo * device)120 SnapshotManager::SnapshotManager(IDeviceInfo* device) 121 : dm_(device->GetDeviceMapper()), device_(device), metadata_dir_(device_->GetMetadataDir()) { 122 merge_consistency_checker_ = android::snapshot::CheckMergeConsistency; 123 } 124 GetCowName(const std::string & snapshot_name)125 static std::string GetCowName(const std::string& snapshot_name) { 126 return snapshot_name + "-cow"; 127 } 128 GetSnapshotDriver(LockedFile * lock)129 SnapshotManager::SnapshotDriver SnapshotManager::GetSnapshotDriver(LockedFile* lock) { 130 if (UpdateUsesUserSnapshots(lock)) { 131 return SnapshotManager::SnapshotDriver::DM_USER; 132 } else { 133 return SnapshotManager::SnapshotDriver::DM_SNAPSHOT; 134 } 135 } 136 GetDmUserCowName(const std::string & snapshot_name,SnapshotManager::SnapshotDriver driver)137 static std::string GetDmUserCowName(const std::string& snapshot_name, 138 SnapshotManager::SnapshotDriver driver) { 139 // dm-user block device will act as a snapshot device. We identify it with 140 // the same partition name so that when partitions can be mounted off 141 // dm-user. 142 143 switch (driver) { 144 case SnapshotManager::SnapshotDriver::DM_USER: { 145 return snapshot_name; 146 } 147 148 case SnapshotManager::SnapshotDriver::DM_SNAPSHOT: { 149 return snapshot_name + "-user-cow"; 150 } 151 152 default: { 153 LOG(ERROR) << "Invalid snapshot driver"; 154 return ""; 155 } 156 } 157 } 158 GetCowImageDeviceName(const std::string & snapshot_name)159 static std::string GetCowImageDeviceName(const std::string& snapshot_name) { 160 return snapshot_name + "-cow-img"; 161 } 162 GetBaseDeviceName(const std::string & partition_name)163 static std::string GetBaseDeviceName(const std::string& partition_name) { 164 return partition_name + "-base"; 165 } 166 GetSourceDeviceName(const std::string & partition_name)167 static std::string GetSourceDeviceName(const std::string& partition_name) { 168 return partition_name + "-src"; 169 } 170 BeginUpdate()171 bool SnapshotManager::BeginUpdate() { 172 bool needs_merge = false; 173 if (!TryCancelUpdate(&needs_merge)) { 174 return false; 175 } 176 if (needs_merge) { 177 LOG(INFO) << "Wait for merge (if any) before beginning a new update."; 178 auto state = ProcessUpdateState(); 179 LOG(INFO) << "Merged with state = " << state; 180 } 181 182 auto file = LockExclusive(); 183 if (!file) return false; 184 185 // Purge the ImageManager just in case there is a corrupt lp_metadata file 186 // lying around. (NB: no need to return false on an error, we can let the 187 // update try to progress.) 188 if (EnsureImageManager()) { 189 images_->RemoveAllImages(); 190 } 191 192 // Clear any cached metadata (this allows re-using one manager across tests). 193 old_partition_metadata_ = nullptr; 194 195 auto state = ReadUpdateState(file.get()); 196 if (state != UpdateState::None) { 197 LOG(ERROR) << "An update is already in progress, cannot begin a new update"; 198 return false; 199 } 200 return WriteUpdateState(file.get(), UpdateState::Initiated); 201 } 202 CancelUpdate()203 bool SnapshotManager::CancelUpdate() { 204 bool needs_merge = false; 205 if (!TryCancelUpdate(&needs_merge)) { 206 return false; 207 } 208 if (needs_merge) { 209 LOG(ERROR) << "Cannot cancel update after it has completed or started merging"; 210 } 211 return !needs_merge; 212 } 213 TryCancelUpdate(bool * needs_merge)214 bool SnapshotManager::TryCancelUpdate(bool* needs_merge) { 215 *needs_merge = false; 216 217 auto file = LockExclusive(); 218 if (!file) return false; 219 220 UpdateState state = ReadUpdateState(file.get()); 221 if (state == UpdateState::None) { 222 RemoveInvalidSnapshots(file.get()); 223 return true; 224 } 225 226 if (state == UpdateState::Initiated) { 227 LOG(INFO) << "Update has been initiated, now canceling"; 228 return RemoveAllUpdateState(file.get()); 229 } 230 231 if (state == UpdateState::Unverified) { 232 // We completed an update, but it can still be canceled if we haven't booted into it. 233 auto slot = GetCurrentSlot(); 234 if (slot != Slot::Target) { 235 LOG(INFO) << "Canceling previously completed updates (if any)"; 236 return RemoveAllUpdateState(file.get()); 237 } 238 } 239 *needs_merge = true; 240 return true; 241 } 242 ReadUpdateSourceSlotSuffix()243 std::string SnapshotManager::ReadUpdateSourceSlotSuffix() { 244 auto boot_file = GetSnapshotBootIndicatorPath(); 245 std::string contents; 246 if (!android::base::ReadFileToString(boot_file, &contents)) { 247 PLOG(WARNING) << "Cannot read " << boot_file; 248 return {}; 249 } 250 return contents; 251 } 252 GetCurrentSlot()253 SnapshotManager::Slot SnapshotManager::GetCurrentSlot() { 254 auto contents = ReadUpdateSourceSlotSuffix(); 255 if (contents.empty()) { 256 return Slot::Unknown; 257 } 258 if (device_->GetSlotSuffix() == contents) { 259 return Slot::Source; 260 } 261 return Slot::Target; 262 } 263 GetSnapshotSlotSuffix()264 std::string SnapshotManager::GetSnapshotSlotSuffix() { 265 switch (GetCurrentSlot()) { 266 case Slot::Target: 267 return device_->GetSlotSuffix(); 268 default: 269 return device_->GetOtherSlotSuffix(); 270 } 271 } 272 RemoveFileIfExists(const std::string & path)273 static bool RemoveFileIfExists(const std::string& path) { 274 std::string message; 275 if (!android::base::RemoveFileIfExists(path, &message)) { 276 LOG(ERROR) << "Remove failed: " << path << ": " << message; 277 return false; 278 } 279 return true; 280 } 281 RemoveAllUpdateState(LockedFile * lock,const std::function<bool ()> & prolog)282 bool SnapshotManager::RemoveAllUpdateState(LockedFile* lock, const std::function<bool()>& prolog) { 283 if (prolog && !prolog()) { 284 LOG(WARNING) << "Can't RemoveAllUpdateState: prolog failed."; 285 return false; 286 } 287 288 LOG(INFO) << "Removing all update state."; 289 290 if (!RemoveAllSnapshots(lock)) { 291 LOG(ERROR) << "Could not remove all snapshots"; 292 return false; 293 } 294 295 // It's okay if these fail: 296 // - For SnapshotBoot and Rollback, first-stage init performs a deeper check after 297 // reading the indicator file, so it's not a problem if it still exists 298 // after the update completes. 299 // - For ForwardMerge, FinishedSnapshotWrites asserts that the existence of the indicator 300 // matches the incoming update. 301 std::vector<std::string> files = { 302 GetSnapshotBootIndicatorPath(), 303 GetRollbackIndicatorPath(), 304 GetForwardMergeIndicatorPath(), 305 GetOldPartitionMetadataPath(), 306 }; 307 for (const auto& file : files) { 308 RemoveFileIfExists(file); 309 } 310 311 // If this fails, we'll keep trying to remove the update state (as the 312 // device reboots or starts a new update) until it finally succeeds. 313 return WriteUpdateState(lock, UpdateState::None); 314 } 315 FinishedSnapshotWrites(bool wipe)316 bool SnapshotManager::FinishedSnapshotWrites(bool wipe) { 317 auto lock = LockExclusive(); 318 if (!lock) return false; 319 320 auto update_state = ReadUpdateState(lock.get()); 321 if (update_state == UpdateState::Unverified) { 322 LOG(INFO) << "FinishedSnapshotWrites already called before. Ignored."; 323 return true; 324 } 325 326 if (update_state != UpdateState::Initiated) { 327 LOG(ERROR) << "Can only transition to the Unverified state from the Initiated state."; 328 return false; 329 } 330 331 if (!EnsureNoOverflowSnapshot(lock.get())) { 332 LOG(ERROR) << "Cannot ensure there are no overflow snapshots."; 333 return false; 334 } 335 336 if (!UpdateForwardMergeIndicator(wipe)) { 337 return false; 338 } 339 340 // This file is written on boot to detect whether a rollback occurred. It 341 // MUST NOT exist before rebooting, otherwise, we're at risk of deleting 342 // snapshots too early. 343 if (!RemoveFileIfExists(GetRollbackIndicatorPath())) { 344 return false; 345 } 346 347 // This file acts as both a quick indicator for init (it can use access(2) 348 // to decide how to do first-stage mounts), and it stores the old slot, so 349 // we can tell whether or not we performed a rollback. 350 auto contents = device_->GetSlotSuffix(); 351 auto boot_file = GetSnapshotBootIndicatorPath(); 352 if (!WriteStringToFileAtomic(contents, boot_file)) { 353 PLOG(ERROR) << "write failed: " << boot_file; 354 return false; 355 } 356 return WriteUpdateState(lock.get(), UpdateState::Unverified); 357 } 358 CreateSnapshot(LockedFile * lock,PartitionCowCreator * cow_creator,SnapshotStatus * status)359 bool SnapshotManager::CreateSnapshot(LockedFile* lock, PartitionCowCreator* cow_creator, 360 SnapshotStatus* status) { 361 CHECK(lock); 362 CHECK(lock->lock_mode() == LOCK_EX); 363 CHECK(status); 364 365 if (status->name().empty()) { 366 LOG(ERROR) << "SnapshotStatus has no name."; 367 return false; 368 } 369 // Check these sizes. Like liblp, we guarantee the partition size is 370 // respected, which means it has to be sector-aligned. (This guarantee is 371 // useful for locating avb footers correctly). The COW file size, however, 372 // can be arbitrarily larger than specified, so we can safely round it up. 373 if (status->device_size() % kSectorSize != 0) { 374 LOG(ERROR) << "Snapshot " << status->name() 375 << " device size is not a multiple of the sector size: " 376 << status->device_size(); 377 return false; 378 } 379 if (status->snapshot_size() % kSectorSize != 0) { 380 LOG(ERROR) << "Snapshot " << status->name() 381 << " snapshot size is not a multiple of the sector size: " 382 << status->snapshot_size(); 383 return false; 384 } 385 if (status->cow_partition_size() % kSectorSize != 0) { 386 LOG(ERROR) << "Snapshot " << status->name() 387 << " cow partition size is not a multiple of the sector size: " 388 << status->cow_partition_size(); 389 return false; 390 } 391 if (status->cow_file_size() % kSectorSize != 0) { 392 LOG(ERROR) << "Snapshot " << status->name() 393 << " cow file size is not a multiple of the sector size: " 394 << status->cow_file_size(); 395 return false; 396 } 397 398 status->set_state(SnapshotState::CREATED); 399 status->set_sectors_allocated(0); 400 status->set_metadata_sectors(0); 401 status->set_using_snapuserd(cow_creator->using_snapuserd); 402 status->set_compression_algorithm(cow_creator->compression_algorithm); 403 if (cow_creator->enable_threading) { 404 status->set_enable_threading(cow_creator->enable_threading); 405 } 406 if (cow_creator->batched_writes) { 407 status->set_batched_writes(cow_creator->batched_writes); 408 } 409 410 if (!WriteSnapshotStatus(lock, *status)) { 411 PLOG(ERROR) << "Could not write snapshot status: " << status->name(); 412 return false; 413 } 414 return true; 415 } 416 CreateCowImage(LockedFile * lock,const std::string & name)417 Return SnapshotManager::CreateCowImage(LockedFile* lock, const std::string& name) { 418 CHECK(lock); 419 CHECK(lock->lock_mode() == LOCK_EX); 420 if (!EnsureImageManager()) return Return::Error(); 421 422 SnapshotStatus status; 423 if (!ReadSnapshotStatus(lock, name, &status)) { 424 return Return::Error(); 425 } 426 427 // The COW file size should have been rounded up to the nearest sector in CreateSnapshot. 428 if (status.cow_file_size() % kSectorSize != 0) { 429 LOG(ERROR) << "Snapshot " << name << " COW file size is not a multiple of the sector size: " 430 << status.cow_file_size(); 431 return Return::Error(); 432 } 433 434 std::string cow_image_name = GetCowImageDeviceName(name); 435 int cow_flags = IImageManager::CREATE_IMAGE_DEFAULT; 436 return Return(images_->CreateBackingImage(cow_image_name, status.cow_file_size(), cow_flags)); 437 } 438 MapDmUserCow(LockedFile * lock,const std::string & name,const std::string & cow_file,const std::string & base_device,const std::string & base_path_merge,const std::chrono::milliseconds & timeout_ms,std::string * path)439 bool SnapshotManager::MapDmUserCow(LockedFile* lock, const std::string& name, 440 const std::string& cow_file, const std::string& base_device, 441 const std::string& base_path_merge, 442 const std::chrono::milliseconds& timeout_ms, std::string* path) { 443 CHECK(lock); 444 445 if (UpdateUsesUserSnapshots(lock)) { 446 SnapshotStatus status; 447 if (!ReadSnapshotStatus(lock, name, &status)) { 448 LOG(ERROR) << "MapDmUserCow: ReadSnapshotStatus failed..."; 449 return false; 450 } 451 452 if (status.state() == SnapshotState::NONE || 453 status.state() == SnapshotState::MERGE_COMPLETED) { 454 LOG(ERROR) << "Should not create a snapshot device for " << name 455 << " after merging has completed."; 456 return false; 457 } 458 459 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 460 if (update_status.state() == UpdateState::MergeCompleted || 461 update_status.state() == UpdateState::MergeNeedsReboot) { 462 LOG(ERROR) << "Should not create a snapshot device for " << name 463 << " after global merging has completed."; 464 return false; 465 } 466 } 467 468 // Use an extra decoration for first-stage init, so we can transition 469 // to a new table entry in second-stage. 470 std::string misc_name = name; 471 if (use_first_stage_snapuserd_) { 472 misc_name += "-init"; 473 } 474 475 if (!EnsureSnapuserdConnected()) { 476 return false; 477 } 478 479 uint64_t base_sectors = 0; 480 if (!UpdateUsesUserSnapshots(lock)) { 481 base_sectors = snapuserd_client_->InitDmUserCow(misc_name, cow_file, base_device); 482 if (base_sectors == 0) { 483 LOG(ERROR) << "Failed to retrieve base_sectors from Snapuserd"; 484 return false; 485 } 486 } else { 487 // For userspace snapshots, the size of the base device is taken as the 488 // size of the dm-user block device. Since there is no pseudo mapping 489 // created in the daemon, we no longer need to rely on the daemon for 490 // sizing the dm-user block device. 491 unique_fd fd(TEMP_FAILURE_RETRY(open(base_path_merge.c_str(), O_RDONLY | O_CLOEXEC))); 492 if (fd < 0) { 493 LOG(ERROR) << "Cannot open block device: " << base_path_merge; 494 return false; 495 } 496 497 uint64_t dev_sz = get_block_device_size(fd.get()); 498 if (!dev_sz) { 499 LOG(ERROR) << "Failed to find block device size: " << base_path_merge; 500 return false; 501 } 502 503 base_sectors = dev_sz >> 9; 504 } 505 506 DmTable table; 507 table.Emplace<DmTargetUser>(0, base_sectors, misc_name); 508 if (!dm_.CreateDevice(name, table, path, timeout_ms)) { 509 LOG(ERROR) << " dm-user: CreateDevice failed... "; 510 return false; 511 } 512 if (!WaitForDevice(*path, timeout_ms)) { 513 LOG(ERROR) << " dm-user: timeout: Failed to create block device for: " << name; 514 return false; 515 } 516 517 auto control_device = "/dev/dm-user/" + misc_name; 518 if (!WaitForDevice(control_device, timeout_ms)) { 519 return false; 520 } 521 522 if (UpdateUsesUserSnapshots(lock)) { 523 // Now that the dm-user device is created, initialize the daemon and 524 // spin up the worker threads. 525 if (!snapuserd_client_->InitDmUserCow(misc_name, cow_file, base_device, base_path_merge)) { 526 LOG(ERROR) << "InitDmUserCow failed"; 527 return false; 528 } 529 } 530 531 return snapuserd_client_->AttachDmUser(misc_name); 532 } 533 MapSnapshot(LockedFile * lock,const std::string & name,const std::string & base_device,const std::string & cow_device,const std::chrono::milliseconds & timeout_ms,std::string * dev_path)534 bool SnapshotManager::MapSnapshot(LockedFile* lock, const std::string& name, 535 const std::string& base_device, const std::string& cow_device, 536 const std::chrono::milliseconds& timeout_ms, 537 std::string* dev_path) { 538 CHECK(lock); 539 540 SnapshotStatus status; 541 if (!ReadSnapshotStatus(lock, name, &status)) { 542 return false; 543 } 544 if (status.state() == SnapshotState::NONE || status.state() == SnapshotState::MERGE_COMPLETED) { 545 LOG(ERROR) << "Should not create a snapshot device for " << name 546 << " after merging has completed."; 547 return false; 548 } 549 550 // Validate the block device size, as well as the requested snapshot size. 551 // Note that during first-stage init, we don't have the device paths. 552 if (android::base::StartsWith(base_device, "/")) { 553 unique_fd fd(open(base_device.c_str(), O_RDONLY | O_CLOEXEC)); 554 if (fd < 0) { 555 PLOG(ERROR) << "open failed: " << base_device; 556 return false; 557 } 558 auto dev_size = get_block_device_size(fd); 559 if (!dev_size) { 560 PLOG(ERROR) << "Could not determine block device size: " << base_device; 561 return false; 562 } 563 if (status.device_size() != dev_size) { 564 LOG(ERROR) << "Block device size for " << base_device << " does not match" 565 << "(expected " << status.device_size() << ", got " << dev_size << ")"; 566 return false; 567 } 568 } 569 if (status.device_size() % kSectorSize != 0) { 570 LOG(ERROR) << "invalid blockdev size for " << base_device << ": " << status.device_size(); 571 return false; 572 } 573 if (status.snapshot_size() % kSectorSize != 0 || 574 status.snapshot_size() > status.device_size()) { 575 LOG(ERROR) << "Invalid snapshot size for " << base_device << ": " << status.snapshot_size(); 576 return false; 577 } 578 if (status.device_size() != status.snapshot_size()) { 579 LOG(ERROR) << "Device size and snapshot size must be the same (device size = " 580 << status.device_size() << ", snapshot size = " << status.snapshot_size(); 581 return false; 582 } 583 584 uint64_t snapshot_sectors = status.snapshot_size() / kSectorSize; 585 586 // Note that merging is a global state. We do track whether individual devices 587 // have completed merging, but the start of the merge process is considered 588 // atomic. 589 SnapshotStorageMode mode; 590 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 591 switch (update_status.state()) { 592 case UpdateState::MergeCompleted: 593 case UpdateState::MergeNeedsReboot: 594 LOG(ERROR) << "Should not create a snapshot device for " << name 595 << " after global merging has completed."; 596 return false; 597 case UpdateState::Merging: 598 case UpdateState::MergeFailed: 599 // Note: MergeFailed indicates that a merge is in progress, but 600 // is possibly stalled. We still have to honor the merge. 601 if (DecideMergePhase(status) == update_status.merge_phase()) { 602 mode = SnapshotStorageMode::Merge; 603 } else { 604 mode = SnapshotStorageMode::Persistent; 605 } 606 break; 607 default: 608 mode = SnapshotStorageMode::Persistent; 609 break; 610 } 611 612 if (mode == SnapshotStorageMode::Persistent && status.state() == SnapshotState::MERGING) { 613 LOG(ERROR) << "Snapshot: " << name 614 << " has snapshot status Merging but mode set to Persistent." 615 << " Changing mode to Snapshot-Merge."; 616 mode = SnapshotStorageMode::Merge; 617 } 618 619 DmTable table; 620 table.Emplace<DmTargetSnapshot>(0, snapshot_sectors, base_device, cow_device, mode, 621 kSnapshotChunkSize); 622 if (!dm_.CreateDevice(name, table, dev_path, timeout_ms)) { 623 LOG(ERROR) << "Could not create snapshot device: " << name; 624 return false; 625 } 626 return true; 627 } 628 MapCowImage(const std::string & name,const std::chrono::milliseconds & timeout_ms)629 std::optional<std::string> SnapshotManager::MapCowImage( 630 const std::string& name, const std::chrono::milliseconds& timeout_ms) { 631 if (!EnsureImageManager()) return std::nullopt; 632 auto cow_image_name = GetCowImageDeviceName(name); 633 634 bool ok; 635 std::string cow_dev; 636 if (device_->IsRecovery() || device_->IsFirstStageInit()) { 637 const auto& opener = device_->GetPartitionOpener(); 638 ok = images_->MapImageWithDeviceMapper(opener, cow_image_name, &cow_dev); 639 } else { 640 ok = images_->MapImageDevice(cow_image_name, timeout_ms, &cow_dev); 641 } 642 643 if (ok) { 644 LOG(INFO) << "Mapped " << cow_image_name << " to " << cow_dev; 645 return cow_dev; 646 } 647 LOG(ERROR) << "Could not map image device: " << cow_image_name; 648 return std::nullopt; 649 } 650 MapSourceDevice(LockedFile * lock,const std::string & name,const std::chrono::milliseconds & timeout_ms,std::string * path)651 bool SnapshotManager::MapSourceDevice(LockedFile* lock, const std::string& name, 652 const std::chrono::milliseconds& timeout_ms, 653 std::string* path) { 654 CHECK(lock); 655 656 auto metadata = ReadOldPartitionMetadata(lock); 657 if (!metadata) { 658 LOG(ERROR) << "Could not map source device due to missing or corrupt metadata"; 659 return false; 660 } 661 662 auto old_name = GetOtherPartitionName(name); 663 auto slot_suffix = device_->GetSlotSuffix(); 664 auto slot = SlotNumberForSlotSuffix(slot_suffix); 665 666 CreateLogicalPartitionParams params = { 667 .block_device = device_->GetSuperDevice(slot), 668 .metadata = metadata, 669 .partition_name = old_name, 670 .timeout_ms = timeout_ms, 671 .device_name = GetSourceDeviceName(name), 672 .partition_opener = &device_->GetPartitionOpener(), 673 }; 674 if (!CreateLogicalPartition(std::move(params), path)) { 675 LOG(ERROR) << "Could not create source device for snapshot " << name; 676 return false; 677 } 678 return true; 679 } 680 UnmapSnapshot(LockedFile * lock,const std::string & name)681 bool SnapshotManager::UnmapSnapshot(LockedFile* lock, const std::string& name) { 682 CHECK(lock); 683 684 if (UpdateUsesUserSnapshots(lock)) { 685 if (!UnmapUserspaceSnapshotDevice(lock, name)) { 686 return false; 687 } 688 } else { 689 if (!DeleteDeviceIfExists(name)) { 690 LOG(ERROR) << "Could not delete snapshot device: " << name; 691 return false; 692 } 693 } 694 return true; 695 } 696 UnmapCowImage(const std::string & name)697 bool SnapshotManager::UnmapCowImage(const std::string& name) { 698 if (!EnsureImageManager()) return false; 699 return images_->UnmapImageIfExists(GetCowImageDeviceName(name)); 700 } 701 DeleteSnapshot(LockedFile * lock,const std::string & name)702 bool SnapshotManager::DeleteSnapshot(LockedFile* lock, const std::string& name) { 703 CHECK(lock); 704 CHECK(lock->lock_mode() == LOCK_EX); 705 if (!EnsureImageManager()) return false; 706 707 if (!UnmapCowDevices(lock, name)) { 708 return false; 709 } 710 711 // We can't delete snapshots in recovery. The only way we'd try is it we're 712 // completing or canceling a merge in preparation for a data wipe, in which 713 // case, we don't care if the file sticks around. 714 if (device_->IsRecovery()) { 715 LOG(INFO) << "Skipping delete of snapshot " << name << " in recovery."; 716 return true; 717 } 718 719 auto cow_image_name = GetCowImageDeviceName(name); 720 if (images_->BackingImageExists(cow_image_name)) { 721 if (!images_->DeleteBackingImage(cow_image_name)) { 722 return false; 723 } 724 } 725 726 std::string error; 727 auto file_path = GetSnapshotStatusFilePath(name); 728 if (!android::base::RemoveFileIfExists(file_path, &error)) { 729 LOG(ERROR) << "Failed to remove status file " << file_path << ": " << error; 730 return false; 731 } 732 return true; 733 } 734 InitiateMerge()735 bool SnapshotManager::InitiateMerge() { 736 auto lock = LockExclusive(); 737 if (!lock) return false; 738 739 UpdateState state = ReadUpdateState(lock.get()); 740 if (state != UpdateState::Unverified) { 741 LOG(ERROR) << "Cannot begin a merge if an update has not been verified"; 742 return false; 743 } 744 745 auto slot = GetCurrentSlot(); 746 if (slot != Slot::Target) { 747 LOG(ERROR) << "Device cannot merge while not booting from new slot"; 748 return false; 749 } 750 751 std::vector<std::string> snapshots; 752 if (!ListSnapshots(lock.get(), &snapshots)) { 753 LOG(ERROR) << "Could not list snapshots"; 754 return false; 755 } 756 757 auto other_suffix = device_->GetOtherSlotSuffix(); 758 759 for (const auto& snapshot : snapshots) { 760 if (android::base::EndsWith(snapshot, other_suffix)) { 761 // Allow the merge to continue, but log this unexpected case. 762 LOG(ERROR) << "Unexpected snapshot found during merge: " << snapshot; 763 continue; 764 } 765 766 // The device has to be mapped, since everything should be merged at 767 // the same time. This is a fairly serious error. We could forcefully 768 // map everything here, but it should have been mapped during first- 769 // stage init. 770 if (dm_.GetState(snapshot) == DmDeviceState::INVALID) { 771 LOG(ERROR) << "Cannot begin merge; device " << snapshot << " is not mapped."; 772 return false; 773 } 774 } 775 776 auto metadata = ReadCurrentMetadata(); 777 for (auto it = snapshots.begin(); it != snapshots.end();) { 778 switch (GetMetadataPartitionState(*metadata, *it)) { 779 case MetadataPartitionState::Flashed: 780 LOG(WARNING) << "Detected re-flashing for partition " << *it 781 << ". Skip merging it."; 782 [[fallthrough]]; 783 case MetadataPartitionState::None: { 784 LOG(WARNING) << "Deleting snapshot for partition " << *it; 785 if (!DeleteSnapshot(lock.get(), *it)) { 786 LOG(WARNING) << "Cannot delete snapshot for partition " << *it 787 << ". Skip merging it anyways."; 788 } 789 it = snapshots.erase(it); 790 } break; 791 case MetadataPartitionState::Updated: { 792 ++it; 793 } break; 794 } 795 } 796 797 bool using_snapuserd = false; 798 799 std::vector<std::string> first_merge_group; 800 801 DmTargetSnapshot::Status initial_target_values = {}; 802 for (const auto& snapshot : snapshots) { 803 if (!UpdateUsesUserSnapshots(lock.get())) { 804 DmTargetSnapshot::Status current_status; 805 if (!QuerySnapshotStatus(snapshot, nullptr, ¤t_status)) { 806 return false; 807 } 808 initial_target_values.sectors_allocated += current_status.sectors_allocated; 809 initial_target_values.total_sectors += current_status.total_sectors; 810 initial_target_values.metadata_sectors += current_status.metadata_sectors; 811 } 812 813 SnapshotStatus snapshot_status; 814 if (!ReadSnapshotStatus(lock.get(), snapshot, &snapshot_status)) { 815 return false; 816 } 817 818 using_snapuserd |= snapshot_status.using_snapuserd(); 819 if (DecideMergePhase(snapshot_status) == MergePhase::FIRST_PHASE) { 820 first_merge_group.emplace_back(snapshot); 821 } 822 } 823 824 SnapshotUpdateStatus initial_status = ReadSnapshotUpdateStatus(lock.get()); 825 initial_status.set_state(UpdateState::Merging); 826 initial_status.set_using_snapuserd(using_snapuserd); 827 828 if (!UpdateUsesUserSnapshots(lock.get())) { 829 initial_status.set_sectors_allocated(initial_target_values.sectors_allocated); 830 initial_status.set_total_sectors(initial_target_values.total_sectors); 831 initial_status.set_metadata_sectors(initial_target_values.metadata_sectors); 832 } 833 834 // If any partitions shrunk, we need to merge them before we merge any other 835 // partitions (see b/177935716). Otherwise, a merge from another partition 836 // may overwrite the source block of a copy operation. 837 const std::vector<std::string>* merge_group; 838 if (first_merge_group.empty()) { 839 merge_group = &snapshots; 840 initial_status.set_merge_phase(MergePhase::SECOND_PHASE); 841 } else { 842 merge_group = &first_merge_group; 843 initial_status.set_merge_phase(MergePhase::FIRST_PHASE); 844 } 845 846 // Point of no return - mark that we're starting a merge. From now on every 847 // eligible snapshot must be a merge target. 848 if (!WriteSnapshotUpdateStatus(lock.get(), initial_status)) { 849 return false; 850 } 851 852 auto reported_code = MergeFailureCode::Ok; 853 for (const auto& snapshot : *merge_group) { 854 // If this fails, we have no choice but to continue. Everything must 855 // be merged. This is not an ideal state to be in, but it is safe, 856 // because we the next boot will try again. 857 auto code = SwitchSnapshotToMerge(lock.get(), snapshot); 858 if (code != MergeFailureCode::Ok) { 859 LOG(ERROR) << "Failed to switch snapshot to a merge target: " << snapshot; 860 if (reported_code == MergeFailureCode::Ok) { 861 reported_code = code; 862 } 863 } 864 } 865 866 // If we couldn't switch everything to a merge target, pre-emptively mark 867 // this merge as failed. It will get acknowledged when WaitForMerge() is 868 // called. 869 if (reported_code != MergeFailureCode::Ok) { 870 WriteUpdateState(lock.get(), UpdateState::MergeFailed, reported_code); 871 } 872 873 // Return true no matter what, because a merge was initiated. 874 return true; 875 } 876 SwitchSnapshotToMerge(LockedFile * lock,const std::string & name)877 MergeFailureCode SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& name) { 878 SnapshotStatus status; 879 if (!ReadSnapshotStatus(lock, name, &status)) { 880 return MergeFailureCode::ReadStatus; 881 } 882 if (status.state() != SnapshotState::CREATED) { 883 LOG(WARNING) << "Snapshot " << name 884 << " has unexpected state: " << SnapshotState_Name(status.state()); 885 } 886 887 if (UpdateUsesUserSnapshots(lock)) { 888 if (EnsureSnapuserdConnected()) { 889 // This is the point where we inform the daemon to initiate/resume 890 // the merge 891 if (!snapuserd_client_->InitiateMerge(name)) { 892 return MergeFailureCode::UnknownTable; 893 } 894 } else { 895 LOG(ERROR) << "Failed to connect to snapuserd daemon to initiate merge"; 896 return MergeFailureCode::UnknownTable; 897 } 898 } else { 899 // After this, we return true because we technically did switch to a merge 900 // target. Everything else we do here is just informational. 901 if (auto code = RewriteSnapshotDeviceTable(name); code != MergeFailureCode::Ok) { 902 return code; 903 } 904 } 905 906 status.set_state(SnapshotState::MERGING); 907 908 if (!UpdateUsesUserSnapshots(lock)) { 909 DmTargetSnapshot::Status dm_status; 910 if (!QuerySnapshotStatus(name, nullptr, &dm_status)) { 911 LOG(ERROR) << "Could not query merge status for snapshot: " << name; 912 } 913 status.set_sectors_allocated(dm_status.sectors_allocated); 914 status.set_metadata_sectors(dm_status.metadata_sectors); 915 } 916 917 if (!WriteSnapshotStatus(lock, status)) { 918 LOG(ERROR) << "Could not update status file for snapshot: " << name; 919 } 920 return MergeFailureCode::Ok; 921 } 922 RewriteSnapshotDeviceTable(const std::string & name)923 MergeFailureCode SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) { 924 std::vector<DeviceMapper::TargetInfo> old_targets; 925 if (!dm_.GetTableInfo(name, &old_targets)) { 926 LOG(ERROR) << "Could not read snapshot device table: " << name; 927 return MergeFailureCode::GetTableInfo; 928 } 929 if (old_targets.size() != 1 || DeviceMapper::GetTargetType(old_targets[0].spec) != "snapshot") { 930 LOG(ERROR) << "Unexpected device-mapper table for snapshot: " << name; 931 return MergeFailureCode::UnknownTable; 932 } 933 934 std::string base_device, cow_device; 935 if (!DmTargetSnapshot::GetDevicesFromParams(old_targets[0].data, &base_device, &cow_device)) { 936 LOG(ERROR) << "Could not derive underlying devices for snapshot: " << name; 937 return MergeFailureCode::GetTableParams; 938 } 939 940 DmTable table; 941 table.Emplace<DmTargetSnapshot>(0, old_targets[0].spec.length, base_device, cow_device, 942 SnapshotStorageMode::Merge, kSnapshotChunkSize); 943 if (!dm_.LoadTableAndActivate(name, table)) { 944 LOG(ERROR) << "Could not swap device-mapper tables on snapshot device " << name; 945 return MergeFailureCode::ActivateNewTable; 946 } 947 LOG(INFO) << "Successfully switched snapshot device to a merge target: " << name; 948 return MergeFailureCode::Ok; 949 } 950 GetSingleTarget(const std::string & dm_name,TableQuery query,DeviceMapper::TargetInfo * target)951 bool SnapshotManager::GetSingleTarget(const std::string& dm_name, TableQuery query, 952 DeviceMapper::TargetInfo* target) { 953 if (dm_.GetState(dm_name) == DmDeviceState::INVALID) { 954 return false; 955 } 956 957 std::vector<DeviceMapper::TargetInfo> targets; 958 bool result; 959 if (query == TableQuery::Status) { 960 result = dm_.GetTableStatus(dm_name, &targets); 961 } else { 962 result = dm_.GetTableInfo(dm_name, &targets); 963 } 964 if (!result) { 965 LOG(ERROR) << "Could not query device: " << dm_name; 966 return false; 967 } 968 if (targets.size() != 1) { 969 return false; 970 } 971 972 *target = std::move(targets[0]); 973 return true; 974 } 975 IsSnapshotDevice(const std::string & dm_name,TargetInfo * target)976 bool SnapshotManager::IsSnapshotDevice(const std::string& dm_name, TargetInfo* target) { 977 DeviceMapper::TargetInfo snap_target; 978 if (!GetSingleTarget(dm_name, TableQuery::Status, &snap_target)) { 979 return false; 980 } 981 auto type = DeviceMapper::GetTargetType(snap_target.spec); 982 983 // If this is not a user-snapshot device then it should either 984 // be a dm-snapshot or dm-snapshot-merge target 985 if (type != "user") { 986 if (type != "snapshot" && type != "snapshot-merge") { 987 return false; 988 } 989 } 990 991 if (target) { 992 *target = std::move(snap_target); 993 } 994 return true; 995 } 996 UpdateStateToStr(const enum UpdateState state)997 auto SnapshotManager::UpdateStateToStr(const enum UpdateState state) { 998 switch (state) { 999 case None: 1000 return "None"; 1001 case Initiated: 1002 return "Initiated"; 1003 case Unverified: 1004 return "Unverified"; 1005 case Merging: 1006 return "Merging"; 1007 case MergeNeedsReboot: 1008 return "MergeNeedsReboot"; 1009 case MergeCompleted: 1010 return "MergeCompleted"; 1011 case MergeFailed: 1012 return "MergeFailed"; 1013 case Cancelled: 1014 return "Cancelled"; 1015 default: 1016 return "Unknown"; 1017 } 1018 } 1019 QuerySnapshotStatus(const std::string & dm_name,std::string * target_type,DmTargetSnapshot::Status * status)1020 bool SnapshotManager::QuerySnapshotStatus(const std::string& dm_name, std::string* target_type, 1021 DmTargetSnapshot::Status* status) { 1022 DeviceMapper::TargetInfo target; 1023 if (!IsSnapshotDevice(dm_name, &target)) { 1024 LOG(ERROR) << "Device " << dm_name << " is not a snapshot or snapshot-merge device"; 1025 return false; 1026 } 1027 if (!DmTargetSnapshot::ParseStatusText(target.data, status)) { 1028 LOG(ERROR) << "Could not parse snapshot status text: " << dm_name; 1029 return false; 1030 } 1031 if (target_type) { 1032 *target_type = DeviceMapper::GetTargetType(target.spec); 1033 } 1034 if (!status->error.empty()) { 1035 LOG(ERROR) << "Snapshot: " << dm_name << " returned error code: " << status->error; 1036 return false; 1037 } 1038 return true; 1039 } 1040 1041 // Note that when a merge fails, we will *always* try again to complete the 1042 // merge each time the device boots. There is no harm in doing so, and if 1043 // the problem was transient, we might manage to get a new outcome. ProcessUpdateState(const std::function<bool ()> & callback,const std::function<bool ()> & before_cancel)1044 UpdateState SnapshotManager::ProcessUpdateState(const std::function<bool()>& callback, 1045 const std::function<bool()>& before_cancel) { 1046 while (true) { 1047 auto result = CheckMergeState(before_cancel); 1048 LOG(INFO) << "ProcessUpdateState handling state: " << UpdateStateToStr(result.state); 1049 1050 if (result.state == UpdateState::MergeFailed) { 1051 AcknowledgeMergeFailure(result.failure_code); 1052 } 1053 if (result.state != UpdateState::Merging) { 1054 // Either there is no merge, or the merge was finished, so no need 1055 // to keep waiting. 1056 return result.state; 1057 } 1058 1059 if (callback && !callback()) { 1060 return result.state; 1061 } 1062 1063 // This wait is not super time sensitive, so we have a relatively 1064 // low polling frequency. 1065 std::this_thread::sleep_for(kUpdateStateCheckInterval); 1066 } 1067 } 1068 CheckMergeState(const std::function<bool ()> & before_cancel)1069 auto SnapshotManager::CheckMergeState(const std::function<bool()>& before_cancel) -> MergeResult { 1070 auto lock = LockExclusive(); 1071 if (!lock) { 1072 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::AcquireLock); 1073 } 1074 1075 auto result = CheckMergeState(lock.get(), before_cancel); 1076 LOG(INFO) << "CheckMergeState for snapshots returned: " << UpdateStateToStr(result.state); 1077 1078 if (result.state == UpdateState::MergeCompleted) { 1079 // Do this inside the same lock. Failures get acknowledged without the 1080 // lock, because flock() might have failed. 1081 AcknowledgeMergeSuccess(lock.get()); 1082 } else if (result.state == UpdateState::Cancelled) { 1083 if (!device_->IsRecovery() && !RemoveAllUpdateState(lock.get(), before_cancel)) { 1084 LOG(ERROR) << "Failed to remove all update state after acknowleding cancelled update."; 1085 } 1086 } 1087 return result; 1088 } 1089 CheckMergeState(LockedFile * lock,const std::function<bool ()> & before_cancel)1090 auto SnapshotManager::CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel) 1091 -> MergeResult { 1092 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 1093 switch (update_status.state()) { 1094 case UpdateState::None: 1095 case UpdateState::MergeCompleted: 1096 // Harmless races are allowed between two callers of WaitForMerge, 1097 // so in both of these cases we just propagate the state. 1098 return MergeResult(update_status.state()); 1099 1100 case UpdateState::Merging: 1101 case UpdateState::MergeNeedsReboot: 1102 case UpdateState::MergeFailed: 1103 // We'll poll each snapshot below. Note that for the NeedsReboot 1104 // case, we always poll once to give cleanup another opportunity to 1105 // run. 1106 break; 1107 1108 case UpdateState::Unverified: 1109 // This is an edge case. Normally cancelled updates are detected 1110 // via the merge poll below, but if we never started a merge, we 1111 // need to also check here. 1112 if (HandleCancelledUpdate(lock, before_cancel)) { 1113 return MergeResult(UpdateState::Cancelled); 1114 } 1115 return MergeResult(update_status.state()); 1116 1117 default: 1118 return MergeResult(update_status.state()); 1119 } 1120 1121 std::vector<std::string> snapshots; 1122 if (!ListSnapshots(lock, &snapshots)) { 1123 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ListSnapshots); 1124 } 1125 1126 auto other_suffix = device_->GetOtherSlotSuffix(); 1127 1128 bool cancelled = false; 1129 bool merging = false; 1130 bool needs_reboot = false; 1131 bool wrong_phase = false; 1132 MergeFailureCode failure_code = MergeFailureCode::Ok; 1133 for (const auto& snapshot : snapshots) { 1134 if (android::base::EndsWith(snapshot, other_suffix)) { 1135 // This will have triggered an error message in InitiateMerge already. 1136 LOG(INFO) << "Skipping merge validation of unexpected snapshot: " << snapshot; 1137 continue; 1138 } 1139 1140 auto result = CheckTargetMergeState(lock, snapshot, update_status); 1141 LOG(INFO) << "CheckTargetMergeState for " << snapshot 1142 << " returned: " << UpdateStateToStr(result.state); 1143 1144 switch (result.state) { 1145 case UpdateState::MergeFailed: 1146 // Take the first failure code in case other failures compound. 1147 if (failure_code == MergeFailureCode::Ok) { 1148 failure_code = result.failure_code; 1149 } 1150 break; 1151 case UpdateState::Merging: 1152 merging = true; 1153 break; 1154 case UpdateState::MergeNeedsReboot: 1155 needs_reboot = true; 1156 break; 1157 case UpdateState::MergeCompleted: 1158 break; 1159 case UpdateState::Cancelled: 1160 cancelled = true; 1161 break; 1162 case UpdateState::None: 1163 wrong_phase = true; 1164 break; 1165 default: 1166 LOG(ERROR) << "Unknown merge status for \"" << snapshot << "\": " 1167 << "\"" << result.state << "\""; 1168 if (failure_code == MergeFailureCode::Ok) { 1169 failure_code = MergeFailureCode::UnexpectedMergeState; 1170 } 1171 break; 1172 } 1173 } 1174 1175 if (merging) { 1176 // Note that we handle "Merging" before we handle anything else. We 1177 // want to poll until *nothing* is merging if we can, so everything has 1178 // a chance to get marked as completed or failed. 1179 return MergeResult(UpdateState::Merging); 1180 } 1181 if (failure_code != MergeFailureCode::Ok) { 1182 // Note: since there are many drop-out cases for failure, we acknowledge 1183 // it in WaitForMerge rather than here and elsewhere. 1184 return MergeResult(UpdateState::MergeFailed, failure_code); 1185 } 1186 if (wrong_phase) { 1187 // If we got here, no other partitions are being merged, and nothing 1188 // failed to merge. It's safe to move to the next merge phase. 1189 auto code = MergeSecondPhaseSnapshots(lock); 1190 if (code != MergeFailureCode::Ok) { 1191 return MergeResult(UpdateState::MergeFailed, code); 1192 } 1193 return MergeResult(UpdateState::Merging); 1194 } 1195 if (needs_reboot) { 1196 WriteUpdateState(lock, UpdateState::MergeNeedsReboot); 1197 return MergeResult(UpdateState::MergeNeedsReboot); 1198 } 1199 if (cancelled) { 1200 // This is an edge case, that we handle as correctly as we sensibly can. 1201 // The underlying partition has changed behind update_engine, and we've 1202 // removed the snapshot as a result. The exact state of the update is 1203 // undefined now, but this can only happen on an unlocked device where 1204 // partitions can be flashed without wiping userdata. 1205 return MergeResult(UpdateState::Cancelled); 1206 } 1207 return MergeResult(UpdateState::MergeCompleted); 1208 } 1209 CheckTargetMergeState(LockedFile * lock,const std::string & name,const SnapshotUpdateStatus & update_status)1210 auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& name, 1211 const SnapshotUpdateStatus& update_status) 1212 -> MergeResult { 1213 SnapshotStatus snapshot_status; 1214 if (!ReadSnapshotStatus(lock, name, &snapshot_status)) { 1215 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ReadStatus); 1216 } 1217 1218 std::unique_ptr<LpMetadata> current_metadata; 1219 1220 if (!IsSnapshotDevice(name)) { 1221 if (!current_metadata) { 1222 current_metadata = ReadCurrentMetadata(); 1223 } 1224 1225 if (!current_metadata || 1226 GetMetadataPartitionState(*current_metadata, name) != MetadataPartitionState::Updated) { 1227 DeleteSnapshot(lock, name); 1228 return MergeResult(UpdateState::Cancelled); 1229 } 1230 1231 // During a check, we decided the merge was complete, but we were unable to 1232 // collapse the device-mapper stack and perform COW cleanup. If we haven't 1233 // rebooted after this check, the device will still be a snapshot-merge 1234 // target. If we have rebooted, the device will now be a linear target, 1235 // and we can try cleanup again. 1236 if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) { 1237 // NB: It's okay if this fails now, we gave cleanup our best effort. 1238 OnSnapshotMergeComplete(lock, name, snapshot_status); 1239 return MergeResult(UpdateState::MergeCompleted); 1240 } 1241 1242 LOG(ERROR) << "Expected snapshot or snapshot-merge for device: " << name; 1243 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType); 1244 } 1245 1246 // This check is expensive so it is only enabled for debugging. 1247 DCHECK((current_metadata = ReadCurrentMetadata()) && 1248 GetMetadataPartitionState(*current_metadata, name) == MetadataPartitionState::Updated); 1249 1250 if (UpdateUsesUserSnapshots(lock)) { 1251 std::string merge_status; 1252 if (EnsureSnapuserdConnected()) { 1253 // Query the snapshot status from the daemon 1254 merge_status = snapuserd_client_->QuerySnapshotStatus(name); 1255 } else { 1256 MergeResult(UpdateState::MergeFailed, MergeFailureCode::QuerySnapshotStatus); 1257 } 1258 1259 if (merge_status == "snapshot-merge-failed") { 1260 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType); 1261 } 1262 1263 // This is the case when device reboots during merge. Once the device boots, 1264 // snapuserd daemon will not resume merge immediately in first stage init. 1265 // This is slightly different as compared to dm-snapshot-merge; In this 1266 // case, metadata file will have "MERGING" state whereas the daemon will be 1267 // waiting to resume the merge. Thus, we resume the merge at this point. 1268 if (merge_status == "snapshot" && snapshot_status.state() == SnapshotState::MERGING) { 1269 if (!snapuserd_client_->InitiateMerge(name)) { 1270 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType); 1271 } 1272 return MergeResult(UpdateState::Merging); 1273 } 1274 1275 if (merge_status == "snapshot" && 1276 DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE && 1277 update_status.merge_phase() == MergePhase::FIRST_PHASE) { 1278 // The snapshot is not being merged because it's in the wrong phase. 1279 return MergeResult(UpdateState::None); 1280 } 1281 1282 if (merge_status == "snapshot-merge") { 1283 if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) { 1284 LOG(ERROR) << "Snapshot " << name 1285 << " is merging after being marked merge-complete."; 1286 return MergeResult(UpdateState::MergeFailed, 1287 MergeFailureCode::UnmergedSectorsAfterCompletion); 1288 } 1289 return MergeResult(UpdateState::Merging); 1290 } 1291 1292 if (merge_status != "snapshot-merge-complete") { 1293 LOG(ERROR) << "Snapshot " << name << " has incorrect status: " << merge_status; 1294 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ExpectedMergeTarget); 1295 } 1296 } else { 1297 // dm-snapshot in the kernel 1298 std::string target_type; 1299 DmTargetSnapshot::Status status; 1300 if (!QuerySnapshotStatus(name, &target_type, &status)) { 1301 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::QuerySnapshotStatus); 1302 } 1303 if (target_type == "snapshot" && 1304 DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE && 1305 update_status.merge_phase() == MergePhase::FIRST_PHASE) { 1306 // The snapshot is not being merged because it's in the wrong phase. 1307 return MergeResult(UpdateState::None); 1308 } 1309 if (target_type != "snapshot-merge") { 1310 // We can get here if we failed to rewrite the target type in 1311 // InitiateMerge(). If we failed to create the target in first-stage 1312 // init, boot would not succeed. 1313 LOG(ERROR) << "Snapshot " << name << " has incorrect target type: " << target_type; 1314 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ExpectedMergeTarget); 1315 } 1316 1317 // These two values are equal when merging is complete. 1318 if (status.sectors_allocated != status.metadata_sectors) { 1319 if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) { 1320 LOG(ERROR) << "Snapshot " << name 1321 << " is merging after being marked merge-complete."; 1322 return MergeResult(UpdateState::MergeFailed, 1323 MergeFailureCode::UnmergedSectorsAfterCompletion); 1324 } 1325 return MergeResult(UpdateState::Merging); 1326 } 1327 } 1328 1329 // Merge is complete at this point 1330 1331 auto code = CheckMergeConsistency(lock, name, snapshot_status); 1332 if (code != MergeFailureCode::Ok) { 1333 return MergeResult(UpdateState::MergeFailed, code); 1334 } 1335 1336 // Merging is done. First, update the status file to indicate the merge 1337 // is complete. We do this before calling OnSnapshotMergeComplete, even 1338 // though this means the write is potentially wasted work (since in the 1339 // ideal case we'll immediately delete the file). 1340 // 1341 // This makes it simpler to reason about the next reboot: no matter what 1342 // part of cleanup failed, first-stage init won't try to create another 1343 // snapshot device for this partition. 1344 snapshot_status.set_state(SnapshotState::MERGE_COMPLETED); 1345 if (!WriteSnapshotStatus(lock, snapshot_status)) { 1346 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::WriteStatus); 1347 } 1348 if (!OnSnapshotMergeComplete(lock, name, snapshot_status)) { 1349 return MergeResult(UpdateState::MergeNeedsReboot); 1350 } 1351 return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok); 1352 } 1353 1354 // This returns the backing device, not the dm-user layer. GetMappedCowDeviceName(const std::string & snapshot,const SnapshotStatus & status)1355 static std::string GetMappedCowDeviceName(const std::string& snapshot, 1356 const SnapshotStatus& status) { 1357 // If no partition was created (the COW exists entirely on /data), the 1358 // device-mapper layering is different than if we had a partition. 1359 if (status.cow_partition_size() == 0) { 1360 return GetCowImageDeviceName(snapshot); 1361 } 1362 return GetCowName(snapshot); 1363 } 1364 CheckMergeConsistency(LockedFile * lock,const std::string & name,const SnapshotStatus & status)1365 MergeFailureCode SnapshotManager::CheckMergeConsistency(LockedFile* lock, const std::string& name, 1366 const SnapshotStatus& status) { 1367 CHECK(lock); 1368 1369 return merge_consistency_checker_(name, status); 1370 } 1371 CheckMergeConsistency(const std::string & name,const SnapshotStatus & status)1372 MergeFailureCode CheckMergeConsistency(const std::string& name, const SnapshotStatus& status) { 1373 if (!status.using_snapuserd()) { 1374 // Do not try to verify old-style COWs yet. 1375 return MergeFailureCode::Ok; 1376 } 1377 1378 auto& dm = DeviceMapper::Instance(); 1379 1380 std::string cow_image_name = GetMappedCowDeviceName(name, status); 1381 std::string cow_image_path; 1382 if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_path)) { 1383 LOG(ERROR) << "Failed to get path for cow device: " << cow_image_name; 1384 return MergeFailureCode::GetCowPathConsistencyCheck; 1385 } 1386 1387 // First pass, count # of ops. 1388 size_t num_ops = 0; 1389 { 1390 unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_CLOEXEC)); 1391 if (fd < 0) { 1392 PLOG(ERROR) << "Failed to open " << cow_image_name; 1393 return MergeFailureCode::OpenCowConsistencyCheck; 1394 } 1395 1396 CowReader reader; 1397 if (!reader.Parse(std::move(fd))) { 1398 LOG(ERROR) << "Failed to parse cow " << cow_image_path; 1399 return MergeFailureCode::ParseCowConsistencyCheck; 1400 } 1401 1402 num_ops = reader.get_num_total_data_ops(); 1403 } 1404 1405 // Second pass, try as hard as we can to get the actual number of blocks 1406 // the system thinks is merged. 1407 unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_DIRECT | O_SYNC | O_CLOEXEC)); 1408 if (fd < 0) { 1409 PLOG(ERROR) << "Failed to open direct " << cow_image_name; 1410 return MergeFailureCode::OpenCowDirectConsistencyCheck; 1411 } 1412 1413 void* addr; 1414 size_t page_size = getpagesize(); 1415 if (posix_memalign(&addr, page_size, page_size) < 0) { 1416 PLOG(ERROR) << "posix_memalign with page size " << page_size; 1417 return MergeFailureCode::MemAlignConsistencyCheck; 1418 } 1419 1420 // COWs are always at least 2MB, this is guaranteed in snapshot creation. 1421 std::unique_ptr<void, decltype(&::free)> buffer(addr, ::free); 1422 if (!android::base::ReadFully(fd, buffer.get(), page_size)) { 1423 PLOG(ERROR) << "Direct read failed " << cow_image_name; 1424 return MergeFailureCode::DirectReadConsistencyCheck; 1425 } 1426 1427 auto header = reinterpret_cast<CowHeader*>(buffer.get()); 1428 if (header->num_merge_ops != num_ops) { 1429 LOG(ERROR) << "COW consistency check failed, expected " << num_ops << " to be merged, " 1430 << "but " << header->num_merge_ops << " were actually recorded."; 1431 LOG(ERROR) << "Aborting merge progress for snapshot " << name 1432 << ", will try again next boot"; 1433 return MergeFailureCode::WrongMergeCountConsistencyCheck; 1434 } 1435 1436 return MergeFailureCode::Ok; 1437 } 1438 MergeSecondPhaseSnapshots(LockedFile * lock)1439 MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) { 1440 std::vector<std::string> snapshots; 1441 if (!ListSnapshots(lock, &snapshots)) { 1442 return MergeFailureCode::ListSnapshots; 1443 } 1444 1445 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 1446 CHECK(update_status.state() == UpdateState::Merging || 1447 update_status.state() == UpdateState::MergeFailed); 1448 CHECK(update_status.merge_phase() == MergePhase::FIRST_PHASE); 1449 1450 update_status.set_state(UpdateState::Merging); 1451 update_status.set_merge_phase(MergePhase::SECOND_PHASE); 1452 if (!WriteSnapshotUpdateStatus(lock, update_status)) { 1453 return MergeFailureCode::WriteStatus; 1454 } 1455 1456 MergeFailureCode result = MergeFailureCode::Ok; 1457 for (const auto& snapshot : snapshots) { 1458 SnapshotStatus snapshot_status; 1459 if (!ReadSnapshotStatus(lock, snapshot, &snapshot_status)) { 1460 return MergeFailureCode::ReadStatus; 1461 } 1462 if (DecideMergePhase(snapshot_status) != MergePhase::SECOND_PHASE) { 1463 continue; 1464 } 1465 auto code = SwitchSnapshotToMerge(lock, snapshot); 1466 if (code != MergeFailureCode::Ok) { 1467 LOG(ERROR) << "Failed to switch snapshot to a second-phase merge target: " << snapshot; 1468 if (result == MergeFailureCode::Ok) { 1469 result = code; 1470 } 1471 } 1472 } 1473 return result; 1474 } 1475 GetSnapshotBootIndicatorPath()1476 std::string SnapshotManager::GetSnapshotBootIndicatorPath() { 1477 return metadata_dir_ + "/" + android::base::Basename(kBootIndicatorPath); 1478 } 1479 GetRollbackIndicatorPath()1480 std::string SnapshotManager::GetRollbackIndicatorPath() { 1481 return metadata_dir_ + "/" + android::base::Basename(kRollbackIndicatorPath); 1482 } 1483 GetForwardMergeIndicatorPath()1484 std::string SnapshotManager::GetForwardMergeIndicatorPath() { 1485 return metadata_dir_ + "/allow-forward-merge"; 1486 } 1487 GetOldPartitionMetadataPath()1488 std::string SnapshotManager::GetOldPartitionMetadataPath() { 1489 return metadata_dir_ + "/old-partition-metadata"; 1490 } 1491 AcknowledgeMergeSuccess(LockedFile * lock)1492 void SnapshotManager::AcknowledgeMergeSuccess(LockedFile* lock) { 1493 // It's not possible to remove update state in recovery, so write an 1494 // indicator that cleanup is needed on reboot. If a factory data reset 1495 // was requested, it doesn't matter, everything will get wiped anyway. 1496 // To make testing easier we consider a /data wipe as cleaned up. 1497 if (device_->IsRecovery()) { 1498 WriteUpdateState(lock, UpdateState::MergeCompleted); 1499 return; 1500 } 1501 1502 RemoveAllUpdateState(lock); 1503 1504 if (UpdateUsesUserSnapshots(lock) && !device()->IsTestDevice()) { 1505 if (snapuserd_client_) { 1506 snapuserd_client_->DetachSnapuserd(); 1507 snapuserd_client_->RemoveTransitionedDaemonIndicator(); 1508 snapuserd_client_ = nullptr; 1509 } 1510 } 1511 } 1512 AcknowledgeMergeFailure(MergeFailureCode failure_code)1513 void SnapshotManager::AcknowledgeMergeFailure(MergeFailureCode failure_code) { 1514 // Log first, so worst case, we always have a record of why the calls below 1515 // were being made. 1516 LOG(ERROR) << "Merge could not be completed and will be marked as failed."; 1517 1518 auto lock = LockExclusive(); 1519 if (!lock) return; 1520 1521 // Since we released the lock in between WaitForMerge and here, it's 1522 // possible (1) the merge successfully completed or (2) was already 1523 // marked as a failure. So make sure to check the state again, and 1524 // only mark as a failure if appropriate. 1525 UpdateState state = ReadUpdateState(lock.get()); 1526 if (state != UpdateState::Merging && state != UpdateState::MergeNeedsReboot) { 1527 return; 1528 } 1529 1530 WriteUpdateState(lock.get(), UpdateState::MergeFailed, failure_code); 1531 } 1532 OnSnapshotMergeComplete(LockedFile * lock,const std::string & name,const SnapshotStatus & status)1533 bool SnapshotManager::OnSnapshotMergeComplete(LockedFile* lock, const std::string& name, 1534 const SnapshotStatus& status) { 1535 if (!UpdateUsesUserSnapshots(lock)) { 1536 if (IsSnapshotDevice(name)) { 1537 // We are extra-cautious here, to avoid deleting the wrong table. 1538 std::string target_type; 1539 DmTargetSnapshot::Status dm_status; 1540 if (!QuerySnapshotStatus(name, &target_type, &dm_status)) { 1541 return false; 1542 } 1543 if (target_type != "snapshot-merge") { 1544 LOG(ERROR) << "Unexpected target type " << target_type 1545 << " for snapshot device: " << name; 1546 return false; 1547 } 1548 if (dm_status.sectors_allocated != dm_status.metadata_sectors) { 1549 LOG(ERROR) << "Merge is unexpectedly incomplete for device " << name; 1550 return false; 1551 } 1552 if (!CollapseSnapshotDevice(lock, name, status)) { 1553 LOG(ERROR) << "Unable to collapse snapshot: " << name; 1554 return false; 1555 } 1556 } 1557 } else { 1558 // Just collapse the device - no need to query again as we just did 1559 // prior to calling this function 1560 if (!CollapseSnapshotDevice(lock, name, status)) { 1561 LOG(ERROR) << "Unable to collapse snapshot: " << name; 1562 return false; 1563 } 1564 } 1565 1566 // Note that collapsing is implicitly an Unmap, so we don't need to 1567 // unmap the snapshot. 1568 1569 if (!DeleteSnapshot(lock, name)) { 1570 LOG(ERROR) << "Could not delete snapshot: " << name; 1571 return false; 1572 } 1573 return true; 1574 } 1575 CollapseSnapshotDevice(LockedFile * lock,const std::string & name,const SnapshotStatus & status)1576 bool SnapshotManager::CollapseSnapshotDevice(LockedFile* lock, const std::string& name, 1577 const SnapshotStatus& status) { 1578 if (!UpdateUsesUserSnapshots(lock)) { 1579 // Verify we have a snapshot-merge device. 1580 DeviceMapper::TargetInfo target; 1581 if (!GetSingleTarget(name, TableQuery::Table, &target)) { 1582 return false; 1583 } 1584 if (DeviceMapper::GetTargetType(target.spec) != "snapshot-merge") { 1585 // This should be impossible, it was checked earlier. 1586 LOG(ERROR) << "Snapshot device has invalid target type: " << name; 1587 return false; 1588 } 1589 1590 std::string base_device, cow_device; 1591 if (!DmTargetSnapshot::GetDevicesFromParams(target.data, &base_device, &cow_device)) { 1592 LOG(ERROR) << "Could not parse snapshot device " << name 1593 << " parameters: " << target.data; 1594 return false; 1595 } 1596 } 1597 1598 uint64_t snapshot_sectors = status.snapshot_size() / kSectorSize; 1599 if (snapshot_sectors * kSectorSize != status.snapshot_size()) { 1600 LOG(ERROR) << "Snapshot " << name 1601 << " size is not sector aligned: " << status.snapshot_size(); 1602 return false; 1603 } 1604 1605 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 1606 // Create a DmTable that is identical to the base device. 1607 CreateLogicalPartitionParams base_device_params{ 1608 .block_device = device_->GetSuperDevice(slot), 1609 .metadata_slot = slot, 1610 .partition_name = name, 1611 .partition_opener = &device_->GetPartitionOpener(), 1612 }; 1613 DmTable table; 1614 if (!CreateDmTable(base_device_params, &table)) { 1615 LOG(ERROR) << "Could not create a DmTable for partition: " << name; 1616 return false; 1617 } 1618 1619 if (!dm_.LoadTableAndActivate(name, table)) { 1620 return false; 1621 } 1622 1623 if (!UpdateUsesUserSnapshots(lock)) { 1624 // Attempt to delete the snapshot device if one still exists. Nothing 1625 // should be depending on the device, and device-mapper should have 1626 // flushed remaining I/O. We could in theory replace with dm-zero (or 1627 // re-use the table above), but for now it's better to know why this 1628 // would fail. 1629 // 1630 // Furthermore, we should not be trying to unmap for userspace snapshot 1631 // as unmap will fail since dm-user itself was a snapshot device prior 1632 // to switching of tables. Unmap will fail as the device will be mounted 1633 // by system partitions 1634 if (status.using_snapuserd()) { 1635 auto dm_user_name = GetDmUserCowName(name, GetSnapshotDriver(lock)); 1636 UnmapDmUserDevice(dm_user_name); 1637 } 1638 } 1639 1640 // We can't delete base device immediately as daemon holds a reference. 1641 // Make sure we wait for all the worker threads to terminate and release 1642 // the reference 1643 if (UpdateUsesUserSnapshots(lock) && EnsureSnapuserdConnected()) { 1644 if (!snapuserd_client_->WaitForDeviceDelete(name)) { 1645 LOG(ERROR) << "Failed to wait for " << name << " control device to delete"; 1646 } 1647 } 1648 1649 auto base_name = GetBaseDeviceName(name); 1650 if (!DeleteDeviceIfExists(base_name)) { 1651 LOG(ERROR) << "Unable to delete base device for snapshot: " << base_name; 1652 } 1653 1654 if (!DeleteDeviceIfExists(GetSourceDeviceName(name), 4000ms)) { 1655 LOG(ERROR) << "Unable to delete source device for snapshot: " << GetSourceDeviceName(name); 1656 } 1657 1658 return true; 1659 } 1660 HandleCancelledUpdate(LockedFile * lock,const std::function<bool ()> & before_cancel)1661 bool SnapshotManager::HandleCancelledUpdate(LockedFile* lock, 1662 const std::function<bool()>& before_cancel) { 1663 auto slot = GetCurrentSlot(); 1664 if (slot == Slot::Unknown) { 1665 return false; 1666 } 1667 1668 // If all snapshots were reflashed, then cancel the entire update. 1669 if (AreAllSnapshotsCancelled(lock)) { 1670 LOG(WARNING) << "Detected re-flashing, cancelling unverified update."; 1671 return RemoveAllUpdateState(lock, before_cancel); 1672 } 1673 1674 // If update has been rolled back, then cancel the entire update. 1675 // Client (update_engine) is responsible for doing additional cleanup work on its own states 1676 // when ProcessUpdateState() returns UpdateState::Cancelled. 1677 auto current_slot = GetCurrentSlot(); 1678 if (current_slot != Slot::Source) { 1679 LOG(INFO) << "Update state is being processed while booting at " << current_slot 1680 << " slot, taking no action."; 1681 return false; 1682 } 1683 1684 // current_slot == Source. Attempt to detect rollbacks. 1685 if (access(GetRollbackIndicatorPath().c_str(), F_OK) != 0) { 1686 // This unverified update is not attempted. Take no action. 1687 PLOG(INFO) << "Rollback indicator not detected. " 1688 << "Update state is being processed before reboot, taking no action."; 1689 return false; 1690 } 1691 1692 LOG(WARNING) << "Detected rollback, cancelling unverified update."; 1693 return RemoveAllUpdateState(lock, before_cancel); 1694 } 1695 PerformInitTransition(InitTransition transition,std::vector<std::string> * snapuserd_argv)1696 bool SnapshotManager::PerformInitTransition(InitTransition transition, 1697 std::vector<std::string>* snapuserd_argv) { 1698 LOG(INFO) << "Performing transition for snapuserd."; 1699 1700 // Don't use EnsureSnapuserdConnected() because this is called from init, 1701 // and attempting to do so will deadlock. 1702 if (!snapuserd_client_ && transition != InitTransition::SELINUX_DETACH) { 1703 snapuserd_client_ = SnapuserdClient::Connect(kSnapuserdSocket, 10s); 1704 if (!snapuserd_client_) { 1705 LOG(ERROR) << "Unable to connect to snapuserd"; 1706 return false; 1707 } 1708 } 1709 1710 auto lock = LockExclusive(); 1711 if (!lock) return false; 1712 1713 std::vector<std::string> snapshots; 1714 if (!ListSnapshots(lock.get(), &snapshots)) { 1715 LOG(ERROR) << "Failed to list snapshots."; 1716 return false; 1717 } 1718 1719 if (UpdateUsesUserSnapshots(lock.get()) && transition == InitTransition::SELINUX_DETACH) { 1720 snapuserd_argv->emplace_back("-user_snapshot"); 1721 if (UpdateUsesIouring(lock.get())) { 1722 snapuserd_argv->emplace_back("-io_uring"); 1723 } 1724 } 1725 1726 size_t num_cows = 0; 1727 size_t ok_cows = 0; 1728 for (const auto& snapshot : snapshots) { 1729 std::string user_cow_name = GetDmUserCowName(snapshot, GetSnapshotDriver(lock.get())); 1730 1731 if (dm_.GetState(user_cow_name) == DmDeviceState::INVALID) { 1732 continue; 1733 } 1734 1735 DeviceMapper::TargetInfo target; 1736 if (!GetSingleTarget(user_cow_name, TableQuery::Table, &target)) { 1737 continue; 1738 } 1739 1740 auto target_type = DeviceMapper::GetTargetType(target.spec); 1741 if (target_type != "user") { 1742 LOG(ERROR) << "Unexpected target type for " << user_cow_name << ": " << target_type; 1743 continue; 1744 } 1745 1746 num_cows++; 1747 1748 SnapshotStatus snapshot_status; 1749 if (!ReadSnapshotStatus(lock.get(), snapshot, &snapshot_status)) { 1750 LOG(ERROR) << "Unable to read snapshot status: " << snapshot; 1751 continue; 1752 } 1753 1754 auto misc_name = user_cow_name; 1755 1756 std::string source_device_name; 1757 if (snapshot_status.old_partition_size() > 0) { 1758 source_device_name = GetSourceDeviceName(snapshot); 1759 } else { 1760 source_device_name = GetBaseDeviceName(snapshot); 1761 } 1762 1763 std::string source_device; 1764 if (!dm_.GetDmDevicePathByName(source_device_name, &source_device)) { 1765 LOG(ERROR) << "Could not get device path for " << GetSourceDeviceName(snapshot); 1766 continue; 1767 } 1768 1769 std::string base_path_merge; 1770 if (!dm_.GetDmDevicePathByName(GetBaseDeviceName(snapshot), &base_path_merge)) { 1771 LOG(ERROR) << "Could not get device path for " << GetSourceDeviceName(snapshot); 1772 continue; 1773 } 1774 1775 std::string cow_image_name = GetMappedCowDeviceName(snapshot, snapshot_status); 1776 1777 std::string cow_image_device; 1778 if (!dm_.GetDmDevicePathByName(cow_image_name, &cow_image_device)) { 1779 LOG(ERROR) << "Could not get device path for " << cow_image_name; 1780 continue; 1781 } 1782 1783 if (transition == InitTransition::SELINUX_DETACH) { 1784 if (!UpdateUsesUserSnapshots(lock.get())) { 1785 auto message = misc_name + "," + cow_image_device + "," + source_device; 1786 snapuserd_argv->emplace_back(std::move(message)); 1787 } else { 1788 auto message = misc_name + "," + cow_image_device + "," + source_device + "," + 1789 base_path_merge; 1790 snapuserd_argv->emplace_back(std::move(message)); 1791 } 1792 1793 // Do not attempt to connect to the new snapuserd yet, it hasn't 1794 // been started. We do however want to wait for the misc device 1795 // to have been created. 1796 ok_cows++; 1797 continue; 1798 } 1799 1800 DmTable table; 1801 table.Emplace<DmTargetUser>(0, target.spec.length, misc_name); 1802 if (!dm_.LoadTableAndActivate(user_cow_name, table)) { 1803 LOG(ERROR) << "Unable to swap tables for " << misc_name; 1804 continue; 1805 } 1806 1807 // Wait for ueventd to acknowledge and create the control device node. 1808 std::string control_device = "/dev/dm-user/" + misc_name; 1809 if (!WaitForDevice(control_device, 10s)) { 1810 LOG(ERROR) << "dm-user control device no found: " << misc_name; 1811 continue; 1812 } 1813 1814 uint64_t base_sectors; 1815 if (!UpdateUsesUserSnapshots(lock.get())) { 1816 base_sectors = 1817 snapuserd_client_->InitDmUserCow(misc_name, cow_image_device, source_device); 1818 } else { 1819 base_sectors = snapuserd_client_->InitDmUserCow(misc_name, cow_image_device, 1820 source_device, base_path_merge); 1821 } 1822 1823 if (base_sectors == 0) { 1824 // Unrecoverable as metadata reads from cow device failed 1825 LOG(FATAL) << "Failed to retrieve base_sectors from Snapuserd"; 1826 return false; 1827 } 1828 1829 CHECK(base_sectors <= target.spec.length); 1830 1831 if (!snapuserd_client_->AttachDmUser(misc_name)) { 1832 // This error is unrecoverable. We cannot proceed because reads to 1833 // the underlying device will fail. 1834 LOG(FATAL) << "Could not initialize snapuserd for " << user_cow_name; 1835 return false; 1836 } 1837 1838 ok_cows++; 1839 } 1840 1841 if (ok_cows != num_cows) { 1842 LOG(ERROR) << "Could not transition all snapuserd consumers."; 1843 return false; 1844 } 1845 return true; 1846 } 1847 ReadCurrentMetadata()1848 std::unique_ptr<LpMetadata> SnapshotManager::ReadCurrentMetadata() { 1849 const auto& opener = device_->GetPartitionOpener(); 1850 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 1851 auto super_device = device_->GetSuperDevice(slot); 1852 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot); 1853 if (!metadata) { 1854 LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device; 1855 return nullptr; 1856 } 1857 return metadata; 1858 } 1859 GetMetadataPartitionState(const LpMetadata & metadata,const std::string & name)1860 SnapshotManager::MetadataPartitionState SnapshotManager::GetMetadataPartitionState( 1861 const LpMetadata& metadata, const std::string& name) { 1862 auto partition = android::fs_mgr::FindPartition(metadata, name); 1863 if (!partition) return MetadataPartitionState::None; 1864 if (partition->attributes & LP_PARTITION_ATTR_UPDATED) { 1865 return MetadataPartitionState::Updated; 1866 } 1867 return MetadataPartitionState::Flashed; 1868 } 1869 AreAllSnapshotsCancelled(LockedFile * lock)1870 bool SnapshotManager::AreAllSnapshotsCancelled(LockedFile* lock) { 1871 std::vector<std::string> snapshots; 1872 if (!ListSnapshots(lock, &snapshots)) { 1873 LOG(WARNING) << "Failed to list snapshots to determine whether device has been flashed " 1874 << "after applying an update. Assuming no snapshots."; 1875 // Let HandleCancelledUpdate resets UpdateState. 1876 return true; 1877 } 1878 1879 std::map<std::string, bool> flashing_status; 1880 1881 if (!GetSnapshotFlashingStatus(lock, snapshots, &flashing_status)) { 1882 LOG(WARNING) << "Failed to determine whether partitions have been flashed. Not" 1883 << "removing update states."; 1884 return false; 1885 } 1886 1887 bool all_snapshots_cancelled = std::all_of(flashing_status.begin(), flashing_status.end(), 1888 [](const auto& pair) { return pair.second; }); 1889 1890 if (all_snapshots_cancelled) { 1891 LOG(WARNING) << "All partitions are re-flashed after update, removing all update states."; 1892 } 1893 return all_snapshots_cancelled; 1894 } 1895 GetSnapshotFlashingStatus(LockedFile * lock,const std::vector<std::string> & snapshots,std::map<std::string,bool> * out)1896 bool SnapshotManager::GetSnapshotFlashingStatus(LockedFile* lock, 1897 const std::vector<std::string>& snapshots, 1898 std::map<std::string, bool>* out) { 1899 CHECK(lock); 1900 1901 auto source_slot_suffix = ReadUpdateSourceSlotSuffix(); 1902 if (source_slot_suffix.empty()) { 1903 return false; 1904 } 1905 uint32_t source_slot = SlotNumberForSlotSuffix(source_slot_suffix); 1906 uint32_t target_slot = (source_slot == 0) ? 1 : 0; 1907 1908 // Attempt to detect re-flashing on each partition. 1909 // - If all partitions are re-flashed, we can proceed to cancel the whole update. 1910 // - If only some of the partitions are re-flashed, snapshots for re-flashed partitions are 1911 // deleted. Caller is responsible for merging the rest of the snapshots. 1912 // - If none of the partitions are re-flashed, caller is responsible for merging the snapshots. 1913 // 1914 // Note that we use target slot metadata, since if an OTA has been applied 1915 // to the target slot, we can detect the UPDATED flag. Any kind of flash 1916 // operation against dynamic partitions ensures that all copies of the 1917 // metadata are in sync, so flashing all partitions on the source slot will 1918 // remove the UPDATED flag on the target slot as well. 1919 const auto& opener = device_->GetPartitionOpener(); 1920 auto super_device = device_->GetSuperDevice(target_slot); 1921 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, target_slot); 1922 if (!metadata) { 1923 return false; 1924 } 1925 1926 for (const auto& snapshot_name : snapshots) { 1927 if (GetMetadataPartitionState(*metadata, snapshot_name) == 1928 MetadataPartitionState::Updated) { 1929 out->emplace(snapshot_name, false); 1930 } else { 1931 // Delete snapshots for partitions that are re-flashed after the update. 1932 LOG(WARNING) << "Detected re-flashing of partition " << snapshot_name << "."; 1933 out->emplace(snapshot_name, true); 1934 } 1935 } 1936 return true; 1937 } 1938 RemoveInvalidSnapshots(LockedFile * lock)1939 void SnapshotManager::RemoveInvalidSnapshots(LockedFile* lock) { 1940 std::vector<std::string> snapshots; 1941 1942 // Remove the stale snapshot metadata 1943 // 1944 // We make sure that all the three cases 1945 // are valid before removing the snapshot metadata: 1946 // 1947 // 1: dm state is active 1948 // 2: Root fs is not mounted off as a snapshot device 1949 // 3: Snapshot slot suffix should match current device slot 1950 if (!ListSnapshots(lock, &snapshots, device_->GetSlotSuffix()) || snapshots.empty()) { 1951 return; 1952 } 1953 1954 // We indeed have some invalid snapshots 1955 for (const auto& name : snapshots) { 1956 if (dm_.GetState(name) == DmDeviceState::ACTIVE && !IsSnapshotDevice(name)) { 1957 if (!DeleteSnapshot(lock, name)) { 1958 LOG(ERROR) << "Failed to delete invalid snapshot: " << name; 1959 } else { 1960 LOG(INFO) << "Invalid snapshot: " << name << " deleted"; 1961 } 1962 } 1963 } 1964 } 1965 RemoveAllSnapshots(LockedFile * lock)1966 bool SnapshotManager::RemoveAllSnapshots(LockedFile* lock) { 1967 std::vector<std::string> snapshots; 1968 if (!ListSnapshots(lock, &snapshots)) { 1969 LOG(ERROR) << "Could not list snapshots"; 1970 return false; 1971 } 1972 1973 std::map<std::string, bool> flashing_status; 1974 if (!GetSnapshotFlashingStatus(lock, snapshots, &flashing_status)) { 1975 LOG(WARNING) << "Failed to get flashing status"; 1976 } 1977 1978 auto current_slot = GetCurrentSlot(); 1979 bool ok = true; 1980 bool has_mapped_cow_images = false; 1981 for (const auto& name : snapshots) { 1982 // If booting off source slot, it is okay to unmap and delete all the snapshots. 1983 // If boot indicator is missing, update state is None or Initiated, so 1984 // it is also okay to unmap and delete all the snapshots. 1985 // If booting off target slot, 1986 // - should not unmap because: 1987 // - In Android mode, snapshots are not mapped, but 1988 // filesystems are mounting off dm-linear targets directly. 1989 // - In recovery mode, assume nothing is mapped, so it is optional to unmap. 1990 // - If partition is flashed or unknown, it is okay to delete snapshots. 1991 // Otherwise (UPDATED flag), only delete snapshots if they are not mapped 1992 // as dm-snapshot (for example, after merge completes). 1993 bool should_unmap = current_slot != Slot::Target; 1994 bool should_delete = ShouldDeleteSnapshot(flashing_status, current_slot, name); 1995 if (should_unmap && android::base::EndsWith(name, device_->GetSlotSuffix())) { 1996 // Something very unexpected has happened - we want to unmap this 1997 // snapshot, but it's on the wrong slot. We can't unmap an active 1998 // partition. If this is not really a snapshot, skip the unmap 1999 // step. 2000 if (dm_.GetState(name) == DmDeviceState::INVALID || !IsSnapshotDevice(name)) { 2001 LOG(ERROR) << "Detected snapshot " << name << " on " << current_slot << " slot" 2002 << " for source partition; removing without unmap."; 2003 should_unmap = false; 2004 } 2005 } 2006 2007 bool partition_ok = true; 2008 if (should_unmap && !UnmapPartitionWithSnapshot(lock, name)) { 2009 partition_ok = false; 2010 } 2011 if (partition_ok && should_delete && !DeleteSnapshot(lock, name)) { 2012 partition_ok = false; 2013 } 2014 2015 if (!partition_ok) { 2016 // Remember whether or not we were able to unmap the cow image. 2017 auto cow_image_device = GetCowImageDeviceName(name); 2018 has_mapped_cow_images |= 2019 (EnsureImageManager() && images_->IsImageMapped(cow_image_device)); 2020 2021 ok = false; 2022 } 2023 } 2024 2025 if (ok || !has_mapped_cow_images) { 2026 // Delete any image artifacts as a precaution, in case an update is 2027 // being cancelled due to some corrupted state in an lp_metadata file. 2028 // Note that we do not do this if some cow images are still mapped, 2029 // since we must not remove backing storage if it's in use. 2030 if (!EnsureImageManager() || !images_->RemoveAllImages()) { 2031 LOG(ERROR) << "Could not remove all snapshot artifacts"; 2032 return false; 2033 } 2034 } 2035 return ok; 2036 } 2037 2038 // See comments in RemoveAllSnapshots(). ShouldDeleteSnapshot(const std::map<std::string,bool> & flashing_status,Slot current_slot,const std::string & name)2039 bool SnapshotManager::ShouldDeleteSnapshot(const std::map<std::string, bool>& flashing_status, 2040 Slot current_slot, const std::string& name) { 2041 if (current_slot != Slot::Target) { 2042 return true; 2043 } 2044 auto it = flashing_status.find(name); 2045 if (it == flashing_status.end()) { 2046 LOG(WARNING) << "Can't determine flashing status for " << name; 2047 return true; 2048 } 2049 if (it->second) { 2050 // partition flashed, okay to delete obsolete snapshots 2051 return true; 2052 } 2053 return !IsSnapshotDevice(name); 2054 } 2055 GetUpdateState(double * progress)2056 UpdateState SnapshotManager::GetUpdateState(double* progress) { 2057 // If we've never started an update, the state file won't exist. 2058 auto state_file = GetStateFilePath(); 2059 if (access(state_file.c_str(), F_OK) != 0 && errno == ENOENT) { 2060 return UpdateState::None; 2061 } 2062 2063 auto lock = LockShared(); 2064 if (!lock) { 2065 return UpdateState::None; 2066 } 2067 2068 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock.get()); 2069 auto state = update_status.state(); 2070 if (progress == nullptr) { 2071 return state; 2072 } 2073 2074 if (state == UpdateState::MergeCompleted) { 2075 *progress = 100.0; 2076 return state; 2077 } 2078 2079 *progress = 0.0; 2080 if (state != UpdateState::Merging) { 2081 return state; 2082 } 2083 2084 if (!UpdateUsesUserSnapshots(lock.get())) { 2085 // Sum all the snapshot states as if the system consists of a single huge 2086 // snapshots device, then compute the merge completion percentage of that 2087 // device. 2088 std::vector<std::string> snapshots; 2089 if (!ListSnapshots(lock.get(), &snapshots)) { 2090 LOG(ERROR) << "Could not list snapshots"; 2091 return state; 2092 } 2093 2094 DmTargetSnapshot::Status fake_snapshots_status = {}; 2095 for (const auto& snapshot : snapshots) { 2096 DmTargetSnapshot::Status current_status; 2097 2098 if (!IsSnapshotDevice(snapshot)) continue; 2099 if (!QuerySnapshotStatus(snapshot, nullptr, ¤t_status)) continue; 2100 2101 fake_snapshots_status.sectors_allocated += current_status.sectors_allocated; 2102 fake_snapshots_status.total_sectors += current_status.total_sectors; 2103 fake_snapshots_status.metadata_sectors += current_status.metadata_sectors; 2104 } 2105 2106 *progress = DmTargetSnapshot::MergePercent(fake_snapshots_status, 2107 update_status.sectors_allocated()); 2108 } else { 2109 if (EnsureSnapuserdConnected()) { 2110 *progress = snapuserd_client_->GetMergePercent(); 2111 } 2112 } 2113 2114 return state; 2115 } 2116 UpdateUsesCompression()2117 bool SnapshotManager::UpdateUsesCompression() { 2118 auto lock = LockShared(); 2119 if (!lock) return false; 2120 return UpdateUsesCompression(lock.get()); 2121 } 2122 UpdateUsesCompression(LockedFile * lock)2123 bool SnapshotManager::UpdateUsesCompression(LockedFile* lock) { 2124 // This returns true even if compression is "none", since update_engine is 2125 // really just trying to see if snapuserd is in use. 2126 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 2127 return update_status.using_snapuserd(); 2128 } 2129 UpdateUsesIouring(LockedFile * lock)2130 bool SnapshotManager::UpdateUsesIouring(LockedFile* lock) { 2131 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 2132 return update_status.io_uring_enabled(); 2133 } 2134 UpdateUsesUserSnapshots()2135 bool SnapshotManager::UpdateUsesUserSnapshots() { 2136 // This and the following function is constantly 2137 // invoked during snapshot merge. We want to avoid 2138 // constantly reading from disk. Hence, store this 2139 // value in memory. 2140 // 2141 // Furthermore, this value in the disk is set 2142 // only when OTA is applied and doesn't change 2143 // during merge phase. Hence, once we know that 2144 // the value is read from disk the very first time, 2145 // it is safe to read successive checks from memory. 2146 if (is_snapshot_userspace_.has_value()) { 2147 return is_snapshot_userspace_.value(); 2148 } 2149 2150 auto lock = LockShared(); 2151 if (!lock) return false; 2152 2153 return UpdateUsesUserSnapshots(lock.get()); 2154 } 2155 UpdateUsesUserSnapshots(LockedFile * lock)2156 bool SnapshotManager::UpdateUsesUserSnapshots(LockedFile* lock) { 2157 // See UpdateUsesUserSnapshots() 2158 if (is_snapshot_userspace_.has_value()) { 2159 return is_snapshot_userspace_.value(); 2160 } 2161 2162 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); 2163 is_snapshot_userspace_ = update_status.userspace_snapshots(); 2164 return is_snapshot_userspace_.value(); 2165 } 2166 ListSnapshots(LockedFile * lock,std::vector<std::string> * snapshots,const std::string & suffix)2167 bool SnapshotManager::ListSnapshots(LockedFile* lock, std::vector<std::string>* snapshots, 2168 const std::string& suffix) { 2169 CHECK(lock); 2170 2171 auto dir_path = metadata_dir_ + "/snapshots"s; 2172 std::unique_ptr<DIR, decltype(&closedir)> dir(opendir(dir_path.c_str()), closedir); 2173 if (!dir) { 2174 PLOG(ERROR) << "opendir failed: " << dir_path; 2175 return false; 2176 } 2177 2178 struct dirent* dp; 2179 while ((dp = readdir(dir.get())) != nullptr) { 2180 if (dp->d_type != DT_REG) continue; 2181 2182 std::string name(dp->d_name); 2183 if (!suffix.empty() && !android::base::EndsWith(name, suffix)) { 2184 continue; 2185 } 2186 2187 // Insert system and product partition at the beginning so that 2188 // during snapshot-merge, these partitions are merged first. 2189 if (name == "system_a" || name == "system_b" || name == "product_a" || 2190 name == "product_b") { 2191 snapshots->insert(snapshots->begin(), std::move(name)); 2192 } else { 2193 snapshots->emplace_back(std::move(name)); 2194 } 2195 } 2196 2197 return true; 2198 } 2199 IsSnapshotManagerNeeded()2200 bool SnapshotManager::IsSnapshotManagerNeeded() { 2201 return access(kBootIndicatorPath, F_OK) == 0; 2202 } 2203 GetGlobalRollbackIndicatorPath()2204 std::string SnapshotManager::GetGlobalRollbackIndicatorPath() { 2205 return kRollbackIndicatorPath; 2206 } 2207 NeedSnapshotsInFirstStageMount()2208 bool SnapshotManager::NeedSnapshotsInFirstStageMount() { 2209 // If we fail to read, we'll wind up using CreateLogicalPartitions, which 2210 // will create devices that look like the old slot, except with extra 2211 // content at the end of each device. This will confuse dm-verity, and 2212 // ultimately we'll fail to boot. Why not make it a fatal error and have 2213 // the reason be clearer? Because the indicator file still exists, and 2214 // if this was FATAL, reverting to the old slot would be broken. 2215 auto slot = GetCurrentSlot(); 2216 2217 if (slot != Slot::Target) { 2218 if (slot == Slot::Source) { 2219 // Device is rebooting into the original slot, so mark this as a 2220 // rollback. 2221 auto path = GetRollbackIndicatorPath(); 2222 if (!android::base::WriteStringToFile("1", path)) { 2223 PLOG(ERROR) << "Unable to write rollback indicator: " << path; 2224 } else { 2225 LOG(INFO) << "Rollback detected, writing rollback indicator to " << path; 2226 } 2227 } 2228 LOG(INFO) << "Not booting from new slot. Will not mount snapshots."; 2229 return false; 2230 } 2231 2232 // If we can't read the update state, it's unlikely anything else will 2233 // succeed, so this is a fatal error. We'll eventually exhaust boot 2234 // attempts and revert to the old slot. 2235 auto lock = LockShared(); 2236 if (!lock) { 2237 LOG(FATAL) << "Could not read update state to determine snapshot status"; 2238 return false; 2239 } 2240 switch (ReadUpdateState(lock.get())) { 2241 case UpdateState::Unverified: 2242 case UpdateState::Merging: 2243 case UpdateState::MergeFailed: 2244 return true; 2245 default: 2246 return false; 2247 } 2248 } 2249 CreateLogicalAndSnapshotPartitions(const std::string & super_device,const std::chrono::milliseconds & timeout_ms)2250 bool SnapshotManager::CreateLogicalAndSnapshotPartitions( 2251 const std::string& super_device, const std::chrono::milliseconds& timeout_ms) { 2252 LOG(INFO) << "Creating logical partitions with snapshots as needed"; 2253 2254 auto lock = LockExclusive(); 2255 if (!lock) return false; 2256 2257 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 2258 return MapAllPartitions(lock.get(), super_device, slot, timeout_ms); 2259 } 2260 MapAllPartitions(LockedFile * lock,const std::string & super_device,uint32_t slot,const std::chrono::milliseconds & timeout_ms)2261 bool SnapshotManager::MapAllPartitions(LockedFile* lock, const std::string& super_device, 2262 uint32_t slot, const std::chrono::milliseconds& timeout_ms) { 2263 const auto& opener = device_->GetPartitionOpener(); 2264 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot); 2265 if (!metadata) { 2266 LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device; 2267 return false; 2268 } 2269 2270 if (!EnsureImageManager()) { 2271 return false; 2272 } 2273 2274 for (const auto& partition : metadata->partitions) { 2275 if (GetPartitionGroupName(metadata->groups[partition.group_index]) == kCowGroupName) { 2276 LOG(INFO) << "Skip mapping partition " << GetPartitionName(partition) << " in group " 2277 << kCowGroupName; 2278 continue; 2279 } 2280 2281 CreateLogicalPartitionParams params = { 2282 .block_device = super_device, 2283 .metadata = metadata.get(), 2284 .partition = &partition, 2285 .timeout_ms = timeout_ms, 2286 .partition_opener = &opener, 2287 }; 2288 if (!MapPartitionWithSnapshot(lock, std::move(params), SnapshotContext::Mount, nullptr)) { 2289 return false; 2290 } 2291 } 2292 2293 LOG(INFO) << "Created logical partitions with snapshot."; 2294 return true; 2295 } 2296 GetRemainingTime(const std::chrono::milliseconds & timeout,const std::chrono::time_point<std::chrono::steady_clock> & begin)2297 static std::chrono::milliseconds GetRemainingTime( 2298 const std::chrono::milliseconds& timeout, 2299 const std::chrono::time_point<std::chrono::steady_clock>& begin) { 2300 // If no timeout is specified, execute all commands without specifying any timeout. 2301 if (timeout.count() == 0) return std::chrono::milliseconds(0); 2302 auto passed_time = std::chrono::steady_clock::now() - begin; 2303 auto remaining_time = timeout - duration_cast<std::chrono::milliseconds>(passed_time); 2304 if (remaining_time.count() <= 0) { 2305 LOG(ERROR) << "MapPartitionWithSnapshot has reached timeout " << timeout.count() << "ms (" 2306 << remaining_time.count() << "ms remaining)"; 2307 // Return min() instead of remaining_time here because 0 is treated as a special value for 2308 // no timeout, where the rest of the commands will still be executed. 2309 return std::chrono::milliseconds::min(); 2310 } 2311 return remaining_time; 2312 } 2313 MapPartitionWithSnapshot(LockedFile * lock,CreateLogicalPartitionParams params,SnapshotContext context,SnapshotPaths * paths)2314 bool SnapshotManager::MapPartitionWithSnapshot(LockedFile* lock, 2315 CreateLogicalPartitionParams params, 2316 SnapshotContext context, SnapshotPaths* paths) { 2317 auto begin = std::chrono::steady_clock::now(); 2318 2319 CHECK(lock); 2320 2321 if (params.GetPartitionName() != params.GetDeviceName()) { 2322 LOG(ERROR) << "Mapping snapshot with a different name is unsupported: partition_name = " 2323 << params.GetPartitionName() << ", device_name = " << params.GetDeviceName(); 2324 return false; 2325 } 2326 2327 // Fill out fields in CreateLogicalPartitionParams so that we have more information (e.g. by 2328 // reading super partition metadata). 2329 CreateLogicalPartitionParams::OwnedData params_owned_data; 2330 if (!params.InitDefaults(¶ms_owned_data)) { 2331 return false; 2332 } 2333 2334 if (!params.partition->num_extents) { 2335 LOG(INFO) << "Skipping zero-length logical partition: " << params.GetPartitionName(); 2336 return true; // leave path empty to indicate that nothing is mapped. 2337 } 2338 2339 // Determine if there is a live snapshot for the SnapshotStatus of the partition; i.e. if the 2340 // partition still has a snapshot that needs to be mapped. If no live snapshot or merge 2341 // completed, live_snapshot_status is set to nullopt. 2342 std::optional<SnapshotStatus> live_snapshot_status; 2343 do { 2344 if (!(params.partition->attributes & LP_PARTITION_ATTR_UPDATED)) { 2345 LOG(INFO) << "Detected re-flashing of partition, will skip snapshot: " 2346 << params.GetPartitionName(); 2347 break; 2348 } 2349 auto file_path = GetSnapshotStatusFilePath(params.GetPartitionName()); 2350 if (access(file_path.c_str(), F_OK) != 0) { 2351 if (errno != ENOENT) { 2352 PLOG(INFO) << "Can't map snapshot for " << params.GetPartitionName() 2353 << ": Can't access " << file_path; 2354 return false; 2355 } 2356 break; 2357 } 2358 live_snapshot_status = std::make_optional<SnapshotStatus>(); 2359 if (!ReadSnapshotStatus(lock, params.GetPartitionName(), &*live_snapshot_status)) { 2360 return false; 2361 } 2362 // No live snapshot if merge is completed. 2363 if (live_snapshot_status->state() == SnapshotState::MERGE_COMPLETED) { 2364 live_snapshot_status.reset(); 2365 } 2366 2367 if (live_snapshot_status->state() == SnapshotState::NONE || 2368 live_snapshot_status->cow_partition_size() + live_snapshot_status->cow_file_size() == 2369 0) { 2370 LOG(WARNING) << "Snapshot status for " << params.GetPartitionName() 2371 << " is invalid, ignoring: state = " 2372 << SnapshotState_Name(live_snapshot_status->state()) 2373 << ", cow_partition_size = " << live_snapshot_status->cow_partition_size() 2374 << ", cow_file_size = " << live_snapshot_status->cow_file_size(); 2375 live_snapshot_status.reset(); 2376 } 2377 } while (0); 2378 2379 if (live_snapshot_status.has_value()) { 2380 // dm-snapshot requires the base device to be writable. 2381 params.force_writable = true; 2382 // Map the base device with a different name to avoid collision. 2383 params.device_name = GetBaseDeviceName(params.GetPartitionName()); 2384 } 2385 2386 AutoDeviceList created_devices; 2387 2388 // Create the base device for the snapshot, or if there is no snapshot, the 2389 // device itself. This device consists of the real blocks in the super 2390 // partition that this logical partition occupies. 2391 std::string base_path; 2392 if (!CreateLogicalPartition(params, &base_path)) { 2393 LOG(ERROR) << "Could not create logical partition " << params.GetPartitionName() 2394 << " as device " << params.GetDeviceName(); 2395 return false; 2396 } 2397 created_devices.EmplaceBack<AutoUnmapDevice>(&dm_, params.GetDeviceName()); 2398 2399 if (paths) { 2400 paths->target_device = base_path; 2401 } 2402 2403 auto remaining_time = GetRemainingTime(params.timeout_ms, begin); 2404 if (remaining_time.count() < 0) { 2405 return false; 2406 } 2407 2408 // Wait for the base device to appear 2409 if (!WaitForDevice(base_path, remaining_time)) { 2410 return false; 2411 } 2412 2413 if (!live_snapshot_status.has_value()) { 2414 created_devices.Release(); 2415 return true; 2416 } 2417 2418 // We don't have ueventd in first-stage init, so use device major:minor 2419 // strings instead. 2420 std::string base_device; 2421 if (!dm_.GetDeviceString(params.GetDeviceName(), &base_device)) { 2422 LOG(ERROR) << "Could not determine major/minor for: " << params.GetDeviceName(); 2423 return false; 2424 } 2425 2426 remaining_time = GetRemainingTime(params.timeout_ms, begin); 2427 if (remaining_time.count() < 0) return false; 2428 2429 std::string cow_name; 2430 CreateLogicalPartitionParams cow_params = params; 2431 cow_params.timeout_ms = remaining_time; 2432 if (!MapCowDevices(lock, cow_params, *live_snapshot_status, &created_devices, &cow_name)) { 2433 return false; 2434 } 2435 std::string cow_device; 2436 if (!GetMappedImageDeviceStringOrPath(cow_name, &cow_device)) { 2437 LOG(ERROR) << "Could not determine major/minor for: " << cow_name; 2438 return false; 2439 } 2440 if (paths) { 2441 paths->cow_device_name = cow_name; 2442 } 2443 2444 remaining_time = GetRemainingTime(params.timeout_ms, begin); 2445 if (remaining_time.count() < 0) return false; 2446 2447 if (context == SnapshotContext::Update && live_snapshot_status->using_snapuserd()) { 2448 // Stop here, we can't run dm-user yet, the COW isn't built. 2449 created_devices.Release(); 2450 return true; 2451 } 2452 2453 if (live_snapshot_status->using_snapuserd()) { 2454 // Get the source device (eg the view of the partition from before it was resized). 2455 std::string source_device_path; 2456 if (live_snapshot_status->old_partition_size() > 0) { 2457 if (!MapSourceDevice(lock, params.GetPartitionName(), remaining_time, 2458 &source_device_path)) { 2459 LOG(ERROR) << "Could not map source device for: " << cow_name; 2460 return false; 2461 } 2462 2463 auto source_device = GetSourceDeviceName(params.GetPartitionName()); 2464 created_devices.EmplaceBack<AutoUnmapDevice>(&dm_, source_device); 2465 } else { 2466 source_device_path = base_path; 2467 } 2468 2469 if (!WaitForDevice(source_device_path, remaining_time)) { 2470 return false; 2471 } 2472 2473 std::string cow_path; 2474 if (!GetMappedImageDevicePath(cow_name, &cow_path)) { 2475 LOG(ERROR) << "Could not determine path for: " << cow_name; 2476 return false; 2477 } 2478 if (!WaitForDevice(cow_path, remaining_time)) { 2479 return false; 2480 } 2481 2482 auto name = GetDmUserCowName(params.GetPartitionName(), GetSnapshotDriver(lock)); 2483 2484 std::string new_cow_device; 2485 if (!MapDmUserCow(lock, name, cow_path, source_device_path, base_path, remaining_time, 2486 &new_cow_device)) { 2487 LOG(ERROR) << "Could not map dm-user device for partition " 2488 << params.GetPartitionName(); 2489 return false; 2490 } 2491 created_devices.EmplaceBack<AutoUnmapDevice>(&dm_, name); 2492 2493 remaining_time = GetRemainingTime(params.timeout_ms, begin); 2494 if (remaining_time.count() < 0) return false; 2495 2496 cow_device = new_cow_device; 2497 } 2498 2499 // For userspace snapshots, dm-user block device itself will act as a 2500 // snapshot device. There is one subtle difference - MapSnapshot will create 2501 // either snapshot target or snapshot-merge target based on the underlying 2502 // state of the snapshot device. If snapshot-merge target is created, merge 2503 // will immediately start in the kernel. 2504 // 2505 // This is no longer true with respect to userspace snapshots. When dm-user 2506 // block device is created, we just have the snapshots ready but daemon in 2507 // the user-space will not start the merge. We have to explicitly inform the 2508 // daemon to resume the merge. Check ProcessUpdateState() call stack. 2509 if (!UpdateUsesUserSnapshots(lock)) { 2510 std::string path; 2511 if (!MapSnapshot(lock, params.GetPartitionName(), base_device, cow_device, remaining_time, 2512 &path)) { 2513 LOG(ERROR) << "Could not map snapshot for partition: " << params.GetPartitionName(); 2514 return false; 2515 } 2516 // No need to add params.GetPartitionName() to created_devices since it is immediately 2517 // released. 2518 2519 if (paths) { 2520 paths->snapshot_device = path; 2521 } 2522 LOG(INFO) << "Mapped " << params.GetPartitionName() << " as snapshot device at " << path; 2523 } else { 2524 LOG(INFO) << "Mapped " << params.GetPartitionName() << " as snapshot device at " 2525 << cow_device; 2526 } 2527 2528 created_devices.Release(); 2529 2530 return true; 2531 } 2532 UnmapPartitionWithSnapshot(LockedFile * lock,const std::string & target_partition_name)2533 bool SnapshotManager::UnmapPartitionWithSnapshot(LockedFile* lock, 2534 const std::string& target_partition_name) { 2535 CHECK(lock); 2536 2537 if (!UnmapSnapshot(lock, target_partition_name)) { 2538 return false; 2539 } 2540 2541 if (!UnmapCowDevices(lock, target_partition_name)) { 2542 return false; 2543 } 2544 2545 auto base_name = GetBaseDeviceName(target_partition_name); 2546 if (!DeleteDeviceIfExists(base_name)) { 2547 LOG(ERROR) << "Cannot delete base device: " << base_name; 2548 return false; 2549 } 2550 2551 auto source_name = GetSourceDeviceName(target_partition_name); 2552 if (!DeleteDeviceIfExists(source_name)) { 2553 LOG(ERROR) << "Cannot delete source device: " << source_name; 2554 return false; 2555 } 2556 2557 LOG(INFO) << "Successfully unmapped snapshot " << target_partition_name; 2558 2559 return true; 2560 } 2561 MapCowDevices(LockedFile * lock,const CreateLogicalPartitionParams & params,const SnapshotStatus & snapshot_status,AutoDeviceList * created_devices,std::string * cow_name)2562 bool SnapshotManager::MapCowDevices(LockedFile* lock, const CreateLogicalPartitionParams& params, 2563 const SnapshotStatus& snapshot_status, 2564 AutoDeviceList* created_devices, std::string* cow_name) { 2565 CHECK(lock); 2566 CHECK(snapshot_status.cow_partition_size() + snapshot_status.cow_file_size() > 0); 2567 auto begin = std::chrono::steady_clock::now(); 2568 2569 std::string partition_name = params.GetPartitionName(); 2570 std::string cow_image_name = GetCowImageDeviceName(partition_name); 2571 *cow_name = GetCowName(partition_name); 2572 2573 // Map COW image if necessary. 2574 if (snapshot_status.cow_file_size() > 0) { 2575 if (!EnsureImageManager()) return false; 2576 auto remaining_time = GetRemainingTime(params.timeout_ms, begin); 2577 if (remaining_time.count() < 0) return false; 2578 2579 if (!MapCowImage(partition_name, remaining_time).has_value()) { 2580 LOG(ERROR) << "Could not map cow image for partition: " << partition_name; 2581 return false; 2582 } 2583 created_devices->EmplaceBack<AutoUnmapImage>(images_.get(), cow_image_name); 2584 2585 // If no COW partition exists, just return the image alone. 2586 if (snapshot_status.cow_partition_size() == 0) { 2587 *cow_name = std::move(cow_image_name); 2588 LOG(INFO) << "Mapped COW image for " << partition_name << " at " << *cow_name; 2589 return true; 2590 } 2591 } 2592 2593 auto remaining_time = GetRemainingTime(params.timeout_ms, begin); 2594 if (remaining_time.count() < 0) return false; 2595 2596 CHECK(snapshot_status.cow_partition_size() > 0); 2597 2598 // Create the DmTable for the COW device. It is the DmTable of the COW partition plus 2599 // COW image device as the last extent. 2600 CreateLogicalPartitionParams cow_partition_params = params; 2601 cow_partition_params.partition = nullptr; 2602 cow_partition_params.partition_name = *cow_name; 2603 cow_partition_params.device_name.clear(); 2604 DmTable table; 2605 if (!CreateDmTable(cow_partition_params, &table)) { 2606 return false; 2607 } 2608 // If the COW image exists, append it as the last extent. 2609 if (snapshot_status.cow_file_size() > 0) { 2610 std::string cow_image_device; 2611 if (!GetMappedImageDeviceStringOrPath(cow_image_name, &cow_image_device)) { 2612 LOG(ERROR) << "Cannot determine major/minor for: " << cow_image_name; 2613 return false; 2614 } 2615 auto cow_partition_sectors = snapshot_status.cow_partition_size() / kSectorSize; 2616 auto cow_image_sectors = snapshot_status.cow_file_size() / kSectorSize; 2617 table.Emplace<DmTargetLinear>(cow_partition_sectors, cow_image_sectors, cow_image_device, 2618 0); 2619 } 2620 2621 // We have created the DmTable now. Map it. 2622 std::string cow_path; 2623 if (!dm_.CreateDevice(*cow_name, table, &cow_path, remaining_time)) { 2624 LOG(ERROR) << "Could not create COW device: " << *cow_name; 2625 return false; 2626 } 2627 created_devices->EmplaceBack<AutoUnmapDevice>(&dm_, *cow_name); 2628 LOG(INFO) << "Mapped COW device for " << params.GetPartitionName() << " at " << cow_path; 2629 return true; 2630 } 2631 UnmapCowDevices(LockedFile * lock,const std::string & name)2632 bool SnapshotManager::UnmapCowDevices(LockedFile* lock, const std::string& name) { 2633 CHECK(lock); 2634 if (!EnsureImageManager()) return false; 2635 2636 if (UpdateUsesCompression(lock) && !UpdateUsesUserSnapshots(lock)) { 2637 auto dm_user_name = GetDmUserCowName(name, GetSnapshotDriver(lock)); 2638 if (!UnmapDmUserDevice(dm_user_name)) { 2639 return false; 2640 } 2641 } 2642 2643 if (!DeleteDeviceIfExists(GetCowName(name), 4000ms)) { 2644 LOG(ERROR) << "Cannot unmap: " << GetCowName(name); 2645 return false; 2646 } 2647 2648 std::string cow_image_name = GetCowImageDeviceName(name); 2649 if (!images_->UnmapImageIfExists(cow_image_name)) { 2650 LOG(ERROR) << "Cannot unmap image " << cow_image_name; 2651 return false; 2652 } 2653 return true; 2654 } 2655 UnmapDmUserDevice(const std::string & dm_user_name)2656 bool SnapshotManager::UnmapDmUserDevice(const std::string& dm_user_name) { 2657 if (dm_.GetState(dm_user_name) == DmDeviceState::INVALID) { 2658 return true; 2659 } 2660 2661 if (!DeleteDeviceIfExists(dm_user_name)) { 2662 LOG(ERROR) << "Cannot unmap " << dm_user_name; 2663 return false; 2664 } 2665 2666 if (EnsureSnapuserdConnected()) { 2667 if (!snapuserd_client_->WaitForDeviceDelete(dm_user_name)) { 2668 LOG(ERROR) << "Failed to wait for " << dm_user_name << " control device to delete"; 2669 return false; 2670 } 2671 } 2672 2673 // Ensure the control device is gone so we don't run into ABA problems. 2674 auto control_device = "/dev/dm-user/" + dm_user_name; 2675 if (!android::fs_mgr::WaitForFileDeleted(control_device, 10s)) { 2676 LOG(ERROR) << "Timed out waiting for " << control_device << " to unlink"; 2677 return false; 2678 } 2679 return true; 2680 } 2681 UnmapUserspaceSnapshotDevice(LockedFile * lock,const std::string & snapshot_name)2682 bool SnapshotManager::UnmapUserspaceSnapshotDevice(LockedFile* lock, 2683 const std::string& snapshot_name) { 2684 auto dm_user_name = GetDmUserCowName(snapshot_name, GetSnapshotDriver(lock)); 2685 if (dm_.GetState(dm_user_name) == DmDeviceState::INVALID) { 2686 return true; 2687 } 2688 2689 CHECK(lock); 2690 2691 SnapshotStatus snapshot_status; 2692 2693 if (!ReadSnapshotStatus(lock, snapshot_name, &snapshot_status)) { 2694 return false; 2695 } 2696 // If the merge is complete, then we switch dm tables which is equivalent 2697 // to unmap; hence, we can't be deleting the device 2698 // as the table would be mounted off partitions and will fail. 2699 if (snapshot_status.state() != SnapshotState::MERGE_COMPLETED) { 2700 if (!DeleteDeviceIfExists(dm_user_name)) { 2701 LOG(ERROR) << "Cannot unmap " << dm_user_name; 2702 return false; 2703 } 2704 } 2705 2706 if (EnsureSnapuserdConnected()) { 2707 if (!snapuserd_client_->WaitForDeviceDelete(dm_user_name)) { 2708 LOG(ERROR) << "Failed to wait for " << dm_user_name << " control device to delete"; 2709 return false; 2710 } 2711 } 2712 2713 // Ensure the control device is gone so we don't run into ABA problems. 2714 auto control_device = "/dev/dm-user/" + dm_user_name; 2715 if (!android::fs_mgr::WaitForFileDeleted(control_device, 10s)) { 2716 LOG(ERROR) << "Timed out waiting for " << control_device << " to unlink"; 2717 return false; 2718 } 2719 return true; 2720 } 2721 MapAllSnapshots(const std::chrono::milliseconds & timeout_ms)2722 bool SnapshotManager::MapAllSnapshots(const std::chrono::milliseconds& timeout_ms) { 2723 auto lock = LockExclusive(); 2724 if (!lock) return false; 2725 2726 auto state = ReadUpdateState(lock.get()); 2727 if (state == UpdateState::Unverified) { 2728 if (GetCurrentSlot() == Slot::Target) { 2729 LOG(ERROR) << "Cannot call MapAllSnapshots when booting from the target slot."; 2730 return false; 2731 } 2732 } else if (state != UpdateState::Initiated) { 2733 LOG(ERROR) << "Cannot call MapAllSnapshots from update state: " << state; 2734 return false; 2735 } 2736 2737 std::vector<std::string> snapshots; 2738 if (!ListSnapshots(lock.get(), &snapshots)) { 2739 return false; 2740 } 2741 2742 const auto& opener = device_->GetPartitionOpener(); 2743 auto slot_suffix = device_->GetOtherSlotSuffix(); 2744 auto slot_number = SlotNumberForSlotSuffix(slot_suffix); 2745 auto super_device = device_->GetSuperDevice(slot_number); 2746 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot_number); 2747 if (!metadata) { 2748 LOG(ERROR) << "MapAllSnapshots could not read dynamic partition metadata for device: " 2749 << super_device; 2750 return false; 2751 } 2752 2753 for (const auto& snapshot : snapshots) { 2754 if (!UnmapPartitionWithSnapshot(lock.get(), snapshot)) { 2755 LOG(ERROR) << "MapAllSnapshots could not unmap snapshot: " << snapshot; 2756 return false; 2757 } 2758 2759 CreateLogicalPartitionParams params = { 2760 .block_device = super_device, 2761 .metadata = metadata.get(), 2762 .partition_name = snapshot, 2763 .timeout_ms = timeout_ms, 2764 .partition_opener = &opener, 2765 }; 2766 if (!MapPartitionWithSnapshot(lock.get(), std::move(params), SnapshotContext::Mount, 2767 nullptr)) { 2768 LOG(ERROR) << "MapAllSnapshots failed to map: " << snapshot; 2769 return false; 2770 } 2771 } 2772 2773 LOG(INFO) << "MapAllSnapshots succeeded."; 2774 return true; 2775 } 2776 UnmapAllSnapshots()2777 bool SnapshotManager::UnmapAllSnapshots() { 2778 auto lock = LockExclusive(); 2779 if (!lock) return false; 2780 2781 return UnmapAllSnapshots(lock.get()); 2782 } 2783 UnmapAllSnapshots(LockedFile * lock)2784 bool SnapshotManager::UnmapAllSnapshots(LockedFile* lock) { 2785 std::vector<std::string> snapshots; 2786 if (!ListSnapshots(lock, &snapshots)) { 2787 return false; 2788 } 2789 2790 for (const auto& snapshot : snapshots) { 2791 if (!UnmapPartitionWithSnapshot(lock, snapshot)) { 2792 LOG(ERROR) << "Failed to unmap snapshot: " << snapshot; 2793 return false; 2794 } 2795 } 2796 2797 // Terminate the daemon and release the snapuserd_client_ object. 2798 // If we need to re-connect with the daemon, EnsureSnapuserdConnected() 2799 // will re-create the object and establish the socket connection. 2800 if (snapuserd_client_) { 2801 LOG(INFO) << "Shutdown snapuserd daemon"; 2802 snapuserd_client_->DetachSnapuserd(); 2803 snapuserd_client_ = nullptr; 2804 } 2805 2806 return true; 2807 } 2808 OpenFile(const std::string & file,int lock_flags)2809 auto SnapshotManager::OpenFile(const std::string& file, int lock_flags) 2810 -> std::unique_ptr<LockedFile> { 2811 unique_fd fd(open(file.c_str(), O_RDONLY | O_CLOEXEC | O_NOFOLLOW)); 2812 if (fd < 0) { 2813 PLOG(ERROR) << "Open failed: " << file; 2814 return nullptr; 2815 } 2816 if (lock_flags != 0 && TEMP_FAILURE_RETRY(flock(fd, lock_flags)) < 0) { 2817 PLOG(ERROR) << "Acquire flock failed: " << file; 2818 return nullptr; 2819 } 2820 // For simplicity, we want to CHECK that lock_mode == LOCK_EX, in some 2821 // calls, so strip extra flags. 2822 int lock_mode = lock_flags & (LOCK_EX | LOCK_SH); 2823 return std::make_unique<LockedFile>(file, std::move(fd), lock_mode); 2824 } 2825 ~LockedFile()2826 SnapshotManager::LockedFile::~LockedFile() { 2827 if (TEMP_FAILURE_RETRY(flock(fd_, LOCK_UN)) < 0) { 2828 PLOG(ERROR) << "Failed to unlock file: " << path_; 2829 } 2830 } 2831 GetStateFilePath() const2832 std::string SnapshotManager::GetStateFilePath() const { 2833 return metadata_dir_ + "/state"s; 2834 } 2835 GetMergeStateFilePath() const2836 std::string SnapshotManager::GetMergeStateFilePath() const { 2837 return metadata_dir_ + "/merge_state"s; 2838 } 2839 GetLockPath() const2840 std::string SnapshotManager::GetLockPath() const { 2841 return metadata_dir_; 2842 } 2843 OpenLock(int lock_flags)2844 std::unique_ptr<SnapshotManager::LockedFile> SnapshotManager::OpenLock(int lock_flags) { 2845 auto lock_file = GetLockPath(); 2846 return OpenFile(lock_file, lock_flags); 2847 } 2848 LockShared()2849 std::unique_ptr<SnapshotManager::LockedFile> SnapshotManager::LockShared() { 2850 return OpenLock(LOCK_SH); 2851 } 2852 LockExclusive()2853 std::unique_ptr<SnapshotManager::LockedFile> SnapshotManager::LockExclusive() { 2854 return OpenLock(LOCK_EX); 2855 } 2856 UpdateStateFromString(const std::string & contents)2857 static UpdateState UpdateStateFromString(const std::string& contents) { 2858 if (contents.empty() || contents == "none") { 2859 return UpdateState::None; 2860 } else if (contents == "initiated") { 2861 return UpdateState::Initiated; 2862 } else if (contents == "unverified") { 2863 return UpdateState::Unverified; 2864 } else if (contents == "merging") { 2865 return UpdateState::Merging; 2866 } else if (contents == "merge-completed") { 2867 return UpdateState::MergeCompleted; 2868 } else if (contents == "merge-needs-reboot") { 2869 return UpdateState::MergeNeedsReboot; 2870 } else if (contents == "merge-failed") { 2871 return UpdateState::MergeFailed; 2872 } else if (contents == "cancelled") { 2873 return UpdateState::Cancelled; 2874 } else { 2875 LOG(ERROR) << "Unknown merge state in update state file: \"" << contents << "\""; 2876 return UpdateState::None; 2877 } 2878 } 2879 operator <<(std::ostream & os,UpdateState state)2880 std::ostream& operator<<(std::ostream& os, UpdateState state) { 2881 switch (state) { 2882 case UpdateState::None: 2883 return os << "none"; 2884 case UpdateState::Initiated: 2885 return os << "initiated"; 2886 case UpdateState::Unverified: 2887 return os << "unverified"; 2888 case UpdateState::Merging: 2889 return os << "merging"; 2890 case UpdateState::MergeCompleted: 2891 return os << "merge-completed"; 2892 case UpdateState::MergeNeedsReboot: 2893 return os << "merge-needs-reboot"; 2894 case UpdateState::MergeFailed: 2895 return os << "merge-failed"; 2896 case UpdateState::Cancelled: 2897 return os << "cancelled"; 2898 default: 2899 LOG(ERROR) << "Unknown update state: " << static_cast<uint32_t>(state); 2900 return os; 2901 } 2902 } 2903 operator <<(std::ostream & os,MergePhase phase)2904 std::ostream& operator<<(std::ostream& os, MergePhase phase) { 2905 switch (phase) { 2906 case MergePhase::NO_MERGE: 2907 return os << "none"; 2908 case MergePhase::FIRST_PHASE: 2909 return os << "first"; 2910 case MergePhase::SECOND_PHASE: 2911 return os << "second"; 2912 default: 2913 LOG(ERROR) << "Unknown merge phase: " << static_cast<uint32_t>(phase); 2914 return os << "unknown(" << static_cast<uint32_t>(phase) << ")"; 2915 } 2916 } 2917 ReadUpdateState(LockedFile * lock)2918 UpdateState SnapshotManager::ReadUpdateState(LockedFile* lock) { 2919 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock); 2920 return status.state(); 2921 } 2922 ReadSnapshotUpdateStatus(LockedFile * lock)2923 SnapshotUpdateStatus SnapshotManager::ReadSnapshotUpdateStatus(LockedFile* lock) { 2924 CHECK(lock); 2925 2926 SnapshotUpdateStatus status = {}; 2927 std::string contents; 2928 if (!android::base::ReadFileToString(GetStateFilePath(), &contents)) { 2929 PLOG(ERROR) << "Read state file failed"; 2930 status.set_state(UpdateState::None); 2931 return status; 2932 } 2933 2934 if (!status.ParseFromString(contents)) { 2935 LOG(WARNING) << "Unable to parse state file as SnapshotUpdateStatus, using the old format"; 2936 2937 // Try to rollback to legacy file to support devices that are 2938 // currently using the old file format. 2939 // TODO(b/147409432) 2940 status.set_state(UpdateStateFromString(contents)); 2941 } 2942 2943 return status; 2944 } 2945 WriteUpdateState(LockedFile * lock,UpdateState state,MergeFailureCode failure_code)2946 bool SnapshotManager::WriteUpdateState(LockedFile* lock, UpdateState state, 2947 MergeFailureCode failure_code) { 2948 SnapshotUpdateStatus status; 2949 status.set_state(state); 2950 2951 switch (state) { 2952 case UpdateState::MergeFailed: 2953 status.set_merge_failure_code(failure_code); 2954 break; 2955 case UpdateState::Initiated: 2956 status.set_source_build_fingerprint( 2957 android::base::GetProperty("ro.build.fingerprint", "")); 2958 break; 2959 default: 2960 break; 2961 } 2962 2963 // If we're transitioning between two valid states (eg, we're not beginning 2964 // or ending an OTA), then make sure to propagate the compression bit and 2965 // build fingerprint. 2966 if (!(state == UpdateState::Initiated || state == UpdateState::None)) { 2967 SnapshotUpdateStatus old_status = ReadSnapshotUpdateStatus(lock); 2968 status.set_using_snapuserd(old_status.using_snapuserd()); 2969 status.set_source_build_fingerprint(old_status.source_build_fingerprint()); 2970 status.set_merge_phase(old_status.merge_phase()); 2971 status.set_userspace_snapshots(old_status.userspace_snapshots()); 2972 status.set_io_uring_enabled(old_status.io_uring_enabled()); 2973 } 2974 return WriteSnapshotUpdateStatus(lock, status); 2975 } 2976 WriteSnapshotUpdateStatus(LockedFile * lock,const SnapshotUpdateStatus & status)2977 bool SnapshotManager::WriteSnapshotUpdateStatus(LockedFile* lock, 2978 const SnapshotUpdateStatus& status) { 2979 CHECK(lock); 2980 CHECK(lock->lock_mode() == LOCK_EX); 2981 2982 std::string contents; 2983 if (!status.SerializeToString(&contents)) { 2984 LOG(ERROR) << "Unable to serialize SnapshotUpdateStatus."; 2985 return false; 2986 } 2987 2988 #ifdef LIBSNAPSHOT_USE_HAL 2989 auto merge_status = MergeStatus::UNKNOWN; 2990 switch (status.state()) { 2991 // The needs-reboot and completed cases imply that /data and /metadata 2992 // can be safely wiped, so we don't report a merge status. 2993 case UpdateState::None: 2994 case UpdateState::MergeNeedsReboot: 2995 case UpdateState::MergeCompleted: 2996 case UpdateState::Initiated: 2997 merge_status = MergeStatus::NONE; 2998 break; 2999 case UpdateState::Unverified: 3000 merge_status = MergeStatus::SNAPSHOTTED; 3001 break; 3002 case UpdateState::Merging: 3003 case UpdateState::MergeFailed: 3004 merge_status = MergeStatus::MERGING; 3005 break; 3006 default: 3007 // Note that Cancelled flows to here - it is never written, since 3008 // it only communicates a transient state to the caller. 3009 LOG(ERROR) << "Unexpected update status: " << status.state(); 3010 break; 3011 } 3012 3013 bool set_before_write = 3014 merge_status == MergeStatus::SNAPSHOTTED || merge_status == MergeStatus::MERGING; 3015 if (set_before_write && !device_->SetBootControlMergeStatus(merge_status)) { 3016 return false; 3017 } 3018 #endif 3019 3020 if (!WriteStringToFileAtomic(contents, GetStateFilePath())) { 3021 PLOG(ERROR) << "Could not write to state file"; 3022 return false; 3023 } 3024 3025 #ifdef LIBSNAPSHOT_USE_HAL 3026 if (!set_before_write && !device_->SetBootControlMergeStatus(merge_status)) { 3027 return false; 3028 } 3029 #endif 3030 return true; 3031 } 3032 GetSnapshotStatusFilePath(const std::string & name)3033 std::string SnapshotManager::GetSnapshotStatusFilePath(const std::string& name) { 3034 auto file = metadata_dir_ + "/snapshots/"s + name; 3035 return file; 3036 } 3037 ReadSnapshotStatus(LockedFile * lock,const std::string & name,SnapshotStatus * status)3038 bool SnapshotManager::ReadSnapshotStatus(LockedFile* lock, const std::string& name, 3039 SnapshotStatus* status) { 3040 CHECK(lock); 3041 auto path = GetSnapshotStatusFilePath(name); 3042 3043 unique_fd fd(open(path.c_str(), O_RDONLY | O_CLOEXEC | O_NOFOLLOW)); 3044 if (fd < 0) { 3045 PLOG(ERROR) << "Open failed: " << path; 3046 return false; 3047 } 3048 3049 if (!status->ParseFromFileDescriptor(fd.get())) { 3050 PLOG(ERROR) << "Unable to parse " << path << " as SnapshotStatus"; 3051 return false; 3052 } 3053 3054 if (status->name() != name) { 3055 LOG(WARNING) << "Found snapshot status named " << status->name() << " in " << path; 3056 status->set_name(name); 3057 } 3058 3059 return true; 3060 } 3061 WriteSnapshotStatus(LockedFile * lock,const SnapshotStatus & status)3062 bool SnapshotManager::WriteSnapshotStatus(LockedFile* lock, const SnapshotStatus& status) { 3063 // The caller must take an exclusive lock to modify snapshots. 3064 CHECK(lock); 3065 CHECK(lock->lock_mode() == LOCK_EX); 3066 CHECK(!status.name().empty()); 3067 3068 auto path = GetSnapshotStatusFilePath(status.name()); 3069 3070 std::string content; 3071 if (!status.SerializeToString(&content)) { 3072 LOG(ERROR) << "Unable to serialize SnapshotStatus for " << status.name(); 3073 return false; 3074 } 3075 3076 if (!WriteStringToFileAtomic(content, path)) { 3077 PLOG(ERROR) << "Unable to write SnapshotStatus to " << path; 3078 return false; 3079 } 3080 3081 return true; 3082 } 3083 EnsureImageManager()3084 bool SnapshotManager::EnsureImageManager() { 3085 if (images_) return true; 3086 3087 images_ = device_->OpenImageManager(); 3088 if (!images_) { 3089 LOG(ERROR) << "Could not open ImageManager"; 3090 return false; 3091 } 3092 return true; 3093 } 3094 EnsureSnapuserdConnected()3095 bool SnapshotManager::EnsureSnapuserdConnected() { 3096 if (snapuserd_client_) { 3097 return true; 3098 } 3099 3100 if (!use_first_stage_snapuserd_ && !EnsureSnapuserdStarted()) { 3101 return false; 3102 } 3103 3104 snapuserd_client_ = SnapuserdClient::Connect(kSnapuserdSocket, 10s); 3105 if (!snapuserd_client_) { 3106 LOG(ERROR) << "Unable to connect to snapuserd"; 3107 return false; 3108 } 3109 return true; 3110 } 3111 UnmapAndDeleteCowPartition(MetadataBuilder * current_metadata)3112 void SnapshotManager::UnmapAndDeleteCowPartition(MetadataBuilder* current_metadata) { 3113 std::vector<std::string> to_delete; 3114 for (auto* existing_cow_partition : current_metadata->ListPartitionsInGroup(kCowGroupName)) { 3115 if (!DeleteDeviceIfExists(existing_cow_partition->name())) { 3116 LOG(WARNING) << existing_cow_partition->name() 3117 << " cannot be unmapped and its space cannot be reclaimed"; 3118 continue; 3119 } 3120 to_delete.push_back(existing_cow_partition->name()); 3121 } 3122 for (const auto& name : to_delete) { 3123 current_metadata->RemovePartition(name); 3124 } 3125 } 3126 AddRequiredSpace(Return orig,const std::map<std::string,SnapshotStatus> & all_snapshot_status)3127 static Return AddRequiredSpace(Return orig, 3128 const std::map<std::string, SnapshotStatus>& all_snapshot_status) { 3129 if (orig.error_code() != Return::ErrorCode::NO_SPACE) { 3130 return orig; 3131 } 3132 uint64_t sum = 0; 3133 for (auto&& [name, status] : all_snapshot_status) { 3134 sum += status.cow_file_size(); 3135 } 3136 return Return::NoSpace(sum); 3137 } 3138 CreateUpdateSnapshots(const DeltaArchiveManifest & manifest)3139 Return SnapshotManager::CreateUpdateSnapshots(const DeltaArchiveManifest& manifest) { 3140 auto lock = LockExclusive(); 3141 if (!lock) return Return::Error(); 3142 3143 auto update_state = ReadUpdateState(lock.get()); 3144 if (update_state != UpdateState::Initiated) { 3145 LOG(ERROR) << "Cannot create update snapshots in state " << update_state; 3146 return Return::Error(); 3147 } 3148 3149 // TODO(b/134949511): remove this check. Right now, with overlayfs mounted, the scratch 3150 // partition takes up a big chunk of space in super, causing COW images to be created on 3151 // retrofit Virtual A/B devices. 3152 if (device_->IsOverlayfsSetup()) { 3153 LOG(ERROR) << "Cannot create update snapshots with overlayfs setup. Run `adb enable-verity`" 3154 << ", reboot, then try again."; 3155 return Return::Error(); 3156 } 3157 3158 const auto& opener = device_->GetPartitionOpener(); 3159 auto current_suffix = device_->GetSlotSuffix(); 3160 uint32_t current_slot = SlotNumberForSlotSuffix(current_suffix); 3161 auto target_suffix = device_->GetOtherSlotSuffix(); 3162 uint32_t target_slot = SlotNumberForSlotSuffix(target_suffix); 3163 auto current_super = device_->GetSuperDevice(current_slot); 3164 3165 auto current_metadata = MetadataBuilder::New(opener, current_super, current_slot); 3166 if (current_metadata == nullptr) { 3167 LOG(ERROR) << "Cannot create metadata builder."; 3168 return Return::Error(); 3169 } 3170 3171 auto target_metadata = 3172 MetadataBuilder::NewForUpdate(opener, current_super, current_slot, target_slot); 3173 if (target_metadata == nullptr) { 3174 LOG(ERROR) << "Cannot create target metadata builder."; 3175 return Return::Error(); 3176 } 3177 3178 // Delete partitions with target suffix in |current_metadata|. Otherwise, 3179 // partition_cow_creator recognizes these left-over partitions as used space. 3180 for (const auto& group_name : current_metadata->ListGroups()) { 3181 if (android::base::EndsWith(group_name, target_suffix)) { 3182 current_metadata->RemoveGroupAndPartitions(group_name); 3183 } 3184 } 3185 3186 SnapshotMetadataUpdater metadata_updater(target_metadata.get(), target_slot, manifest); 3187 if (!metadata_updater.Update()) { 3188 LOG(ERROR) << "Cannot calculate new metadata."; 3189 return Return::Error(); 3190 } 3191 3192 // Delete previous COW partitions in current_metadata so that PartitionCowCreator marks those as 3193 // free regions. 3194 UnmapAndDeleteCowPartition(current_metadata.get()); 3195 3196 // Check that all these metadata is not retrofit dynamic partitions. Snapshots on 3197 // devices with retrofit dynamic partitions does not make sense. 3198 // This ensures that current_metadata->GetFreeRegions() uses the same device 3199 // indices as target_metadata (i.e. 0 -> "super"). 3200 // This is also assumed in MapCowDevices() call below. 3201 CHECK(current_metadata->GetBlockDevicePartitionName(0) == LP_METADATA_DEFAULT_PARTITION_NAME && 3202 target_metadata->GetBlockDevicePartitionName(0) == LP_METADATA_DEFAULT_PARTITION_NAME); 3203 3204 std::map<std::string, SnapshotStatus> all_snapshot_status; 3205 3206 // In case of error, automatically delete devices that are created along the way. 3207 // Note that "lock" is destroyed after "created_devices", so it is safe to use |lock| for 3208 // these devices. 3209 AutoDeviceList created_devices; 3210 3211 const auto& dap_metadata = manifest.dynamic_partition_metadata(); 3212 CowOptions options; 3213 CowWriter writer(options); 3214 bool cow_format_support = true; 3215 if (dap_metadata.cow_version() < writer.GetCowVersion()) { 3216 cow_format_support = false; 3217 } 3218 3219 LOG(INFO) << " dap_metadata.cow_version(): " << dap_metadata.cow_version() 3220 << " writer.GetCowVersion(): " << writer.GetCowVersion(); 3221 3222 // Deduce supported features. 3223 bool userspace_snapshots = CanUseUserspaceSnapshots(); 3224 bool legacy_compression = GetLegacyCompressionEnabledProperty(); 3225 3226 std::string vabc_disable_reason; 3227 if (!dap_metadata.vabc_enabled()) { 3228 vabc_disable_reason = "not enabled metadata"; 3229 } else if (device_->IsRecovery()) { 3230 vabc_disable_reason = "recovery"; 3231 } else if (!cow_format_support) { 3232 vabc_disable_reason = "cow format not supported"; 3233 } else if (!KernelSupportsCompressedSnapshots()) { 3234 vabc_disable_reason = "kernel missing userspace block device support"; 3235 } 3236 3237 if (!vabc_disable_reason.empty()) { 3238 if (userspace_snapshots) { 3239 LOG(INFO) << "Userspace snapshots disabled: " << vabc_disable_reason; 3240 } 3241 if (legacy_compression) { 3242 LOG(INFO) << "Compression disabled: " << vabc_disable_reason; 3243 } 3244 userspace_snapshots = false; 3245 legacy_compression = false; 3246 } 3247 3248 const bool using_snapuserd = userspace_snapshots || legacy_compression; 3249 if (!using_snapuserd) { 3250 LOG(INFO) << "Using legacy Virtual A/B (dm-snapshot)"; 3251 } 3252 3253 std::string compression_algorithm; 3254 if (using_snapuserd) { 3255 compression_algorithm = dap_metadata.vabc_compression_param(); 3256 if (compression_algorithm.empty()) { 3257 // Older OTAs don't set an explicit compression type, so default to gz. 3258 compression_algorithm = "gz"; 3259 } 3260 } 3261 3262 PartitionCowCreator cow_creator{ 3263 .target_metadata = target_metadata.get(), 3264 .target_suffix = target_suffix, 3265 .target_partition = nullptr, 3266 .current_metadata = current_metadata.get(), 3267 .current_suffix = current_suffix, 3268 .update = nullptr, 3269 .extra_extents = {}, 3270 .using_snapuserd = using_snapuserd, 3271 .compression_algorithm = compression_algorithm, 3272 }; 3273 if (dap_metadata.vabc_feature_set().has_threaded()) { 3274 cow_creator.enable_threading = dap_metadata.vabc_feature_set().threaded(); 3275 } 3276 if (dap_metadata.vabc_feature_set().has_batch_writes()) { 3277 cow_creator.batched_writes = dap_metadata.vabc_feature_set().batch_writes(); 3278 } 3279 3280 auto ret = CreateUpdateSnapshotsInternal(lock.get(), manifest, &cow_creator, &created_devices, 3281 &all_snapshot_status); 3282 if (!ret.is_ok()) return ret; 3283 3284 auto exported_target_metadata = target_metadata->Export(); 3285 if (exported_target_metadata == nullptr) { 3286 LOG(ERROR) << "Cannot export target metadata"; 3287 return Return::Error(); 3288 } 3289 3290 ret = InitializeUpdateSnapshots(lock.get(), target_metadata.get(), 3291 exported_target_metadata.get(), target_suffix, 3292 all_snapshot_status); 3293 if (!ret.is_ok()) return ret; 3294 3295 if (!UpdatePartitionTable(opener, device_->GetSuperDevice(target_slot), 3296 *exported_target_metadata, target_slot)) { 3297 LOG(ERROR) << "Cannot write target metadata"; 3298 return Return::Error(); 3299 } 3300 3301 // If snapuserd is enabled, we need to retain a copy of the old metadata 3302 // so we can access original blocks in case they are moved around. We do 3303 // not want to rely on the old super metadata slot because we don't 3304 // guarantee its validity after the slot switch is successful. 3305 if (using_snapuserd) { 3306 auto metadata = current_metadata->Export(); 3307 if (!metadata) { 3308 LOG(ERROR) << "Could not export current metadata"; 3309 return Return::Error(); 3310 } 3311 3312 auto path = GetOldPartitionMetadataPath(); 3313 if (!android::fs_mgr::WriteToImageFile(path, *metadata.get())) { 3314 LOG(ERROR) << "Cannot write old metadata to " << path; 3315 return Return::Error(); 3316 } 3317 } 3318 3319 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock.get()); 3320 status.set_state(update_state); 3321 status.set_using_snapuserd(using_snapuserd); 3322 3323 if (userspace_snapshots) { 3324 status.set_userspace_snapshots(true); 3325 LOG(INFO) << "Virtual A/B using userspace snapshots"; 3326 3327 if (GetIouringEnabledProperty()) { 3328 status.set_io_uring_enabled(true); 3329 LOG(INFO) << "io_uring for snapshots enabled"; 3330 } 3331 } else if (legacy_compression) { 3332 LOG(INFO) << "Virtual A/B using legacy snapuserd"; 3333 } else { 3334 LOG(INFO) << "Virtual A/B using dm-snapshot"; 3335 } 3336 3337 is_snapshot_userspace_.emplace(userspace_snapshots); 3338 3339 if (!device()->IsTestDevice() && using_snapuserd) { 3340 // Terminate stale daemon if any 3341 std::unique_ptr<SnapuserdClient> snapuserd_client = std::move(snapuserd_client_); 3342 if (!snapuserd_client) { 3343 snapuserd_client = SnapuserdClient::Connect(kSnapuserdSocket, 5s); 3344 } 3345 if (snapuserd_client) { 3346 snapuserd_client->DetachSnapuserd(); 3347 snapuserd_client = nullptr; 3348 } 3349 } 3350 3351 if (!WriteSnapshotUpdateStatus(lock.get(), status)) { 3352 LOG(ERROR) << "Unable to write new update state"; 3353 return Return::Error(); 3354 } 3355 3356 created_devices.Release(); 3357 LOG(INFO) << "Successfully created all snapshots for target slot " << target_suffix; 3358 3359 return Return::Ok(); 3360 } 3361 CreateUpdateSnapshotsInternal(LockedFile * lock,const DeltaArchiveManifest & manifest,PartitionCowCreator * cow_creator,AutoDeviceList * created_devices,std::map<std::string,SnapshotStatus> * all_snapshot_status)3362 Return SnapshotManager::CreateUpdateSnapshotsInternal( 3363 LockedFile* lock, const DeltaArchiveManifest& manifest, PartitionCowCreator* cow_creator, 3364 AutoDeviceList* created_devices, 3365 std::map<std::string, SnapshotStatus>* all_snapshot_status) { 3366 CHECK(lock); 3367 3368 auto* target_metadata = cow_creator->target_metadata; 3369 const auto& target_suffix = cow_creator->target_suffix; 3370 3371 if (!target_metadata->AddGroup(kCowGroupName, 0)) { 3372 LOG(ERROR) << "Cannot add group " << kCowGroupName; 3373 return Return::Error(); 3374 } 3375 3376 std::map<std::string, const PartitionUpdate*> partition_map; 3377 std::map<std::string, std::vector<Extent>> extra_extents_map; 3378 for (const auto& partition_update : manifest.partitions()) { 3379 auto suffixed_name = partition_update.partition_name() + target_suffix; 3380 auto&& [it, inserted] = partition_map.emplace(suffixed_name, &partition_update); 3381 if (!inserted) { 3382 LOG(ERROR) << "Duplicated partition " << partition_update.partition_name() 3383 << " in update manifest."; 3384 return Return::Error(); 3385 } 3386 3387 auto& extra_extents = extra_extents_map[suffixed_name]; 3388 if (partition_update.has_hash_tree_extent()) { 3389 extra_extents.push_back(partition_update.hash_tree_extent()); 3390 } 3391 if (partition_update.has_fec_extent()) { 3392 extra_extents.push_back(partition_update.fec_extent()); 3393 } 3394 } 3395 3396 for (auto* target_partition : ListPartitionsWithSuffix(target_metadata, target_suffix)) { 3397 cow_creator->target_partition = target_partition; 3398 cow_creator->update = nullptr; 3399 auto iter = partition_map.find(target_partition->name()); 3400 if (iter != partition_map.end()) { 3401 cow_creator->update = iter->second; 3402 } else { 3403 LOG(INFO) << target_partition->name() 3404 << " isn't included in the payload, skipping the cow creation."; 3405 continue; 3406 } 3407 3408 cow_creator->extra_extents.clear(); 3409 auto extra_extents_it = extra_extents_map.find(target_partition->name()); 3410 if (extra_extents_it != extra_extents_map.end()) { 3411 cow_creator->extra_extents = std::move(extra_extents_it->second); 3412 } 3413 3414 // Compute the device sizes for the partition. 3415 auto cow_creator_ret = cow_creator->Run(); 3416 if (!cow_creator_ret.has_value()) { 3417 LOG(ERROR) << "PartitionCowCreator returned no value for " << target_partition->name(); 3418 return Return::Error(); 3419 } 3420 3421 LOG(INFO) << "For partition " << target_partition->name() 3422 << ", device size = " << cow_creator_ret->snapshot_status.device_size() 3423 << ", snapshot size = " << cow_creator_ret->snapshot_status.snapshot_size() 3424 << ", cow partition size = " 3425 << cow_creator_ret->snapshot_status.cow_partition_size() 3426 << ", cow file size = " << cow_creator_ret->snapshot_status.cow_file_size(); 3427 3428 // Delete any existing snapshot before re-creating one. 3429 if (!DeleteSnapshot(lock, target_partition->name())) { 3430 LOG(ERROR) << "Cannot delete existing snapshot before creating a new one for partition " 3431 << target_partition->name(); 3432 return Return::Error(); 3433 } 3434 3435 // It is possible that the whole partition uses free space in super, and snapshot / COW 3436 // would not be needed. In this case, skip the partition. 3437 bool needs_snapshot = cow_creator_ret->snapshot_status.snapshot_size() > 0; 3438 bool needs_cow = (cow_creator_ret->snapshot_status.cow_partition_size() + 3439 cow_creator_ret->snapshot_status.cow_file_size()) > 0; 3440 CHECK(needs_snapshot == needs_cow); 3441 3442 if (!needs_snapshot) { 3443 LOG(INFO) << "Skip creating snapshot for partition " << target_partition->name() 3444 << "because nothing needs to be snapshotted."; 3445 continue; 3446 } 3447 3448 // Find the original partition size. 3449 auto name = target_partition->name(); 3450 auto old_partition_name = 3451 name.substr(0, name.size() - target_suffix.size()) + cow_creator->current_suffix; 3452 auto old_partition = cow_creator->current_metadata->FindPartition(old_partition_name); 3453 if (old_partition) { 3454 cow_creator_ret->snapshot_status.set_old_partition_size(old_partition->size()); 3455 } 3456 3457 // Store these device sizes to snapshot status file. 3458 if (!CreateSnapshot(lock, cow_creator, &cow_creator_ret->snapshot_status)) { 3459 return Return::Error(); 3460 } 3461 created_devices->EmplaceBack<AutoDeleteSnapshot>(this, lock, target_partition->name()); 3462 3463 // Create the COW partition. That is, use any remaining free space in super partition before 3464 // creating the COW images. 3465 if (cow_creator_ret->snapshot_status.cow_partition_size() > 0) { 3466 CHECK(cow_creator_ret->snapshot_status.cow_partition_size() % kSectorSize == 0) 3467 << "cow_partition_size == " 3468 << cow_creator_ret->snapshot_status.cow_partition_size() 3469 << " is not a multiple of sector size " << kSectorSize; 3470 auto cow_partition = target_metadata->AddPartition(GetCowName(target_partition->name()), 3471 kCowGroupName, 0 /* flags */); 3472 if (cow_partition == nullptr) { 3473 return Return::Error(); 3474 } 3475 3476 if (!target_metadata->ResizePartition( 3477 cow_partition, cow_creator_ret->snapshot_status.cow_partition_size(), 3478 cow_creator_ret->cow_partition_usable_regions)) { 3479 LOG(ERROR) << "Cannot create COW partition on metadata with size " 3480 << cow_creator_ret->snapshot_status.cow_partition_size(); 3481 return Return::Error(); 3482 } 3483 // Only the in-memory target_metadata is modified; nothing to clean up if there is an 3484 // error in the future. 3485 } 3486 3487 all_snapshot_status->emplace(target_partition->name(), 3488 std::move(cow_creator_ret->snapshot_status)); 3489 3490 LOG(INFO) << "Successfully created snapshot partition for " << target_partition->name(); 3491 } 3492 3493 LOG(INFO) << "Allocating CoW images."; 3494 3495 for (auto&& [name, snapshot_status] : *all_snapshot_status) { 3496 // Create the backing COW image if necessary. 3497 if (snapshot_status.cow_file_size() > 0) { 3498 auto ret = CreateCowImage(lock, name); 3499 if (!ret.is_ok()) return AddRequiredSpace(ret, *all_snapshot_status); 3500 } 3501 3502 LOG(INFO) << "Successfully created snapshot for " << name; 3503 } 3504 3505 return Return::Ok(); 3506 } 3507 InitializeUpdateSnapshots(LockedFile * lock,MetadataBuilder * target_metadata,const LpMetadata * exported_target_metadata,const std::string & target_suffix,const std::map<std::string,SnapshotStatus> & all_snapshot_status)3508 Return SnapshotManager::InitializeUpdateSnapshots( 3509 LockedFile* lock, MetadataBuilder* target_metadata, 3510 const LpMetadata* exported_target_metadata, const std::string& target_suffix, 3511 const std::map<std::string, SnapshotStatus>& all_snapshot_status) { 3512 CHECK(lock); 3513 3514 CreateLogicalPartitionParams cow_params{ 3515 .block_device = LP_METADATA_DEFAULT_PARTITION_NAME, 3516 .metadata = exported_target_metadata, 3517 .timeout_ms = std::chrono::milliseconds::max(), 3518 .partition_opener = &device_->GetPartitionOpener(), 3519 }; 3520 for (auto* target_partition : ListPartitionsWithSuffix(target_metadata, target_suffix)) { 3521 AutoDeviceList created_devices_for_cow; 3522 3523 if (!UnmapPartitionWithSnapshot(lock, target_partition->name())) { 3524 LOG(ERROR) << "Cannot unmap existing COW devices before re-mapping them for zero-fill: " 3525 << target_partition->name(); 3526 return Return::Error(); 3527 } 3528 3529 auto it = all_snapshot_status.find(target_partition->name()); 3530 if (it == all_snapshot_status.end()) continue; 3531 cow_params.partition_name = target_partition->name(); 3532 std::string cow_name; 3533 if (!MapCowDevices(lock, cow_params, it->second, &created_devices_for_cow, &cow_name)) { 3534 return Return::Error(); 3535 } 3536 3537 std::string cow_path; 3538 if (!images_->GetMappedImageDevice(cow_name, &cow_path)) { 3539 LOG(ERROR) << "Cannot determine path for " << cow_name; 3540 return Return::Error(); 3541 } 3542 3543 if (it->second.using_snapuserd()) { 3544 unique_fd fd(open(cow_path.c_str(), O_RDWR | O_CLOEXEC)); 3545 if (fd < 0) { 3546 PLOG(ERROR) << "open " << cow_path << " failed for snapshot " 3547 << cow_params.partition_name; 3548 return Return::Error(); 3549 } 3550 3551 CowOptions options; 3552 if (device()->IsTestDevice()) { 3553 options.scratch_space = false; 3554 } 3555 options.compression = it->second.compression_algorithm(); 3556 3557 CowWriter writer(options); 3558 if (!writer.Initialize(fd) || !writer.Finalize()) { 3559 LOG(ERROR) << "Could not initialize COW device for " << target_partition->name(); 3560 return Return::Error(); 3561 } 3562 } else { 3563 auto ret = InitializeKernelCow(cow_path); 3564 if (!ret.is_ok()) { 3565 LOG(ERROR) << "Can't zero-fill COW device for " << target_partition->name() << ": " 3566 << cow_path; 3567 return AddRequiredSpace(ret, all_snapshot_status); 3568 } 3569 } 3570 // Let destructor of created_devices_for_cow to unmap the COW devices. 3571 }; 3572 return Return::Ok(); 3573 } 3574 MapUpdateSnapshot(const CreateLogicalPartitionParams & params,std::string * snapshot_path)3575 bool SnapshotManager::MapUpdateSnapshot(const CreateLogicalPartitionParams& params, 3576 std::string* snapshot_path) { 3577 auto lock = LockShared(); 3578 if (!lock) return false; 3579 if (!UnmapPartitionWithSnapshot(lock.get(), params.GetPartitionName())) { 3580 LOG(ERROR) << "Cannot unmap existing snapshot before re-mapping it: " 3581 << params.GetPartitionName(); 3582 return false; 3583 } 3584 3585 SnapshotStatus status; 3586 if (!ReadSnapshotStatus(lock.get(), params.GetPartitionName(), &status)) { 3587 return false; 3588 } 3589 if (status.using_snapuserd()) { 3590 LOG(ERROR) << "Cannot use MapUpdateSnapshot with snapuserd"; 3591 return false; 3592 } 3593 3594 SnapshotPaths paths; 3595 if (!MapPartitionWithSnapshot(lock.get(), params, SnapshotContext::Update, &paths)) { 3596 return false; 3597 } 3598 3599 if (!paths.snapshot_device.empty()) { 3600 *snapshot_path = paths.snapshot_device; 3601 } else { 3602 *snapshot_path = paths.target_device; 3603 } 3604 DCHECK(!snapshot_path->empty()); 3605 return true; 3606 } 3607 OpenSnapshotWriter(const android::fs_mgr::CreateLogicalPartitionParams & params,const std::optional<std::string> & source_device)3608 std::unique_ptr<ISnapshotWriter> SnapshotManager::OpenSnapshotWriter( 3609 const android::fs_mgr::CreateLogicalPartitionParams& params, 3610 const std::optional<std::string>& source_device) { 3611 #if defined(LIBSNAPSHOT_NO_COW_WRITE) 3612 (void)params; 3613 (void)source_device; 3614 3615 LOG(ERROR) << "Snapshots cannot be written in first-stage init or recovery"; 3616 return nullptr; 3617 #else 3618 // First unmap any existing mapping. 3619 auto lock = LockShared(); 3620 if (!lock) return nullptr; 3621 if (!UnmapPartitionWithSnapshot(lock.get(), params.GetPartitionName())) { 3622 LOG(ERROR) << "Cannot unmap existing snapshot before re-mapping it: " 3623 << params.GetPartitionName(); 3624 return nullptr; 3625 } 3626 3627 SnapshotPaths paths; 3628 if (!MapPartitionWithSnapshot(lock.get(), params, SnapshotContext::Update, &paths)) { 3629 return nullptr; 3630 } 3631 3632 SnapshotStatus status; 3633 if (!paths.cow_device_name.empty()) { 3634 if (!ReadSnapshotStatus(lock.get(), params.GetPartitionName(), &status)) { 3635 return nullptr; 3636 } 3637 } else { 3638 // Currently, partition_cow_creator always creates snapshots. The 3639 // reason is that if partition X shrinks while partition Y grows, we 3640 // cannot bindly write to the newly freed extents in X. This would 3641 // make the old slot unusable. So, the entire size of the target 3642 // partition is currently considered snapshottable. 3643 LOG(ERROR) << "No snapshot available for partition " << params.GetPartitionName(); 3644 return nullptr; 3645 } 3646 3647 if (status.using_snapuserd()) { 3648 return OpenCompressedSnapshotWriter(lock.get(), source_device, params.GetPartitionName(), 3649 status, paths); 3650 } 3651 return OpenKernelSnapshotWriter(lock.get(), source_device, params.GetPartitionName(), status, 3652 paths); 3653 #endif 3654 } 3655 3656 #if !defined(LIBSNAPSHOT_NO_COW_WRITE) 3657 std::unique_ptr<ISnapshotWriter> SnapshotManager::OpenCompressedSnapshotWriter( 3658 LockedFile* lock, const std::optional<std::string>& source_device, 3659 [[maybe_unused]] const std::string& partition_name, const SnapshotStatus& status, 3660 const SnapshotPaths& paths) { 3661 CHECK(lock); 3662 3663 CowOptions cow_options; 3664 cow_options.compression = status.compression_algorithm(); 3665 cow_options.max_blocks = {status.device_size() / cow_options.block_size}; 3666 cow_options.batch_write = status.batched_writes(); 3667 cow_options.num_compress_threads = status.enable_threading() ? 2 : 0; 3668 // Disable scratch space for vts tests 3669 if (device()->IsTestDevice()) { 3670 cow_options.scratch_space = false; 3671 } 3672 3673 // Currently we don't support partial snapshots, since partition_cow_creator 3674 // never creates this scenario. 3675 CHECK(status.snapshot_size() == status.device_size()); 3676 3677 auto writer = std::make_unique<CompressedSnapshotWriter>(cow_options); 3678 if (source_device) { 3679 writer->SetSourceDevice(*source_device); 3680 } 3681 3682 std::string cow_path; 3683 if (!GetMappedImageDevicePath(paths.cow_device_name, &cow_path)) { 3684 LOG(ERROR) << "Could not determine path for " << paths.cow_device_name; 3685 return nullptr; 3686 } 3687 3688 unique_fd cow_fd(open(cow_path.c_str(), O_RDWR | O_CLOEXEC)); 3689 if (cow_fd < 0) { 3690 PLOG(ERROR) << "OpenCompressedSnapshotWriter: open " << cow_path; 3691 return nullptr; 3692 } 3693 if (!writer->SetCowDevice(std::move(cow_fd))) { 3694 LOG(ERROR) << "Could not create COW writer from " << cow_path; 3695 return nullptr; 3696 } 3697 3698 return writer; 3699 } 3700 3701 std::unique_ptr<ISnapshotWriter> SnapshotManager::OpenKernelSnapshotWriter( 3702 LockedFile* lock, const std::optional<std::string>& source_device, 3703 [[maybe_unused]] const std::string& partition_name, const SnapshotStatus& status, 3704 const SnapshotPaths& paths) { 3705 CHECK(lock); 3706 3707 CowOptions cow_options; 3708 cow_options.max_blocks = {status.device_size() / cow_options.block_size}; 3709 3710 auto writer = std::make_unique<OnlineKernelSnapshotWriter>(cow_options); 3711 3712 std::string path = paths.snapshot_device.empty() ? paths.target_device : paths.snapshot_device; 3713 unique_fd fd(open(path.c_str(), O_RDWR | O_CLOEXEC)); 3714 if (fd < 0) { 3715 PLOG(ERROR) << "open failed: " << path; 3716 return nullptr; 3717 } 3718 3719 if (source_device) { 3720 writer->SetSourceDevice(*source_device); 3721 } 3722 3723 uint64_t cow_size = status.cow_partition_size() + status.cow_file_size(); 3724 writer->SetSnapshotDevice(std::move(fd), cow_size); 3725 3726 return writer; 3727 } 3728 #endif // !defined(LIBSNAPSHOT_NO_COW_WRITE) 3729 UnmapUpdateSnapshot(const std::string & target_partition_name)3730 bool SnapshotManager::UnmapUpdateSnapshot(const std::string& target_partition_name) { 3731 auto lock = LockShared(); 3732 if (!lock) return false; 3733 return UnmapPartitionWithSnapshot(lock.get(), target_partition_name); 3734 } 3735 UnmapAllPartitionsInRecovery()3736 bool SnapshotManager::UnmapAllPartitionsInRecovery() { 3737 auto lock = LockExclusive(); 3738 if (!lock) return false; 3739 3740 const auto& opener = device_->GetPartitionOpener(); 3741 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 3742 auto super_device = device_->GetSuperDevice(slot); 3743 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot); 3744 if (!metadata) { 3745 LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device; 3746 return false; 3747 } 3748 3749 bool ok = true; 3750 for (const auto& partition : metadata->partitions) { 3751 auto partition_name = GetPartitionName(partition); 3752 ok &= UnmapPartitionWithSnapshot(lock.get(), partition_name); 3753 } 3754 return ok; 3755 } 3756 operator <<(std::ostream & os,SnapshotManager::Slot slot)3757 std::ostream& operator<<(std::ostream& os, SnapshotManager::Slot slot) { 3758 switch (slot) { 3759 case SnapshotManager::Slot::Unknown: 3760 return os << "unknown"; 3761 case SnapshotManager::Slot::Source: 3762 return os << "source"; 3763 case SnapshotManager::Slot::Target: 3764 return os << "target"; 3765 } 3766 } 3767 Dump(std::ostream & os)3768 bool SnapshotManager::Dump(std::ostream& os) { 3769 // Don't actually lock. Dump() is for debugging purposes only, so it is okay 3770 // if it is racy. 3771 auto file = OpenLock(0 /* lock flag */); 3772 if (!file) return false; 3773 3774 std::stringstream ss; 3775 3776 auto update_status = ReadSnapshotUpdateStatus(file.get()); 3777 3778 ss << "Update state: " << update_status.state() << std::endl; 3779 ss << "Using snapuserd: " << update_status.using_snapuserd() << std::endl; 3780 ss << "Using userspace snapshots: " << update_status.userspace_snapshots() << std::endl; 3781 ss << "Using io_uring: " << update_status.io_uring_enabled() << std::endl; 3782 ss << "Using XOR compression: " << GetXorCompressionEnabledProperty() << std::endl; 3783 ss << "Current slot: " << device_->GetSlotSuffix() << std::endl; 3784 ss << "Boot indicator: booting from " << GetCurrentSlot() << " slot" << std::endl; 3785 ss << "Rollback indicator: " 3786 << (access(GetRollbackIndicatorPath().c_str(), F_OK) == 0 ? "exists" : strerror(errno)) 3787 << std::endl; 3788 ss << "Forward merge indicator: " 3789 << (access(GetForwardMergeIndicatorPath().c_str(), F_OK) == 0 ? "exists" : strerror(errno)) 3790 << std::endl; 3791 ss << "Source build fingerprint: " << update_status.source_build_fingerprint() << std::endl; 3792 3793 if (update_status.state() == UpdateState::Merging) { 3794 ss << "Merge completion: "; 3795 if (!EnsureSnapuserdConnected()) { 3796 ss << "N/A"; 3797 } else { 3798 ss << snapuserd_client_->GetMergePercent() << "%"; 3799 } 3800 ss << std::endl; 3801 ss << "Merge phase: " << update_status.merge_phase() << std::endl; 3802 } 3803 3804 bool ok = true; 3805 std::vector<std::string> snapshots; 3806 if (!ListSnapshots(file.get(), &snapshots)) { 3807 LOG(ERROR) << "Could not list snapshots"; 3808 snapshots.clear(); 3809 ok = false; 3810 } 3811 for (const auto& name : snapshots) { 3812 ss << "Snapshot: " << name << std::endl; 3813 SnapshotStatus status; 3814 if (!ReadSnapshotStatus(file.get(), name, &status)) { 3815 ok = false; 3816 continue; 3817 } 3818 ss << " state: " << SnapshotState_Name(status.state()) << std::endl; 3819 ss << " device size (bytes): " << status.device_size() << std::endl; 3820 ss << " snapshot size (bytes): " << status.snapshot_size() << std::endl; 3821 ss << " cow partition size (bytes): " << status.cow_partition_size() << std::endl; 3822 ss << " cow file size (bytes): " << status.cow_file_size() << std::endl; 3823 ss << " allocated sectors: " << status.sectors_allocated() << std::endl; 3824 ss << " metadata sectors: " << status.metadata_sectors() << std::endl; 3825 ss << " compression: " << status.compression_algorithm() << std::endl; 3826 ss << " merge phase: " << DecideMergePhase(status) << std::endl; 3827 } 3828 os << ss.rdbuf(); 3829 return ok; 3830 } 3831 EnsureMetadataMounted()3832 std::unique_ptr<AutoDevice> SnapshotManager::EnsureMetadataMounted() { 3833 if (!device_->IsRecovery()) { 3834 // No need to mount anything in recovery. 3835 LOG(INFO) << "EnsureMetadataMounted does nothing in Android mode."; 3836 return std::unique_ptr<AutoUnmountDevice>(new AutoUnmountDevice()); 3837 } 3838 auto ret = AutoUnmountDevice::New(device_->GetMetadataDir()); 3839 if (ret == nullptr) return nullptr; 3840 3841 // In rescue mode, it is possible to erase and format metadata, but /metadata/ota is not 3842 // created to execute snapshot updates. Hence, subsequent calls is likely to fail because 3843 // Lock*() fails. By failing early and returning nullptr here, update_engine_sideload can 3844 // treat this case as if /metadata is not mounted. 3845 if (!LockShared()) { 3846 LOG(WARNING) << "/metadata is mounted, but errors occur when acquiring a shared lock. " 3847 "Subsequent calls to SnapshotManager will fail. Unmounting /metadata now."; 3848 return nullptr; 3849 } 3850 return ret; 3851 } 3852 HandleImminentDataWipe(const std::function<void ()> & callback)3853 bool SnapshotManager::HandleImminentDataWipe(const std::function<void()>& callback) { 3854 if (!device_->IsRecovery()) { 3855 LOG(ERROR) << "Data wipes are only allowed in recovery."; 3856 return false; 3857 } 3858 3859 auto mount = EnsureMetadataMounted(); 3860 if (!mount || !mount->HasDevice()) { 3861 // We allow the wipe to continue, because if we can't mount /metadata, 3862 // it is unlikely the device would have booted anyway. If there is no 3863 // metadata partition, then the device predates Virtual A/B. 3864 return true; 3865 } 3866 3867 // Check this early, so we don't accidentally start trying to populate 3868 // the state file in recovery. Note we don't call GetUpdateState since 3869 // we want errors in acquiring the lock to be propagated, instead of 3870 // returning UpdateState::None. 3871 auto state_file = GetStateFilePath(); 3872 if (access(state_file.c_str(), F_OK) != 0 && errno == ENOENT) { 3873 return true; 3874 } 3875 3876 auto slot_number = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 3877 auto super_path = device_->GetSuperDevice(slot_number); 3878 if (!CreateLogicalAndSnapshotPartitions(super_path, 20s)) { 3879 LOG(ERROR) << "Unable to map partitions to complete merge."; 3880 return false; 3881 } 3882 3883 auto process_callback = [&]() -> bool { 3884 if (callback) { 3885 callback(); 3886 } 3887 return true; 3888 }; 3889 3890 in_factory_data_reset_ = true; 3891 UpdateState state = 3892 ProcessUpdateStateOnDataWipe(true /* allow_forward_merge */, process_callback); 3893 in_factory_data_reset_ = false; 3894 3895 if (state == UpdateState::MergeFailed) { 3896 return false; 3897 } 3898 3899 // Nothing should be depending on partitions now, so unmap them all. 3900 if (!UnmapAllPartitionsInRecovery()) { 3901 LOG(ERROR) << "Unable to unmap all partitions; fastboot may fail to flash."; 3902 } 3903 3904 if (state != UpdateState::None) { 3905 auto lock = LockExclusive(); 3906 if (!lock) return false; 3907 3908 // Zap the update state so the bootloader doesn't think we're still 3909 // merging. It's okay if this fails, it's informative only at this 3910 // point. 3911 WriteUpdateState(lock.get(), UpdateState::None); 3912 } 3913 return true; 3914 } 3915 FinishMergeInRecovery()3916 bool SnapshotManager::FinishMergeInRecovery() { 3917 if (!device_->IsRecovery()) { 3918 LOG(ERROR) << "Data wipes are only allowed in recovery."; 3919 return false; 3920 } 3921 3922 auto mount = EnsureMetadataMounted(); 3923 if (!mount || !mount->HasDevice()) { 3924 return false; 3925 } 3926 3927 auto slot_number = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 3928 auto super_path = device_->GetSuperDevice(slot_number); 3929 if (!CreateLogicalAndSnapshotPartitions(super_path, 20s)) { 3930 LOG(ERROR) << "Unable to map partitions to complete merge."; 3931 return false; 3932 } 3933 3934 UpdateState state = ProcessUpdateState(); 3935 if (state != UpdateState::MergeCompleted) { 3936 LOG(ERROR) << "Merge returned unexpected status: " << state; 3937 return false; 3938 } 3939 3940 // Nothing should be depending on partitions now, so unmap them all. 3941 if (!UnmapAllPartitionsInRecovery()) { 3942 LOG(ERROR) << "Unable to unmap all partitions; fastboot may fail to flash."; 3943 } 3944 return true; 3945 } 3946 ProcessUpdateStateOnDataWipe(bool allow_forward_merge,const std::function<bool ()> & callback)3947 UpdateState SnapshotManager::ProcessUpdateStateOnDataWipe(bool allow_forward_merge, 3948 const std::function<bool()>& callback) { 3949 auto slot_number = SlotNumberForSlotSuffix(device_->GetSlotSuffix()); 3950 UpdateState state = ProcessUpdateState(callback); 3951 LOG(INFO) << "Update state in recovery: " << state; 3952 switch (state) { 3953 case UpdateState::MergeFailed: 3954 LOG(ERROR) << "Unrecoverable merge failure detected."; 3955 return state; 3956 case UpdateState::Unverified: { 3957 // If an OTA was just applied but has not yet started merging: 3958 // 3959 // - if forward merge is allowed, initiate merge and call 3960 // ProcessUpdateState again. 3961 // 3962 // - if forward merge is not allowed, we 3963 // have no choice but to revert slots, because the current slot will 3964 // immediately become unbootable. Rather than wait for the device 3965 // to reboot N times until a rollback, we proactively disable the 3966 // new slot instead. 3967 // 3968 // Since the rollback is inevitable, we don't treat a HAL failure 3969 // as an error here. 3970 auto slot = GetCurrentSlot(); 3971 if (slot == Slot::Target) { 3972 if (allow_forward_merge && 3973 access(GetForwardMergeIndicatorPath().c_str(), F_OK) == 0) { 3974 LOG(INFO) << "Forward merge allowed, initiating merge now."; 3975 3976 if (!InitiateMerge()) { 3977 LOG(ERROR) << "Failed to initiate merge on data wipe."; 3978 return UpdateState::MergeFailed; 3979 } 3980 return ProcessUpdateStateOnDataWipe(false /* allow_forward_merge */, callback); 3981 } 3982 3983 LOG(ERROR) << "Reverting to old slot since update will be deleted."; 3984 device_->SetSlotAsUnbootable(slot_number); 3985 } else { 3986 LOG(INFO) << "Booting from " << slot << " slot, no action is taken."; 3987 } 3988 break; 3989 } 3990 case UpdateState::MergeNeedsReboot: 3991 // We shouldn't get here, because nothing is depending on 3992 // logical partitions. 3993 LOG(ERROR) << "Unexpected merge-needs-reboot state in recovery."; 3994 break; 3995 default: 3996 break; 3997 } 3998 return state; 3999 } 4000 EnsureNoOverflowSnapshot(LockedFile * lock)4001 bool SnapshotManager::EnsureNoOverflowSnapshot(LockedFile* lock) { 4002 CHECK(lock); 4003 4004 std::vector<std::string> snapshots; 4005 if (!ListSnapshots(lock, &snapshots)) { 4006 LOG(ERROR) << "Could not list snapshots."; 4007 return false; 4008 } 4009 4010 for (const auto& snapshot : snapshots) { 4011 SnapshotStatus status; 4012 if (!ReadSnapshotStatus(lock, snapshot, &status)) { 4013 return false; 4014 } 4015 if (status.using_snapuserd()) { 4016 continue; 4017 } 4018 4019 std::vector<DeviceMapper::TargetInfo> targets; 4020 if (!dm_.GetTableStatus(snapshot, &targets)) { 4021 LOG(ERROR) << "Could not read snapshot device table: " << snapshot; 4022 return false; 4023 } 4024 if (targets.size() != 1) { 4025 LOG(ERROR) << "Unexpected device-mapper table for snapshot: " << snapshot 4026 << ", size = " << targets.size(); 4027 return false; 4028 } 4029 if (targets[0].IsOverflowSnapshot()) { 4030 LOG(ERROR) << "Detected overflow in snapshot " << snapshot 4031 << ", CoW device size computation is wrong!"; 4032 return false; 4033 } 4034 } 4035 4036 return true; 4037 } 4038 RecoveryCreateSnapshotDevices()4039 CreateResult SnapshotManager::RecoveryCreateSnapshotDevices() { 4040 if (!device_->IsRecovery()) { 4041 LOG(ERROR) << __func__ << " is only allowed in recovery."; 4042 return CreateResult::NOT_CREATED; 4043 } 4044 4045 auto mount = EnsureMetadataMounted(); 4046 if (!mount || !mount->HasDevice()) { 4047 LOG(ERROR) << "Couldn't mount Metadata."; 4048 return CreateResult::NOT_CREATED; 4049 } 4050 return RecoveryCreateSnapshotDevices(mount); 4051 } 4052 RecoveryCreateSnapshotDevices(const std::unique_ptr<AutoDevice> & metadata_device)4053 CreateResult SnapshotManager::RecoveryCreateSnapshotDevices( 4054 const std::unique_ptr<AutoDevice>& metadata_device) { 4055 if (!device_->IsRecovery()) { 4056 LOG(ERROR) << __func__ << " is only allowed in recovery."; 4057 return CreateResult::NOT_CREATED; 4058 } 4059 4060 if (metadata_device == nullptr || !metadata_device->HasDevice()) { 4061 LOG(ERROR) << "Metadata not mounted."; 4062 return CreateResult::NOT_CREATED; 4063 } 4064 4065 auto state_file = GetStateFilePath(); 4066 if (access(state_file.c_str(), F_OK) != 0 && errno == ENOENT) { 4067 LOG(ERROR) << "Couldn't access state file."; 4068 return CreateResult::NOT_CREATED; 4069 } 4070 4071 if (!NeedSnapshotsInFirstStageMount()) { 4072 return CreateResult::NOT_CREATED; 4073 } 4074 4075 auto slot_suffix = device_->GetOtherSlotSuffix(); 4076 auto slot_number = SlotNumberForSlotSuffix(slot_suffix); 4077 auto super_path = device_->GetSuperDevice(slot_number); 4078 if (!CreateLogicalAndSnapshotPartitions(super_path, 20s)) { 4079 LOG(ERROR) << "Unable to map partitions."; 4080 return CreateResult::ERROR; 4081 } 4082 return CreateResult::CREATED; 4083 } 4084 UpdateForwardMergeIndicator(bool wipe)4085 bool SnapshotManager::UpdateForwardMergeIndicator(bool wipe) { 4086 auto path = GetForwardMergeIndicatorPath(); 4087 4088 if (!wipe) { 4089 LOG(INFO) << "Wipe is not scheduled. Deleting forward merge indicator."; 4090 return RemoveFileIfExists(path); 4091 } 4092 4093 // TODO(b/152094219): Don't forward merge if no CoW file is allocated. 4094 4095 LOG(INFO) << "Wipe will be scheduled. Allowing forward merge of snapshots."; 4096 if (!android::base::WriteStringToFile("1", path)) { 4097 PLOG(ERROR) << "Unable to write forward merge indicator: " << path; 4098 return false; 4099 } 4100 4101 return true; 4102 } 4103 GetSnapshotMergeStatsInstance()4104 ISnapshotMergeStats* SnapshotManager::GetSnapshotMergeStatsInstance() { 4105 return SnapshotMergeStats::GetInstance(*this); 4106 } 4107 4108 // This is only to be used in recovery or normal Android (not first-stage init). 4109 // We don't guarantee dm paths are available in first-stage init, because ueventd 4110 // isn't running yet. GetMappedImageDevicePath(const std::string & device_name,std::string * device_path)4111 bool SnapshotManager::GetMappedImageDevicePath(const std::string& device_name, 4112 std::string* device_path) { 4113 // Try getting the device string if it is a device mapper device. 4114 if (dm_.GetState(device_name) != DmDeviceState::INVALID) { 4115 return dm_.GetDmDevicePathByName(device_name, device_path); 4116 } 4117 4118 // Otherwise, get path from IImageManager. 4119 return images_->GetMappedImageDevice(device_name, device_path); 4120 } 4121 GetMappedImageDeviceStringOrPath(const std::string & device_name,std::string * device_string_or_mapped_path)4122 bool SnapshotManager::GetMappedImageDeviceStringOrPath(const std::string& device_name, 4123 std::string* device_string_or_mapped_path) { 4124 // Try getting the device string if it is a device mapper device. 4125 if (dm_.GetState(device_name) != DmDeviceState::INVALID) { 4126 return dm_.GetDeviceString(device_name, device_string_or_mapped_path); 4127 } 4128 4129 // Otherwise, get path from IImageManager. 4130 if (!images_->GetMappedImageDevice(device_name, device_string_or_mapped_path)) { 4131 return false; 4132 } 4133 4134 LOG(WARNING) << "Calling GetMappedImageDevice with local image manager; device " 4135 << (device_string_or_mapped_path ? *device_string_or_mapped_path : "(nullptr)") 4136 << "may not be available in first stage init! "; 4137 return true; 4138 } 4139 WaitForDevice(const std::string & device,std::chrono::milliseconds timeout_ms)4140 bool SnapshotManager::WaitForDevice(const std::string& device, 4141 std::chrono::milliseconds timeout_ms) { 4142 if (!android::base::StartsWith(device, "/")) { 4143 return true; 4144 } 4145 4146 // In first-stage init, we rely on init setting a callback which can 4147 // regenerate uevents and populate /dev for us. 4148 if (uevent_regen_callback_) { 4149 if (!uevent_regen_callback_(device)) { 4150 LOG(ERROR) << "Failed to find device after regenerating uevents: " << device; 4151 return false; 4152 } 4153 return true; 4154 } 4155 4156 // Otherwise, the only kind of device we need to wait for is a dm-user 4157 // misc device. Normal calls to DeviceMapper::CreateDevice() guarantee 4158 // the path has been created. 4159 if (!android::base::StartsWith(device, "/dev/dm-user/")) { 4160 return true; 4161 } 4162 4163 if (timeout_ms.count() == 0) { 4164 LOG(ERROR) << "No timeout was specified to wait for device: " << device; 4165 return false; 4166 } 4167 if (!android::fs_mgr::WaitForFile(device, timeout_ms)) { 4168 LOG(ERROR) << "Timed out waiting for device to appear: " << device; 4169 return false; 4170 } 4171 return true; 4172 } 4173 IsSnapuserdRequired()4174 bool SnapshotManager::IsSnapuserdRequired() { 4175 auto lock = LockExclusive(); 4176 if (!lock) return false; 4177 4178 auto status = ReadSnapshotUpdateStatus(lock.get()); 4179 return status.state() != UpdateState::None && status.using_snapuserd(); 4180 } 4181 PrepareSnapuserdArgsForSelinux(std::vector<std::string> * snapuserd_argv)4182 bool SnapshotManager::PrepareSnapuserdArgsForSelinux(std::vector<std::string>* snapuserd_argv) { 4183 return PerformInitTransition(InitTransition::SELINUX_DETACH, snapuserd_argv); 4184 } 4185 DetachFirstStageSnapuserdForSelinux()4186 bool SnapshotManager::DetachFirstStageSnapuserdForSelinux() { 4187 LOG(INFO) << "Detaching first stage snapuserd"; 4188 4189 auto lock = LockExclusive(); 4190 if (!lock) return false; 4191 4192 std::vector<std::string> snapshots; 4193 if (!ListSnapshots(lock.get(), &snapshots)) { 4194 LOG(ERROR) << "Failed to list snapshots."; 4195 return false; 4196 } 4197 4198 size_t num_cows = 0; 4199 size_t ok_cows = 0; 4200 for (const auto& snapshot : snapshots) { 4201 std::string user_cow_name = GetDmUserCowName(snapshot, GetSnapshotDriver(lock.get())); 4202 4203 if (dm_.GetState(user_cow_name) == DmDeviceState::INVALID) { 4204 continue; 4205 } 4206 4207 DeviceMapper::TargetInfo target; 4208 if (!GetSingleTarget(user_cow_name, TableQuery::Table, &target)) { 4209 continue; 4210 } 4211 4212 auto target_type = DeviceMapper::GetTargetType(target.spec); 4213 if (target_type != "user") { 4214 LOG(ERROR) << "Unexpected target type for " << user_cow_name << ": " << target_type; 4215 continue; 4216 } 4217 4218 num_cows++; 4219 auto misc_name = user_cow_name; 4220 4221 DmTable table; 4222 table.Emplace<DmTargetUser>(0, target.spec.length, misc_name); 4223 if (!dm_.LoadTableAndActivate(user_cow_name, table)) { 4224 LOG(ERROR) << "Unable to swap tables for " << misc_name; 4225 continue; 4226 } 4227 4228 // Wait for ueventd to acknowledge and create the control device node. 4229 std::string control_device = "/dev/dm-user/" + misc_name; 4230 if (!WaitForDevice(control_device, 10s)) { 4231 LOG(ERROR) << "dm-user control device no found: " << misc_name; 4232 continue; 4233 } 4234 4235 ok_cows++; 4236 LOG(INFO) << "control device is ready: " << control_device; 4237 } 4238 4239 if (ok_cows != num_cows) { 4240 LOG(ERROR) << "Could not transition all snapuserd consumers."; 4241 return false; 4242 } 4243 4244 return true; 4245 } 4246 PerformSecondStageInitTransition()4247 bool SnapshotManager::PerformSecondStageInitTransition() { 4248 return PerformInitTransition(InitTransition::SECOND_STAGE); 4249 } 4250 ReadOldPartitionMetadata(LockedFile * lock)4251 const LpMetadata* SnapshotManager::ReadOldPartitionMetadata(LockedFile* lock) { 4252 CHECK(lock); 4253 4254 if (!old_partition_metadata_) { 4255 auto path = GetOldPartitionMetadataPath(); 4256 old_partition_metadata_ = android::fs_mgr::ReadFromImageFile(path); 4257 if (!old_partition_metadata_) { 4258 LOG(ERROR) << "Could not read old partition metadata from " << path; 4259 return nullptr; 4260 } 4261 } 4262 return old_partition_metadata_.get(); 4263 } 4264 DecideMergePhase(const SnapshotStatus & status)4265 MergePhase SnapshotManager::DecideMergePhase(const SnapshotStatus& status) { 4266 if (status.using_snapuserd() && status.device_size() < status.old_partition_size()) { 4267 return MergePhase::FIRST_PHASE; 4268 } 4269 return MergePhase::SECOND_PHASE; 4270 } 4271 UpdateCowStats(ISnapshotMergeStats * stats)4272 void SnapshotManager::UpdateCowStats(ISnapshotMergeStats* stats) { 4273 auto lock = LockExclusive(); 4274 if (!lock) return; 4275 4276 std::vector<std::string> snapshots; 4277 if (!ListSnapshots(lock.get(), &snapshots, GetSnapshotSlotSuffix())) { 4278 LOG(ERROR) << "Could not list snapshots"; 4279 return; 4280 } 4281 4282 uint64_t cow_file_size = 0; 4283 uint64_t total_cow_size = 0; 4284 uint64_t estimated_cow_size = 0; 4285 for (const auto& snapshot : snapshots) { 4286 SnapshotStatus status; 4287 if (!ReadSnapshotStatus(lock.get(), snapshot, &status)) { 4288 return; 4289 } 4290 4291 cow_file_size += status.cow_file_size(); 4292 total_cow_size += status.cow_file_size() + status.cow_partition_size(); 4293 estimated_cow_size += status.estimated_cow_size(); 4294 } 4295 4296 stats->report()->set_cow_file_size(cow_file_size); 4297 stats->report()->set_total_cow_size_bytes(total_cow_size); 4298 stats->report()->set_estimated_cow_size_bytes(estimated_cow_size); 4299 } 4300 SetMergeStatsFeatures(ISnapshotMergeStats * stats)4301 void SnapshotManager::SetMergeStatsFeatures(ISnapshotMergeStats* stats) { 4302 auto lock = LockExclusive(); 4303 if (!lock) return; 4304 4305 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock.get()); 4306 stats->report()->set_iouring_used(update_status.io_uring_enabled()); 4307 stats->report()->set_userspace_snapshots_used(update_status.userspace_snapshots()); 4308 stats->report()->set_xor_compression_used(GetXorCompressionEnabledProperty()); 4309 } 4310 DeleteDeviceIfExists(const std::string & name,const std::chrono::milliseconds & timeout_ms)4311 bool SnapshotManager::DeleteDeviceIfExists(const std::string& name, 4312 const std::chrono::milliseconds& timeout_ms) { 4313 auto start = std::chrono::steady_clock::now(); 4314 while (true) { 4315 if (dm_.DeleteDeviceIfExists(name)) { 4316 return true; 4317 } 4318 auto now = std::chrono::steady_clock::now(); 4319 auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - start); 4320 if (elapsed >= timeout_ms) { 4321 break; 4322 } 4323 std::this_thread::sleep_for(400ms); 4324 } 4325 4326 // Try to diagnose why this failed. First get the actual device path. 4327 std::string full_path; 4328 if (!dm_.GetDmDevicePathByName(name, &full_path)) { 4329 LOG(ERROR) << "Unable to diagnose DM_DEV_REMOVE failure."; 4330 return false; 4331 } 4332 4333 // Check for child dm-devices. 4334 std::string block_name = android::base::Basename(full_path); 4335 std::string sysfs_holders = "/sys/class/block/" + block_name + "/holders"; 4336 4337 std::error_code ec; 4338 std::filesystem::directory_iterator dir_iter(sysfs_holders, ec); 4339 if (auto begin = std::filesystem::begin(dir_iter); begin != std::filesystem::end(dir_iter)) { 4340 LOG(ERROR) << "Child device-mapper device still mapped: " << begin->path(); 4341 return false; 4342 } 4343 4344 // Check for mounted partitions. 4345 android::fs_mgr::Fstab fstab; 4346 android::fs_mgr::ReadFstabFromFile("/proc/mounts", &fstab); 4347 for (const auto& entry : fstab) { 4348 if (android::base::Basename(entry.blk_device) == block_name) { 4349 LOG(ERROR) << "Partition still mounted: " << entry.mount_point; 4350 return false; 4351 } 4352 } 4353 4354 // Check for detached mounted partitions. 4355 for (const auto& fs : std::filesystem::directory_iterator("/sys/fs", ec)) { 4356 std::string fs_type = android::base::Basename(fs.path().c_str()); 4357 if (!(fs_type == "ext4" || fs_type == "f2fs")) { 4358 continue; 4359 } 4360 4361 std::string path = fs.path().c_str() + "/"s + block_name; 4362 if (access(path.c_str(), F_OK) == 0) { 4363 LOG(ERROR) << "Block device was lazily unmounted and is still in-use: " << full_path 4364 << "; possibly open file descriptor or attached loop device."; 4365 return false; 4366 } 4367 } 4368 4369 LOG(ERROR) << "Device-mapper device " << name << "(" << full_path << ")" 4370 << " still in use." 4371 << " Probably a file descriptor was leaked or held open, or a loop device is" 4372 << " attached."; 4373 return false; 4374 } 4375 ReadMergeFailureCode()4376 MergeFailureCode SnapshotManager::ReadMergeFailureCode() { 4377 auto lock = LockExclusive(); 4378 if (!lock) return MergeFailureCode::AcquireLock; 4379 4380 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock.get()); 4381 if (status.state() != UpdateState::MergeFailed) { 4382 return MergeFailureCode::Ok; 4383 } 4384 return status.merge_failure_code(); 4385 } 4386 ReadSourceBuildFingerprint()4387 std::string SnapshotManager::ReadSourceBuildFingerprint() { 4388 auto lock = LockExclusive(); 4389 if (!lock) return {}; 4390 4391 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock.get()); 4392 return status.source_build_fingerprint(); 4393 } 4394 IsUserspaceSnapshotUpdateInProgress()4395 bool SnapshotManager::IsUserspaceSnapshotUpdateInProgress() { 4396 auto slot = GetCurrentSlot(); 4397 if (slot == Slot::Target) { 4398 if (IsSnapuserdRequired()) { 4399 return true; 4400 } 4401 } 4402 4403 return false; 4404 } 4405 4406 } // namespace snapshot 4407 } // namespace android 4408